from urllib.request import urlopen
import bs4
A function to get the content of a web page as a string (basically, what we did manually in Lecture 39).
def urlreadtext(url,*args,**kwargs):
"""
Retrieve URL `url` and return the response body decoded as a string. If
content-type specifies a charset, use that. Otherwise, attempt UTF-8
decoding. Returns the resulting string.
Additional arguments are passed to `urllib.request.urlopen`.
"""
print("Opening URL '{}'".format(url))
with urlopen(url,*args,**kwargs) as res:
# Get raw data (bytes)
data = res.read()
# Determine the encoding
encoding = res.headers.get_content_charset()
if encoding is None:
# Danger: no encoding was specified in the headers
# Try using UTF-8
encoding = "UTF-8"
# TODO: Detect if the response indicates a non-text content type, and if
# so, raise an informative exception rather than just letting the
# attempt to decode as a string fail.
# Convert to string and return
return data.decode(encoding)
# Let's apply caching so that we only make one request to any given
# URL, even if `urlreadtext` is called many times
import functools
urlreadtext = functools.lru_cache(maxsize=None)(urlreadtext)
urlreadtext("http://example.com")
urlreadtext("http://example.com")
A function to generate URLs for MCS 275 related things.
def mcs275url(itemtype,number):
urlbase = "http://www.dumas.io/teaching/2022/spring/mcs275/"
sub_fmts = {
"homework":"nbview/homework/homework{}.html",
"homework solution":"nbview/homework/homework{}soln.html",
"worksheet":"nbview/worksheets/worksheet{}.html",
"worksheet solution":"nbview/worksheets/worksheet{}soln.html",
"project":"nbview/projects/project{}.html",
"lecture":"slides/lecture{}.html"
}
if itemtype not in sub_fmts:
raise ValueError("itemtype '{}' unknown; must be one of {}".format(itemtype,sub_fmts.keys()))
return urlbase + sub_fmts[itemtype].format(number)
mcs275url("lecture",40)
mcs275url("homework solution",12)
mcs275url("project",4) # reminder: Due Fri 29 April!
s = urlreadtext(mcs275url("lecture",40))
s[:100]
Let's get the title of lecture 40.
s = urlreadtext(mcs275url("lecture",40))
start = s.find("<title>")
stop = s.find("</title>")
print("The title of lecture 40 is:",s[start+7:stop])
import html.parser
class TitleExtractor(html.parser.HTMLParser):
def __init__(self,*args,**kwargs):
super().__init__(*args,**kwargs)
self.listening = False
self.captured = ""
def handle_starttag(self, tag, attrs):
#print("Start tag",tag)
if tag == "title":
self.listening = True
def handle_endtag(self, tag):
#print("End tag",tag)
if tag == "title":
self.listening = False
def handle_data(self, data):
if self.listening:
self.captured += data
X = TitleExtractor()
X.feed(urlreadtext(mcs275url("lecture",40)))
print("The title of lecture 40 is:",X.captured)
from bs4 import BeautifulSoup
soup = BeautifulSoup(
urlreadtext(mcs275url("lecture",40)),
"html.parser" # Use Python's built-in parser
)
print("The title of lecture 40 is:",soup.title.text)
from bs4 import BeautifulSoup
import time
lecture_titles = {}
for n in range(1,41):
time.sleep(0.1)
soup = BeautifulSoup(
urlreadtext(mcs275url("lecture",n)),
"html.parser" # Use Python's built-in parser
)
lecture_titles[n] = soup.title.text
lecture_titles
from bs4 import BeautifulSoup
soup = BeautifulSoup(
urlreadtext(mcs275url("lecture",40)),
"html.parser" # Use Python's built-in parser
)
soup.find("section").h1 # Does the first slide have an h1?
len(soup.find_all("section")) # number of slides in lecture 40
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
lecture_numbers = list(range(1,41))
lecture_slide_counts = []
for n in range(1,41):
soup = BeautifulSoup(
urlreadtext(mcs275url("lecture",n)),
"html.parser" # Use Python's built-in parser
)
lecture_slide_counts.append(len(soup.find_all("section")))
plt.bar(lecture_numbers,lecture_slide_counts)
plt.title("Number of slides in MCS 275 lectures")
plt.xlabel("Lecture number")
plt.show()