作业(HTMLParser, Request, Re)
Topic sourcefrom urllib import request
from html.parser import HTMLParser
import re
# 这个函数用来获取属性
def _attr(attrlist, attrname):
for attr in attrlist:
if attr[0] == attrname:
return attr[1]
return None
#
class MyHTMLParser(HTMLParser):
# 根据HTMLParser文档,用户需要自己创建相应的子类来继承HTMLParser,并且复写相应的handler方法。
# 初始函数
def __init__(self):
HTMLParser.__init__(self)
self.location_flag = False
self.title_flag = False
self.time_flag = False
# 处理开始标签,这里的attrs获取到的是属性列表,属性以元组方式表示
# 获取class,选取需要的标签
def handle_starttag(self, tag, attrs):
if tag == 'a' and re.match(r'^/events/python-events/(\d{4})/$', _attr(attrs, 'href')):
self.title_flag = True
elif tag == 'span' and _attr(attrs, 'class') == 'event-location':
self.location_flag = True
elif tag == 'time':
self.time_flag = True
else:
self.location_flag = False
self.title_flag = False
self.time_flag = False
def handle_endtag(self, tag):
# print('</%s>' % tag)
pass
def handle_startendtag(self, tag, attrs):
# print('<%s/>' % tag)
pass
def handle_data(self, data):
if self.title_flag == True:
print('event-title:', data)
self.title_flag = False # 这一步赋值给flag是避免在后面判断时,相应flag始终为True
elif self.location_flag == True:
print('event-location:', data)
self.location_flag = False
elif self.time_flag == True:
print('time:', data)
self.time_flag = False
def handle_comment(self, data):
pass
def handle_entityref(self, name):
print('&%s;' % name)
# pass
def handle_charref(self, name):
# print('&#%s;' % name)
pass
url = 'https://www.python.org/events/python-events/'
with request.urlopen(url) as response:
data = response.read().decode('utf-8')
parser = MyHTMLParser()
parser.feed(data)
parser.close()
- 1
用户7686226831