from html.parser import HTMLParser
class PythonConferenceParser(HTMLParser):
def __init__(self, html):
super(PythonConferenceParser, self).__init__()
self.conferences = []
self.feed(html)
#attrs是含有tuple对象的列表
def handle_starttag(self, name, attrs):
attrs = dict(attrs)
if name == 'h3' and 'class' in attrs and 'event-title' == attrs['class']:
self.conferences.append({})
self.conferences[-1]['event_title'] = None
elif name == 'time' and 'datetime' in attrs:
self.conferences[-1]['datetime'] = None
elif name == 'span' and 'class' in attrs and 'event-location' == attrs['class']:
self.conferences[-1]['event_location'] = None
def handle_data(self, data):
key = self._get_next_attr()
if key:
self.conferences[-1][key] = data
def _get_next_attr(self):
if not self.conferences:
return None
for k, v in self.conferences[-1].items():
if v == None:
return k
return None
def __str__(self):
return str(self.conferences)
conferences = PythonConferenceParser(html).conferences
for conference in conferences:
print(conference)
求实-探针