Discuss / Python / 作业(HTMLParser, Request, Re)

作业(HTMLParser, Request, Re)

Topic source
from urllib import requestfrom html.parser import HTMLParserimport re# 这个函数用来获取属性def _attr(attrlist, attrname):    for attr in attrlist:        if attr[0] == attrname:            return attr[1]    return None#class MyHTMLParser(HTMLParser):    # 根据HTMLParser文档,用户需要自己创建相应的子类来继承HTMLParser,并且复写相应的handler方法。    # 初始函数    def __init__(self):        HTMLParser.__init__(self)        self.location_flag = False        self.title_flag = False        self.time_flag = False    # 处理开始标签,这里的attrs获取到的是属性列表,属性以元组方式表示    # 获取class,选取需要的标签    def handle_starttag(self, tag, attrs):        if tag == 'a' and re.match(r'^/events/python-events/(\d{4})/$', _attr(attrs, 'href')):            self.title_flag = True        elif tag == 'span' and _attr(attrs, 'class') == 'event-location':            self.location_flag = True        elif tag == 'time':            self.time_flag = True        else:            self.location_flag = False            self.title_flag = False            self.time_flag = False    def handle_endtag(self, tag):        # print('</%s>' % tag)        pass    def handle_startendtag(self, tag, attrs):        # print('<%s/>' % tag)        pass    def handle_data(self, data):        if self.title_flag == True:            print('event-title:', data)            self.title_flag = False  # 这一步赋值给flag是避免在后面判断时,相应flag始终为True        elif self.location_flag == True:            print('event-location:', data)            self.location_flag = False        elif self.time_flag == True:            print('time:', data)            self.time_flag = False    def handle_comment(self, data):        pass    def handle_entityref(self, name):        print('&%s;' % name)        # pass    def handle_charref(self, name):        # print('&#%s;' % name)        passurl = 'https://www.python.org/events/python-events/'with request.urlopen(url) as response:    data = response.read().decode('utf-8')parser = MyHTMLParser()parser.feed(data)parser.close()
from urllib import request
from html.parser import HTMLParser
import re


# 这个函数用来获取属性
def _attr(attrlist, attrname):
    for attr in attrlist:
        if attr[0] == attrname:
            return attr[1]
    return None

#
class MyHTMLParser(HTMLParser):
    # 根据HTMLParser文档,用户需要自己创建相应的子类来继承HTMLParser,并且复写相应的handler方法。

    # 初始函数
    def __init__(self):
        HTMLParser.__init__(self)
        self.location_flag = False
        self.title_flag = False
        self.time_flag = False

    # 处理开始标签,这里的attrs获取到的是属性列表,属性以元组方式表示
    # 获取class,选取需要的标签
    def handle_starttag(self, tag, attrs):
        if tag == 'a' and re.match(r'^/events/python-events/(\d{4})/$', _attr(attrs, 'href')):
            self.title_flag = True
        elif tag == 'span' and _attr(attrs, 'class') == 'event-location':
            self.location_flag = True
        elif tag == 'time':
            self.time_flag = True
        else:
            self.location_flag = False
            self.title_flag = False
            self.time_flag = False

    def handle_endtag(self, tag):
        # print('</%s>' % tag)
        pass

    def handle_startendtag(self, tag, attrs):
        # print('<%s/>' % tag)
        pass

    def handle_data(self, data):
        if self.title_flag == True:
            print('event-title:', data)
            self.title_flag = False  # 这一步赋值给flag是避免在后面判断时,相应flag始终为True
        elif self.location_flag == True:
            print('event-location:', data)
            self.location_flag = False
        elif self.time_flag == True:
            print('time:', data)
            self.time_flag = False

    def handle_comment(self, data):
        pass

    def handle_entityref(self, name):
        print('&%s;' % name)
        # pass

    def handle_charref(self, name):
        # print('&#%s;' % name)
        pass


url = 'https://www.python.org/events/python-events/'
with request.urlopen(url) as response:
    data = response.read().decode('utf-8')
parser = MyHTMLParser()
parser.feed(data)
parser.close()

  • 1

Reply