Discuss - 廖雪峰的官方网站

Discuss / Python / 借鉴了其他同学的思路，我把注释完善下，方便新入坑的同学理解

Back

借鉴了其他同学的思路，我把注释完善下，方便新入坑的同学理解

Topic source

叫是先生

#1 Created at ... [Delete] [Delete and Lock User]

from html.parser import HTMLParser
from urllib import request
import urllib
import ssl
import re


class MyHTMLParser(HTMLParser):
    def __init__(self):
        super().__init__() #这里super().__init__()和super(MyHTMLParser,self)是等价的
        self.__flag = '' #设置标志位

    def handle_starttag(self, tag, attrs):
    # attr里面是一个有dict组成的list, 筛选找到合适的标签位置，然后把标志位设为相应的
        if tag == 'h3' and ('class', 'event-title') in attrs:
            self.__flag = 'title'   
        if tag == 'time':
            self.__flag = 'time'
        if tag == 'span' and ('class', 'say-no-more') in attrs:
            self.__flag = 'year'
        if tag == 'span' and ('class', 'event-location') in attrs:
            self.__flag = 'location'

    def handle_endtag(self, tag):
    #html一般是<>??</>这样格式的，在</>处将标志位清空
        self.__flag = ''

    def handle_data(self, data):
        if self.__flag == 'title':
            print(data)
        if self.__flag == 'time':
            print(data)
        if self.__flag == 'year':
        #还有2个符合tag == 'span' and ('class', 'say-no-more')筛选条件的，但并不是我们要找的year，所以需要用正则把他们过滤掉
            if re.match(r'\s\d{4}', data):
                print(data)
        if self.__flag == 'location':
            print(data)
            print('---------------')



context = ssl._create_unverified_context() #这是针对ssl验证问题
parser = MyHTMLParser()
URL = 'https://www.python.org/events/python-events/'
with urllib.request.urlopen(URL, context=context) as f:
data = f.read()
parser.feed(data.decode('utf-8'))

1

Reply