Discuss / Python / HTML 没有XML格式严谨, 不好弄的感觉... 写了一个相当复杂的方式实现...

HTML 没有XML格式严谨, 不好弄的感觉... 写了一个相当复杂的方式实现...

Topic source

一雷叔一

#1 Created at ... [Delete] [Delete and Lock User]

from html.parser import HTMLParser from html.entities import name2codepoint

Stag = [] Satt = {} events = []

class MyHTMLParser(HTMLParser):

def handle_starttag(self, tag, attrs):
    Stag.append(tag)
    Satt[tag] = attrs
    if len(Stag) > 2:
        if Stag[0] in Satt:
            Satt.pop(Stag[0])
        Stag.pop(0)

def handle_endtag(self, tag):
    if len(Stag) > 1:
        if Stag[1] == tag:
            if Stag[1] in Satt:
                Satt.pop(Stag[1])
            Stag[1] = ' '

def handle_data(self, data):
    if len(Stag) > 1:
        if Stag[0] == 'h3' and Stag[1] == 'a' and ('class', 'event-title') in Satt[Stag[0]]:
            events.append([data.strip()])
        if Stag[0] == 'p' and Stag[1] == 'time':
            events[len(events)-1].append(data.strip())
        if Stag[1] == 'span' and ('class', 'event-location') in Satt[Stag[1]]:
            events[len(events)-1].append(data.strip())

parser = MyHTMLParser() parser.feed(Htmldata)

n = 0 print('----重要事件----', len(events), '件') while n < len(events): print(events[n]) n = n + 1


  • 1

Reply