# 抓取网页https://www.cnblogs.com/ 推荐栏 from urllib import request from html.parser import HTMLParser from html.entities import name2codepoint from datetime import datetime from urllib.error import HTTPError import re,pprint class CnBlogHTMLParser(HTMLParser): def __init__(self, *, convert_charrefs: bool = True) -> None: super().__init__(convert_charrefs=convert_charrefs) self.items = {} self.__parseData = '' self. __parseIndex = 0 def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: if tag == 'a': if self._attr(attrs, 'id') == 'editor_pick_lnk': self.items['item_%d' % self.__parseIndex] = { 'value': '暂无' } title = self._attr(attrs, 'title') if title and re.match(r'评论\d+, 推荐\d+, 阅读\d+', title): self.items['item_%d' % self.__parseIndex] = { 'value': title } if tag == 'span' and self._attr(attrs, 'class') == 'headline-label': self.__parseData = 'headline-label' def handle_endtag(self, tag: str) -> None: self.__parseData = '' if tag == 'html': self.__parseIndex = 0 else: self.__parseIndex += 1 def handle_data(self, data: str) -> None: if self.__parseData == 'headline-label': key = 'item_%d' % self.__parseIndex if key in self.items: self.items[key]['label'] = data # match_result = re.search(r'<span class="headline-label">(.*?)</span>(.*?)', data) # if match_result: # print(match_result.groups()) # else: # print('匹配失败') def _attr(self, attrs, target_key): for key, value in attrs: if key == target_key: return value return None class CnBlogRequest(object): def __init__(self, url) -> None: self.__url = url def request(self): req = request.Request(self.__url) req.add_header('User-Agent', 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57') with request.urlopen(req) as resource: if resource.status != 200: raise HTTPError(url=self.__url, code= resource.status, msg=resource.reason) return resource.read().decode('utf-8') if __name__ == '__main__': try: response = CnBlogRequest('https://www.cnblogs.com/').request() parser = CnBlogHTMLParser() parser.feed(response) parser.close() pprint.pprint(parser.items) except Exception as e: print("抓取失败:", e)
{'item_165': {'label': '【编辑推荐】', 'value': '暂无'}, 'item_169': {'label': '【最多推荐】', 'value': '评论16, 推荐31, 阅读1049'}, 'item_173': {'label': '【最多评论】', 'value': '评论17, 推荐16, 阅读613'}, 'item_177': {'label': '【新闻头条】', 'value': '评论3, 推荐8, 阅读609'}, 'item_181': {'label': '【推荐新闻】', 'value': '评论3, 推荐7, 阅读991'}}
Sign in to make a reply
B O O M!