# coding=utf-8 import requests from retrying import retry from lxml import etree import json from queue import Queue import threading class QiushiSpider(object): def __init__(self): self.url_temp = "https://www.qiushibaike.com/8hr/page/{}/" self.headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"} self.url_queue = Queue() self.html_queue = Queue() self.content_queue = Queue() def get_url_list(self): for i in range(1, 14): self.url_queue.put(self.url_temp.format(i)) @retry(stop_max_attempt_number=3) def _parse_url(self, url): response = requests.get(url, headers=self.headers, timeout=5) assert response.status_code == 200 html = etree.HTML(response.content) return html def parse_url(self): # 2. 发送请求获取相应 while 1: url = self.url_queue.get() print(" now parseing", url) try: html = self._parse_url(url) except Exception as e: print(e) html = None self.html_queue.put(html) self.url_queue.task_done() def get_content_list(self): while 1: html = self.html_queue.get() div_list = html.xpath("//div[contains(@id,'qiushi_tag_')]") content_list = [] for div in div_list: item = {} src = div.xpath("./div[@class='author clearfix']/a[1]/img/@src") item["src"] = "https:" + src[0] if len(src) > 0 else None author_name = div.xpath("./div[@class='author clearfix']/a[1]/img/@alt") item["author_name"] = author_name[0] if len(author_name) > 0 else None author_gender = div.xpath(".//div[contains(@class,'articleGender')]/@class") item["author_gender"] = author_gender[0].split(" ")[-1].replace("Icon", "") if len( author_gender) > 0 else None author_age = div.xpath(".//div[contains(@class,'articleGender')]/text()") item["author_age"] = author_age[0] if len(author_age) > 0 else None duanzi_content = div.xpath(".//div[@class='content']/span/text()") item["duanzi_content"] = [i.replace("\n", "") for i in duanzi_content] content_list.append(item) self.content_queue.put(content_list) self.html_queue.task_done() def save_content_list(self): while 1: content_list = self.content_queue.get() file_path = './qiushi.txt' with open(file_path, "a", encoding='utf-8') as f: for content in content_list: f.write(json.dumps(content, ensure_ascii=False, indent=4)) f.write("\n") self.content_queue.task_done() def run(self): thread_list = [] # 1.url_list t_url = threading.Thread(target=self.get_url_list) thread_list.append(t_url) # 2.遍历url_list发送请求,获取响应 for i in range(3): t_parse_url = threading.Thread(target=self.parse_url) thread_list.append(t_parse_url) # 3.提取数据 t_get_content = threading.Thread(target=self.get_content_list) thread_list.append(t_get_content) # 4.保存 t_save = threading.Thread(target=self.save_content_list) thread_list.append(t_save) for t in thread_list: t.setDaemon(True) t.start() for q in [self.url_queue,self.html_queue,self.content_queue]: q.join() if __name__ == '__main__': qiushi = QiushiSpider() qiushi.run()
多线程爬虫之糗事百科
最新推荐文章于 2022-03-31 16:09:33 发布