多线程爬取糗事百科段子
from threading import Thread
from queue import Queue
import requests
from fake_useragent import UserAgent
from lxml import etree
class CrawlInfo(Thread):
def __init__(self,url_queue,html_queue):
Thread.__init__(self)
self.url_queue=url_queue
self.html_queue=html_queue
def run(self):
headers = {
'User-Agent': UserAgent().chrome
}
while not self.url_queue.empty():
response=requests.get(self.url_queue.get(),headers=headers)
if response.status_code==200:
# print(response.text)
return html_queue.put(response.text)
class ParseInfo(Thread):
def __init__(self,html_queue):
Thread.__init__(self)
self.html_queue = html_queue
def run(self):
print(html_queue)
while not self.html_queue.empty():
e=etree.HTML(self.html_queue.get())
span_contents=e.xpath('//div[@class="content"]/span[1]')
with open('段子.txt','a',encoding='utf-8') as f:
for span in span_contents:
info=span.xpath('string(.)')
# print(info)
f.write(info+'\n')
if __name__ == '__main__':
url_queue = Queue()
html_queue=Queue()
base_url = 'https://www.qiushibaike.com/text/page/{}/'
for i in range(1, 10):
new_url = base_url.format(i)
url_queue.put(new_url)
crawlList=[]
for i in range(3):
c=CrawlInfo(url_queue,html_queue)
crawlList.append(c)
c.start()
for crawl in crawlList:
crawl.join()
p=ParseInfo(html_queue)
p.start()