python爬虫多线程进程
import requests
import json
from lxml import etree
import time
from retrying import retry
import threading
from queue import Queue
class QiushiSpider:
def init(self):
self.url = “https://www.qiushibaike.com/hot/page/{}/”
self.headers ={“User-Agent”:“Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36”}
# self.url_list = [self.url.format(i) for i in range(1,14)]
self.url_queue = Queue()
[self.url_queue.put(self.url.format(i)) for i in range(1,14)]
self.html_str_queue = Queue()
self.content_list_queue = Queue()
#按住ctrl键点击@retry搜索init
@retry(stop_max_attempt_number = 5)
def _get_html(self,url):
response = requests.get(url,headers = self.headers)
print(response.status_code)
#ASSERT函数,其作用是如果它的条件返回错误,则终止程序执行
assert response.status_code == 200
return response.content.decode()
def get_html(self):
while True:
url = self.url_queue.get()
try:
html_str = self._get_html(url)
except:
html_str = None
# return html_str
self.html_str_queue.put(html_str)
self.url_queue.task_done()
def get_content_list(self):
while True:
html_str = self.html_str_queue.get()
html = etree.HTML(html_str)
#contains(@id,'qiushi_tag') id包含字符串'qiushi_tag'
div_list = html.xpath("//div[contains(@id,'qiushi_tag')]")
content_list = []
for div in div_list:
cdict = {}
#a[]下标从1开始,|后面那边是在匿名用户找,其实一样只要把a[1]换成span[1]就可以了
cdict["header_img"] = div.xpath(".//div[@class='author clearfix']/a[1]/img/@src|.//div[@class='author clearfix']/span[1]/img/@src")
#[0]:因为是个list对象,取第一个,从0开始
cdict["header_img"] = "http:" + cdict["header_img"][0]
cdict["username"] = div.xpath(".//div[@class='author clearfix']/a[1]/img/@alt|.//div[@class='author clearfix']/span[1]/img/@alt")[0]
#年龄同一行,年龄前面就是性别
cdict["sex"] = div.xpath(".//div[@class='author clearfix']/div/@class")
cdict["sex"] = cdict["sex"][0].split(" ")[-1].replace("Icon","") if len(cdict["sex"])>0 else None
content_list.append(cdict)
self.content_list_queue.put(content_list)
self.html_str_queue.task_done()
def save_data(self):
while True:
content_list = self.content_list_queue.get()
for i in content_list:
with open("1_qiushibk.txt","a",encoding="utf-8") as f:
f.write(json.dumps(i,ensure_ascii=False,indent=2))
print(i)
self.content_list_queue.task_done()
def run(self):
# for i in self.url_list:
# html_str = self.get_html(i)
# if html_str is not None:
# self.get_content_list(html_str)
# else:
# self.url_list.append(i)
thread_list = []
for i in range(3):
t_get_html = threading.Thread(target=self.get_html)
#上面get_html后面不能加(),得去掉,下同.target=不要忘记写了
thread_list.append(t_get_html)
for i in range(2):
t_content_list =threading.Thread(target=self.get_content_list)
thread_list.append(t_content_list)
t_save_data =threading.Thread(target=self.save_data)
thread_list.append(t_save_data)
for t in thread_list:
t.setDaemon(True)
t.start()
for q in [self.url_queue,self.html_str_queue,self.content_list_queue]:
q.join()
if name == ‘main’:
t1=time.time()
guoke = QiushiSpider()
guoke.run()
#format():把()里面的东西放入前面的{}
print(“程序运行了{}”.format(time.time()-t1))
import requests
from lxml import etree
import time
from retrying import retry
import json
from multiprocessing import Process
from multiprocessing import JoinableQueue as Queue
class QiushiSpider:
def init(self):
‘’‘全局变量’’’
self.headers = {“User-Agent”:“Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36”}
self.url = “https://www.qiushibaike.com/hot/page/{}/”
# self.url_list = [self.url.format(i) for i in range(1,14)]
self.url_list_queue = Queue()
# [self.url_list_queue.put(self.url.format(i)) for i in range(1,14)]
for i in range(1,14):
self.url_list_queue.put(self.url.format(i))
self.html_str_queue = Queue()
self.content_list_queue = Queue()
@retry(stop_max_attempt_number=5)
def _get_html(self,url):
'''发送请求,获取数据'''
print(url)
# time.sleep(5)
response = requests.get(url,headers=self.headers)
assert response.status_code == 200
return response.content.decode()
def get_html(self):
while True:
url = self.url_list_queue.get()
try:
html_str = self._get_html(url)
except:
html_str = None
self.html_str_queue.put(html_str)
self.url_list_queue.task_done()
def xpath_cl(self):
'''处理数据,提取所需要的内容'''
while True:
html_str = self.html_str_queue.get()
elem = etree.HTML(html_str)
div_list = elem.xpath('//div[contains(@id,"qiushi_tag")]')
content_list = []
for div in div_list:
item = {}
#定义一个字典,保存每一条数据
#获取头像
item["head_img"] = div.xpath('./div[@class="author clearfix"]/a[1]/img/@src|./div[@class="author clearfix"]/span[1]/img/@src')
item["head_img"] = "https:" + item["head_img"][0]
#获取网名
item["name"] = div.xpath('./div[@class="author clearfix"]/a[2]/h2/text()|./div[@class="author clearfix"]/span[2]/h2/text()')[0]
#获取性别
item["sex"] = div.xpath('.//div[contains(@class,"articleGender")]/@class')
item["sex"] = item["sex"][0].split(" ")[-1].replace("Icon","") if len(item["sex"])>0 else None
#获取年龄
item["age"] = div.xpath('.//div[contains(@class,"articleGender")]/text()')
item["age"] = item["age"][0] if len(item["age"])>0 else None
#获取内容
item["content"] = div.xpath('.//div[@class="content"]/span/text()')
item["content"] = [i.replace("\n","") for i in item["content"]]
item["content"] = "".join(item["content"])
#获取链接
item["url"] = div.xpath('./a[1]/@href')
item["url"] = "https://www.qiushibaike.com" + item["url"][0]
#获取图片
item["img"] = div.xpath('.//div[@class="thumb"]/a/img/@src')
item["img"] = "http:" + item["img"][0] if len(item["img"])>0 else None
#获取好笑数
item["vote"] = div.xpath('.//span[@class="stats-vote"]/i/text()')[0]
#获取评论数
item["comments"] = div.xpath('.//span[@class="stats-comments"]/a/i/text()')[0]
#获取神用户
item["cmt_name"] = div.xpath('.//div[@class="cmtMain"]/span[2]/text()')
item["cmt_name"] = item["cmt_name"][0].replace(":","") if len(item["cmt_name"])>0 else None
#获取神评论
item["cmt_text"] = div.xpath('.//div[@class="cmtMain"]/div/text()')
item["cmt_text"] = item["cmt_text"][0] if len(item["cmt_text"])>0 else None
#获取点赞数
item["likenum"] = div.xpath('.//div[@class="cmtMain"]//div[@class="likenum"]/text()')
item["likenum"] = item["likenum"][-1].replace("\n","") if len(item["likenum"])>0 else None
content_list.append(item)
self.content_list_queue.put(content_list)
self.html_str_queue.task_done()
def sava_date(self):
'''保存数据'''
while True:
content_list = self.content_list_queue.get()
for content in content_list:
with open("qiushinew.txt","a",encoding="utf-8") as f:
f.write(json.dumps(content,ensure_ascii=False,indent=2))
self.content_list_queue.task_done()
def run(self):
# 主要功能实现
# for url in self.url_list:
# # 1.发送url请求
# html_str = self.get_html(url)
# # 2.得到响应,获取数据
# # 3.xpath对数据进行处理
# if html_str is not None:
# content_list = self.xpath_cl(html_str)
# # 4.保存数据
# self.sava_date(content_list)
# else:
# self.url_list.append(url)
Process_list = []
for i in range(10):
P_get_html = Process(target=self.get_html)
Process_list.append(P_get_html)
for i in range(6):
P_xpath_cl = Process(target=self.xpath_cl)
Process_list.append(P_xpath_cl)
P_sava_date = Process(target=self.sava_date)
Process_list.append(P_sava_date)
for p in Process_list:
p.daemon=True
p.start()
for q in [self.url_list_queue,self.html_str_queue,self.content_list_queue]:
q.join()
if name == ‘main’:
t1 = time.time()
qiushi = QiushiSpider()
qiushi.run()
print(“爬取花了:{}秒”.format(time.time()-t1))