参考文章:连接我找不到了,这里有原博主的代码,写的特别详尽
# coding=utf-8
import requests
from lxml import etree
import threading
from queue import Queue
# https://docs.python.org/3/library/queue.html#module-queue
# 队列使用方法简介
# q.qsize() 返回队列的大小
# q.empty() 如果队列为空,返回True,反之False
# q.full() 如果队列满了,返回True,反之False
# q.full 与 maxsize 大小对应
# q.get([block[, timeout]]) 获取队列,timeout等待时间
# q.get_nowait() 相当q.get(False)
# q.put(item) 写入队列,timeout等待时间
# q.put_nowait(item) 相当q.put(item, False)
# q.task_done() 在完成一项工作之后,q.task_done() 函数向任务已经完成的队列发送一个信号
# q.join() 实际上意味着等到队列为空,再执行别的操作
class QiubaiSpdier:
def __init__(self):
self.url_temp = "https://www.qiushibaike.com/8hr/page/{}/"
self.headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}
self.url_queue = Queue()
self.html_queue = Queue()
self.content_queue = Queue()
def get_url_list(self):
# return [self.url_temp.format(i) for i in range(1,14)]
for i in range(1,14):
# 把13个索引页面的Url放进url_queue队列里
self.url_queue.put(self.url_temp.format(i))
def parse_url(self):
while True:
# get方法和task_done搭配使用
# 在put是队列+1,get和task_done一起使用时队列才会-1
url = self.url_queue.get()
print(url)
response = requests.get(url,headers=self.headers)
# 然后把索引页的响应页面放进html_queue队列里
self.html_queue.put(response.content.decode())
#类似于向队列发送清空信号,如果不用这个的话。主进程会一直卡在join这一块,认为队列没有清空
self.url_queue.task_done()
def get_content_list(self): #提取数据
while True:
# 先从索引页响应页面html_queue队列里面取出索引页面
html_str = self.html_queue.get()
html = etree.HTML(html_str)
div_list = html.xpath("//div[@id='content-left']/div") #分组
content_list = []
for div in div_list:
item= {}
item["content"] = div.xpath(".//div[@class='content']/span/text()")
item["content"] = [i.replace("\n","") for i in item["content"]]
item["author_gender"] = div.xpath(".//div[contains(@class,'articleGender')]/@class")
item["author_gender"] = item["author_gender"][0].split(" ")[-1].replace("Icon","") if len(item["author_gender"])>0 else None
item["auhtor_age"] = div.xpath(".//div[contains(@class,'articleGender')]/text()")
item["auhtor_age"] = item["auhtor_age"][0] if len(item["auhtor_age"])>0 else None
item["content_img"] = div.xpath(".//div[@class='thumb']/a/img/@src")
item["content_img"] = "https:"+item["content_img"][0] if len(item["content_img"])>0 else None
item["author_img"] = div.xpath(".//div[@class='author clearfix']//img/@src")
item["author_img"] = "https:"+item["author_img"][0] if len(item["author_img"])>0 else None
item["stats_vote"] = div.xpath(".//span[@class='stats-vote']/i/text()")
item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"])>0 else None
content_list.append(item)
# 把content_list放进content_queue里面
self.content_queue.put(content_list)
self.html_queue.task_done()
def save_content_list(self): #保存
while True:
content_list = self.content_queue.get()
for i in content_list:
print(i)
pass
self.content_queue.task_done()
def run(self): #实现主要逻辑
thread_list = []
#1.url_list
# threading.Thread不需要传参数,参数都是从队列里面取得
t_url = threading.Thread(target=self.get_url_list)
thread_list.append(t_url)
#2.遍历,发送请求,获取响应
for i in range(20): # 添加20个线程
t_parse = threading.Thread(target=self.parse_url)
thread_list.append(t_parse)
#3.提取数据
for i in range(2): # 添加2个线程
t_html = threading.Thread(target=self.get_content_list)
thread_list.append(t_html)
#4.保存
t_save = threading.Thread(target=self.save_content_list)
thread_list.append(t_save)
for t in thread_list:
t.setDaemon(True) #把子线程设置为守护线程,该线程不重要,主线程结束,子线程结束(子线程是while true不会自己结束)
t.start()
for q in [self.url_queue,self.html_queue,self.content_queue]:
q.join() #让主线程等待阻塞,等待队列的任务完成(即队列为空时 )之后再进行主线程
print("主线程结束")
if __name__ == '__main__':
qiubai = QiubaiSpdier()
qiubai.run()
我自己的代码在下面展示,这里主要讲解一下我遇到的问题
问题一:self.dateInfo.task_done(),这个task_done()主要是为了告诉线程池已经取走一个了,不然线程池会一直卡在join里面,无限挂起
问题二: t.setDaemon(True) # 把子线程设置为守护线程,该线程不重要,主线程结束,子线程结束,因为多线程都是写的while True 循环。不会结束,所以这个方法呢就是只要主线程结束,代码结束,这个代码一定要写在start之前。这个方法刚好与join相反
from cui.mybaseCode import *
from queue import Queue
import time
import threading
class Batch:
def __init__(self,cookie,dates):
self.cookie = cookie
self.pgdb_conn = pg.connect(database="cui", user="postgres", password="4OZ5EvxekT", host="127.0.0.1",
port="5432")
self.cur = self.pgdb_conn.cursor()
self.endindex = 0
self.dates=dates
self.dateInfo = Queue()
self.cateInfo =Queue()
self.content_queue = Queue()
def getCrowInfo(self):
three_url = 'https://databank.yushanfang.com/api/paasapi?path=/api/dimension/listChildDimension&type=CATEGORY&id='
response = requests_method(self.cookie, 'get', three_url, {})
info = json.loads(response)
for item in info['data']:
self.cateInfo.put([item['bizId'], item['name']])
self.cateInfo.put(['', '全部'])
print(self.cateInfo)
def test(self):
if self.content_queue.empty() == True:
print('cui')
#
# def putCateInfo(self):
def putDate(self):
for mydate in self.dates:
self.dateInfo.put(mydate)
def getData(self):
print('开始爬取')
cate=['','全部']
date = self.dateInfo.get()
first_url = 'https://databank.yushanfang.com/api/ecapi'
params={
'path': '/databank/crowdFullLink/detail',
'beginTheDate': date,
'endTheDate':str(int(date)+1)
}
if cate[0] != '':
print(cate[0])
params['cateId']=str(cate[0])
res =json.loads(requests_method(self.cookie, 'get', first_url, params))
for item in res['data'][4:]:
self.content_queue.put([item['name'], item['cnt'], str(date) +str(cate[1])])
self.dateInfo.task_done()
time.sleep(random.random())
print(self.dateInfo.empty())
def save_content_list(self): #保存
print('开始存储')
while True:
mycontent = self.content_queue.get()
print('mycontent---------------:%s'%(mycontent))
self.cur.execute("insert into rightconner values(%s,%s,%s)",
(mycontent[0],mycontent[1],mycontent[2]))
self.content_queue.task_done()
print('储存数据---------:%s'%(self.content_queue.empty()))
def run(self):
thread_list = []
# t_CateInfo = threading.Thread(target=self.putCateInfo)
# thread_list.append(t_CateInfo)
for i in range(2):
t_putDate = threading.Thread(target=self.putDate)
thread_list.append(t_putDate)
for i in range(3):
t_getData = threading.Thread(target=self.getData)
thread_list.append(t_getData)
t_save = threading.Thread(target=self.save_content_list)
thread_list.append(t_save)
for t in thread_list:
#这个方法一定要在start方法之前设置
t.setDaemon(True) # 把子线程设置为守护线程,该线程不重要,主线程结束,子线程结束(子线程是while true不会自己结束)
t.start()
for q in [ self.dateInfo,self.content_queue]:
q.join() # 让主线程等待阻塞,等待队列的任务完成(即队列为空时 )之后再进行主线程
self.pgdb_conn.commit()
self.cur.close()
print("主线程结束")
if __name__ == '__main__':
cookie='cna=Xl4OFI+aFD8CAdINZPa5UoXn; _tb_token_=QNhOPtgFPRg96H1MLojb; bs_n_lang=zh_CN; c_token=6de723623112a374f649f951c25a6eab; ck2=09c36266afd1c4b7d35b734d859c4f55; an=5LiK5rW354K55q2j5LqS6IGU572R56eR5oqA; lg=true; sg=A09; lvc=sAhojP%2BY2S2dOA%3D%3D; isg=BJSUSroks7nCWicMTbkpDcFLZdIMtbhenvNPcC51Mp-iGTRjVvi0ZdafHVEk4fAv'
startDate = '2017-09-01'
endDate = '2017-09-10'
frequency='1'
start,end=calculateDays(startDate,endDate,frequency)
date=days(start,end)
tests=Batch(cookie,date)
tests.run()