#-*-coding:utf-8-*-
# -*-coding:utf-8-*-
import threading, time, requests, json
from queue import Queue
class Pcoduct(threading.Thread): # 继承多线程父类
def __init__(self, i, q):
super().__init__() # 继承父类init#自动执行run
self.i = i
self.q = q
def run(self): # 请求#复写run方法
while True:
if self.q.empty():
break
try:
q2 = self.q.get(block=False)
print(self.i, "任务执行")
time3 = time.time()
url = f"https://careers.tencent.com/tencentcareer/api/post/Query?timestamp={time3}&keyword=python&pageIndex={q2}&pageSize=10&language=zh-cn&area=cn"
self.get_html(url)
print(self.i, "任务结束")
except:
pass
def get_html(self, url):
headers = { # 伪装成浏览器,防止反爬,通用
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
response = requests.get(url=url, headers=headers).json()
q.put(response)
class Customer(threading.Thread): # 继承多线程父类
def __init__( self,j):
super().__init__() # 继承父类init#自动执行run
self.j = j
def run(self): # 请求#复写run方法
while True:
if c.empty() and flag:
break
try:
response=c.get(block=False)
print(self.j, "任务执行")
self.parse_html(response)
print(self.j,'任务结束')
except:
pass
def parse_html(self,response):
job_list = response['Data']['Posts']
for job in job_list:
# 工作名称:
name = job['RecruitPostName']
# 工作地点:
address = job['LocationName']
# 岗位职责:
Responsibility = job['Responsibility']
Responsibility = Responsibility.replace('\n', '').replace('\r', '')
# 详情url:
PostURL = job['PostURL']
infor = f'工作名称:{name},工作地点:{address},岗位职责:{Responsibility},详情url:{PostURL}'
with lock:
with open('腾讯招聘.txt', 'a', encoding='utf-8')as fp:
fp.write(infor + '\n')
if __name__ == '__main__':
lock=threading.Lock()
flag=False
start = time.time()
q = Queue()#生产者队列
for i in range(1, 21):
q.put(i)
c=Queue()
product= ['p1', 'p2', 'p3'] # 创建任务对列#起线程
customer=['c1','c2','c3']
qp=[]
cq=[]
for pi in product:
crawl = Pcoduct(pi, q)
crawl.start()
qp.append(crawl)
for ci in customer:
crawl2 = Customer(ci)
crawl2.start()
cq.append(crawl2)
for pj in qp:
pj.join() # 阻塞主线程
flag=True
for cj in cq:
cj.join()
end = time.time()
print(end - start)
爬虫多线程生产者与消费者
最新推荐文章于 2020-08-05 13:09:37 发布