爬虫多线程生产者与消费者

#-*-coding:utf-8-*-
# -*-coding:utf-8-*-
import threading, time, requests, json
from  queue import Queue
class Pcoduct(threading.Thread):  # 继承多线程父类
    def __init__(self, i, q):
        super().__init__()  # 继承父类init#自动执行run
        self.i = i
        self.q = q
    def run(self):  # 请求#复写run方法
        while True:
            if self.q.empty():
                break
            try:

                q2 = self.q.get(block=False)
                print(self.i, "任务执行")
                time3 = time.time()
                url = f"https://careers.tencent.com/tencentcareer/api/post/Query?timestamp={time3}&keyword=python&pageIndex={q2}&pageSize=10&language=zh-cn&area=cn"
                self.get_html(url)
                print(self.i, "任务结束")
            except:
                pass
    def get_html(self, url):
        headers = {  # 伪装成浏览器,防止反爬,通用
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
        response = requests.get(url=url, headers=headers).json()
        q.put(response)
class Customer(threading.Thread):  # 继承多线程父类
    def __init__( self,j):
        super().__init__()  # 继承父类init#自动执行run
        self.j = j

    def run(self):  # 请求#复写run方法
        while True:
            if c.empty() and  flag:
                break
            try:
                response=c.get(block=False)
                print(self.j, "任务执行")
                self.parse_html(response)
                print(self.j,'任务结束')
            except:
                pass
    def parse_html(self,response):
        job_list = response['Data']['Posts']
        for job in job_list:
            # 工作名称:
            name = job['RecruitPostName']
            # 工作地点:
            address = job['LocationName']
            # 岗位职责:
            Responsibility = job['Responsibility']
            Responsibility = Responsibility.replace('\n', '').replace('\r', '')
            # 详情url:
            PostURL = job['PostURL']
            infor = f'工作名称:{name},工作地点:{address},岗位职责:{Responsibility},详情url:{PostURL}'
            with lock:
                with open('腾讯招聘.txt', 'a', encoding='utf-8')as fp:
                    fp.write(infor + '\n')
if __name__ == '__main__':
    lock=threading.Lock()
    flag=False
    start = time.time()
    q = Queue()#生产者队列
    for i in range(1, 21):
        q.put(i)
    c=Queue()
    product= ['p1', 'p2', 'p3']  # 创建任务对列#起线程
    customer=['c1','c2','c3']
    qp=[]
    cq=[]
    for pi in product:
        crawl = Pcoduct(pi, q)
        crawl.start()
        qp.append(crawl)
    for ci in customer:
        crawl2 = Customer(ci)
        crawl2.start()
        cq.append(crawl2)
    for pj in qp:
        pj.join()  # 阻塞主线程
    flag=True
    for cj in cq:
        cj.join()
    end = time.time()
    print(end - start)


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值