多线程爬虫


# import requests
# import pymysql
# from queue import Queue
# import threading
# import time


# class BaiDu:
#     def __init__(self):
#         self.url = 'https://talent.baidu.com/httservice/getPostListNew'
#         self.headers = {
#             'Host': 'talent.baidu.com',
#             'Origin': 'https://talent.baidu.com',
#             'Referer': 'https://talent.baidu.com/jobs/social-list',
#             'User-Agent': '',
#             'Cookie': 'BIDUPSID=F4445B37EDD903DFE4635B2BAE533C0F; PSTM=1656736794; BAIDUID=F4445B37EDD903DF0CF2B2CD5B621428:FG=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=36557_38112_38470_38439_38404_38468_38289_38486_37929_26350_38423_37881; BAIDUID_BFESS=F4445B37EDD903DF0CF2B2CD5B621428:FG=1; BA_HECTOR=002g2l000k2g8h85058425bu1i2o27g1n; ZFY=1pIkyXYuGTdDYvHq4SRq4FJSivK95:Bb4qRVK43CU510:C; PSINO=5; delPer=0; Hm_lvt_50e85ccdd6c1e538eb1290bc92327926=1680617574; Hm_lpvt_50e85ccdd6c1e538eb1290bc92327926=1680617679; RT="z=1&dm=baidu.com&si=9fc57594-e773-4241-b0f6-ea0a982d68dd&ss=lg2cb410&sl=4&tt=97p&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf"'
#         }
#         self.db = pymysql.connect(host='localhost', user='root', password='55555', port=3306, db='job')
#         self.cursor = self.db.cursor()
#         self.form_queue = Queue()
#         self.json_queue = Queue()
#         self.content_queue = Queue()


#     def form_data(self):
#         for page in range(1, 5):
#             data = {
#                 'recruitType': 'SOCIAL',
#                 'pageSize': '10',
#                 'keyWord': '',
#                 'curPage': f'{page}',
#                 'projectType': ''
#             }
#             self.form_queue.put(data)


#     def get_data(self):
#         while True:
#             data = self.form_queue.get()
#             response = requests.post(self.url, headers=self.headers, data=data)
#             self.json_queue.put(response.json())
#             self.form_queue.task_done()


#     def parse_data(self):
#         while True:
#             result = self.json_queue.get()
#             for one in result['data']['list']:
#                 education = one['education']
#                 name_ = one['name']
#                 publishDate = one['publishDate']
#                 serviceCondition = one['serviceCondition']
#                 updateDate = one['updateDate']
#                 workContent = one['workContent']
#                 workPlace = one['workPlace']
#                 workYears = one['workYears']
#                 content = [education, name_, publishDate, serviceCondition, updateDate, workContent, workPlace, workYears]
#                 self.content_queue.put(content)
#             self.json_queue.task_done()

#     def create_table(self):
#         create_table_sql = '''
#             create table if not exists baidu(
#             id int unsigned primary key auto_increment,
#             education varchar(10),
#             name_ varchar(100),
#             publishDate varchar(10),
#             serviceCondition text,
#             updateDate varchar(10),
#             workContent text,
#             workPlace varchar(10),
#             workYears varchar(2)
#             ) default charset=utf8 default collate=utf8_general_ci
#         '''
#         try:
#             self.cursor.execute(create_table_sql)
#             print('创建表成功')
#         except Exception as e:
#             print('创建表失败', repr(e))


#     def save_data(self):
#         while True:
#             content = self.content_queue.get()
#             education = content[0]
#             name_ = content[1]
#             publishDate = content[2]
#             serviceCondition = content[3]
#             updateDate = content[4]
#             workContent = content[5]
#             workPlace = content[6]
#             workYears = content[7]
#             insert_sql = """
#                 insert into baidu(id, education, name_, publishDate, serviceCondition, updateDate, workContent, workPlace, workYears)
#                 values(%s, %s, %s, %s, %s, %s, %s, %s, %s)
#             """
#             try:
#                 self.cursor.execute(insert_sql, (0, content[0], content[1], content[2], content[3], content[4], content[5], content[6], content[7]))
#                 self.db.commit()
#                 print('插入数据成功')
#             except Exception as e:
#                 print('插入数据失败', repr(e))
#                 self.db.rollback()
#             self.content_queue.task_done()


#     def main(self):
#         self.create_table()
#         thread_list = []
#         t_form_data = threading.Thread(target=self.form_data)
#         thread_list.append(t_form_data)
#         for i in range(4):
#             t_get_data = threading.Thread(target=self.get_data)
#             thread_list.append(t_get_data)
#         t_parse_data = threading.Thread(target=self.parse_data)
#         thread_list.append(t_parse_data)
#         t_save_data = threading.Thread(target=self.save_data)
#         thread_list.append(t_save_data)
#         for thread in thread_list:
#             thread.setDaemon(True)
#             thread.start()
#         time.sleep(1)
#         for queue in [self.form_queue, self.json_queue, self.content_queue]:
#             queue.join()
#         self.db.close()


# if __name__ == '__main__':
#     b = BaiDu()
#     b.main()



import requests
import pymongo
import time
from queue import Queue
import threading


class BaiDu:
    def __init__(self):
        self.url = 'https://talent.baidu.com/httservice/getPostListNew'
        self.headers = {
            'Host': 'talent.baidu.com',
            'Origin': 'https://talent.baidu.com',
            'Referer': 'https://talent.baidu.com/jobs/social-list',
            'User-Agent': '',
            'Cookie': 'BIDUPSID=F4445B37EDD903DFE4635B2BAE533C0F; PSTM=1656736794; BAIDUID=F4445B37EDD903DF0CF2B2CD5B621428:FG=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=36557_38112_38470_38439_38404_38468_38289_38486_37929_26350_38423_37881; BAIDUID_BFESS=F4445B37EDD903DF0CF2B2CD5B621428:FG=1; BA_HECTOR=002g2l000k2g8h85058425bu1i2o27g1n; ZFY=1pIkyXYuGTdDYvHq4SRq4FJSivK95:Bb4qRVK43CU510:C; PSINO=5; delPer=0; Hm_lvt_50e85ccdd6c1e538eb1290bc92327926=1680617574; Hm_lpvt_50e85ccdd6c1e538eb1290bc92327926=1680617679; RT="z=1&dm=baidu.com&si=9fc57594-e773-4241-b0f6-ea0a982d68dd&ss=lg2cb410&sl=4&tt=97p&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf"'
        }
        self.db = pymongo.MongoClient(host='localhost', port=27017)
        self.collection = self.db.python.work
        self.form_queue = Queue()
        self.json_queue = Queue()
        self.content_queue = Queue()


    def form_data(self):
        for page in range(1, 5):
            data = {
                'recruitType': 'SOCIAL',
                'pageSize': '10',
                'keyWord': '',
                'curPage': f'{page}',
                'projectType': ''
            }
            self.form_queue.put(data)


    def get_data(self):
        while True:
            data = self.form_queue.get()
            response = requests.post(self.url, headers=self.headers, data=data)
            self.json_queue.put(response.json())
            self.form_queue.task_done()


    def parse_data(self):
        while True:
            result = self.json_queue.get()
            for one in result['data']['list']:
                item = {}
                item['education'] = one['education']
                item['name'] = one['name']
                item['publishDate'] = one['publishDate']
                item['serviceCondition'] = one['serviceCondition']
                item['updateDate'] = one['updateDate']
                item['workContent'] = one['workContent']
                item['workPlace'] = one['workPlace']
                item['workYears'] = one['workYears']
                self.content_queue.put(item)
            self.json_queue.task_done()


    def save_data(self):
        while True:
            item = self.content_queue.get()
            self.collection.insert_one(item)
            print('插入数据成功')
            self.content_queue.task_done()


    def main(self):
        thread_list = []
        t_form_data = threading.Thread(target=self.form_data)
        thread_list.append(t_form_data)
        for i in range(4):
            t_get_data = threading.Thread(target=self.get_data)
            thread_list.append(t_get_data)
        t_parse_data = threading.Thread(target=self.parse_data)
        thread_list.append(t_parse_data)
        t_save_data = threading.Thread(target=self.save_data)
        thread_list.append(t_save_data)
        for thread in thread_list:
            # 将子线程设置为守护线程:主线程执行完毕,所有子线程必须终止,程序结束。该操作能够防止队列.get()或其他问题阻塞主线程,
            # 但是无法保证子线程任务都执行完了,所以下面实现了队列.join()。利用队列.get()阻塞主线程,一方面在系统还在开启线程时,可以防止主线程快速执行完,以至于子线程任务都没运行;
            # 另一方面确保满足子线程任务都执行完了这个条件,主线程才可以继续执行。注意该步必须在线程开启前实现
            thread.setDaemon(True)
            thread.start()
        # 当要开启的线程数量较多时,在系统还在开启线程时,主线程已经执行完了,线程任务都没运行,队列的计数自然为0,所以可以设置一个休眠时间
        time.sleep(1)
        for queue in [self.form_queue, self.json_queue, self.content_queue]:
            queue.join()  # 队列.get()阻塞主线程的解决方案:所有队列的计数为0时(队列.put()队列计数加1,队列.get()+队列.task_done()队列计数减1),主线程才可以继续往下执行代码
        self.db.close()


if __name__ == '__main__':
    b = BaiDu()
    b.main()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值