# import requests
# import pymysql
# from queue import Queue
# import threading
# import time
# class BaiDu:
# def __init__(self):
# self.url = 'https://talent.baidu.com/httservice/getPostListNew'
# self.headers = {
# 'Host': 'talent.baidu.com',
# 'Origin': 'https://talent.baidu.com',
# 'Referer': 'https://talent.baidu.com/jobs/social-list',
# 'User-Agent': '',
# 'Cookie': 'BIDUPSID=F4445B37EDD903DFE4635B2BAE533C0F; PSTM=1656736794; BAIDUID=F4445B37EDD903DF0CF2B2CD5B621428:FG=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=36557_38112_38470_38439_38404_38468_38289_38486_37929_26350_38423_37881; BAIDUID_BFESS=F4445B37EDD903DF0CF2B2CD5B621428:FG=1; BA_HECTOR=002g2l000k2g8h85058425bu1i2o27g1n; ZFY=1pIkyXYuGTdDYvHq4SRq4FJSivK95:Bb4qRVK43CU510:C; PSINO=5; delPer=0; Hm_lvt_50e85ccdd6c1e538eb1290bc92327926=1680617574; Hm_lpvt_50e85ccdd6c1e538eb1290bc92327926=1680617679; RT="z=1&dm=baidu.com&si=9fc57594-e773-4241-b0f6-ea0a982d68dd&ss=lg2cb410&sl=4&tt=97p&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf"'
# }
# self.db = pymysql.connect(host='localhost', user='root', password='55555', port=3306, db='job')
# self.cursor = self.db.cursor()
# self.form_queue = Queue()
# self.json_queue = Queue()
# self.content_queue = Queue()
# def form_data(self):
# for page in range(1, 5):
# data = {
# 'recruitType': 'SOCIAL',
# 'pageSize': '10',
# 'keyWord': '',
# 'curPage': f'{page}',
# 'projectType': ''
# }
# self.form_queue.put(data)
# def get_data(self):
# while True:
# data = self.form_queue.get()
# response = requests.post(self.url, headers=self.headers, data=data)
# self.json_queue.put(response.json())
# self.form_queue.task_done()
# def parse_data(self):
# while True:
# result = self.json_queue.get()
# for one in result['data']['list']:
# education = one['education']
# name_ = one['name']
# publishDate = one['publishDate']
# serviceCondition = one['serviceCondition']
# updateDate = one['updateDate']
# workContent = one['workContent']
# workPlace = one['workPlace']
# workYears = one['workYears']
# content = [education, name_, publishDate, serviceCondition, updateDate, workContent, workPlace, workYears]
# self.content_queue.put(content)
# self.json_queue.task_done()
# def create_table(self):
# create_table_sql = '''
# create table if not exists baidu(
# id int unsigned primary key auto_increment,
# education varchar(10),
# name_ varchar(100),
# publishDate varchar(10),
# serviceCondition text,
# updateDate varchar(10),
# workContent text,
# workPlace varchar(10),
# workYears varchar(2)
# ) default charset=utf8 default collate=utf8_general_ci
# '''
# try:
# self.cursor.execute(create_table_sql)
# print('创建表成功')
# except Exception as e:
# print('创建表失败', repr(e))
# def save_data(self):
# while True:
# content = self.content_queue.get()
# education = content[0]
# name_ = content[1]
# publishDate = content[2]
# serviceCondition = content[3]
# updateDate = content[4]
# workContent = content[5]
# workPlace = content[6]
# workYears = content[7]
# insert_sql = """
# insert into baidu(id, education, name_, publishDate, serviceCondition, updateDate, workContent, workPlace, workYears)
# values(%s, %s, %s, %s, %s, %s, %s, %s, %s)
# """
# try:
# self.cursor.execute(insert_sql, (0, content[0], content[1], content[2], content[3], content[4], content[5], content[6], content[7]))
# self.db.commit()
# print('插入数据成功')
# except Exception as e:
# print('插入数据失败', repr(e))
# self.db.rollback()
# self.content_queue.task_done()
# def main(self):
# self.create_table()
# thread_list = []
# t_form_data = threading.Thread(target=self.form_data)
# thread_list.append(t_form_data)
# for i in range(4):
# t_get_data = threading.Thread(target=self.get_data)
# thread_list.append(t_get_data)
# t_parse_data = threading.Thread(target=self.parse_data)
# thread_list.append(t_parse_data)
# t_save_data = threading.Thread(target=self.save_data)
# thread_list.append(t_save_data)
# for thread in thread_list:
# thread.setDaemon(True)
# thread.start()
# time.sleep(1)
# for queue in [self.form_queue, self.json_queue, self.content_queue]:
# queue.join()
# self.db.close()
# if __name__ == '__main__':
# b = BaiDu()
# b.main()
import requests
import pymongo
import time
from queue import Queue
import threading
class BaiDu:
def __init__(self):
self.url = 'https://talent.baidu.com/httservice/getPostListNew'
self.headers = {
'Host': 'talent.baidu.com',
'Origin': 'https://talent.baidu.com',
'Referer': 'https://talent.baidu.com/jobs/social-list',
'User-Agent': '',
'Cookie': 'BIDUPSID=F4445B37EDD903DFE4635B2BAE533C0F; PSTM=1656736794; BAIDUID=F4445B37EDD903DF0CF2B2CD5B621428:FG=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=36557_38112_38470_38439_38404_38468_38289_38486_37929_26350_38423_37881; BAIDUID_BFESS=F4445B37EDD903DF0CF2B2CD5B621428:FG=1; BA_HECTOR=002g2l000k2g8h85058425bu1i2o27g1n; ZFY=1pIkyXYuGTdDYvHq4SRq4FJSivK95:Bb4qRVK43CU510:C; PSINO=5; delPer=0; Hm_lvt_50e85ccdd6c1e538eb1290bc92327926=1680617574; Hm_lpvt_50e85ccdd6c1e538eb1290bc92327926=1680617679; RT="z=1&dm=baidu.com&si=9fc57594-e773-4241-b0f6-ea0a982d68dd&ss=lg2cb410&sl=4&tt=97p&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf"'
}
self.db = pymongo.MongoClient(host='localhost', port=27017)
self.collection = self.db.python.work
self.form_queue = Queue()
self.json_queue = Queue()
self.content_queue = Queue()
def form_data(self):
for page in range(1, 5):
data = {
'recruitType': 'SOCIAL',
'pageSize': '10',
'keyWord': '',
'curPage': f'{page}',
'projectType': ''
}
self.form_queue.put(data)
def get_data(self):
while True:
data = self.form_queue.get()
response = requests.post(self.url, headers=self.headers, data=data)
self.json_queue.put(response.json())
self.form_queue.task_done()
def parse_data(self):
while True:
result = self.json_queue.get()
for one in result['data']['list']:
item = {}
item['education'] = one['education']
item['name'] = one['name']
item['publishDate'] = one['publishDate']
item['serviceCondition'] = one['serviceCondition']
item['updateDate'] = one['updateDate']
item['workContent'] = one['workContent']
item['workPlace'] = one['workPlace']
item['workYears'] = one['workYears']
self.content_queue.put(item)
self.json_queue.task_done()
def save_data(self):
while True:
item = self.content_queue.get()
self.collection.insert_one(item)
print('插入数据成功')
self.content_queue.task_done()
def main(self):
thread_list = []
t_form_data = threading.Thread(target=self.form_data)
thread_list.append(t_form_data)
for i in range(4):
t_get_data = threading.Thread(target=self.get_data)
thread_list.append(t_get_data)
t_parse_data = threading.Thread(target=self.parse_data)
thread_list.append(t_parse_data)
t_save_data = threading.Thread(target=self.save_data)
thread_list.append(t_save_data)
for thread in thread_list:
# 将子线程设置为守护线程:主线程执行完毕,所有子线程必须终止,程序结束。该操作能够防止队列.get()或其他问题阻塞主线程,
# 但是无法保证子线程任务都执行完了,所以下面实现了队列.join()。利用队列.get()阻塞主线程,一方面在系统还在开启线程时,可以防止主线程快速执行完,以至于子线程任务都没运行;
# 另一方面确保满足子线程任务都执行完了这个条件,主线程才可以继续执行。注意该步必须在线程开启前实现
thread.setDaemon(True)
thread.start()
# 当要开启的线程数量较多时,在系统还在开启线程时,主线程已经执行完了,线程任务都没运行,队列的计数自然为0,所以可以设置一个休眠时间
time.sleep(1)
for queue in [self.form_queue, self.json_queue, self.content_queue]:
queue.join() # 队列.get()阻塞主线程的解决方案:所有队列的计数为0时(队列.put()队列计数加1,队列.get()+队列.task_done()队列计数减1),主线程才可以继续往下执行代码
self.db.close()
if __name__ == '__main__':
b = BaiDu()
b.main()
多线程爬虫
于 2023-04-08 21:43:01 首次发布