import requests
import csv
import time
import threading
import re
from queue import Queue
class Producer(threading.Thread):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'
}
def __init__(self, url_queue, page_queue):
super(Producer, self).__init__()
self.url_queue = url_queue
self.page_queue = page_queue
def run(self):
while True:
if self.url_queue.empty():
break
pageurl = url_queue.get()
self.pares_page(pageurl)
def pares_page(self, pageurl):
response = requests.get(pageurl, Producer.headers)
time.sleep(1)
response.encoding = 'utf-8'
results = response.json()
postinfo = results['Data']['Posts']
for post in postinfo:
postdict = {}
postdict['PostName'] = post.get('RecruitPostName')
postdict['CountryName'] = post.get('CountryName')
postdict['LocationName'] = post.get('LocationName')
postid = post.get('PostId')
postu = post.get('PostURL')
urlpostid = re.search(r'(.*?)postId=(\d+)', postu).group(2)
urlhead = re.search(r'(.*?)postId=(\d+)', postu).group(1) + 'postId='
if postid != urlpostid: #由于爬取的源码数据中有部分详情URL与实际的URL不对应,所以采用了判断方式处理不正确的部分
postdict['PostURL'] = urlhead + postid
else:
postdict['PostURL'] = postu
self.page_queue.put(postdict)
class Consumer(threading.Thread):
def __init__(self, page_queue):
super(Consumer, self).__init__()
self.page_queue = page_queue
def run(self):
post_list = []
while True:
if self.page_queue.empty():
break
pageinfo = page_queue.get()
post_list.append(pageinfo)
with open('tencent.csv', 'w', encoding='utf-8-sig', newline='') as f:
write = csv.DictWriter(f, fieldnames=('PostName', 'CountryName', 'LocationName', 'PostURL'))
write.writeheader()
write.writerows(post_list)
if __name__ == '__main__':
url_queue = Queue()
page_queue = Queue()
for u in range(1, 11):
url = f'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1650801412555&countryId=&cityId=&bgIds=&productId=&categoryId=40001001,40001005&parentCategoryId=&attrId=&keyword=&pageIndex={u}&pageSize=10&language=zh-cn&area=cn'
url_queue.put(url)
p_list = []
for i in range(3):
t = Producer(url_queue, page_queue)
t.start()
p_list.append(t)
for p in p_list:
p.join()
c_list = []
for j in range(3):
tt = Consumer(page_queue)
tt.start()
c_list.append(tt)
for c in c_list:
c.join()
本文适合初学者,如果你是python大神,欢迎留言指点。