生产者与消费者模式:
1导入模块:
from lxml import etree
import requests
from threading import Thread
from queue import Queue
2.生产者:
def get_page(url,page_list):
headers = {
‘User-agent’: ‘ios/xxxx.3.3.3.10’
}
while True:
if page_list.empty():
break
page=page_list.get()
full_url = url + str(page - 1) * 10
response=requests.get(full_url,headers=headers)
url_list.put(response.text)
消费者:-------解析:
def parse_page():
while not exit_flag:
try:
response=url_list.get()
tree = etree.HTML(response)
position_info_list = tree.xpath(’//tr[@class=“even”] | //tr[@class=“odd”]’)
for position_info in position_info_list:
# 实例化 收集信息
name = position_info.xpath('.//td[1]/a/text()')[0]
detailLink = 'https://hr.tencent.com/' + position_info.xpath('.//td[1]/a/@href')[0]
detailLink = detailLink
position = position_info.xpath('.//td[2]/text()')
positionInfo = position[0] if position else ''
ipeopleNumber = position_info.xpath('.//td[3]/text()')[0]
workLocation = position_info.xpath('.//td[4]/text()')[0]
publishTime = position_info.xpath('.//td[5]/text()')[0]
# 写进文件:
job = name + ',' + ipeopleNumber + '\n'
with open('position.txt', 'a', encoding='utf-8')as f:
f.write(job)
except:
pass
获取到的URL页面:
url_list=Queue()
exit_flag=False
调用函数:
if name == ‘main’:
url=‘https://hr.tencent.com/position.php?start=’
page_list=Queue()
for page in range(1,280):
page_list.put(page)
for i in range(3):
t=Thread(target=get_page,args=(url,page_list))
t.start()