# -*- coding: utf-8 -*-
import os
import logging
import requests
from queue import Queue
from bs4 import BeautifulSoup
#==============================================================================
# 开始url
#==============================================================================
START_URL = (
"http://search.51job.com/list/010000%252C020000%252C030200%252C040000"
",000000,0000,00,9,99,Python,2,{}.html? lang=c&stype=1&postchannel=00"
"00&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lon"
"lat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&"
"address=&line=&specialarea=00&from=&welfare="
)
urls = [START_URL.format(p) for p in range(1, 16)]
#用数字替换了URL中的{}部分,共爬取15页
HEADERS = {
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36"
"(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
}
def get_logger():
"""
创建日志实例
"""
formatter = logging.Formatter("%(asctime)s - %(message)s")
logger = logging.getLogger("monitor")
logger.setLevel(LOG_LEVEL)
ch = logging.StreamHandler()
ch.setFormatter(formatter)
logger.addHandler(ch)
return logger
LOG_LEVEL = logging.INFO # 日志等级
logger = get_logger()
count = 1 # 记录当前爬第几条数据
company = [] #存放公司信息
desc_url_queue = Queue() # 线程池队列
#pool = Pool(POOL_MAXSIZE) # 线程池管理线程,最大协程数
#==============================================================================
# 开始爬取公司
#==============================================================================
for url in urls:
logger.info("爬取第 {} 页".format(urls.index(url) + 1))
html = requests.get(url, headers=HEADERS).content.decode("gbk")
bs = BeautifulSoup(html, "lxml").find("div", class_="dw_table").find_all(
"div", class_="el"
)
for b in bs:
try:
href, post = b.find("a")["href"], b.find("a")["title"]
locate = b.find("span", class_="t3").text
salary = b.find("span", class_="t4").text
item = {
"href": href, "post": post, "locate": locate, "salary": salary
}
desc_url_queue.put(href) # 岗位详情链接加入队列
company.append(item)
except Exception:
pass
# 打印队列长度,即多少条岗位详情 url
logger.info("队列长度为 {} ".format(desc_url_queue.qsize()))
#==============================================================================
# #得到了company信息list
#==============================================================================
def post_require():
"""
爬取职位描述
"""
count = 1 # 记录当前爬第几条数据
while True:
# 从队列中取 url
url = desc_url_queue.get()
resp = requests.get(url, headers=HEADERS)
if resp.status_code == 200:
logger.info("爬取第 {} 条岗位详情".format(count))
html = resp.content.decode("gbk")
desc_url_queue.task_done() #队列清空
# 在完成一项工作之后,queue.task_done()函数向任务已经完成的队列发送一个信号
count += 1
else:
desc_url_queue.put(url)
continue
try:
bs = BeautifulSoup(html, "lxml").find(
"div", class_="bmsg job_msg inbox"
).text
s = bs.replace("微信", "").replace("分享", "").replace("邮件", "").replace(
"\t", ""
).strip()
with open(
os.path.join("data", "post_require.txt"), "a", encoding="utf-8"
) as f:
f.write(s)
except Exception as e:
logger.error(e)
logger.warning(url)
if desc_url_queue.empty():
print('队列长度'+str(desc_url_queue.qsize()))
break
#==============================================================================
# #获取上一步desc_url_queue队列里每个页面的岗位详情
#==============================================================================
post_require()
python Queue的多线程应用
queue.Queue
Queue类实现了一个基本的先进先出(FIFO)容器,使用put()将元素添加到序列尾端,get()从队列尾部移除元素.
q = Queue()
for i in range(3):
q.put(i)
while not q.empty():
print(q.get())
gevent.pool
Python通过yield提供了对协程的基本支持,但是不完全。而第三方的gevent为Python提供了比较完善的协程支持。
gevent是第三方库,通过greenlet实现协程,其基本思想是:
当一个greenlet遇到IO操作时,比如访问网络,就自动切换到其他的greenlet,等到IO操作完成,再在适当的时候切换回来继续执行。由于IO操作非常耗时,经常使程序处于等待状态,有了gevent为我们自动切换协程,就保证总有greenlet在运行,而不是等待IO。
由于切换是在IO操作时自动完成,所以gevent需要修改Python自带的一些标准库,这一过程在启动时通过monkey patch完成:
Pool
中提供了如下几个方法:
apply()
apply_async()
map()
map_async()
close()
terminal()
join()
遇到阻塞队列 线程不结束
队列是线程间最常用的交换数据的形式。Queue模块是提供队列操作的模块,虽然简单易用,但是不小心的话,还是会出现一些意外。
“ gevent.hub.LoopExit: This operation would block forever”
解决。避免使用gevent https://www.cnblogs.com/cjaaron/p/9178083.html