上次写的python多线程爬虫,并没有充分利用cpu资源,实际上只进行了一个核。这里简单的介绍一下几个函:
pool = Pool(3) #根据cpu核数设置进程个数
pool.map(get_infor,urls) ,#map函数的作用,将urls中每个元素到放入到get_infor中执行一遍
pool.close() #关闭进程池
pool.join() #等待全部进程结束
代码:
#encoding:utf-8
'''
Created on 2017年12月
'''
from multiprocessing import Pool
import Queue
import urllib2
from bs4 import BeautifulSoup
import time
start = time.time()
request_headers = {
'host': "www.lagou.com",
'connection': "keep-alive",
'cache-control': "no-cache",
'user-agent': "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36",
'accept': "application/json, text/javascript, */*; q=0.01",
'accept-language': "zh-CN,zh;q=0.9",
'cookie':" "
}
f = open('ab.txt','a+')
def get_infor(url):
requset = urllib2.Request(url,headers=request_headers)
try:
response=urllib2.urlopen(requset)
allcon=response.read()
soup = BeautifulSoup(allcon,"html.parser",from_encoding="gb18030")
except urllib2.URLError, e:
print e.reason
res = get_result(soup)
print url,res
def get_result(soup):
# get every position
res = ''
for list in soup.select('.list_item_top'):
position = list.select('.p_top')
job_name = position[0].find('h3').text.strip()
job_location_data = position[0].find_all('span')
job_location = job_location_data[0].text.strip()
job_data = job_location_data[1].text.strip()
money_experience = (list.select('.p_bot'))[0].text.strip().split('\n')
money = money_experience[0].strip()
experience = money_experience[1].strip()
company = list.select('.company')
company_name = company[0].select('.company_name')[0].text.strip()
company_url = company[0].find('a').get('href').strip()
industry = company[0].select('.industry')[0].text.strip()
line_res = ''
line_res = job_name +'\t'+job_location+'\t'+job_data+'\t'+money+'\t'+experience+'\t'+company_name+'t'+company_url+'\t'+industry+'\n'
res = res+line_res
return res
if __name__ == '__main__':
urls = []
type = 'Java'
for i in range(1,31):
urls.append('https://www.lagou.com/zhaopin/'+str(type)+'/'+str(i)+'/?filterOption='+str(i)+'&city=北京')
pool = Pool(3)
pool.map(get_infor,urls)
pool.close()
pool.join()
print "Elapsed Time: %s" % (time.time() - start)
还有一点,多进程爬虫容易被封。
如有问题,欢迎指正。