复习一下另一种提取方式:Python根据xpath造的轮子,美味汤。/20170110
复习findAll和select两种提取方法。/20170110
# coding: utf-8
__author__ = '姜枫渔火'
import requests
from bs4 import BeautifulSoup
import json
import sys
from fake_useragent import UserAgent
from multiprocessing import Pool
def getOnePage(num):
headers = {'User-Agent' : UserAgent().random}
res = requests.get('http://hr.tencent.com/position.php?&start=' + str(num), headers=headers)
html = res.text
return html
def prasePage(html):
bsObj = BeautifulSoup(html, 'lxml') # 招聘列表在tr标签下,分为odd和even两个子标签交替
# print(bsObj)
# odd = bsObj.select('.odd')
# even = bsObj.select('.even')
# recruit_list = odd + even
# print(type(recruit_list))
# for item in recruit_list:
# print(type(item))
# _item={}
# _item['zhiweimingcheng']=item.select('td a')[0].get_text()
# _item['zhiweilianjie']='http://hr.tencent.com/'+item.select('td a')[0].attrs['href']
# _item['zhiweileibie']=item.select('td')[1].get_text()
# _item['zhaopinrenshu']=item.select('td')[2].get_text()
# _item['gongzuodidian']=item.select('td')[3].get_text()
# _item['fabushijian']=item.select('td')[4].get_text()
recruit_list = bsObj.findAll('tr')
# print(type(recruit_list)) # 注意类型(<class 'bs4.element.ResultSet'>)
for item in recruit_list[1:-2]:
# print(type(item)) # 注意类型(<class 'bs4.element.Tag'>),可以在列表中继续使用bs4而不必用正则
yield {
'职位名称' : item.select('td a')[0].get_text(),
'职位链接' : 'http://hr.tencent.com/'+item.select('td a')[0].attrs['href'],
'职位类别' : item.select('td')[1].get_text(),
'招聘人数' : item.select('td')[2].get_text(),
'工作地点' : item.select('td')[3].get_text(),
'发布时间' : item.select('td')[4].get_text()
}
def writeFile(content):
with open('招聘信息1.txt', 'a') as f:
f.write(json.dumps(content, ensure_ascii = False) + '\n') # 解决中文乱码问题
def main(i):
html = getOnePage(i)
for items in prasePage(html):
print(items)
writeFile(items)
if __name__ == '__main__':
pool = Pool() # 开启进程池
pool.map(main,[i*10 for i in range(50)])
pool.close()
pool.join()