我们以腾讯社招页面来做演示:http://hr.tencent.com/position.php?&start=10#a
使用BeautifuSoup4解析器,将招聘网页上的职位名称、职位类别、招聘人数、工作地点、发布时间,以及每个职位详情的点击链接存储出来。
from bs4 import BeautifulSoup
import urllib.request
import json # 使用了json格式存储
def tencent():
url = 'http://hr.tencent.com/'
request = urllib.request.Request(url + 'position.php?&start=10#a')
response =urllib.request.urlopen(request)
resHtml = response.read()
output =open('tencent.json','wb+')
html = BeautifulSoup(resHtml,'lxml')
# 创建CSS选择器
result = html.select('tr[class="even"]')
result2 = html.select('tr[class="odd"]')
result += result2
print(result)
items = []
for site in result:
item = {}
name = site.select('td a')[0].get_text()
detailLink = site.select('td a')[0].attrs['href']
catalog = site.select('td')[1].get_text()
recruitNumber = site.select('td')[2].get_text()
workLocation = site.select('td')[3].get_text()
publishTime = site.select('td')[4].get_text()
item['name'] = name
item['detailLink'] = url + detailLink
item['catalog'] = catalog
item['recruitNumber'] = recruitNumber
item['publishTime'] = publishTime
item['workLocation'] = workLocation
items.append(item)
# 禁用ascii编码,按utf-8编码
line = json.dumps(items,ensure_ascii=False)
output.write(line.encode())
output.close()
if __name__ == "__main__":
tencent()
爬取结果tencent.json
[
{
"detailLink": "http://hr.tencent.com/position_detail.php?id=40821&keywords=&tid=0&lid=0",
"workLocation": "北京",
"catalog": "职能类",
"publishTime": "2018-05-24",
"recruitNumber": "1", "name":
"S2-MIG风险管理经理(北京)"
},
{
"detailLink": "http://hr.tencent.com/position_detail.php?id=40825&keywords=&tid=0&lid=0",
"workLocation": "深圳",
"catalog": "市场类",
"publishTime": "2018-05-24",
"recruitNumber": "1",
"name": "19116-互联网+医疗行业经理(深圳)"
},
{"detailLink": "http://hr.tencent.com/position_detail.php?id=40815&keywords=&tid=0&lid=0",
"workLocation": "深圳",
"catalog": "技术类",
"publishTime": "2018-05-24",
"recruitNumber": "1",
"name": "24012-H5游戏开发工程师(深圳)"
},
{
"detailLink": "http://hr.tencent.com/position_detail.php?id=40818&keywords=&tid=0&lid=0",
"workLocation": "北京",
"catalog": "设计类",
"publishTime": "2018-05-24",
"recruitNumber": "1",
"name": "23674-视觉设计(北京)"
},
{
"detailLink": "http://hr.tencent.com/position_detail.php?id=40820&keywords=&tid=0&lid=0",
"workLocation": "深圳",
"catalog": "设计类",
"publishTime": "2018-05-24",
"recruitNumber": "2",
"name": "24491-高级多媒体设计师(深圳)"
},
{
"detailLink": "http://hr.tencent.com/position_detail.php?id=40824&keywords=&tid=0&lid=0",
"workLocation": "深圳",
"catalog": "技术类",
"publishTime": "2018-05-24",
"recruitNumber": "1",
"name": "26564-后台开发工程师(深圳)"
},
{
"detailLink": "http://hr.tencent.com/position_detail.php?id=40828&keywords=&tid=0&lid=0",
"workLocation": "深圳",
"catalog": "市场类",
"publishTime": "2018-05-24",
"recruitNumber": "1",
"name": "MIG15-腾讯叮当高级销售经理"
},
{
"detailLink": "http://hr.tencent.com/position_detail.php?id=40817&keywords=&tid=0&lid=0",
"workLocation": "深圳",
"catalog": "产品/项目类",
"publishTime": "2018-05-24",
"recruitNumber": "1",
"name": "26564-项目经理(深圳)"
},
{
"detailLink": "http://hr.tencent.com/position_detail.php?id=40819&keywords=&tid=0&lid=0",
"workLocation": "深圳",
"catalog": "产品/项目类",
"publishTime": "2018-05-24",
"recruitNumber": "1",
"name": "24491-游戏英文文案翻译(深圳)"
},
{
"detailLink": "http://hr.tencent.com/position_detail.php?id=40822&keywords=&tid=0&lid=0",
"workLocation": "深圳",
"catalog": "产品/项目类",
"publishTime": "2018-05-24",
"recruitNumber": "1",
"name": "SD3-海外PM(日语方向)"
}
]