基于requests的51job数据爬取并储存

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import requests
from fake_useragent import UserAgent
from lxml import etree
agent = UserAgent()
url = 'http://search.51job.com/list/010000%252C020000%252C030200%252C040000%252C180200,000000,0000,00,9,11,python,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=21&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
response = requests.get(
    url,
    headers={'User-Agent':agent.random}
)
response.encoding=response.apparent_encoding
root = etree.HTML(response.text)
div_list = root.xpath("//div[@class='dw_table']/div[@class='el']")
for div in div_list:

    money = div.xpath("span[@class='t4']/text()")
    # if not money:
    #     money = '面议'
    # else:
    #     money = money[0]
    money = money[0] if money else "面议"
    print(money)
    # 工作名称不可能为空,所以不用判断
    a = div.xpath("p/span/a")[0]
    job_name = a.xpath("text()")[0].strip()
    job_href = a.xpath("@href")[0]
    print(job_name)
    date_time = div.xpath("span[@class='t5']/text()")
    date_time = date_time[0] if date_time else "没有时间"
    print(date_time)
    # with open('job.csv','a',encoding='gb18030') as f:
    #     f.write(job_name+','+date_time+','+money+'\n')
    with open('job.csv','a',encoding='gb18030') as f:
        job_list = [job_name,date_time,money,'\n']
        f.write(','.join(job_list))

 

转载于:https://my.oschina.net/u/3771014/blog/1629381

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值