xpath 爬取51job,存于excel

#coding=gb18030
import requests
from bs4 import BeautifulSoup
from lxml import etree
import os
import time
import xlwt
import urllib2
import string

ur=[]
th=[]
zp=u'招聘_'
xl=xlwt.Workbook()
st=xl.add_sheet('job',cell_overwrite_ok=True)

for i in range(1,3):
    url='http://search.51job.com/list/040000,000000,0000,00,9,99,python,2,'+str(i) +'.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
    ur.append(url)

header={'User-Agent':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)'}

job_link=[]
for u in ur:
    req=urllib2.Request(u,headers=header)
    resp=urllib2.urlopen(req)
    response=resp.read()
    resp.close()
    html=etree.HTML(response)
    result=html.xpath('//a[@οnmοusedοwn=""]/@href')
    for x in result:
        job_link.append(x)

k=0
for job in job_link:
    k+=1
    try:
        req=urllib2.Request(job)
        res=urllib2.urlopen(req)
        respon=res.read()
        res.close()
        sou=etree.HTML(respon)
        st_title=sou.xpath('//title/text()')
        ss=st_title[0].find(u'招聘_')+3
        st.write(k,0,st_title[0][ss:-12].strip())
        st.write(k,1,sou.xpath('//h1/text()')[0])
        t1=sou.xpath('//span[@class="sp4"]//text()')
        tt1=','.join(t1)
        st.write(k,2,tt1)
        st.write(k,3,sou.xpath('//span[@class="lname"]/text()')[0])
        dz=sou.xpath('//div[@class="bmsg inbox"]//text()')
        s=''.join(dz).strip()
        st.write(k,4,s)
        st.write(k,5,sou.xpath('//strong/text()')[1])
        di=sou.xpath('//div[@class="bmsg job_msg inbox"]//text()')
        d=''.join(di).replace('\n','').strip()
        st.write(k,6,d)

    except:
        pass
xl.save('d:\\job_python.xls')

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值