Python_51job爬虫

闲来无事,爬个51job的薪酬信息,哈哈,低质量工作,简单的列表模式,有没有什么反爬虫,连多进程都没用上,直接上代码。

# -*-encoding:utf-8 -*-
'''
created by zwg in 2017-04-03
'''

import urllib2
from bs4 import BeautifulSoup
import csv
import sys
reload(sys)
sys.setdefaultencoding('utf8')
def set_opener():
    handle1 = urllib2.HTTPBasicAuthHandler()
    myopener = urllib2.build_opener(handle1)
    urllib2.install_opener(myopener)
    return myopener


def get_html(myopener,url):
    try:
        page=myopener.open(url)
        html=page.read()
    except:
        html=''
    return html

def get_one_page_info(html):
    soup=BeautifulSoup(html,'lxml')
    job_info=soup.find_all('div',class_='el')
    jobs=[]
    for job in job_info:
        try:
            zhiwei=job.find('a',target='_blank').string
        except:
            zhiwei=''
        try:
            gongsi=job.find('span',class_='t2').string
        except:
            gongsi= ''
        try:
            dizhi=job.find('span', class_='t3').string
        except:
            dizhi = ''
        try:
            xinchou = job.find('span', class_='t4').string
        except:
            xinchou = ''
        try:
            riqi = job.find('span', class_='t5').string
        except:
            riqi = ''
        one_job=[zhiwei,gongsi,xinchou,dizhi,riqi]
        try:
            one_job=[s.strip() for s in one_job]
        except:
            pass
        if False in [s=='' for s in one_job] and not '时间' in riqi:
            jobs.append(one_job)
    try:
        next_page_url=soup.find('a',text='下一页')['href']
    except:
        next_page_url=''
    return jobs,next_page_url


url="http://search.51job.com/list/030200,000000,0000,00,9,99,%2520,2,1.html?lang=" \
    "c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&compa" \
    "nysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fro" \
    "mType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="

myopener=set_opener()
html=get_html(myopener,url)
next_page_url='0'
jobs_file=file('jobs.csv','wb+')
writer=csv.writer(jobs_file)
k=1
while next_page_url!='':
    html = get_html(myopener, url)
    jobs,next_page_url=get_one_page_info(html)
    print 'The %s page: get %s jobs'%(str(k),str(len(jobs)))
    writer.writerows(jobs)
    url=next_page_url
    k+=1
jobs_file.close()



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值