闲来无事,爬个51job的薪酬信息,哈哈,低质量工作,简单的列表模式,有没有什么反爬虫,连多进程都没用上,直接上代码。
# -*-encoding:utf-8 -*-
'''
created by zwg in 2017-04-03
'''
import urllib2
from bs4 import BeautifulSoup
import csv
import sys
reload(sys)
sys.setdefaultencoding('utf8')
def set_opener():
handle1 = urllib2.HTTPBasicAuthHandler()
myopener = urllib2.build_opener(handle1)
urllib2.install_opener(myopener)
return myopener
def get_html(myopener,url):
try:
page=myopener.open(url)
html=page.read()
except:
html=''
return html
def get_one_page_info(html):
soup=BeautifulSoup(html,'lxml')
job_info=soup.find_all('div',class_='el')
jobs=[]
for job in job_info:
try:
zhiwei=job.find('a',target='_blank').string
except:
zhiwei=''
try:
gongsi=job.find('span',class_='t2').string
except:
gongsi= ''
try:
dizhi=job.find('span', class_='t3').string
except:
dizhi = ''
try:
xinchou = job.find('span', class_='t4').string
except:
xinchou = ''
try:
riqi = job.find('span', class_='t5').string
except:
riqi = ''
one_job=[zhiwei,gongsi,xinchou,dizhi,riqi]
try:
one_job=[s.strip() for s in one_job]
except:
pass
if False in [s=='' for s in one_job] and not '时间' in riqi:
jobs.append(one_job)
try:
next_page_url=soup.find('a',text='下一页')['href']
except:
next_page_url=''
return jobs,next_page_url
url="http://search.51job.com/list/030200,000000,0000,00,9,99,%2520,2,1.html?lang=" \
"c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&compa" \
"nysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fro" \
"mType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
myopener=set_opener()
html=get_html(myopener,url)
next_page_url='0'
jobs_file=file('jobs.csv','wb+')
writer=csv.writer(jobs_file)
k=1
while next_page_url!='':
html = get_html(myopener, url)
jobs,next_page_url=get_one_page_info(html)
print 'The %s page: get %s jobs'%(str(k),str(len(jobs)))
writer.writerows(jobs)
url=next_page_url
k+=1
jobs_file.close()