https://search.51job.com/list/060000,000000,0000,00,9,99,python,2,3.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
这是要爬取的网址
需要爬取这些内容,不多说,直接开始写了。
import re
import random
import urllib.request
import xlwt
# 获取html页面信息
def getHtml():
url = 'https://search.51job.com/list/060000,000000,0000,00,9,99,python,2,1.html?l' \
'ang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&' \
'companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate' \
'=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
req = urllib.request.Request(url)
proxies = ["121.31.159.231:8123","110.73.10.193:8123","110.73.10.204:8123"]
proxy_handler = urllib.request.ProxyHandler({"http":random.choice(proxies)})
opener = urllib.request.build_opener(proxy_handler)
urllib.request.install_opener(opener)
response = urllib.request.urlopen(req)
html = response.read().decode('gbk')
# print(html)
return html
# getHtml()
# 获取想要爬取的数据
def getdata(html):
reg = re.compile(r'class="t1 ">.*?<a target="_blank" title="(.*?)".*? <span class="t2"><a target="_blank" title="(.*?)".*? <span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*?<span class="t5">(.*?)</span>',re.S)
items = re.findall(reg,html)
# print(items)
return items
dataList = []
# 存入dataList
def saveDataList():
html = getHtml()
items = getdata(html)
for i in items:
data = []
for j in range(5):
data.append(i[j])
dataList.append(data)
return
# 存入Excel文件中
def saveExcel(path):
#创建Excel文件
book = xlwt.Workbook()
sheet = book.add_sheet('51job职位信息')
col = [u'职位名',u'公司名',u'工作地点',u'薪资',u'发布时间']
for i in range(5):
sheet.write(0,i,col[i])
for i in range(len(dataList)):
data = dataList[i]
for j in range(5):
sheet.write(i+1,j,data[j])
book.save(path)
saveDataList()
saveExcel('51job.xls')