根据其他项目的思路和代码进行某网站的招聘信息爬取,但是在创建后没有报错且能运行,但是没有爬取到数据,求助解决方法。https://www.cnblogs.com/Beyond-Ricky/p/6771028.html
# -*- coding:utf-8 -*-
import urllib.request
import re
import xlwt #用来创建excel文档并写入数据
#获取原码
def get_content(page):
url ='http://s.gxrc.com/sJob?district=1&workProperty=-1&page=1&pageSize=20&orderType=0&listValue=1'+ str(page)+'.html'
a = urllib.request.urlopen(url)#打开网址
html = a.read().decode('utf-8')#读取源代码并转为unicode
return html
def get(html):
reg = re.compile(r'class="w1 ">.*? <a target="_blank" title="(.*?)".*? <h3 class="w2"><a target="_blank" title="(.*?)".*?<h3 class="w3">(.*?)</h3>.*?<h3 class="w4">(.*?)</h3>.*? <h3 class="w5">(.*?)</h3>',re.S)#匹配换行符
items = re.findall(reg,html)
return items
def excel_write(items,index):
#爬取到的内容写入excel表格
for item in items:#职位信息
for i in range(0,5):
#print item[i]
ws.write(index,i,item[i])#行,列,数据
print(index)
index+=1
newTable="gxrcw.xls"#表格名称
wb = xlwt.Workbook(encoding='utf-8')#创建excel文件,声明编码
ws = wb.add_sheet('sheet1')#创建表格
headData = ['职位名称','公司名称','薪资','工作地','更新时间']#表头部信息
for colnum in range(0, 5):
ws.write(0, colnum, headData[colnum], xlwt.easyxf('font: bold on')) # 行,列
for each in range(1,732):
index=(each-1)*50+1
excel_write(get(get_content(each)),index)
wb.save(newTable)