目录
需求:
编写函数,可以指明抓取某职位以及页数实现抓不同职位时(用正则和request),存到同一个excel中的不同sheet中,并将数据存到mysql中
初版:
以下是第一次编写代码时的思路
#编写正则
def getjobname(jobname,startnum,endnum):
allresult=[]
jobname1=request.quote(request.quote(jobname))
for i in range(startnum,endnum):
print('正在抓取第',i,'页')
url='https://search.51job.com/list/000000,000000,0000,00,9,99,'+jobname1+',2,'+str(i)+'.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=4&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
response=request.urlopen(url)
html=response.read().decode('gbk')
#编写正则用来提取信息,去掉的东西都用.*?来代替
reg=re.compile('<div class="el">.*?<p class="t1 ">.*?<a target="_blank" title="(.*?)".*?<span class="t2"><a target="_blank" title="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*?<span class="t5">(.*?)</span>.*?</div>',re.S)
result=re.findall(reg,html)
allresult+=result
这里对jobname进行了解码处理,将函数执行时输入的汉字职位转化为匹配51job代码的值
#存储到xls:
if os.path.exists('51job职位信息.xlsx'):
workbook = load_workbook("51job职位信息.xlsx")
sheetNames = workbook.sheetnames
if jobname in sheetNames:
print("sheetName 重复")
return
else:
sheet=workbook.create_sheet(jobname,len(sheetNames)+1)
sheet.append('职位名,公司名,工作地点,薪资,发布时间'.split(','))
for each in allresult:
sheet.append(each)
else:
workbook=Workbook()
sheet=workbook.active
sheet.title=jobname
sheet.append('职位名,公司名,工作地点,薪资,发布时间'.split(','))
for each in allresult:
sheet.append(each)
workbook.save('51job职位信息.xlsx')
#存储到mysql
connection = pymysql.connect(
host='localhost',
user='root',
password='1234',
db='jobinfo',
charset='utf8'
)
cursor = connection.cursor()
for index, each in enumerate(allresult):
sql = "insert into info