import pandas as pd
import requests
from lxml import etree
import chardet
import numpy as np
zwmc=[]
gsmc=[]
gzdd=[]
xz_low=[]
xz_height=[]
ptime=[]
href=[]
a=[]
b=[]
c=[]
d=[]
e=[]
for i in range(2):
url='https://search.51job.com/list/070200,000000,0000,00,9,99,%2B,2,{0}.html'.format(i)
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
response=requests.get(url,headers=headers)
response.encoding=chardet.detect(response.content)['encoding']
result=etree.HTML(response.text)
item={}
item['t1']=result.xpath('//div[@class="el"]/p/span/a/text()')
item['t2']=result.xpath('//div[@class="el"]/span[@class="t2"]/a/text()')
item['t3']=result.xpath('//div[@class="el"]/span[@class="t3"]/text()')
t4=result.xpath('//div[@class="el"]/span[@class="t4"]')
item['t4']=[]
for i in t4:
item['t4'].append(i.xpath('string(.)'))
item['t5']=result.xpath('//div[@class="el"]/span[@class="t5"]/text()')
item['href']=result.xpath('//div[@class="el"]/p/span/a/@href')
for i in range(len(item['t1'])):
item['t1'][i]=item['t1'][i].strip()
zw_low=[]
zw_height=[]
for xz in item['t4']:
if xz !="":
xz=xz.strip().split('-')
if len(xz)>1:
if xz[1][-1]=='月' and xz[1][-3]=='万':
zw_low.append(float(xz[0])*10000)
zw_height.append(float(xz[1][0:-3])*10000)
elif xz[1][-1]=='年' and xz[1][-3]=='万':
zw_low.append(round((float(xz[0])*10000)/12,1))
zw_height.append(round((float(xz[1][0:-3])*10000)/12,1))
elif xz[1][-1]=='月' and xz[1][-3]=='千':
zw_low.append(float(xz[0])*1000)
zw_height.append(float(xz[1][0:-3])*1000)
else:
zw_low.append(0)
zw_height.append(0)
else:
if xz[0][-1] =='天' and xz[0][-3]=='元':
zw_low.append(xz[0][0:-3])
zw_height.append(xz[0][0:-3])
else:
zw_low.append(0)
zw_height.append(0)
else:
zw_low.append(0)
zw_height.append(0)
item['xz_low']=zw_low
item['xz_height']=zw_height
for i in range(len(item['t5'])):
item['t5'][i]='2018-'+item['t5'][i]
for i in range(len(item['t1'])):
zwmc.append(item['t1'][i])
gsmc.append(item['t2'][i])
gzdd.append(item['t3'][i])
xz_low.append(item['xz_low'][i])
xz_height.append(item['xz_height'][i])
ptime.append(item['t5'][i])
href.append(item['href'][i])
j=0
for i in href:
print(i)
url=i
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
response=requests.get(url,headers=headers)
response.encoding=chardet.detect(response.content)['encoding']
result=etree.HTML(response.text)
item={}
item['t1']=result.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[2]/text()')
if len(item['t1'])==5:
a.append(item['t1'][0])
b.append(item['t1'][1])
c.append(item['t1'][2])
d.append(item['t1'][3])
elif len(item['t1'])==4:
a.append(item['t1'][0])
b.append(np.nan)
c.append(item['t1'][1])
d.append(item['t1'][2])
else:
a.append(np.nan)
b.append(np.nan)
c.append(np.nan)
d.append(np.nan)
e.append(np.nan)
https://jobs.51job.com/nanjing/120765370.html?s=01&t=0
https://jobs.51job.com/nanjing-qhq/111504132.html?s=01&t=0
https://jobs.51job.com/nanjing/119195851.html?s=01&t=0
https://jobs.51job.com/fuzhou/120763909.html?s=01&t=0
https://jobs.51job.com/nanjing/119679799.html?s=01&t=0
https://jobs.51job.com/nanjing/120759263.html?s=01&t=0
https://jobs.51job.com/nanjing-qxq/120758430.html?s=01&t=0
https://jobs.51job.com/nanjing-jnq/117194442.html?s=01&t=0
https://jobs.51job.com/nanjing-qhq/116958794.html?s=01&t=0
https://jobs.51job.com/nanjing/120749829.html?s=01&t=0
https://jobs.51job.com/nanjing-qhq/120747304.html?s=01&t=0
https://jobs.51job.com/nanjing/120748516.html?s=01&t=0
https://jobs.51job.com/nanjing/120747138.html?s=01&t=0
https://jobs.51job.com/nanjing/120746319.html?s=01&t=0
http://astrazeneca.51job.com/sc/show_job_detail.php?jobid=120745775
https://jobs.51job.com/nanjing-jyq/120116860.html?s=01&t=0
https://jobs.51job.com/nanjing-xwq/119839700.html?s=01&t=0
https://jobs.51job.com/nanjing-yhtq/120735484.html?s=01&t=0
https://jobs.51job.com/nanjing/120734422.html?s=01&t=0
https://jobs.51job.com/nanjing/117008657.html?s=01&t=0
https://jobs.51job.com/nanjing/119619851.html?s=01&t=0
https://jobs.51job.com/nanjing/120763081.html?s=01&t=0
https://jobs.51job.com/nanjing/94954959.html?s=01&t=0
https://jobs.51job.com/nanjing-qhq/105410257.html?s=01&t=0
https://jobs.51job.com/nanjing/120751843.html?s=01&t=0
https://jobs.51job.com/chuzhou/120750509.html?s=01&t=0
http://schaeffler.51job.com/sc/show_job_detail.php?jobid=111620488
https://jobs.51job.com/nanjing/114661488.html?s=01&t=0
http://deppon.51job.com/sc/show_job_detail.php?jobid=119032948
https://jobs.51job.com/nanjing/120759749.html?s=01&t=0
https://jobs.51job.com/nanjing/120335790.html?s=01&t=0
https://jobs.51job.com/nanjing/120077889.html?s=01&t=0
https://jobs.51job.com/nanjing-yhtq/120736428.html?s=01&t=0
https://jobs.51job.com/nanjing/118627153.html?s=01&t=0
https://jobs.51job.com/chuzhou/120741653.html?s=01&t=0
https://jobs.51job.com/nanjing-xwq/120741607.html?s=01&t=0
https://jobs.51job.com/nanjing-qxq/107120958.html?s=01&t=0
https://jobs.51job.com/nanjing/120738083.html?s=01&t=0
https://jobs.51job.com/nanjing/108133718.html?s=01&t=0
https://jobs.51job.com/nanjing/119959950.html?s=01&t=0
https://jobs.51job.com/nanjing-jnq/117650408.html?s=01&t=0
https://jobs.51job.com/nanjing-glq/104704234.html?s=01&t=0
https://jobs.51job.com/nanjing/114939099.html?s=01&t=0
https://jobs.51job.com/nanjing-lsq/120621444.html?s=01&t=0
https://jobs.51job.com/nanjing-lsq/120432024.html?s=01&t=0
https://jobs.51job.com/nanjing-qxq/104835579.html?s=01&t=0
https://jobs.51job.com/nanjing-qxq/120766217.html?s=01&t=0
https://jobs.51job.com/nanjing/120766170.html?s=01&t=0
https://jobs.51job.com/nanjing-qxq/120750779.html?s=01&t=0
https://jobs.51job.com/nanjing-jnq/120517790.html?s=01&t=0
data={
'职位名称':zwmc,
'公司名称':gsmc,
'工作地点':gzdd,
'职位最低月薪':xz_low,
'职位最高月薪':xz_height,
'发布时间':ptime,
'网站地址':href,
'地区':a,
'经验':b,
'学历':c,
'招聘人数':d,
}
import MySQLdb
def dic2sql(dic, sql):
sf = ''
for key in dic:
tup = (key, dic[key])
sf += (str(tup) + ',')
sf = sf.rstrip(',')
sql2 = sql % sf
return sql2
if __name__ == '__main__':
dic = {'apple': 216, 'jar': 138}
sql = "insert into users (login,userid) VALUES %s;"
ret = dic2sql(dic, sql)
cxn = MySQLdb.connect(user='root',password='password', db='test')
cur = cxn.cursor()
cur.execute(ret)
cxn.commit()
cxn.close()
test=pd.DataFrame(data)
from sqlalchemy import create_engine
engine = create_engine("mysql+pymysql://root:123456@localhost:3306/spider?charset=utf8")
test.to_sql(name = 'cnblog',con = engine,if_exists = 'append',index = False,index_label = False)
test.to_excel('test.xlsx')