20200318_抓取51job招聘数据存数据库

import pandas as pd
import requests
from lxml import etree
import chardet
import numpy as np
#职位名称
zwmc=[]
#公司名称
gsmc=[]
#工作地点
gzdd=[]
#职位最低月薪
xz_low=[]
#职位最高月薪
xz_height=[]
#发布时间
ptime=[]
#网站地址
href=[]
##地区
a=[]
#经验
b=[]
#学历
c=[]
#招聘人数
d=[]
#时间
e=[]
# url='https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588,2,1.html'
for i in range(2):
    url='https://search.51job.com/list/070200,000000,0000,00,9,99,%2B,2,{0}.html'.format(i)
    headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
    response=requests.get(url,headers=headers)
    response.encoding=chardet.detect(response.content)['encoding']
    result=etree.HTML(response.text)
    item={}
    item['t1']=result.xpath('//div[@class="el"]/p/span/a/text()') #职位名称
    item['t2']=result.xpath('//div[@class="el"]/span[@class="t2"]/a/text()') #公司名称
    item['t3']=result.xpath('//div[@class="el"]/span[@class="t3"]/text()') #工作地点
    t4=result.xpath('//div[@class="el"]/span[@class="t4"]')
    item['t4']=[]
    for i in t4:
        item['t4'].append(i.xpath('string(.)'))  #职位月薪
    item['t5']=result.xpath('//div[@class="el"]/span[@class="t5"]/text()') #发布时间
    item['href']=result.xpath('//div[@class="el"]/p/span/a/@href')  #详细链接
    #3.数据清洗,处理原始数据
    #(1)去掉职位名称前后空白
    for i in range(len(item['t1'])):
        item['t1'][i]=item['t1'][i].strip()
    #(2)薪资处理
    #定义列表,存储处理后的薪资数据
    zw_low=[] #最低月薪
    zw_height=[] #最高薪资
    #考虑薪资数据可能出现的情况做循环判断
    for xz in item['t4']:
        if xz !="":
            xz=xz.strip().split('-')
            if len(xz)>1:
                if xz[1][-1]=='月' and xz[1][-3]=='万':
                    zw_low.append(float(xz[0])*10000)
                    zw_height.append(float(xz[1][0:-3])*10000)
                elif xz[1][-1]=='年' and xz[1][-3]=='万':
                    zw_low.append(round((float(xz[0])*10000)/12,1))
                    zw_height.append(round((float(xz[1][0:-3])*10000)/12,1))
                elif xz[1][-1]=='月' and xz[1][-3]=='千':
                    zw_low.append(float(xz[0])*1000)
                    zw_height.append(float(xz[1][0:-3])*1000)
                else:
                    zw_low.append(0)
                    zw_height.append(0)
            else:
                if xz[0][-1] =='天' and xz[0][-3]=='元':
                    zw_low.append(xz[0][0:-3])
                    zw_height.append(xz[0][0:-3])
                else:
                    zw_low.append(0)
                    zw_height.append(0)
        else:
            zw_low.append(0)
            zw_height.append(0)
    item['xz_low']=zw_low
    item['xz_height']=zw_height
    #(3) 时间数据处理
    for i in range(len(item['t5'])):
        item['t5'][i]='2018-'+item['t5'][i]
    for i in range(len(item['t1'])):
        zwmc.append(item['t1'][i])
        gsmc.append(item['t2'][i])
        gzdd.append(item['t3'][i])
        xz_low.append(item['xz_low'][i])
        xz_height.append(item['xz_height'][i])
        ptime.append(item['t5'][i])
        href.append(item['href'][i])
j=0
for i in href:
    print(i)
    url=i
    headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
    response=requests.get(url,headers=headers)
    response.encoding=chardet.detect(response.content)['encoding']
    result=etree.HTML(response.text)
    item={}
    item['t1']=result.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[2]/text()') #职位名称
#     print(len(item['t1']))
    if len(item['t1'])==5:
        a.append(item['t1'][0])
        b.append(item['t1'][1])
        c.append(item['t1'][2])
        d.append(item['t1'][3])
    elif len(item['t1'])==4:
        a.append(item['t1'][0])
        b.append(np.nan)
        c.append(item['t1'][1])
        d.append(item['t1'][2])
#     elif len(item['t1'])==7:
#         a.append(item['t1'][0])
#         b.append(item['t1'][1])
#         c.append(item['t1'][2])
#         d.append(item['t1'][3])
#         e.append(item['t1'][4])
#         f.append(item['t1'][5])
#         g.append(item['t1'][6])
    else:
        a.append(np.nan)
        b.append(np.nan)
        c.append(np.nan)
        d.append(np.nan)
        e.append(np.nan)   
https://jobs.51job.com/nanjing/120765370.html?s=01&t=0
https://jobs.51job.com/nanjing-qhq/111504132.html?s=01&t=0
https://jobs.51job.com/nanjing/119195851.html?s=01&t=0
https://jobs.51job.com/fuzhou/120763909.html?s=01&t=0
https://jobs.51job.com/nanjing/119679799.html?s=01&t=0
https://jobs.51job.com/nanjing/120759263.html?s=01&t=0
https://jobs.51job.com/nanjing-qxq/120758430.html?s=01&t=0
https://jobs.51job.com/nanjing-jnq/117194442.html?s=01&t=0
https://jobs.51job.com/nanjing-qhq/116958794.html?s=01&t=0
https://jobs.51job.com/nanjing/120749829.html?s=01&t=0
https://jobs.51job.com/nanjing-qhq/120747304.html?s=01&t=0
https://jobs.51job.com/nanjing/120748516.html?s=01&t=0
https://jobs.51job.com/nanjing/120747138.html?s=01&t=0
https://jobs.51job.com/nanjing/120746319.html?s=01&t=0
http://astrazeneca.51job.com/sc/show_job_detail.php?jobid=120745775
https://jobs.51job.com/nanjing-jyq/120116860.html?s=01&t=0
https://jobs.51job.com/nanjing-xwq/119839700.html?s=01&t=0
https://jobs.51job.com/nanjing-yhtq/120735484.html?s=01&t=0
https://jobs.51job.com/nanjing/120734422.html?s=01&t=0
https://jobs.51job.com/nanjing/117008657.html?s=01&t=0
https://jobs.51job.com/nanjing/119619851.html?s=01&t=0
https://jobs.51job.com/nanjing/120763081.html?s=01&t=0
https://jobs.51job.com/nanjing/94954959.html?s=01&t=0
https://jobs.51job.com/nanjing-qhq/105410257.html?s=01&t=0
https://jobs.51job.com/nanjing/120751843.html?s=01&t=0
https://jobs.51job.com/chuzhou/120750509.html?s=01&t=0
http://schaeffler.51job.com/sc/show_job_detail.php?jobid=111620488
https://jobs.51job.com/nanjing/114661488.html?s=01&t=0
http://deppon.51job.com/sc/show_job_detail.php?jobid=119032948
https://jobs.51job.com/nanjing/120759749.html?s=01&t=0
https://jobs.51job.com/nanjing/120335790.html?s=01&t=0
https://jobs.51job.com/nanjing/120077889.html?s=01&t=0
https://jobs.51job.com/nanjing-yhtq/120736428.html?s=01&t=0
https://jobs.51job.com/nanjing/118627153.html?s=01&t=0
https://jobs.51job.com/chuzhou/120741653.html?s=01&t=0
https://jobs.51job.com/nanjing-xwq/120741607.html?s=01&t=0
https://jobs.51job.com/nanjing-qxq/107120958.html?s=01&t=0
https://jobs.51job.com/nanjing/120738083.html?s=01&t=0
https://jobs.51job.com/nanjing/108133718.html?s=01&t=0
https://jobs.51job.com/nanjing/119959950.html?s=01&t=0
https://jobs.51job.com/nanjing-jnq/117650408.html?s=01&t=0
https://jobs.51job.com/nanjing-glq/104704234.html?s=01&t=0
https://jobs.51job.com/nanjing/114939099.html?s=01&t=0
https://jobs.51job.com/nanjing-lsq/120621444.html?s=01&t=0
https://jobs.51job.com/nanjing-lsq/120432024.html?s=01&t=0
https://jobs.51job.com/nanjing-qxq/104835579.html?s=01&t=0
https://jobs.51job.com/nanjing-qxq/120766217.html?s=01&t=0
https://jobs.51job.com/nanjing/120766170.html?s=01&t=0
https://jobs.51job.com/nanjing-qxq/120750779.html?s=01&t=0
https://jobs.51job.com/nanjing-jnq/120517790.html?s=01&t=0
data={
    '职位名称':zwmc,
    '公司名称':gsmc,
    '工作地点':gzdd,
    '职位最低月薪':xz_low,
    '职位最高月薪':xz_height,
    '发布时间':ptime,
    '网站地址':href,
    '地区':a,
    '经验':b,
    '学历':c,
    '招聘人数':d,
}
import MySQLdb

def dic2sql(dic, sql):
    sf = ''

    for key in dic:
        tup = (key, dic[key])
        sf += (str(tup) + ',')
    sf = sf.rstrip(',')

    sql2 = sql % sf
    return sql2

if __name__ == '__main__':
    dic = {'apple': 216, 'jar': 138}
    sql = "insert into users (login,userid) VALUES %s;"

    ret = dic2sql(dic, sql)
    # print(ret)

    # 连接MySQL,并提交数据
    cxn = MySQLdb.connect(user='root',password='password', db='test')
    cur = cxn.cursor()
    cur.execute(ret)
    cxn.commit()
    cxn.close()
test=pd.DataFrame(data)
from sqlalchemy import create_engine
engine = create_engine("mysql+pymysql://root:123456@localhost:3306/spider?charset=utf8")
test.to_sql(name = 'cnblog',con = engine,if_exists = 'append',index = False,index_label = False)
test.to_excel('test.xlsx')
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值