37.爬前程无忧

前程无忧每一页面显示的工作条数很有限,而且排名靠前的工作也并非是最好的,而且详细信息还得点进去看,以至于一般求职者根本看不了多少工作信息,也就无法筛选出满意的工作,这也是导致求职者海投简历的原因。 在我爬取了大量工作信息后,发现在前程无忧里面搜索关键词会有相当多的根本不相干的工作,垃圾信息相当多,如果只是随便看看那么很容易只看到垃圾信息。

import requests
from lxml import etree
import time
import random
import pandas as pd
import re
def get_detail(detail_url):
    resp1 = requests.get(detail_url, headers=HEADERS)
    text1 = resp1.content.decode('gbk', errors='ignore')
    html1 = etree.HTML(text1)
    ask = str(html1.xpath("/html/body/div[3]/div[2]/div[3]/div[1]/div/p/text()")).replace(' ','')
    ask = re.sub("\['", '', ask)
    ask = re.sub("']", '', ask)
    return ask

def get_detail_urls(url,HEADERS):
    resp = requests.get(url,headers=HEADERS)
    text = resp.content.decode('gbk',errors='ignore')
    html = etree.HTML(text)

    detail_url = html.xpath("//table[@class='tbspan']//a/@href")
    detail_url_1 = html.xpath("//table[@class='tbspan']//a/text()")
    detail_url_2 = []
    for i in range(len(detail_url)):
        detail_url_2.append('http://www.ygdy8.net' + detail_url[i])

    data = pd.DataFrame({'film_name':detail_url_1,'detail_url':detail_url_2})
    data.to_csv('film.csv', index=False, sep=';', mode='a', header=False)


HEADERS = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
'''
for i in range(1,100):
    url = 'http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html'.format(i)
    time.sleep(random.random())
    print('第{}页'.format(i))
    get_detail_urls(url, HEADERS)

'''
def get_jobs(HEADERS):
    job_ = []
    salary_ = []
    area_ = []
    company_ = []
    detail_url_ =[]
    ask_ = []
    try:
        for page in range(1, 200):
            url = 'https://search.51job.com/list/090200,000000,0000,00,9,99,%2B,2,{}.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format(
                page)
            resp = requests.get(url, headers=HEADERS)
            text = resp.content.decode('gbk', errors='ignore')
            html = etree.HTML(text)
            print('第%s页' % page)
            for i in range(4, 54):
                job = html.xpath("//*[@id='resultList']/div[{}]/p/span/a/text()".format(i))[0].replace(' ', '')
                job = re.sub("\r\n", "", job)
                salary = html.xpath("//*[@id='resultList']/div[{}]/span[3]/text()".format(i))
                area = html.xpath("//*[@id='resultList']/div[{}]/span[2]/text()".format(i))[0]
                company = html.xpath("//*[@id='resultList']/div[{}]/span[1]/a/text()".format(i))[0].replace(' ', '')
                detail_url = str(html.xpath("//*[@id='resultList']/div[{}]/p/span/a/@href".format(i)))
                detail_url = re.sub("\['", '', detail_url)
                detail_url = re.sub("']", '', detail_url)

                ask = get_detail(detail_url)
                ask_.append(ask)

                job_.append(job)
                salary_.append(salary)
                area_.append(area)
                company_.append(company)
                detail_url_.append(url)

    except:
        pass
    print(job_, '\n', salary_, '\n', company_, '\n', detail_url_)
    print(ask_)
    return job_, salary_, area_, company_, detail_url_, ask_
job_, salary_, area_, company_, detail_url_, ask_ = get_jobs(HEADERS)
df1 = pd.DataFrame({'job':job_, 'salary':salary_, 'company':company_, 'ask':ask_, 'detail_url':detail_url_,})
writer=pd.ExcelWriter('test.xlsx')
df1.to_excel(writer,'Sheet1')
writer.save()
writer.close()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值