分享爬取XX招聘的简单代码

简单的爬虫分享

XX招聘网站的简单爬虫,只限交流学习使用,严禁非法使用,非法使用概不负责。

代理主要使用了requests+BeautifulSoup,还有少量正则表达式。
也加入了代理池,在此感谢J_hao104,提供的获取代理的方法。

最后将爬取的数据存入Excel中。Python版本3.6,在这里依然推荐Anaconda管理Python版本及其工具包。

为了使爬取更加顺利,必须把爬虫伪装成浏览器。并且不能频繁抓取。得sleep一会再抓。

最后贴出全部code。只限交流学习使用。代码中不足的地方还望同学们自己优化啦!

#-*- encoding: utf-8 -*-
import requests
import pandas as pd
import random
import time
from bs4 import BeautifulSoup as BS
import re
from fake_useragent import UserAgent

ua = UserAgent()
headers = {
        'User-Agent': ua.chrome,
        'Host': 'sou.zhaopin.com',
        'Referer': 'https://www.zhaopin.com/',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9'
    }
#获取代理ip
def get_proxy():
    return requests.get("http://123.207.35.36:5010/get/").content

get_proxy=get_proxy()
#print(get_proxy)
def getHtml(start,city,kw):
    url = "https://fe-api.zhaopin.com/c/i/sou?start=" + str(start) + \
          "&pageSize=90&cityId=" +city+\
          "&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&sortType=publish&kw="+kw+\
          "&kt=3&lastUrlQuery=%7B%22jl%22:%22538%22,%22kw%22:%22%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88%22,%22kt%22:%223%22%7D&_v=0.79472005&" \
          "x-zp-page-request-id=15621c9c7abd41679024b2ac5cf0f992-1541566903009-311773"
    #print(url)
    #response = requests.get(url, headers)
    response = requests.get(url, headers,proxies={"http": "http://{}".format(get_proxy)})
    html = response.json()
    return html


def getOnePageJobs(html):
    global job_datas
    job_datas = html["data"]["results"]
    jobs = pd.DataFrame(
        # columns=["compay", "type", "size", "jobName", "salary", "workingExp", "eduLevel", "welfare", "createDate",
        #          "endDate", "city", "geo_lon", "geo_lat"])
        columns = ["公司名称", "公司类型", "公司规模", "岗位名称", "薪水", "工作经验", "学历", "岗位亮点", "创建时间",
                       "截止时间", "城市", "位置坐标经度", "位置坐标纬度", "岗位职责"])
    for i in range(len(job_datas)):
        company = job_datas[i]["company"]["name"]
        company_type = job_datas[i]["company"]["type"]["name"]
        company_size = job_datas[i]["company"]["size"]["name"]
        jobName = job_datas[i]["jobName"]
        salary = job_datas[i]["salary"]
        workingExp = job_datas[i]["workingExp"]['name']
        eduLevel = job_datas[i]["eduLevel"]["name"]
        welfare = job_datas[i]["welfare"]
        createDate = job_datas[i]["createDate"]
        endDate = job_datas[i]["endDate"]
        positionURL = job_datas[i]["positionURL"]
        city = job_datas[i]["city"]["display"]
        geo_lon = job_datas[i]['geo']["lon"]
        geo_lat = job_datas[i]['geo']["lat"]
        #爬取第二层
        response = requests.get(positionURL,proxies={"http": "http://{}".format(get_proxy)})
        soup = BS(response.text, 'html.parser')
        content=soup.find_all('div', class_='pos-ul')
        content=str(content).encode('utf-8')
        pattern = re.compile(u"[\u4e00-\u9fa5]+")
        content = re.findall(pattern, content.decode('utf-8'))
        #新魏华文楷体宋体微软雅黑磅
        content = re.sub("[新魏华文楷体宋体微软雅黑磅\[\]\"\'\,\.\,\。]","", str(content).encode('utf-8').decode('utf-8'))
        job = pd.DataFrame(
            [company, company_type, company_size, jobName, salary, workingExp, eduLevel, welfare, createDate, endDate,
             city, geo_lon, geo_lat,content]).T
        # job.columns = ["compay", "type", "size", "jobName", "salary", "workingExp", "eduLevel", "welfare", "createDate",
        #                "endDate", "city", "geo_lon", "geo_lat","content"]
        job.columns = ["公司名称", "公司类型", "公司规模", "岗位名称", "薪水", "工作经验", "学历", "岗位亮点", "创建时间",
                                       "截止时间", "城市", "位置坐标经度", "位置坐标纬度","岗位职责"]
        jobs = pd.concat([jobs, job], ignore_index=True)
    return jobs


if __name__ == "__main__":
    jobs = []
    i = 1
    city = input("请输入您想待的城市:")
    kw = input("请输入您想干的岗位名称:")
    page = input("请输入您想查询前多少页的内容:")
    while i<(int(page)+1):
        print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+" 正在爬取第" + str(i) + "页...")
        start = 90 * (i - 1)
        startTime = time.time()
        html = getHtml(start,city,kw)
        onepagejobs = getOnePageJobs(html)
        jobs.append(onepagejobs)
        # 随机生成1-5之间的实数,用于页面的停留时长,防止反爬
        s = random.randint(1, 5)
        time.sleep(s)
        endTime = time.time()
        print("爬取第" + str(i) + "页,耗时:" + str(endTime - startTime)+"s")
        i += 1

    print("全部爬取完毕!")

    alljobs = pd.concat(jobs, ignore_index=True)
    alljobs.to_excel(str(city)+"_"+str(kw)+"_"+"智联招聘.xls")
  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值