Python网络爬虫之前途无忧

引言

  前途无忧提供了大量的岗位招聘信息。对于就业选择提供一种解决方案。首先是对前途无忧的 robots.txt进行检查,返回结果是404表明允许爬取该网页的所有内容。

目标内容

  如爬取python为关键字的岗位内容。

技术流程

  1.指定URL

#如前50页的URL,可在开发者工具中寻在,通过访问不同的页面发现URL的规律。
for i in range(1,50):
    pages_url.append('https://search.51job.com/list/000000,000000,0000,00,9,99,%25E5%25AE%25B6%25E6%2595%2599,2,{}.html'.format(i))
#UA伪装,产生随机的UA
ua=UserAgent()
headers = {
    'User-Agent': ua.random
}
param = {
    'lang': 'c',
    'postchannel': 0000,
    'workyear': 99,
    'cotype': 99,
    'degreefrom': 99,
    'jobterm': 99,
    'companysize': 99,
    'ord_field': 0,
    'dibiaoid': 0,
    'line': '',
    'welfare': ''
}

  2.发起页面请求

dic_page_text = []
def get_dic_page_text(url):
    response = rq.get(url = url,headers = headers,params = param).text
    #得到的response与开发者工具显示不一致,可通过Beautifulsoap进行查看。使用正则表达对目标进行提取。
    techer_page_text = re.compile('window.__SEARCH_RESULT__ = (.*?)</script>').findall(response)[0]
    #对获取的内容进行json转码。
    dic_page_text.append(json.loads(techer_page_text)['engine_search_result'])
    #停滞1秒,避免请求的网页服务器崩溃。
    time.sleep(1)

  3.获得响应数据

#将获取的数据保存至列表,便于存储到数据框。
for dic in dic_page_text:
        job_href.append(dic['job_href'])
        job_name.append(dic['job_name'])
        company_name.append(dic['company_name'])
        company_href.append(dic['company_href'])
        providesalary_text.append(dic['providesalary_text'])
        workarea_text.append(dic['workarea_text'])
        updatedate.append(dic['updatedate'])
        iscommunicate.append(dic['iscommunicate'])
        companytype_text.append(dic['companytype_text'])
        degreefrom.append(dic['degreefrom'])
        workyear.append(dic['workyear'])
        issuedate.append(dic['issuedate'])
        jobwelf.append(dic['jobwelf'])
        attribute_text.append(dic['attribute_text'])
        companysize_text.append(dic['companysize_text'])
        companyind_text.append(dic['companyind_text'])
        url = dic['job_href']
        response = rq.get(url = url,headers = headers).content
        tree = etree.HTML(response)
        page_init = tree.xpath('string(//div[@class = "tBorderTop_box"]/div)')
        #格式的初步处理。
        page = re.sub(r'\r\n','',page_init).replace(' ', '')
        job_information.append(page[:-4])
        time.sleep(1)

  4.数据持久化

init_data = pd.DataFrame({'job_href':job_href,
                          'job_name':job_name,
                          'company_name':company_name,
                          'company_href':company_href,
                          'providesalary_text':providesalary_text,
                          'workarea_text':workarea_text,
                          'updatedate':updatedate,
                          'iscommunicate':iscommunicate,
                          'companytype_text':companytype_text,
                          'degreefrom':degreefrom,
                          'workyear':workyear,
                          'issuedate':issuedate,
                          'jobwelf':jobwelf,
                          'attribute_text':attribute_text,
                          'companysize_text':companysize_text,
                          'companyind_text':companyind_text,
                          'job_information':job_information})
#当然也可存入数据库,具体看要求。
init_data.to_csv('./init_data0.csv') 
代码实现
#导入模块
import re
import requests as rq
from lxml import etree
from time import sleep
from fake_useragent import UserAgent
import json
from multiprocessing.dummy import Pool as ThreadPool
import time
import modin.pandas as pd
import numpy as npbea

pages_url = []
for i in range(1,10):
    pages_url.append('https://search.51job.com/list/000000,000000,0000,00,9,99,%25E5%25AE%25B6%25E6%2595%2599,2,{}.html'.format(i))

ua=UserAgent()
headers = {
    'User-Agent': ua.random
}
param = {
    'lang': 'c',
    'postchannel': 0000,
    'workyear': 99,
    'cotype': 99,
    'degreefrom': 99,
    'jobterm': 99,
    'companysize': 99,
    'ord_field': 0,
    'dibiaoid': 0,
    'line': '',
    'welfare': ''
}
dic_page_text = []
def get_dic_page_text(url):
    response = rq.get(url = url,headers = headers,params = param).text
    techer_page_text = re.compile('window.__SEARCH_RESULT__ = (.*?)</script>').findall(response)[0]
    dic_page_text.append(json.loads(techer_page_text)['engine_search_result'])
    time.sleep(1)

job_href = []
job_name  =[]
company_name = []
company_href = []
providesalary_text = []
workarea_text = []
updatedate = []
iscommunicate =[]
companytype_text = []
degreefrom = []
workyear = []
issuedate = []
jobwelf = []
attribute_text = []
companysize_text = []
companyind_text = []
job_information = []

def get_page_text(dic_page_text):
    for dic in dic_page_text:
        job_href.append(dic['job_href'])
        job_name.append(dic['job_name'])
        company_name.append(dic['company_name'])
        company_href.append(dic['company_href'])
        providesalary_text.append(dic['providesalary_text'])
        workarea_text.append(dic['workarea_text'])
        updatedate.append(dic['updatedate'])
        iscommunicate.append(dic['iscommunicate'])
        companytype_text.append(dic['companytype_text'])
        degreefrom.append(dic['degreefrom'])
        workyear.append(dic['workyear'])
        issuedate.append(dic['issuedate'])
        jobwelf.append(dic['jobwelf'])
        attribute_text.append(dic['attribute_text'])
        companysize_text.append(dic['companysize_text'])
        companyind_text.append(dic['companyind_text'])
        url = dic['job_href']
        response = rq.get(url = url,headers = headers).content
        tree = etree.HTML(response)
        page_init = tree.xpath('string(//div[@class = "tBorderTop_box"]/div)')
        page = re.sub(r'\r\n','',page_init).replace(' ', '')
        job_information.append(page[:-4])
        #此处为了实现高并发。
        time.sleep(1)

if __name__ == '__main__' :
    start = time.time()   
    ThreadPool_1 = ThreadPool(10)
    ThreadPool_1.map(get_dic_page_text,pages_url)
    ThreadPool_1.close()
    ThreadPool_1.join()
    ThreadPool_2 = ThreadPool(12)
    ThreadPool_2.map(get_page_text,dic_page_text)
    ThreadPool_2.close()
    ThreadPool_2.join()
    end = time.time()
    print(end-start)
    init_data = pd.DataFrame({'job_href':job_href,
                          'job_name':job_name,
                          'company_name':company_name,
                          'company_href':company_href,
                          'providesalary_text':providesalary_text,
                          'workarea_text':workarea_text,
                          'updatedate':updatedate,
                          'iscommunicate':iscommunicate,
                          'companytype_text':companytype_text,
                          'degreefrom':degreefrom,
                          'workyear':workyear,
                          'issuedate':issuedate,
                          'jobwelf':jobwelf,
                          'attribute_text':attribute_text,
                          'companysize_text':companysize_text,
                          'companyind_text':companyind_text,
                          'job_information':job_information})
	init_data.to_csv('./init_data0.csv')
致谢

首先,声明该文章非商业用途!
若有疑问可留言,也希望各位提出纠正,恳请宁不吝啬教授,谢谢!!!
同时也感谢小破站提供的网络爬虫的学习,如:
[1]: https://book.apeland.cn/details/69/

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值