入门学习python爬虫(二)通过beautifulsoup优雅爬取新华三的招聘信息

入门学习python爬虫(二)通过beautifulsoup优雅爬取新华三的招聘信息

  1. 初始化保存文件

创建类,初始化参数

class My(object):
    def __init__(self):
        path="D://shuju"
        file="XH3.json"
        self.file_path=os.path.join(path,file)
        self.fp=open(self.file_path,"a",encoding="utf-8")
  1. 通过动态改变header,防止反爬封ip

每次请求动态改变header里的值,模拟浏览器请求过程。

    def get_soup(self,url):
        a = [
            "Mozilla/5.0 (Windows NT 6.1; Win64; rv:27.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
            "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:27.0) Gecko/20100101 Firfox/27.0"
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
            "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:10.0) Gecko/20100101 Firfox/10.0"
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/21.0.1180.110 Safari/537.36"
            "Mozilla/5.0 (X11; Ubuntu; Linux i686 rv:10.0) Gecko/20100101 Firfox/27.0"
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/34.0.1838.2 Safari/537.36"
            "Mozilla/5.0 (X11; Ubuntu; Linux i686 rv:27.0) Gecko/20100101 Firfox/27.0"
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
        ]
        random_header = random.choice(a)
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'User-Agent': random_header
        }
        response=requests.get(url,headers=headers).content.decode("utf-8")
        soup = BeautifulSoup(response, "html5lib")
        return soup
  1. 获取当前页数

方法一 通过正则表达式取出页数

    def get_page(self,url):
        soup =self.get_soup(url)
        a=soup.select(".tablefooter span")[0].text
        page=re.findall("当前第1/(\d+)页",a)
        return int(page[0])

方法二 通过bs4选择器,选出页数

    def get_page2(self,url):
        soup =self.get_soup(url)
        a=soup.select(".tablefooter span a")[3]
        page = parse_qs(a['href'])['PageIndex'][0]
        return page
  1. 获取职位详情
    def get_urls(self,item,url):
        url="https://h3c.zhiye.com/"+url
        print(url)
        soup=self.get_soup(url)
        nwe=soup.select(".nvalue")
        item["招聘类别"]=nwe[0].text.replace("\n","").replace(" ","")
        item["工作性质"]=nwe[1].text.replace("\n","").replace(" ","")
        item["薪资范围"]=nwe[2].text.replace("\n","").replace(" ","")
        item["招聘类别"]=nwe[3].text.replace("\n","").replace(" ","")
        item["发布时间"]=nwe[4].text.replace("\n","").replace(" ","")
        item["截止时间"]=nwe[5].text.replace("\n","").replace(" ","")
        new=soup.select(".xiangqingtext p")
        item["工作地点"]=new[1].text.replace("\n","").replace(" ","")
        item["工作职责"]=new[3].text.replace("\n","").replace(" ","").replace("\t","")
        item["任职资格"]=new[5].text.replace("\n","").replace(" ","").replace("\t","")
  1. 获取职位信息

通过url数据解析招聘信息列表数据,动态请求睡眠。

    def crewl(self,page):
        for i in range(1,page+1):
            url="https://h3c.zhiye.com/search/?p=1%5E-1%2C3%5E-1&PageIndex={}".format(i)
            test=requests.get(url).text
            soups=BeautifulSoup(test,"html5lib")
            tr=soups.select(".listtable tbody tr")
            tmes = [6,2,3,10,5,6,15,8,9]
            tm=[1,2,3]
            for t in tr:
                item={}
                name=t.findAll("td")[0].text.replace("\n","").replace(" ","")
                lie=t.findAll("td")[1].text.replace("\n","").replace(" ","")
                loc = t.findAll("td")[2].text.replace("\n", "").replace(" ", "")
                newtime = t.findAll("td")[3].text.replace("\n", "").replace(" ", "")
                urls = t.findAll("td")[0].a['href']
                self.get_urls(item,urls)
                times = random.choice(tm)
                time.sleep(times)
                item["name"]=name
                item["lie"] = lie
                item["loc"] = loc
                item["time"] = newtime
                print('正在保存数据%s' % item)
                self.save(item)
  1. 保存文件
    def save(self,item):
        data=json.dumps(item,ensure_ascii=False)
        self.fp.write(data+"\n")
  1. 关闭文件
    def save_close(self):
        self.fp.close()
        print("数据保存在"+self.file_path)
  1. 调用主函数
if __name__ == '__main__':
    my=My()
    url="https://h3c.zhiye.com/search?r=-1&p=1%5E-1%2C3%5E-1&c=&d=&k=#jlt"
    page=my.get_page(url)
    my.crewl(page)
  1. 完整代码
import requests
from  bs4 import BeautifulSoup
from urllib.parse import parse_qs
import random
import time
import json
import re
import os
class My(object):
    def __init__(self):
        path="D://shuju"
        file="XH3.json"
        self.file_path=os.path.join(path,file)
        self.fp=open(self.file_path,"a",encoding="utf-8")
    def get_soup(self,url):
        a = [
            "Mozilla/5.0 (Windows NT 6.1; Win64; rv:27.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
            "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:27.0) Gecko/20100101 Firfox/27.0"
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
            "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:10.0) Gecko/20100101 Firfox/10.0"
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/21.0.1180.110 Safari/537.36"
            "Mozilla/5.0 (X11; Ubuntu; Linux i686 rv:10.0) Gecko/20100101 Firfox/27.0"
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/34.0.1838.2 Safari/537.36"
            "Mozilla/5.0 (X11; Ubuntu; Linux i686 rv:27.0) Gecko/20100101 Firfox/27.0"
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
        ]
        random_header = random.choice(a)
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'User-Agent': random_header
        }
        response=requests.get(url,headers=headers).content.decode("utf-8")
        soup = BeautifulSoup(response, "html5lib")
        return soup
    #方法一
    def get_page(self,url):
        soup =self.get_soup(url)
        a=soup.select(".tablefooter span")[0].text
        page=re.findall("当前第1/(\d+)页",a)
        return int(page[0])
    #方法二
    def get_page2(self,url):
        soup =self.get_soup(url)
        a=soup.select(".tablefooter span a")[3]
        page = parse_qs(a['href'])['PageIndex'][0]
        return page
    def get_urls(self,item,url):
        url="https://h3c.zhiye.com/"+url
        print(url)
        soup=self.get_soup(url)
        nwe=soup.select(".nvalue")
        item["招聘类别"]=nwe[0].text.replace("\n","").replace(" ","")
        item["工作性质"]=nwe[1].text.replace("\n","").replace(" ","")
        item["薪资范围"]=nwe[2].text.replace("\n","").replace(" ","")
        item["招聘类别"]=nwe[3].text.replace("\n","").replace(" ","")
        item["发布时间"]=nwe[4].text.replace("\n","").replace(" ","")
        item["截止时间"]=nwe[5].text.replace("\n","").replace(" ","")
        new=soup.select(".xiangqingtext p")
        item["工作地点"]=new[1].text.replace("\n","").replace(" ","")
        item["工作职责"]=new[3].text.replace("\n","").replace(" ","").replace("\t","")
        item["任职资格"]=new[5].text.replace("\n","").replace(" ","").replace("\t","")
    def crewl(self,page):
        for i in range(1,page+1):
            url="https://h3c.zhiye.com/search/?p=1%5E-1%2C3%5E-1&PageIndex={}".format(i)
            test=requests.get(url).text
            soups=BeautifulSoup(test,"html5lib")
            tr=soups.select(".listtable tbody tr")
            tmes = [6,2,3,10,5,6,15,8,9]
            tm=[1,2,3]
            for t in tr:
                item={}
                name=t.findAll("td")[0].text.replace("\n","").replace(" ","")
                lie=t.findAll("td")[1].text.replace("\n","").replace(" ","")
                loc = t.findAll("td")[2].text.replace("\n", "").replace(" ", "")
                newtime = t.findAll("td")[3].text.replace("\n", "").replace(" ", "")
                urls = t.findAll("td")[0].a['href']
                self.get_urls(item,urls)
                times = random.choice(tm)
                time.sleep(times)
                item["name"]=name
                item["lie"] = lie
                item["loc"] = loc
                item["time"] = newtime
                print('正在保存数据%s' % item)
                self.save(item)
    def save(self,item):
        data=json.dumps(item,ensure_ascii=False)
        self.fp.write(data+"\n")
    def save_close(self):
        self.fp.close()
        print("数据保存在"+self.file_path)
if __name__ == '__main__':
    my=My()
    url="https://h3c.zhiye.com/search?r=-1&p=1%5E-1%2C3%5E-1&c=&d=&k=#jlt"
    page=my.get_page(url)
    my.crewl(page)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值