抓取boss直聘的信息

最新推荐文章于 2024-04-30 23:36:05 发布

二八定律

最新推荐文章于 2024-04-30 23:36:05 发布

阅读量1.4k

点赞数

分类专栏：爬虫

本文链接：https://blog.csdn.net/weixin_42961417/article/details/82141734

版权

爬虫专栏收录该内容

11 篇文章 0 订阅

订阅专栏

from bs4 import BeautifulSoup
import requests
import ip_proxy
from urllib import parse

headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36’,
}

def get_boss_info(my_ip,detailed_url):
#url = ‘https://www.zhipin.com/job_detail/7e883f0c3a336cb51n142968FFM~.html?ka=search_list_1’

proxy = {
    'http': 'http://' + my_ip.ip_proxy_str,
    'https': 'http://' + my_ip.ip_proxy_str
}
response = requests.get(detailed_url, headers=headers, proxies = proxy, timeout=5)

soup = BeautifulSoup(response.text, 'lxml')
title = soup.find('h1').text
#div_ele = soup.find('div', class_="name")
#print(div_ele)
salary = soup.find('span', class_="badge").text.replace('\n', '').strip()
print(title)
print(salary)
gezhong_info = soup.select('div.info-primary > p')[0].text.replace('\n', '').strip()
print(gezhong_info)
gangwei_info = soup.select('div.text')[0].text
print(gangwei_info)

def get_detail_url(my_ip, url):# 获取详情页的url
# url = ‘https://www.zhipin.com/c101010100/h_101010100/?query=python&page=2&ka=page-2’
proxy = {
‘http’: ‘http://’ + my_ip.ip_proxy_str,
‘https’: ‘http://’ + my_ip.ip_proxy_str
}
response = requests.get(url, headers = headers, proxies=proxy, timeout=5)

soup = BeautifulSoup(response.text, 'lxml')
#a_ele_list = soup.select('h3.name > a')
a_ele_list = soup.select('div.job-list > ul > li div.info-primary > h3 > a')

for a_ele in a_ele_list:
    # 属性值的获取可以通过类似字典的方式获取
    a_href = a_ele['href']
    # 拼接详情页的链接
    href = parse.urljoin(url, a_href)
    print('详情页的href: ' + href)
    # 重试三次, 获取代理访问boss直聘, 三次没有成功访问就跳过
    for i in range(0,3):
        try:
            # 获取详情页的信息
            get_boss_info(my_ip, href)
            break
        except Exception as e:
            print(e)
            my_ip.update_ip_proxy_str()

def get_all_info(my_ip):
base_url = ‘https://www.zhipin.com/c101010100/h_101010100/?query=python&page=%s&ka=page-%s’
for i in range(1,4):
# 每一个分页的url
url = base_url % (i, i)
# 循环处理, 如果proxy不好使, 就需要换代理, 如果重试4次依然不好使,就跳过
for i in range(0, 4):
try:
# 循环四次访问boss直聘的网站, 分页的内容
# get_detail_url(my_ip, url)
get_detail_url(my_ip, url)
break
except Exception as e:
print(e)
my_ip.update_ip_proxy_str()

if name == ‘main‘:
my_ip = ip_proxy.ip_getter()
# 获取一个ip
# proxy_str = ‘36.27.143.72:21450’
# print(proxy_str)
# 获取所有的boss直聘信息
get_all_info(my_ip)