可以去某云某宝或其他渠道购买,具体使用看自己购买商家的API文档,查看使用方法。
ip_proxy.py
-
import requests
-
class ip_getter(object):
-
def __init__(self):
-
self.ip_proxy_str = get_ip_string()
-
def update_ip_proxy_str(self):
-
self.ip_proxy_str = get_ip_string()
-
print('get one ip : ' + self.ip_proxy_str)
-
def get_ip_string():
-
url = 'API接口'
-
response = requests.get(url)
-
return response.text
boos_bs4.py ( boos直聘 实例)
-
from bs4 import BeautifulSoup
-
import requests
-
import ip_proxy
-
from urllib import parse
-
headers = {
-
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
-
}
-
def get_boss_info(my_ip, detailed_url):
-
# url = 'https://www.zhipin.com/job_detail/7e883f0c3a336cb51n142968FFM~.html?ka=search_list_1'
-
proxy = {
-
'http': 'http://' + my_ip.ip_proxy_str,
-
'https': 'http://' + my_ip.ip_proxy_str
-
}
-
response = requests.get(detailed_url, headers=headers, proxies=proxy, timeout=5)
-
soup = BeautifulSoup(response.text, 'lxml')
-
title = soup.find('h1').text
-
# div_ele = soup.find('div', class_="name")
-
# print(div_ele)
-
salary = soup.find('span', class_="badge").text.replace('\n', '').strip()
-
print(title)
-
print(salary)
-
gezhong_info = soup.select('div.info-primary > p')[0].text.replace('\n', '').strip()
-
print(gezhong_info)
-
gangwei_info = soup.select('div.text')[0].text
-
print(gangwei_info)
-
# 获取详情页的url
-
def get_detail_url(my_ip, url):
-
# url = 'https://www.zhipin.com/c101010100/h_101010100/?query=python&page=2&ka=page-2'
-
proxy = {
-
'http': 'http://' + my_ip.ip_proxy_str,
-
'https': 'http://' + my_ip.ip_proxy_str
-
}
-
response = requests.get(url, headers = headers, proxies=proxy, timeout=5)
-
soup = BeautifulSoup(response.text, 'lxml')
-
# a_ele_list = soup.select('h3.name > a')
-
a_ele_list = soup.select('div.job-list > ul > li div.info-primary > h3 > a')
-
for a_ele in a_ele_list:
-
# 属性值的获取可以通过类似字典的方式获取
-
a_href = a_ele['href']
-
# 拼接详情页的链接
-
href = parse.urljoin(url, a_href)
-
print('详情页的href: ' + href)
-
# 重试三次, 获取代理访问boss直聘, 三次没有成功访问就跳过
-
for i in range(0, 3):
-
try:
-
# 获取详情页的信息
-
get_boss_info(my_ip, href)
-
break
-
except Exception as e:
-
print(e)
-
my_ip.update_ip_proxy_str()
-
def get_all_info(my_ip):
-
base_url = 'https://www.zhipin.com/c101010100/h_101010100/?query=python&page=%s&ka=page-%s'
-
for i in range(1, 4):
-
# 每一个分页的url
-
url = base_url % (i, i)
-
# 循环处理, 如果proxy不好使, 就需要换代理, 如果重试4次依然不好使,就跳过
-
for i in range(0, 4):
-
try:
-
# 循环四次访问boss直聘的网站, 分页的内容
-
# get_detail_url(my_ip, url)
-
get_detail_url(my_ip, url)
-
break
-
except Exception as e:
-
print(e)
-
my_ip.update_ip_proxy_str()
-
if __name__ == '__main__':
-
my_ip = ip_proxy.ip_getter()
-
# 获取一个ip
-
# proxy_str = '36.27.143.72:21450'
-
# print(proxy_str)
-
# 获取所有的boss直聘信息
-
get_all_info(my_ip)
-
# with open('boss.html', 'wb') as f:
-
# f.write(response.content)