技术要点:
1、python操作redis的基础,如建立连接,插入数据,读取数据等等。
2、代理技术反爬。
目标:
爬取拉勾网站数据分析师岗位信息,将公司、薪水、城市及岗位详细信息的url地址写入redis数据库,然后从redis读取url,并爬取每个岗位详细信息,获取职位诱惑和岗位要求的信息,再次写入redis。
import requests
import time
import random
import redis
from bs4 import BeautifulSoup
h1 = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', \
'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='} # 创建不同的headers
h2 = {'User-Agent': 'Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17', \
'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='}
h3 = {'User-Agent': 'Opera/9.80 (iPhone; Opera Mini/7.1.32694/27.1407; U; en) Presto/2.8.119 Version/11.10', \
'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='}
proxies1 = {"http": "http://110.243.21.179:9999"} # 代理可能会失效,运行时可以更新成有效代理的IP
proxies2 = {"http": "http://180.119.68.43:9999"} # 创建不同的headers,应对反爬
proxies3 = {"http": "http://123.163.96.166:9999"}
r = redis.Redis(host="localhost", port='6379', db=5, password='XXXXX', decode_responses=True) # redis数据库连接,此处密码用xxx代替
def get_Cookies(proxies, headers): # 构建获得cookies的函数
url = 'https://www.lagou.com/jobs/list_python'
session = requests.session()
session.post(url, headers=headers, proxies=proxies)
cookies = session.cookies
return cookies.get_dict()
def get_joblist(url,position): # 获取岗位基本信息,并写入redis数据库
numJob = 0 # 记录岗位数
for j in range(1, 31):
para = {'first': 'true', 'pn': j, 'kd': position} # post请求的关键参数
if j >= 11 and j < 21: # 10页以上更换cookies,应对反爬虫的关键措施
time.sleep(1)
f = requests.post(url, cookies=get_Cookies(proxies1, h1), headers=h1, data=para, proxies=proxies1) # post请求
if j >= 21:
time.sleep(1)
f = requests.post(url, cookies=get_Cookies(proxies2, h2), headers=h2, data=para, proxies=proxies2)
if j < 11:
f = requests.post(url, cookies=get_Cookies(proxies3, h3), headers=h3, data=para, proxies=proxies3)
try:
js = f.json() # 获取json格式数据
for i in js['content']['positionResult']['result']:
numJob += 1
positionId = i['positionId']
positionName = i['positionName']
companyId = i['companyId']
companyFullName = i['companyFullName']
city = i['city']
companyShortName = i['companyShortName']
salary = i['salary']
job_detail_url = 'https://www.lagou.com/jobs/' + str(positionId) + '.html'
# r.hset(companyShortName,'companyFullName',companyFullName)
# r.hset(companyShortName, 'job_detail_url', job_detail_url)
# r.hset(companyShortName,'positionName', positionName)
# r.hset(companyShortName, 'positionId', positionId)
# r.hset(companyShortName, 'salary', salary)
r.hmset(companyShortName,
{'companyFullName': companyFullName, 'city': city, 'job_detail_url': job_detail_url,
'positionName': positionId, 'salary': salary}) # 此语句功能同上述单句汇总,注意格式
except Exception as e:
print('wrong', e)
time.sleep(random.random() * 2)
print('共爬取了%d个岗位'%numJob) # 记录岗位数量
return
def get_job_detail(url): # 获取岗位详细信息,包括职位诱惑和岗位描述,写入redis数据库
html = requests.get(url, headers=h1, cookies=get_Cookies(proxies1, h1))
soup = BeautifulSoup(html.text, 'lxml')
job_trigger = soup.select_one('#job_detail > dd.job-advantage >p').text.strip()
job_description = soup.select_one('#job_detail > dd.job_bt > div').text.strip()
companyShortName = soup.select_one('#job_company > dt > a > div > h3 > em').text.strip()
r.hmset(companyShortName, {'职位诱惑': job_trigger, '职位描述': job_description})
return
if __name__=='__main__':
# url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false' # 分析网页结构,得出这个网址也是成功爬虫关键
url='https://www.lagou.com/jobs/positionAjax.json?city=%E6%9D%AD%E5%B7%9E&needAddtionalResult=false' #拉勾杭州站
time1 = time.time()
position=input('请输入要搜索的岗位名称,如python,数据分析,java等:')
print('正在爬取,请稍候!')
get_joblist(url,position)
n=0
for i in r.keys():
url = r.hget(i, 'job_detail_url')
n+=1
print('获取第{}个岗位的详细信息,地址:{}'.format(n,url))
get_job_detail(url)
print('Time Used:', time.time() - time1) # Time Used: 787.010014295578 记录程序运行花费的时间