爬拉勾网
以前看视频教程的练习例子,现在应该用不了了
使用requests 数据请求爬取
import requests
from lxml import etree
import time
import re
headers = {
'Referer': 'https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl='
'false&fromSearch=true&labelWords=&suginput=',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/70.0.3538.102 Safari/537.36',
'Cookie': 'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1542976104; _ga=GA1.2.348676795.1542976104; _'
'gid=GA1.2.1925661825.1542976104; '
'user_trace_token=20181123202703-157ced4d-ef1b-11e8-b745-525400f775ce; '
'LGSID=20181123202703-157cf045-ef1b-11e8-b745-525400f775ce; '
'LGUID=20181123202703-157cf2ce-ef1b-11e8-b745-525400f775ce; '
'index_location_city=%E5%85%A8%E5%9B%BD; '
'JSESSIONID=ABAAABAACEFAACG974B7CC20D98AA2220FE807548E02F78; '
'LGRID=20181123203737-8f0e481b-ef1c-11e8-b745-525400f775ce; '
'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1542976739'
}
def request_page():
url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
data = {
'first': 'false',
'pn': 1,
'kd': 'python'
}
for i in range(1, 21):
data['pn'] = i # 遍历要爬取的页数并把i传给参数'pn'(页数)
repsonse = requests.post(url, headers=headers, data=data) # 职位列表的页面
result = repsonse.json() # 通过.json方法,将返回的json数据解析(load)为字典表
positions = result['content']['positionResult']['result'] # 通过转换后的json数据(字典表)获取到职位列表信息
for position in positions:
positionID = position['positionId'] # 遍历并获取到每个职位对应的id
position_url = 'https://www.lagou.com/jobs/{}.html'.format(positionID) # 通过id构造详情页面url
print(position_url)
parse_detall(position_url) # 获取到的详情页面url传给parse_detall函数来解析
def parse_detall(url):
lagous = []
response = requests.get(url, headers=headers) # 访问详情页面
print(response)
text = response.text # 转换为str
html = etree.HTML(text) # 创建xpath
name = html.xpath('//span[@class="name"]/text()')[0] # 获取职位名称
job_spans = html.xpath('//dd[@class="job_request"]//span') # 获取薪水 工作地点 经验等信息的span标签
salary_span = job_spans[0] # 薪水span标签
salary = salary_span.xpath('.//text()')[0].strip() # 薪水span标签里的字符串
city = job_spans[1].xpath('.//text()')[0].strip() # 工作地点
city = re.sub(r'[\s/]', '', city) # 用正则替换左右斜杠为空字符串
years = job_spans[2]