ajax请求下,动态页面数据的抓取,主要是构建post请求头,模拟请求
import requests
from lxml import etree
import json
import sys
class Lagou(object):
def __init__(self):
self.headers = {
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Cookie': 'user_trace_token=20170920183457-58cc73d5-9def-11e7-9c29-525400f775ce; LGUID=20170920183457-58cc7899-9def-11e7-9c29-525400f775ce; index_location_city=%E5%85%A8%E5%9B%BD; TG-TRACK-CODE=search_code; _gid=GA1.2.1674249864.1506152972; _ga=GA1.2.661938952.1505903691; LGRID=20170924124247-cffed120-a0e2-11e7-9278-5254005c3644; JSESSIONID=ABAAABAACDBABJB5CBA63393ECA49354BFB77C6B0BD0B5B; SEARCH_ID=4b82b54b5eab451392d2f73823a14a00',
'Host': 'www.lagou.com',
'Origin': 'https://www.lagou.com',
'Referer': 'https://www.lagou.com/jobs/list_python%E7%88%AC%E8%99AB?px=default&city=%E6%88%90%E9%83%BD',
'X-Anit-Forge-Code': 0,
'X-Anit-Forge-Token': None
}
self.base_url = 'https://www.lagou.com/jobs/list_%s?px=default&city=%s'
def get_combine_list(self):
combine_list = []
# for position in position_list:
# for city in city_list:
dict = {}
url = self.base_url % (position, city)
html = requests.get(url, headers=self.headers).content.decode()
# print(html)
selector = etree.HTML(html)
# print(selector)
total_page = selector.xpath('//span[@class="span totalNum"]/text()')
dict['city'] = city
dict['position'] = position
dict['total_page'] = total_page
combine_list.append(dict)
# print(combine_list)
return combine_list
def get_position_data(self, url, post_data):
data = requests.post(url, data=post_data, headers=self.headers).content
# print(data)
# time.sleep(5)
json_data = json.loads(data.decode('utf-8'))
# print(json_data)
position_data = json_data['content']['positionResult']['result']
data_list = []
for positions in position_data:
# for i in positions:
# print(i)
item = {}
item['position_city'] = positions['city']
item['creat_time'] = positions['createTime']
item['company_name'] = positions['companyFullName']
item['district'] = positions['district']
item['education'] = positions['education']
item['workyear'] = positions['workYear']
item['salary'] = positions['salary']
item['positon'] = positions['positionLables']
# print(item)
data_list.append(item)
# print(len(data_list))
return data_list
def save_data(self,data_list):
# print(len(data_list))
filename = city + '.xml'
with open(filename,'a') as f:
for data in data_list:
result = json.dumps(data, ensure_ascii=False) + ',\n'
f.write(result)
def run(self):
combine_list = self.get_combine_list()
# time.sleep(5)
for dict in combine_list:
city = dict['city']
position = dict['position']
total_page = dict['total_page']
for num in total_page:
total_page = int(num)
# print(city, total_page, position)
json_url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&city=%s&needAddtionalResult=false&isSchoolJob=0'
url = json_url % city
for page in range(1, total_page+1):
# print(page)
post_data = {
'first': 'true',
'pn': str(page),
'kd':position
}
data_list = self.get_position_data(url, post_data)
# time.sleep(5)
# print(len(data_list))
# for node in data_list:
self.save_data(data_list)
if __name__ == "__main__":
city= sys.argv[1]
position = sys.argv[2]
# city_list = ['上海']
# position_list = ['python爬虫']
lagou = Lagou()
lagou.run()