拉勾网爬虫_面向对象

最新推荐文章于 2024-01-04 22:45:21 发布

calculate_1

最新推荐文章于 2024-01-04 22:45:21 发布

阅读量767

点赞数

分类专栏： scrapy-爬虫文章标签：爬虫面向对象 xmlhttprequest

本文链接：https://blog.csdn.net/calculate_1/article/details/78079353

版权

scrapy-爬虫专栏收录该内容

1 篇文章 0 订阅

订阅专栏

该博客探讨了如何使用面向对象的方法来构建爬虫，特别关注在处理Ajax请求下，动态页面数据的抓取。主要内容涉及构建POST请求头以模拟XMLHttpRequest请求，从而获取原本隐藏的数据。

摘要由CSDN通过智能技术生成

ajax请求下，动态页面数据的抓取，主要是构建post请求头，模拟请求

import requests
from lxml import etree
import json
import sys
class Lagou(object):
    def __init__(self):
        self.headers = {
            'X-Requested-With': 'XMLHttpRequest',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
            'Cookie': 'user_trace_token=20170920183457-58cc73d5-9def-11e7-9c29-525400f775ce; LGUID=20170920183457-58cc7899-9def-11e7-9c29-525400f775ce; index_location_city=%E5%85%A8%E5%9B%BD; TG-TRACK-CODE=search_code; _gid=GA1.2.1674249864.1506152972; _ga=GA1.2.661938952.1505903691; LGRID=20170924124247-cffed120-a0e2-11e7-9278-5254005c3644; JSESSIONID=ABAAABAACDBABJB5CBA63393ECA49354BFB77C6B0BD0B5B; SEARCH_ID=4b82b54b5eab451392d2f73823a14a00',
            'Host': 'www.lagou.com',
            'Origin': 'https://www.lagou.com',
            'Referer': 'https://www.lagou.com/jobs/list_python%E7%88%AC%E8%99AB?px=default&city=%E6%88%90%E9%83%BD',
            'X-Anit-Forge-Code': 0,
            'X-Anit-Forge-Token': None

        }
            self.base_url = 'https://www.lagou.com/jobs/list_%s?px=default&city=%s'

    def get_combine_list(self):

        combine_list = []

        # for position in position_list:
        #     for city in city_list:
        dict = {}
        url = self.base_url % (position, city)
        html = requests.get(url, headers=self.headers).content.decode()
        # print(html)
        selector = etree.HTML(html)
        # print(selector)
        total_page = selector.xpath('//span[@class="span totalNum"]/text()')
        dict['city'] = city
        dict['position'] = position
        dict['total_page'] = total_page
        combine_list.append(dict)
    # print(combine_list)
        return combine_list

    def get_position_data(self, url, post_data):

        data = requests.post(url, data=post_data, headers=self.headers).content
        # print(data)
        # time.sleep(5)
        json_data = json.loads(data.decode('utf-8'))
        # print(json_data)
        position_data = json_data['content']['positionResult']['result']
        data_list = []
        for positions in position_data:
            # for i in positions:
            #     print(i)
            item = {}
            item['position_city'] = positions['city']
            item['creat_time'] = positions['createTime']
            item['company_name'] = positions['companyFullName']
            item['district'] = positions['district']
            item['education'] = positions['education']
            item['workyear'] = positions['workYear']
            item['salary'] = positions['salary']
            item['positon'] = positions['positionLables']
            # print(item)
            data_list.append(item)
            # print(len(data_list))
        return data_list

    def save_data(self,data_list):
        # print(len(data_list))
        filename = city + '.xml'
        with open(filename,'a') as f:
            for data in data_list:
                result = json.dumps(data, ensure_ascii=False) + ',\n'
                f.write(result)

    def run(self):

        combine_list = self.get_combine_list()
        # time.sleep(5)
        for dict in combine_list:
            city = dict['city']
            position = dict['position']
            total_page = dict['total_page']
            for num in total_page:
                total_page = int(num)
            # print(city, total_page, position)
        json_url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&city=%s&needAddtionalResult=false&isSchoolJob=0'
        url = json_url % city
        for page in range(1, total_page+1):
            # print(page)
            post_data = {
                'first': 'true',
                'pn': str(page),
                'kd':position
            }
            data_list = self.get_position_data(url, post_data)
            # time.sleep(5)
            # print(len(data_list))
            # for node in data_list:
            self.save_data(data_list)
if __name__ == "__main__":
    city= sys.argv[1]
    position = sys.argv[2]
    # city_list = ['上海']
    # position_list = ['python爬虫']
    lagou = Lagou()
    lagou.run()