python爬虫（三）之虎嗅网汽车文章爬虫

最新推荐文章于 2024-05-27 08:52:59 发布

JavaGPT

最新推荐文章于 2024-05-27 08:52:59 发布

阅读量374

点赞数 4

分类专栏： python 文章标签： python 爬虫汽车

本文链接：https://blog.csdn.net/weixin_46294086/article/details/138679687

版权

python 专栏收录该内容

20 篇文章 0 订阅

订阅专栏

python爬虫（三）之虎嗅网汽车文章爬虫

闲来没事，闲鱼上有个好兄弟要我从虎嗅网上抓一些汽车文章的爬虫，于是大力出奇迹，我写了一个python程序，将这个网站上所有的汽车文章全部抓取下来了，存储到了本地的虎嗅.csv。

import requests
import json
import csv
from lxml import etree
import time
import random
from datetime import datetime


class Huxiu:

    def __init__(self):
        self.article_list_pre_url = "https://api-article.huxiu.com/web/channel/articleList"
        self.article_list_post_url = "&pageSize=10&orderBy=createTime&order=DESC&isProfessional=true&userType=0"
        self.start_page = 1
        self.end_page = 1000
        self.article_list_headers = {
            'authority': 'api-article.huxiu.com',
            'accept': 'application/json, text/plain, */*',
            'accept-language': 'zh-CN,zh;q=0.9',
            'content-type': 'application/x-www-form-urlencoded',
            'cookie': 'Hm_lvt_502e601588875750790bbe57346e972b=1710422257; huxiu_analyzer_wcy_id=9wau9zilte4pu8mg6b7z; hx_object_visit_referer_1_2702514=https%3A%2F%2Fwww.huxiu.com%2Fchannel%2F21.html; Hm_lpvt_502e601588875750790bbe57346e972b=1710422520',
            'origin': 'https://www.huxiu.com',
            'referer': 'https://www.huxiu.com/',
            'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
            'sec-fetch-dest': 'empty',
            'sec-fetch-mode': 'cors',
            'sec-fetch-site': 'same-site',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
        }

        self.article_detail_headers = {
            'authority': 'www.huxiu.com',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'accept-language': 'zh-CN,zh;q=0.9',
            'cache-control': 'max-age=0',
            'cookie': 'Hm_lvt_502e601588875750790bbe57346e972b=1710422257; huxiu_analyzer_wcy_id=9wau9zilte4pu8mg6b7z; Hm_lpvt_502e601588875750790bbe57346e972b=1710422520',
            'referer': 'https://www.huxiu.com/channel/21.html',
            'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
            'sec-fetch-dest': 'document',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-site': 'same-origin',
            'sec-fetch-user': '?1',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
        }

    def post_request(self, url, headers, payload):
        response = requests.request("POST", url, headers=headers, data=payload)
        return response.text

    def get_request(self, url, headers):
        response = requests.request("GET", url, headers=headers)
        return response.text

    def do_work(self):
        with open('虎嗅.csv', 'w', newline='', encoding='utf-8-sig') as file:
            writer = csv.writer(file)
            csv_title = ["标题", "作者", "发布时间", "原文地址", "正文"]
            writer.writerow(csv_title)

            page_no = 1
            last_time = 1710425737
            # 最早的时间戳 1684505520
            while True:
                print("=====================> 当前第" + str(page_no) + "页 =======================")
                payload = 'platform=www&last_time=' + str(last_time) + '&channel_id=21'
                print(datetime.fromtimestamp(last_time).strftime('%Y-%m-%d %H:%M:%S'))
                text = self.post_request(self.article_list_pre_url, headers=self.article_list_headers, payload=payload)
                json_data = json.loads(text)
                data = json_data["data"]["datalist"]
                if len(data) <= 0:
                    break
                self.write_page(writer, data)
                last_time = int(json_data["data"]["last_time"])
                page_no += 1

    def write_page(self, writer, data):
        for item in data:
            # print(item["title"])
            # print(item["author"]["username"])
            # print(item["created_at"])
            # 获取文章详情内容
            # https://www.xchuxing.com/article/116378
            article_url = "https://www.huxiu.com/article/" + str(item["aid"]) + ".html"
            text = self.get_request(article_url, headers=self.article_detail_headers)

            html = etree.HTML(text)
            # //*[@id="nice"]/div/div[1]
            result = html.xpath('normalize-space(//*[@id="article-content"])')
            # time_struct = time.localtime(item["created_at"])
            # date = time.strftime("%Y-%m-%d %H:%M:%S", time_struct)

            row = [item["title"], item["user_info"]["username"], article_url, item["formatDate"], result]
            writer.writerow(row)
            # seconds = random.randint(1, 4)
            print("===========> 当前文章 " + article_url + " 写入完毕")
            # print("===========> 当前文章 " + article_url + " 写入完毕,等待" + str(seconds) + "秒继续")
            # time.sleep(seconds)


if __name__ == '__main__':
    huxiu = Huxiu()
    huxiu.do_work()