python爬虫(三)之虎嗅网汽车文章爬虫
闲来没事,闲鱼上有个好兄弟要我从虎嗅网上抓一些汽车文章的爬虫,于是大力出奇迹,我写了一个python程序,将这个网站上所有的汽车文章全部抓取下来了,存储到了本地的虎嗅.csv
。
import requests
import json
import csv
from lxml import etree
import time
import random
from datetime import datetime
class Huxiu:
def __init__(self):
self.article_list_pre_url = "https://api-article.huxiu.com/web/channel/articleList"
self.article_list_post_url = "&pageSize=10&orderBy=createTime&order=DESC&isProfessional=true&userType=0"
self.start_page = 1
self.end_page = 1000
self.article_list_headers = {
'authority': 'api-article.huxiu.com',
'accept': 'application/json, text/plain, */*',
'accept-language': 'zh-CN,zh;q=0.9',
'content-type': 'application/x-www-form-urlencoded',
'cookie': 'Hm_lvt_502e601588875750790bbe57346e972b=1710422257; huxiu_analyzer_wcy_id=9wau9zilte4pu8mg6b7z; hx_object_visit_referer_1_2702514=https%3A%2F%2Fwww.huxiu.com%2Fchannel%2F21.html; Hm_lpvt_502e601588875750790bbe57346e972b=1710422520',
'origin': 'https://www.huxiu.com',
'referer': 'https://www.huxiu.com/',
'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
}
self.article_detail_headers = {
'authority': 'www.huxiu.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'cookie': 'Hm_lvt_502e601588875750790bbe57346e972b=1710422257; huxiu_analyzer_wcy_id=9wau9zilte4pu8mg6b7z; Hm_lpvt_502e601588875750790bbe57346e972b=1710422520',
'referer': 'https://www.huxiu.com/channel/21.html',
'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
}
def post_request(self, url, headers, payload):
response = requests.request("POST", url, headers=headers, data=payload)
return response.text
def get_request(self, url, headers):
response = requests.request("GET", url, headers=headers)
return response.text
def do_work(self):
with open('虎嗅.csv', 'w', newline='', encoding='utf-8-sig') as file:
writer = csv.writer(file)
csv_title = ["标题", "作者", "发布时间", "原文地址", "正文"]
writer.writerow(csv_title)
page_no = 1
last_time = 1710425737
# 最早的时间戳 1684505520
while True:
print("=====================> 当前第" + str(page_no) + "页 =======================")
payload = 'platform=www&last_time=' + str(last_time) + '&channel_id=21'
print(datetime.fromtimestamp(last_time).strftime('%Y-%m-%d %H:%M:%S'))
text = self.post_request(self.article_list_pre_url, headers=self.article_list_headers, payload=payload)
json_data = json.loads(text)
data = json_data["data"]["datalist"]
if len(data) <= 0:
break
self.write_page(writer, data)
last_time = int(json_data["data"]["last_time"])
page_no += 1
def write_page(self, writer, data):
for item in data:
# print(item["title"])
# print(item["author"]["username"])
# print(item["created_at"])
# 获取文章详情内容
# https://www.xchuxing.com/article/116378
article_url = "https://www.huxiu.com/article/" + str(item["aid"]) + ".html"
text = self.get_request(article_url, headers=self.article_detail_headers)
html = etree.HTML(text)
# //*[@id="nice"]/div/div[1]
result = html.xpath('normalize-space(//*[@id="article-content"])')
# time_struct = time.localtime(item["created_at"])
# date = time.strftime("%Y-%m-%d %H:%M:%S", time_struct)
row = [item["title"], item["user_info"]["username"], article_url, item["formatDate"], result]
writer.writerow(row)
# seconds = random.randint(1, 4)
print("===========> 当前文章 " + article_url + " 写入完毕")
# print("===========> 当前文章 " + article_url + " 写入完毕,等待" + str(seconds) + "秒继续")
# time.sleep(seconds)
if __name__ == '__main__':
huxiu = Huxiu()
huxiu.do_work()
下面是程序的运行结果,最终的数据存储在同级目录下的虎嗅.csv
文件中
写在最后
代码精选(www.codehuber.com),程序员的终身学习网站已上线!
如果这篇【文章】有帮助到你,希望可以给【JavaGPT】点个赞👍,创作不易,如果有对【后端技术】、【前端领域】感兴趣的小可爱,也欢迎关注❤️❤️❤️ 【JavaGPT】❤️❤️❤️,我将会给你带来巨大的【收获与惊喜】💝💝💝!