python爬取澎湃新闻内容

最新推荐文章于 2024-05-19 23:31:32 发布

想毕业的研究牲

最新推荐文章于 2024-05-19 23:31:32 发布

阅读量176

点赞数 1

文章标签： python 爬虫爬山算法

本文链接：https://blog.csdn.net/Petrichor12345/article/details/138750009

版权

import time
import requests
import csv
import os
import re
from datetime import datetime
from apscheduler.schedulers.blocking import BlockingScheduler
import pandas as pd
import urllib.parse
import json
def init():
    if not os.path.exists('spider.csv'):
        with open('spider.csv','a',encoding='utf8',newline='') as csvfile:
            wirter = csv.writer(csvfile)
            wirter.writerow([
                '新闻id',
                '新闻类型',
                '新闻名',
                '评论数',
                '点赞数',
                '网址',
                '热度值'
            ])

def wirterRow(row):
        with open('spider.csv','a',encoding='utf8',newline='') as csvfile:
            wirter = csv.writer(csvfile)
            wirter.writerow(row)

def get_json(url, nav):
    headers = {
        'User-Agent': '',  # 替换your User-Agent
        'Cookie': '',   # 替换your cookie
        'Content-Type': 'application/json'
    }
    # 发送 POST 请求
    response = requests.post(url, data=json.dumps(nav), headers=headers)

    if response.status_code == 200:
        return response.json()
    else:
        return None


def parse_json(label, response):
    print(response)
    data = response['data']['list']
    print(data)
    for article in data:
        label = label
        id = article['contId']
        name = article['name']
        interactionNum = article['interactionNum']
        praiseTimes = article['praiseTimes']
        hot = praiseTimes + interactionNum
        url = f"https://www.thepaper.cn/newsDetail_forward_{id}"
        wirterRow([
            id,
            label,
            name,
            interactionNum,
            praiseTimes,
            url,
            hot
        ])

def start():
    init()
    list = [
        {"channelId": "128409",
         "excludeContIds": [27145950, 26992923, 25503233, 27211286, 26889619, 26938065, 27027414, 27215250, 27032914,
                            26691161, 25495819, 26207966, 26966986, 27215244, 27215243, 27215242, 26440518, 26889612,
                            26966915, 12256226, 25061346, 27088893, 26691318, 27090044, 27004665, 25808367, 27004663,
                            26678587, 27172977, 25992227, 26682534, 26925039, 27147685, 27148773],
         "listRecommendIds": [], "pageSize": 20,  "pageNum": 2},
        {"channelId": "136261", "excludeContIds": [27240236, 25822121, 27224491, 27007403, 27146035],
         "listRecommendIds": [], "pageSize": 20,  "cornerLabelDesc": 'false', "pageNum": 2},
        {"channelId": "139090", "excludeContIds": [27259483, 27265756], "listRecommendIds": [27259483], "pageSize": 10,
          "cornerLabelDesc": 'false', "pageNum": 2},
        {"channelId": "26916", "excludeContIds": [27265585, 27033129, 27263192, 27265724, 27215250],
         "listRecommendIds": [27215250, 27265724, 27265585, 27263192, 27033129], "pageSize": 20,
          "cornerLabelDesc": 'true', "pageNum": 2},
        {"channelId": "25950",
         "excludeContIds": [27265762, 27265764, 27265769, 27265930, 27265931, 27265771, 27265934, 27265520, 27266096,
                            27266065, 27265906, 27265497, 27261924, 27265725, 27265758, 27265855],
         "listRecommendIds": [27265855, 27265906, 27265769, 27265758, 27265762], "pageSize": 20,
         "cornerLabelDesc": 'false', "pageNum": 2},
        {"channelId": "122908",
         "excludeContIds": [27265841, 27261547, 27262875, 27260795, 27265812, 27259672, 27265934, 27265790],
         "listRecommendIds": [27265790, 27265812, 27259672], "pageSize": 20,
         "cornerLabelDesc": 'false', "pageNum": 2},
        {"channelId": "25951", "excludeContIds": [27265717, 27265718, 27265499],
         "listRecommendIds": [27265718, 27265499], "pageSize": 20, "cornerLabelDesc": 'false',
         "pageNum": 2},
        {"channelId": "119908", "excludeContIds": [27230285, 27240265, 27265935],
         "listRecommendIds": [27240265, 27230285], "pageSize": 20,  "cornerLabelDesc": 'false',
         "pageNum": 2}
    ]

    for pageNum in range(1, 20):
        for nav in list:
            nav["pageNum"] = pageNum
            print('正在爬取类型：' + nav["channelId"] + '中的第' + str(pageNum) + '页数据')
            # 发送请求并处理响应
            url = 'https://api.thepaper.cn/contentapi/nodeCont/getByChannelId'
            response = get_json(url, nav)
            parse_json(nav["channelId"], response)


if __name__ == '__main__':
    # 爬取新闻文章
    start()