import time
import requests
import csv
import os
import re
from datetime import datetime
from apscheduler.schedulers.blocking import BlockingScheduler
import pandas as pd
import urllib.parse
import json
def init():
if not os.path.exists('spider.csv'):
with open('spider.csv','a',encoding='utf8',newline='') as csvfile:
wirter = csv.writer(csvfile)
wirter.writerow([
'新闻id',
'新闻类型',
'新闻名',
'评论数',
'点赞数',
'网址',
'热度值'
])
def wirterRow(row):
with open('spider.csv','a',encoding='utf8',newline='') as csvfile:
wirter = csv.writer(csvfile)
wirter.writerow(row)
def get_json(url, nav):
headers = {
'User-Agent': '', # 替换your User-Agent
'Cookie': '', # 替换your cookie
'Content-Type': 'application/json'
}
# 发送 POST 请求
response = requests.post(url, data=json.dumps(nav), headers=headers)
if response.status_code == 200:
return response.json()
else:
return None
def parse_json(label, response):
print(response)
data = response['data']['list']
print(data)
for article in data:
label = label
id = article['contId']
name = article['name']
interactionNum = article['interactionNum']
praiseTimes = article['praiseTimes']
hot = praiseTimes + interactionNum
url = f"https://www.thepaper.cn/newsDetail_forward_{id}"
wirterRow([
id,
label,
name,
interactionNum,
praiseTimes,
url,
hot
])
def start():
init()
list = [
{"channelId": "128409",
"excludeContIds": [27145950, 26992923, 25503233, 27211286, 26889619, 26938065, 27027414, 27215250, 27032914,
26691161, 25495819, 26207966, 26966986, 27215244, 27215243, 27215242, 26440518, 26889612,
26966915, 12256226, 25061346, 27088893, 26691318, 27090044, 27004665, 25808367, 27004663,
26678587, 27172977, 25992227, 26682534, 26925039, 27147685, 27148773],
"listRecommendIds": [], "pageSize": 20, "pageNum": 2},
{"channelId": "136261", "excludeContIds": [27240236, 25822121, 27224491, 27007403, 27146035],
"listRecommendIds": [], "pageSize": 20, "cornerLabelDesc": 'false', "pageNum": 2},
{"channelId": "139090", "excludeContIds": [27259483, 27265756], "listRecommendIds": [27259483], "pageSize": 10,
"cornerLabelDesc": 'false', "pageNum": 2},
{"channelId": "26916", "excludeContIds": [27265585, 27033129, 27263192, 27265724, 27215250],
"listRecommendIds": [27215250, 27265724, 27265585, 27263192, 27033129], "pageSize": 20,
"cornerLabelDesc": 'true', "pageNum": 2},
{"channelId": "25950",
"excludeContIds": [27265762, 27265764, 27265769, 27265930, 27265931, 27265771, 27265934, 27265520, 27266096,
27266065, 27265906, 27265497, 27261924, 27265725, 27265758, 27265855],
"listRecommendIds": [27265855, 27265906, 27265769, 27265758, 27265762], "pageSize": 20,
"cornerLabelDesc": 'false', "pageNum": 2},
{"channelId": "122908",
"excludeContIds": [27265841, 27261547, 27262875, 27260795, 27265812, 27259672, 27265934, 27265790],
"listRecommendIds": [27265790, 27265812, 27259672], "pageSize": 20,
"cornerLabelDesc": 'false', "pageNum": 2},
{"channelId": "25951", "excludeContIds": [27265717, 27265718, 27265499],
"listRecommendIds": [27265718, 27265499], "pageSize": 20, "cornerLabelDesc": 'false',
"pageNum": 2},
{"channelId": "119908", "excludeContIds": [27230285, 27240265, 27265935],
"listRecommendIds": [27240265, 27230285], "pageSize": 20, "cornerLabelDesc": 'false',
"pageNum": 2}
]
for pageNum in range(1, 20):
for nav in list:
nav["pageNum"] = pageNum
print('正在爬取类型:' + nav["channelId"] + '中的第' + str(pageNum) + '页数据')
# 发送请求并处理响应
url = 'https://api.thepaper.cn/contentapi/nodeCont/getByChannelId'
response = get_json(url, nav)
parse_json(nav["channelId"], response)
if __name__ == '__main__':
# 爬取新闻文章
start()
python爬取澎湃新闻内容
最新推荐文章于 2024-05-19 23:31:32 发布