爬取新浪国内新闻及热评

最新推荐文章于 2024-01-12 10:43:48 发布

zhiyunyao

最新推荐文章于 2024-01-12 10:43:48 发布

阅读量947

点赞数 2

文章标签：网络爬虫大数据分析

本文链接：https://blog.csdn.net/qq_43526642/article/details/98438666

版权

爬取新浪国内新闻及热评

短期内有效

import requests
import json
import re
import pandas
from bs4 import BeautifulSoup
from datetime import datetime
# import sys
# sys.stdout = open('1.out', 'w', encoding='utf-8')

commentURL = 'https://comment.sina.com.cn/page/info?version=1&format=json&channel=gn&newsid=comos-{}&group=undefined&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=3&t_size=3&h_size=3&thread=1&uid=unlogin_user'

sinaURL = 'https://feed.sina.com.cn/api/roll/get?pageid=121&lid=1356&num=20&versionNumber=1.2.4&page={}&encode=utf-8&callback=feedCardJsonpCallback'


def getComments(newsurl):
    hcomment = []
    m = re.search('doc-i(.+).shtml', newsurl)
    newsid = m.group(1)
    commentres = requests.get(commentURL.format(newsid))
    jd = json.loads(commentres.text)
    comment_num = jd['result']['count']['total']
    for h in jd['result']['hot_list']:
        hcomment.append(h['nick'] + ' ' + h['area'] + ' ' + h['content'].strip())
    hcomments='|'.join(hcomment)
    return comment_num, hcomments


def getNewsDetails(newsurl):
    result = {}
    res = requests.get(newsurl)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')

    result['title'] = soup.select('.main-title')[0].text
    ds = soup.select('.date-source')[0]
    date = ds.select('.date')[0].text
    result['dt'] = datetime.strptime(date, '%Y年%m月%d日 %H:%M')
    result['source'] = ds.select('.source')[0].text

    passages = [p.text.strip() for p in soup.select('#article p')[:-1]]
    result['article'] = ' '.join(passages)
    result['editor'] = soup.select('.show_author')[0].text.lstrip(
        '责任编辑：').rstrip()
    result['commentNum'], result['hotcoments'] = getComments(newsurl)
    return result


def parseListLinks(url):
    newdetails = []
    res = requests.get(url)
    res.encoding = 'utf-8'
    m = re.search(r'try\{feedCardJsonpCallback\((.*)\);\}catch\(e\)\{\};',
                  res.text)
    jd = json.loads(m.group(1))

    for ent in jd['result']['data']:
        newdetails.append(getNewsDetails(ent['url']))
    return newdetails


def main(pages):
    news_total = []
    for i in range(1, pages + 1):
        news_total.extend(parseListLinks(sinaURL.format(i)))
    df = pandas.DataFrame(news_total)
    df.to_excel('sinaNews.xlsx')


if __name__ == '__main__':
    main(5)  # 爬取页数, 结果输出到当前目录下sinaNews.xlsx