爬取新浪国内新闻及热评
短期内有效
import requests
import json
import re
import pandas
from bs4 import BeautifulSoup
from datetime import datetime
# import sys
# sys.stdout = open('1.out', 'w', encoding='utf-8')
commentURL = 'https://comment.sina.com.cn/page/info?version=1&format=json&channel=gn&newsid=comos-{}&group=undefined&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=3&t_size=3&h_size=3&thread=1&uid=unlogin_user'
sinaURL = 'https://feed.sina.com.cn/api/roll/get?pageid=121&lid=1356&num=20&versionNumber=1.2.4&page={}&encode=utf-8&callback=feedCardJsonpCallback'
def getComments(newsurl):
hcomment = []
m = re.search('doc-i(.+).shtml', newsurl)
newsid = m.group(1)
commentres = requests.get(commentURL.format(newsid))
jd = json.loads(commentres.text)
comment_num = jd['result']['count']['total']
for h in jd['result']['hot_list']:
hcomment.append(h['nick'] + ' ' + h['area'] + ' ' + h['content'].strip())
hcomments='|'.join(hcomment)
return comment_num, hcomments
def getNewsDetails(newsurl):
result = {}
res = requests.get(newsurl)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
result['title'] = soup.select('.main-title')[0].text
ds = soup.select('.date-source')[0]
date = ds.select('.date')[0].text
result['dt'] = datetime.strptime(date, '%Y年%m月%d日 %H:%M')
result['source'] = ds.select('.source')[0].text
passages = [p.text.strip() for p in soup.select('#article p')[:-1]]
result['article'] = ' '.join(passages)
result['editor'] = soup.select('.show_author')[0].text.lstrip(
'责任编辑:').rstrip()
result['commentNum'], result['hotcoments'] = getComments(newsurl)
return result
def parseListLinks(url):
newdetails = []
res = requests.get(url)
res.encoding = 'utf-8'
m = re.search(r'try\{feedCardJsonpCallback\((.*)\);\}catch\(e\)\{\};',
res.text)
jd = json.loads(m.group(1))
for ent in jd['result']['data']:
newdetails.append(getNewsDetails(ent['url']))
return newdetails
def main(pages):
news_total = []
for i in range(1, pages + 1):
news_total.extend(parseListLinks(sinaURL.format(i)))
df = pandas.DataFrame(news_total)
df.to_excel('sinaNews.xlsx')
if __name__ == '__main__':
main(5) # 爬取页数, 结果输出到当前目录下sinaNews.xlsx