python新闻评论分析_python爬取新浪财经新闻正文和评论并对评论做情感分析

最新推荐文章于 2024-05-02 17:43:48 发布

weixin_39637049

最新推荐文章于 2024-05-02 17:43:48 发布

阅读量415

点赞数

文章标签： python新闻评论分析

import urllib.request

from urllib.error import HTTPError

from bs4 import BeautifulSoup

import json

import requests

import pandas as pd

#存入本地txt

def write_article(data,flag):

if flag == 1:

file_name = 'newscontent.txt'

f = open(file_name, 'a', encoding='utf-8')

f.write(data)

f.write("\n\n")

f.close()

elif flag == 2:

file_name = 'newscomment.csv'

name = ['comment','agree']

comments = pd.DataFrame(list(zip(*data)),columns=name)

comments.to_csv(file_name)

else:

file_name = 'newscomment.txt'

f = open(file_name, 'a', encoding='utf-8')

f.write(data)

f.write("\n\n")

f.close()

def get_content(url):

try:

html = urllib.request.urlopen(url)

except HTTPError as e:

return None

try:

obj = BeautifulSoup(html.read(),"html.parser")

except HTTPError as f:

return None

nameList = obj.findAll("div", {"class": "article"})

for name in nameList:

print(name.get_text())

str = name.get_text()

write_article(str,1)

get_content("https://finance.sina.com.cn/chanjing/gsnews/2019-12-11/doc-iihnzahi6659560.shtml")

#定义空数组用来存放评论和评论获赞的数量

listAll=[]

listComments = []

listAgree = []

#获取当前页的评论最多三条热门

comments = requests.get("https://comment.sina.com.cn/page/info?version=1&format=json&channel=cj&newsid=comos-ihnzahi6659560&group=undefined&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=3&t_size=3&h_size=3&thread=1&uid=unlogin_user")

comments.encoding=('utf-8')

comments.text

jd = json.loads(comments.text)

#print(jd)

for x in range(3):

print(jd['result']['hot_list'][x]['content'])

str1 = jd['result']['hot_list'][x]['content']

str2 = jd['result']['hot_list'][x]['agree']

write_article(str1,3)

listComments.append(str1)

listAgree.append(str2)

#打开更多评论查看全部评论

comments2 = requests.get("http://comment.sina.com.cn/page/info?version=1&format=json&channel=cj&newsid=comos-ihnzahi6659560&group=0&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=10&t_size=3&h_size=3&thread=1&uid=unlogin_user")

comments2.encoding=('utf-8')

comments.text

jd1 = json.loads(comments2.text)

count = jd1['result']['count']['thread_show']

#print(jd1)

#-5防止下标越界

for x in range(count-5):

print(jd1['result']['cmntlist'][x]['content'])

str1 = jd1['result']['cmntlist'][x]['content']

str2 = jd1['result']['cmntlist'][x]['agree']

write_article(str1,3)

listComments.append(str1)