python新闻评论分析_python爬取新浪财经新闻正文和评论并对评论做情感分析

import urllib.request

from urllib.error import HTTPError

from bs4 import BeautifulSoup

import json

import requests

import pandas as pd

#存入本地txt

def write_article(data,flag):

if flag == 1:

file_name = 'newscontent.txt'

f = open(file_name, 'a', encoding='utf-8')

f.write(data)

f.write("\n\n")

f.close()

elif flag == 2:

file_name = 'newscomment.csv'

name = ['comment','agree']

comments = pd.DataFrame(list(zip(*data)),columns=name)

comments.to_csv(file_name)

else:

file_name = 'newscomment.txt'

f = open(file_name, 'a', encoding='utf-8')

f.write(data)

f.write("\n\n")

f.close()

def get_content(url):

try:

html = urllib.request.urlopen(url)

except HTTPError as e:

return None

try:

obj = BeautifulSoup(html.read(),"html.parser")

except HTTPError as f:

return None

nameList = obj.findAll("div", {"class": "article"})

for name in nameList:

print(name.get_text())

str = name.get_text()

write_article(str,1)

get_content("https://finance.sina.com.cn/chanjing/gsnews/2019-12-11/doc-iihnzahi6659560.shtml")

#定义空数组 用来存放评论和评论获赞的数量

listAll=[]

listComments = []

listAgree = []

#获取当前页的评论 最多三条热门

comments = requests.get("https://comment.sina.com.cn/page/info?version=1&format=json&channel=cj&newsid=comos-ihnzahi6659560&group=undefined&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=3&t_size=3&h_size=3&thread=1&uid=unlogin_user")

comments.encoding=('utf-8')

comments.text

jd = json.loads(comments.text)

#print(jd)

for x in range(3):

print(jd['result']['hot_list'][x]['content'])

str1 = jd['result']['hot_list'][x]['content']

str2 = jd['result']['hot_list'][x]['agree']

write_article(str1,3)

listComments.append(str1)

listAgree.append(str2)

#打开更多评论查看全部评论

comments2 = requests.get("http://comment.sina.com.cn/page/info?version=1&format=json&channel=cj&newsid=comos-ihnzahi6659560&group=0&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=10&t_size=3&h_size=3&thread=1&uid=unlogin_user")

comments2.encoding=('utf-8')

comments.text

jd1 = json.loads(comments2.text)

count = jd1['result']['count']['thread_show']

#print(jd1)

#-5防止下标越界

for x in range(count-5):

print(jd1['result']['cmntlist'][x]['content'])

str1 = jd1['result']['cmntlist'][x]['content']

str2 = jd1['result']['cmntlist'][x]['agree']

write_article(str1,3)

listComments.append(str1)

listAgree.append(str2)

#把评论和赞放到一个数组里面

listAll.append(listComments)

listAll.append(listAgree)

#将评论写入cvs文件

#print(listAll)

write_article(listAll,2)

以下是一个简单的例子,使用Python和BeautifulSoup爬取新浪财经新闻并进行情感分析。 首先,需要安装必要的库:BeautifulSoup、requests和TextBlob(进行情感分析)。 ```python from textblob import TextBlob import requests from bs4 import BeautifulSoup ``` 接下来,定义一个函数来获取新浪财经网站上的新闻标题和内容: ```python def get_news(url): # 发送请求 response = requests.get(url) # 解析HTML soup = BeautifulSoup(response.text, 'html.parser') # 获取新闻标题 title = soup.find('h1', class_='main-title').text # 获取新闻内容 content = '' for p in soup.find_all('p'): content += p.text return title, content ``` 接下来,定义一个函数来进行情感分析: ```python def analyze_sentiment(text): # 使用TextBlob进行情感分析 blob = TextBlob(text) sentiment_score = blob.sentiment.polarity if sentiment_score > 0: return 'positive' elif sentiment_score < 0: return 'negative' else: return 'neutral' ``` 最后,将所有代码放在一起: ```python from textblob import TextBlob import requests from bs4 import BeautifulSoup def get_news(url): # 发送请求 response = requests.get(url) # 解析HTML soup = BeautifulSoup(response.text, 'html.parser') # 获取新闻标题 title = soup.find('h1', class_='main-title').text # 获取新闻内容 content = '' for p in soup.find_all('p'): content += p.text return title, content def analyze_sentiment(text): # 使用TextBlob进行情感分析 blob = TextBlob(text) sentiment_score = blob.sentiment.polarity if sentiment_score > 0: return 'positive' elif sentiment_score < 0: return 'negative' else: return 'neutral' # 测试 url = 'https://finance.sina.com.cn/stock/usstock/c/2022-05-11/doc-imcwipii6960623.shtml' title, content = get_news(url) sentiment = analyze_sentiment(content) print('标题:', title) print('情感分析:', sentiment) ``` 这个例子只是一个简单的示例,还有很多可以改进的地方。例如,可以使用更高级的情感分析算法,或者使用多线程/多进程来加快爬取速度等等。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值