python爬虫及其分析

re从零开始的代码生活

于 2024-09-11 21:34:31 发布

阅读量109

点赞数

文章标签： python 爬虫开发语言

本文链接：https://blog.csdn.net/qq_46136833/article/details/142150892

版权

此次代码为搜集微博评论以及对于评论内容进行评论情感分析和词云图绘制等功能。

搜集代码如下：

import urllib.parse
import urllib.request
import requests
from bs4 import BeautifulSoup
import pandas as pd

alldata = []

# 定义 URL 和请求头
keyword = '亚文化'

for page in range(1,50):
    url = f'https://s.weibo.com/weibo?q={urllib.parse.quote(keyword)}&Refer=user_weibo&page={page}'
    headers = {
        'referer': 'https://s.weibo.com/',
        'Cookie': 'SINAGLOBAL=3217528387671.389.1710902126918; ALF=1713494144; SUB=_2A25I_jvRDeRhGeFK71MW9CfNwjmIHXVocjEZrDV8PUJbkNAGLVX9kW1NQ0r1inHgjEyADZa5ngTxKjFc2w9WWV_G; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWjy3Toac90fFYhoRnbbfqP5JpX5KzhUgL.FoMXSh2NSh.p1K-2dJLoIE5LxKMLB.zL12qLxK-L12-L1hqLxKnLB.qL1hM0eh.ceBtt; _s_tentry=weibo.com; Apache=9125693823788.748.1710903100151; ULV=1710903100184:2:2:2:9125693823788.748.1710903100151:1710902126952',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0'

    }

    # 构造请求对象并发送请求
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    comments = soup.find_all('div', {'class': 'content', 'node-type': 'like'})
    cards = soup.find_all('div', {'class': 'card-act'})

    for card, comment in zip(cards, comments):
        like_count = card.find('a', {'action-type': 'feed_list_like'}).find('span',
                                                                            {'class': 'woo-like-count'}).get_text()
        comment_count = card.find('a', {'action-type': 'feed_list_comment'}).get_text().strip()

        nickname = comment.find('a', class_='name')['nick-name']
        time = comment.find('div', class_='from').find('a').text
        scomment = comment.find('p', class_='txt').get_text(strip=True)
        like_count = 0 if like_count == '赞' else like_count
        comment_count = 0 if comment_count == '评论' else comment_count
        # 整合数据
        rowdata = [comment_count, like_count, nickname, time, scomment]
        alldata.append(rowdata)

dataframe = pd.DataFrame(data=alldata, columns=['评论数量', '点赞数量', '评论者昵称', '评论时间', '评论内容'])
dataframe.to_excel('亚文化.xlsx', index=False)

分析代码如下：

from collections import Counter
from snownlp import SnowNLP
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import jieba
# 读取Excel文件中的数据
data = pd.read_excel('亚文化.xlsx')
num_comments = len(data)
print("一共读取了", num_comments, "条评论")

# 进行情感分析
data['情感分析'] = data['评论内容'].apply(lambda x: SnowNLP(x).sentiments)

positive_comments = data[data['情感分析'] >= 0.55]


# 假设您的数据存储在名为data的DataFrame中
# 根据您的描述，'情感分析'列包含情感分析的值

# 进行筛选
z_comments = data[(0.45 < data['情感分析']) & (data['情感分析'] < 0.55)]
negative_comments = data[data['情感分析'] <= 0.45]

# 输出人们对评论是积极的还是消极的
print("积极评论数量：", len(positive_comments))
print("中立评论数量：",len(z_comments))
print("消极评论数量：", len(negative_comments))

# 合并所有评论内容
all_comments = ' '.join(data['评论内容'])

# 自定义词频字典，将想要突出显示的内容加入其中
custom_words = {
    "二次元": 3,
    "同性恋": 1,
    # Add more custom words with their frequencies
}

# 使用 replace 方法去除不需要的内容
filtered_comments = all_comments.replace('展开', '').replace('c', '').replace('俄罗斯','').replace('超话','').replace('微博视频','')

# 将处理后的字符串分割为单词，并计算词频
comment_counter = Counter(filtered_comments.split())

# 合并自定义词频字典和评论数据的词频统计
comment_counter.update(custom_words)

# 生成词云图
# 生成词云图
wordcloud = WordCloud(font_path='simhei.ttf', width=800, height=400, scale=2, max_font_size=150, max_words=100, background_color='white', collocations=False).generate_from_frequencies(comment_counter)
plt.figure(dpi=300)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
plt.figure(dpi=300)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# 进行词频分析
words = jieba.lcut(all_comments)

# 去除特殊符号和空格
clean_words = [word.strip() for word in words if word.strip() and word != ' ' and word != '\u200b' and word != '，' and word != '。'and word != '的'and word != '和'and word != '#'and word != '是'and word != '在'and word != '我'and word != '人'and word != '展开'and word != '、']

# 统计词频
word_counts = Counter(clean_words)
top_words = word_counts.most_common(10)
print('词频分析结果:', top_words)