利用python进行QQ聊天分析

Tokeii

已于 2023-03-22 08:29:23 修改

阅读量750

点赞数

分类专栏： python 文章标签： python 开发语言

于 2023-03-22 08:27:42 首次发布

本文链接：https://blog.csdn.net/u010418732/article/details/129701704

版权

python 专栏收录该内容

12 篇文章 0 订阅

订阅专栏

该代码使用Python对聊天记录进行处理，包括解析文件、筛选2023年后的记录、进行情感分析、计算对话间的时间差以及统计词频。通过SnowNLP进行情感分析，使用jieba进行分词，并生成词云图展示高频词汇。

摘要由CSDN通过智能技术生成

import re
import jieba
import pandas as pd
import numpy as np
from datetime import datetime
from snownlp import SnowNLP
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from datetime import timedelta


# 解析聊天记录文件
def parse_chat_records(file_path):
    with open(file_path, encoding='utf-8') as f:
        content = f.read()
    all_messages = re.findall(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) (.*?)\n(.*?)\n', content)

    # 筛选2023年之后的聊天记录
    messages = []
    for date_time, user, message in all_messages:
        year = int(date_time[:4])
        if year >= 2023:
            messages.append((date_time, user, message))

    return messages

# 情感分析
def sentiment_analysis(messages):
    sentiment_scores = []
    for _, _, message in messages:
        if message.strip():  # 检查消息是否为空
            sentiment_scores.append(SnowNLP(message).sentiments)
    return sentiment_scores

# 计算时间差值
def time_differences(messages):
    diffs = []
    for i in range(1, len(messages)):
        time1 = datetime.strptime(messages[i-1][0], '%Y-%m-%d %H:%M:%S')
        time2 = datetime.strptime(messages[i][0], '%Y-%m-%d %H:%M:%S')
        delta = time2 - time1

        # 检查时间差是否小于或等于1天
        if delta <= timedelta(hours=8):
            diffs.append(delta.seconds)
    return diffs

# 词频统计
def word_frequency(messages, stopwords_path=None):
    words = []

    # 读取停用词文件
    stopwords = set()
    if stopwords_path:
        with open(stopwords_path, encoding='utf-8') as f:
            for line in f:
                stopwords.add(line.strip())

    for _, _, message in messages:
        for word in jieba.cut(message):
            # 过滤掉长度为1的词汇和停用词
            if len(word) > 1 and word not in stopwords and word not in ['图片', '表情', '这些', '那些', '就是', '那个', '之前', '一个', '现在']:
                words.append(word)
    counter = Counter(words)
    return counter.most_common()

# 生成词云
def generate_wordcloud(words, file_path):
    if len(words) == 0:
        print('没有足够的词汇生成词云。')
        return

    wc = WordCloud(font_path='simhei.ttf', background_color='white', width=1920, height=1080)
    wc.generate_from_frequencies(dict(words))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.savefig(file_path)
# 生成圆饼图
def generate_pie_chart(labels, values, file_path):
    plt.pie(values, labels=labels, autopct='%1.1f%%')
    plt.savefig(file_path)
    
    
# 主程序
def analyze_chat_records(file_path):
    messages = parse_chat_records(file_path)
    sentiment_scores = sentiment_analysis(messages)
    time_diffs = time_differences(messages)
    words = word_frequency(messages)
    generate_wordcloud(words, 'wordcloud.png')
    
        

    # 输出结果
    print('情感分析（平均分）:', np.mean(sentiment_scores))
    print('每次对话之间的时间差值（平均秒数）:', np.mean(time_diffs))
    print('词频统计（前10）:', words[:10])



if __name__ == '__main__':
    analyze_chat_records('test.txt')