读取csv文件,打印列名称:
import pandas as pd
# data = pd.read_csv("guba_fc_result_20230413.csv")
data = pd.read_csv("guba_all_newtext_20230413.csv")
data.columns
保存文件:
data.to_csv("guba_all_cutwords_20230413.csv",index=False)
统计:
data['ticker_name'].value_counts()
字符串长度过滤:
filtered_df = data[data['matches'] != '[]']
long_text = filtered_df[filtered_df['text'].str.len() > 100]
画字符串长度直方图:
import numpy as np
from matplotlib import pyplot as plt
len_text = [len(text) for text in filtered_df['text']]
#len_text = [len(text) for text in data['content']]
#len_text = [len(text) for text in data['rateContent']]
plt.figure(figsize=(20,8),dpi=80)
plt.hist(len_text,bins=20)
plt.show()
按字符串名称过滤:
v_data = data[data['ticker_name'].isin(['迈瑞医疗'])]
v_data = v_data[v_data['post_date'].isin(['2023-03-01'])]
去除nan值:
data.dropna(inplace=True)
合并同名称的数据:
#所有的相同股票的数据合并在一起
# 根据ticker_name列对数据进行分组,并将每个分组的seg数据合并在一起
data = data.groupby('ticker_name')['seg'].apply(lambda x: ' '.join(x)).reset_index()
data
按字符串长度过滤数据:
# 计算seg列中词个数
data['word_count'] = data['seg'].str.split().apply(len)
# 保留词个数超过200的行
data = data[data['word_count'] > 200]
# 移除word_count列
data = data.drop('word_count', axis=1)
data
统计分词词数:
word_counts = data.groupby('ticker_name')['seg'].apply(lambda x: sum(len(text.split()) for text in x)).reset_index()
# 输出结果
print(word_counts)
对分词结果分组,保存新的行:
import math
def split_seg(seg, chunk_size):
chunks = []
words = seg.split()
num_chunks = math.ceil(len(words) / chunk_size)
# print("num_chunks:",num_chunks)
for i in range(num_chunks):
start = i * chunk_size
end = start + chunk_size
chunk = ' '.join(words[start:end])
chunks.append(chunk)
return chunks
# 分割seg列
new_rows = []
for _, row in data.iterrows():
ticker_name = row['ticker_name']
seg = row['seg']
num_words = len(seg.split())
if num_words > 1000:
chunked_segs = split_seg(seg, 3000)
for i, chunk in enumerate(chunked_segs):
new_ticker_name = ticker_name + '_' + str(i)
new_rows.append({'ticker_name': new_ticker_name, 'seg': chunk})
else:
new_rows.append({'ticker_name': ticker_name, 'seg': seg})
# 创建新的DataFrame
new_data = pd.DataFrame(new_rows)
new_data
对分组分词使用tfidf算法:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
# 定义tokenizer函数
def tokenizer(text):
return text.split()
# 计算tf-idf值
tfidf = TfidfVectorizer(tokenizer=tokenizer, stop_words='english')
tfidf_matrix = tfidf.fit_transform(new_data['seg'])
# 获取特征名列表
feature_names = tfidf.get_feature_names()
# 遍历每篇文章
for _, group in new_data.groupby('ticker_name'):
# 获取tf-idf矩阵
tfidf_scores = tfidf_matrix[group.index, :]
# 计算每个词的tf-idf值
word_scores = list(zip(feature_names, tfidf_scores.sum(axis=0).tolist()[0]))
# 按tf-idf值从大到小排序
word_scores = sorted(word_scores, key=lambda x: x[1], reverse=True)
# 打印文章中tf-idf值最高的前10个词
print(group['ticker_name'].iloc[0])
for word, score in word_scores[:10]:
print(word, score)
print()