import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os
from collections import Counter
import time
# import requests
# from scrapy import Selector
# import seaborn as sns
import jieba
import jieba.posseg as psg
plt.rcParams['font.family'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
#
wk_dir = "2022——爬虫与数据分析"
data_dir = "2022——爬虫与数据分析/data_dir_爬虫课程"
#
# import jieba.analyse
#
# file = "sanguo.txt"
# topK = 12
# content = open(file, 'rb').read()
#
# tags = jieba.analyse.extract_tags(content, topK=topK)
# print(tags)
# # ['玄德', '程远志', '张角', '云长', '张飞', '黄巾', '封谞', '刘焉', '邓茂', '邹靖', '姓名', '招军']
#
# # withWeight=True:将权重值一起返回
# tags = jieba.analyse.extract_tags(content, topK=topK, withWeight=True)
# print(tags)
# # [('玄德', 0.1038549799467099), ('程远志', 0.07787459004363208), ('张角', 0.0722532891360849),
# # ('云长', 0.07048801593691037), ('张飞', 0.060972692853113214), ('黄巾', 0.058227157790330185),
# # ('封谞', 0.0563904127495283), ('刘焉', 0.05470798376886792), ('邓茂', 0.04917692565566038),
# # ('邹靖', 0.04427258239705188), ('姓名', 0.04219704283997642), ('招军', 0.04182041076757075)]
#
#
# import jieba
# import jieba.posseg as pseg
#
# # 默认模式
# seg_list = pseg.cut("今天哪里都没去,在家里睡了一天")
# for word, flag in seg_list:
# print(word + " " + flag)
#
# """
# 使用 jieba 默认模式的输出结果是:
# 我 r
# Prefix dict has been built successfully.
# 今天 t
# 吃 v
# 早饭 n
# 了 ul
# """
#
# # paddle 模式
# words = pseg.cut("我今天吃早饭了", use_paddle=True)
# """
# 使用 paddle 模式的输出结果是:
# 我 r
# 今天 TIME
# 吃 v
# 早饭 n
# 了 xc
# """
#
# import paddlehub as hub
# senta = hub.Module(name="senta_bilstm")
# test_text = ["这家餐厅很好吃", "这部电影真的很差劲", "我爱自然语言处理"]
# input_dict = {"text": test_text}
# results = senta.sentiment_classify(data=input_dict)
# for result in results:
# print(result)
#
# {'positive_probs': 0.9363, 'text': '这家餐厅很好吃', 'sentiment_key': 'positive', 'negative_probs': 0.0637,
# 'sentiment_label': 2}
# {'positive_probs': 0.0213, 'text': '这部电影真的很差劲', 'sentiment_key': 'negative', 'negative_probs': 0.9787,
# 'sentiment_label': 0}
# {'positive_probs': 0.9501, 'text': '我爱自然语言处理', 'sentiment_key': 'positive', 'negative_probs': 0.0499,
# 'sentiment_label': 2}
'''
读入数据
'''
#---------------------------------------------------------#
#---- * step 1 数据清理 * ----#
#---------------------------------------------------------#
os.listdir(data_dir)
df = pd.read_excel(os.path.join(data_dir, "new_new_wukelan_(3-26)_pageByPage_0-200.xlsx"))
df.shape
#---------------------------------------------------------#
#---- * 清理 order * ----#
#---------------------------------------------------------#
def clean_order(df):
order = []
for i in range(df.shape[0]):
doi = df.iloc[i, ]['web-scraper-order']
doi = re.sub(r'(\d.+?-)(\d)', r'\2', doi)
order.append(doi)
return order
df.keys()
content = df['content_of_postPage']
content
#---------------------------------------------------------#
#---- * 筛选出带有 aissuR airkU 的帖子。 * ----#
#---------------------------------------------------------#
import matplotlib.pyplot as plt
import numpy as np
def demo_sample_fun(n_of_sample_size, times_of_sampling):
pop = np.arange(start = 6, stop=100)
sample_mean = []
for i in range(times_of_sampling):
sample_of_one_time = np.random.choice(pop, size=n_of_sample_size, replace=False)
mean_of_sample = np.mean(sample_of_one_time)
sample_mean.append(mean_of_sample)
plt.hist(sample_mean)
demo_sample_fun(2, 100)
demo_sample_fun(3, 100)
demo_sample_fun(50, 10000)
plt.show()
content
#---------------------------------------------------------#
#---- * step 1 数据清理 * ----#
#---------------------------------------------------------#
os.listdir(data_dir)
df = pd.read_excel(os.path.join(data_dir, "new_new_wukelan_(3-26)_pageByPage_0-200.xlsx"))
df.shape
content = df['content_of_postPage']
#---------------------------------------------------------#
#---- * step 2 情感分析 * ----#
#---------------------------------------------------------#
lst_of_country = []
for ct in content:
if isinstance(ct, str):
if "aissuR" in ct or "airkU" in ct:
lst_of_country.append(ct)
else:
pass
else:
pass
len(lst_of_country)
lst_of_country[0:2]
#---------------------------------------------------------#
#---- * 把帖子 分句, 按照国家 分句 * ----#
#---------------------------------------------------------#
pattern = r'。|?|!'
lst_of_sentence = []
russ_sentence = []
ukr_sentence = []
russ_uk_sentence = []
for post in lst_of_country:
result_list = re.split(pattern, post)
for res in result_list:
lst_of_sentence.append(res)
if "aissuR" in res and "airkU" in res:
russ_uk_sentence.append(res)
elif "aissuR" in res:
russ_sentence.append(res)
elif "airkU" in res:
ukr_sentence.append(res)
else:
pass
#---------------------------------------------------------#
#---- * 切分成了 短句, 每一个短句 至少包括 airkU或者aissuR * ----#
#---------------------------------------------------------#
len(russ_sentence)
len(ukr_sentence)
russ_sentence[0]
ukr_sentence[0]
#---------------------------------------------------------#
#---- * 使用飞桨 分析情感 * ----#
#---------------------------------------------------------#
import paddlehub as hub
senta = hub.Module(name="senta_bilstm")
#---------------------------------------------------------#
#---- * 分析 对“aissuR” 的情感标签 * ----#
#---------------------------------------------------------#
test_text = russ_sentence
input_dict = {"text":test_text}
results = senta.sentiment_classify(data = input_dict)
results[0]
type_of_senti_rus = []
for result in results:
x = result['sentiment_key']
type_of_senti_rus.append(x)
# print(result)
type_of_senti_rus
pd.Series(type_of_senti_rus).value_counts()
#---------------------------------------------------------#
#---- * 分析 对“airkU” 的情感标签 * ----#
#---------------------------------------------------------#
test_text = ukr_sentence
input_dict = {"text":test_text}
results = senta.sentiment_classify(data = input_dict)
results[0]
type_of_senti_uk = []
for result in results:
x = result['sentiment_key']
type_of_senti_uk.append(x)
# print(result)
type_of_senti_uk
uk = pd.Series(type_of_senti_uk).value_counts()
rus = pd.Series(type_of_senti_rus).value_counts()
df_sentiment = pd.DataFrame({"russia": rus, "ukria":uk})
df_sentiment = df_sentiment.T
df_sentiment
df_sentiment['percentage'] = df_sentiment['positive'] / (df_sentiment['negative'] + df_sentiment['positive'])
df_sentiment['percentage']
df_sentiment
#---------------------------------------------------------#
#---- * 从情感比例来看, 对aissuR的支持 稍微高一些 * ----#
#---------------------------------------------------------#
#---------------------------------------------------------#
#---- *step 3 制作词云图 * ----#
#---------------------------------------------------------#
#---------------------------------------------------------#
#---- *step3.1 jieba 分词 * ----#
#---------------------------------------------------------#
content
# 导入词典 停词表 , 例外词表
jieba.load_userdict("ukria/data_dir_of_ukVsRs/user_dict_ukria.txt")
stopwords = open("JKM/data_dir_jkm/stopwords_baidu.txt", encoding="utf8").readlines()
stopwords = [a.strip() for a in stopwords]
out_list = ['。', '的', '了', ',', '?', ':', '(', ', ', ')', '“', '、', ',', '”', '…', '!', '[', ']', ',', '*', '!', '(', '-', '→', ':', ')', '.', '......', '...', '......',
'《', ]
content
all_word = []
for ct in content:
if isinstance(ct, str):
seg_list = jieba.lcut(ct, cut_all=False)
for seg in seg_list:
if seg not in stopwords and seg not in out_list and len(seg)>=2:
all_word.append(seg)
else:
pass
else:
pass
#---------------------------------------------------------#
#---- * 统计词频 * ----#
#---------------------------------------------------------#
c = Counter(all_word)
c.most_common(100)
c.most_common(300)
#---------------------------------------------------------#
#---- * 使用 频率最高的前 300个词作图 * ----#
#---------------------------------------------------------#
from wordcloud import WordCloud
wc = WordCloud(
scale = 4,
background_color='white',
font_path="c:\\windows\\fonts\\SimHei.ttf",
prefer_horizontal= True
)
most_freq_word_dct = dict(c.most_common(300))
most_freq_word_dct
wc.generate_from_frequencies(most_freq_word_dct)
plt.imshow(wc)
plt.tight_layout()
plt.xticks([])
plt.yticks([])
#---------------------------------------------------------#
#---- * 关键词 共现矩阵生成 * ----#
#---------------------------------------------------------#
content[0]
"""
高频 300词
"""
most_freq_word_dct
top300_words = [ x[0] for x in most_freq_word_dct.items() ]
for k, v in most_freq_word_dct.items():
print(k)
top300_words
#---------------------------------------------------------#
#---- * 把 content 分词 做成 语料库 corpus * ----#
#---------------------------------------------------------#
corpus = []
for ct in content:
if isinstance(ct, str):
seg_list = jieba.lcut(ct, cut_all = False)
topword_in_sentence = list(filter(lambda x: x in top300_words, seg_list))
if len(topword_in_sentence) >=2:
corpus.append(topword_in_sentence)
else:
pass
else:
pass
len(corpus)
corpus[0]
corpus[1]
corpus[3]
# 构建 DataFrame
#---------------------------------------------------------#
#---- * 返回一个 关键词 共现矩阵 * ----#
#---------------------------------------------------------#
def makeK_coExist_matrix(corpus, topwords):
"""
:param corpus: 语料库 列表 的列表格式 ,分词好的
:param topwords: 关键词 列表
:return: 返回 df
"""
shp_of_matrix = len(topwords)
df_coEx_test = pd.DataFrame(np.zeros([shp_of_matrix, shp_of_matrix]), index =topwords, columns = topwords)
for cp in corpus:
# cp = eval(cp) ######### 这一句可以没有
for i in range(len(cp)):
wordi = cp[i]
if wordi in topwords:
for j in range(i, len(cp)):
wordj = cp[j]
if wordi == wordj:
pass
elif wordj in topwords:
df_coEx_test.loc[wordi, wordj] += 1
# print("---"*10, '\n',cp)
# print("************************")
# print(wordi, wordj)
# print("************************")
# break
# break
break
return df_coEx_test
df_coEx = makeK_coExist_matrix(corpus, top300_words)
df_coEx
#---------------------------------------------------------#
#---- * 把权重的df 转换成 node weight 的csv 文件的df 方便写入 * ----#
#---------------------------------------------------------#
def make_nodeFile(df_result_coex):
Source = []
Target = []
Weight = []
for i in range(df_result_coex.shape[0]):
for j in range(0, i):
source = df_result_coex.index[i]
target = df_result_coex.columns[j]
weight = df_result_coex.iloc[i, j]
Source.append(source)
Target.append(target)
Weight.append(weight)
df_node = pd.DataFrame({"Source": Source, "Target": Target, "Weight": Weight})
return df_node
df_node = make_nodeFile(df_coEx)
#---------------------------------------------------------#
#---- * 筛选之后 , 写入 csv 文件 * ----#
#---------------------------------------------------------#
df_node.shape
df_node.sort_values('Weight', ascending=False, inplace=True )
df_node.head()
df_node.tail()
df_node = df_node[df_node['Source'] != "点击"]
df_node = df_node[df_node['Target'] != "点击"]
df_node = df_node[df_node['Source'] != "展开"]
df_node = df_node[df_node['Target'] != "展开"]
df_node = df_node[df_node['Source'] != "完整"]
df_node = df_node[df_node['Target'] != "完整"]
df_node = df_node[df_node['Source'] != "图片"]
df_node = df_node[df_node['Target'] != "图片"]
df_node.head()
#---------------------------------------------------------#
#---- * 选择 前面 1000个 节点 * ----#
#---------------------------------------------------------#
df_node2 = df_node.iloc[0:1000, ]
df_node2.shape
df_node2
df_node2.head()
df_node3 = df_node2[df_node2['Weight']>=5]
df_node3.shape
df_node3.to_excel(os.path.join(data_dir, "df_node_and_edge_first3.xlsx"), index = False)
纯技术贴啊。
因为 红色 步步哦。所以弄成这样的。只贴上代买。
效果如下: