文章目录
数据源
kaggle:https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge
数据预览:
{
"paper_id": "0ab9404566ff5d8d81aebd0e9d9128b9589d9015",
"metadata": {
"title": "A CASE STUDY OF BRAZIL",
"authors": [
{
"first": "Pedro",
"middle": [
"V"
],
"last": "Savi",
"suffix": "",
"affiliation": {
"laboratory": "",
"institution": "Universidade Federal do Rio de Janeiro",
"location": {
"addrLine": "21.941.972 -Rio de Janeiro -RJ",
"postBox": "P.O. Box 68.503",
"country": "Brazil"
}
},
"email": "pedrov.savi@gmail.com"
},
]
},
............................好长好长好长好长
数据预处理
1. json数据提取
# 1. 读取文件
import os
import json
def readfile(path):
files = os.listdir(path)
file_list = []
for file in files: # 遍历文件夹
if not os.path.isdir(file):
file_list.append(path + '/' + file)
return file_list
file_list = readfile('data')
all_content = []
# todo for all files
for file_path in file_list:
with open(file_path, 'r', encoding='utf8')as fp:
paper_data = json.load(fp)
all_abstract = paper_data['abstract']
# body_texts = paper_data['body_text']
content = ''
for text in all_abstract:
content += text['text']
all_content.append(content)
2.文本转小写
for i, content in enumerate(all_content):
all_content[i] = content.lower() # 文本转小写
3.词形还原
首先我们需要先安装nltk,然后下载wordnet工具,运行下面的代码就可以:
import nltk
nltk.download('wordnet')
如果不幸被墙了,可以到这个网站下载:http://www.nltk.org/nltk_data/
wordnet的具体下载地址是:https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/wordnet.zip
具体的离线安装方法在网址:http://www.nltk.org/data.html
词形还原的代码如下:
from sklearn.feature_extraction.text import TfidfVectorizer
# 3.词形还原
for i, content in enumerate(all_content):
lemmatizer = WordNetLemmatizer()
all_content[i] = lemmatizer.lemmatize(content)
4.分词
首先要下载NLTK包,如果词形还原处已经安装,则可跳过。
import nltk.tokenize as tk
tokenizer = tk.WordPunctTokenizer()
for i, content in enumerate(all_content):
tokens = tokenizer.tokenize(content)
all_content[i] = tokens
5.剔除停用词
def drop_stopwords(contents, stopwords):
contents_clean = []
for line in contents:
line_clean = []
for word in line:
if word in stopwords:
continue
elif not word.isalpha(): # 只保留全部是字母的单词
continue
line_clean.append(word)
contents_clean.append(line_clean)
return contents_clean
f = open("eng_stop_words.txt", "r") # 设置文件对象
stopwords = [i.strip() for i in f.readlines()]
content_words_list = drop_stopwords(all_content, stopwords)
这里我们也可以暂时保存一下:
import pandas as pd
data = pd.DataFrame(content_words_list)
data.to_csv('data.csv', header=None, index=None)
6.词频统计
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
corpus = []
for content in content_words_list:
str_content = " ".join(content)
corpus.append(str_content)
vectorizer = CountVectorizer()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(
vectorizer.fit_transform([corpus[0]])) # 这里统计corpus[0]的词频
word_count = vectorizer.vocabulary_ # word_count 即为词频
这里我们可以对词频绘图:
import matplotlib.pyplot as plt
import seaborn as sns
word_sort = sorted(word_count.items(), key=lambda d: d[1], reverse=True)
word_sort = word_sort[:16]
x = []
y = []
for word in word_sort:
x.append(word[0])
y.append(word[1])
plt.style.use({'figure.figsize': (17, 8)})
sns.barplot(x, y, palette="BuPu_r")
plt.ylabel('Word count')
# 数据可视化:柱状图
sns.despine(bottom=True)
plt.show()
得到这样的图片:
或者随机选取一些单词,然后绘制他们的词频图:
import matplotlib.pyplot as plt
import seaborn as sns
import random
word_sort = sorted(word_count.items(), key=lambda d: d[1], reverse=True)
word_sort = random.sample(word_sort, 16)
x = []
y = []
for word in word_sort:
x.append(word[0])
y.append(word[1])
plt.style.use({'figure.figsize': (17, 8)})
sns.barplot(x, y, palette="BuPu_r")
plt.ylabel('Word count')
# 数据可视化:柱状图
sns.despine(bottom=True)
plt.show()
或是做一个常见的词云:
import numpy as np
import wordcloud # 词云展示库
from PIL import Image # 图像处理库
plt.style.use({'figure.figsize': (10, 16)})
mask = np.array(Image.open('amvw4-olmxk.jpg')) # 定义词频背景
wc = wordcloud.WordCloud(
mask=mask, # 设置背景图
max_words=50, # 最多显示词数
max_font_size=100 # 字体最大值
)
word_cloud = dict(word_sort)
wc.generate_from_frequencies(word_cloud) # 从字典生成词云
image_colors = wordcloud.ImageColorGenerator(mask) # 从背景图建立颜色方案
wc.recolor(color_func=image_colors) # 将词云颜色设置为背景图方案
plt.imshow(wc) # 显示词云
plt.axis('off') # 关闭坐标轴
plt.show() # 显示图像
原图是这样的:
数据分析
1.TF-IDF——文本相似度
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import heapq
word = vectorizer.get_feature_names() # 获取词袋模型中的所有词语
weight = tfidf.toarray() # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
# 根据TF-IDF得到第k篇文章的主题词
k = 3
paper_weight = list(weight[k])
max_num_index = list(map(paper_weight.index, heapq.nlargest(10, paper_weight)))
# 这就是TF-IDF值最大前10个代表词
tfidf_topic_word = []
for index in max_num_index:
tfidf_topic_word.append(word[index])
2.LDA 建模——主题提取
首先需要将corpus的数据保存到本地,也方便放到服务器上训练,训练时间受数据量影响比较大
LDA python实现:https://blog.csdn.net/wind_blast/article/details/53815757
import pandas as pd
data = pd.DataFrame(corpus)
data.to_csv('data.txt', header=None, index=None)
这是LDA建模的核心代码
from gensim import corpora
import gensim # pip install gensim
def get_topic(all_contents, num_topic=10):
# num_topic 定义LDA模型需要训练成多少类
try:
def lda_analyze(all_contents, num_topic=10):
"""这是训练LDA的核心方法"""
dictionary = corpora.Dictionary(all_contents)
corpus = [dictionary.doc2bow(sentence) for sentence in all_contents]
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topic) # 核心代码
return lda
# all_contents is list to list
lda = lda_analyze(all_contents, num_topic=num_topic)
print("------------------------- " + str(num_topic) + " ---------------")
for topic in lda.print_topics(num_words=10): # 这里是打印LDA分类的结果
print(topic[1])
# save model
# lda.save('lda_' + str(num_topic) + '.model')
except Exception as e:
print(e)
# 整合data的核心代码
data = list(iter(open('data.txt')))
data = [content.split() for content in data]
for i in range(16):
get_topic(data, i + 1) # 从分为1个类别到16个类别,都跑一跑,然后把结果保存下来
效果如下(可能是训练量太少了):
由于LDA是无监督分类方法,因此分割线上的数字表示将原文本分成多少类
------------------------- 1 ---------------
0.010*"time" + 0.009*"þ" + 0.009*"covid" + 0.008*"infected" + 0.007*"disease" + 0.007*"data" + 0.007*"model" + 0.007*"à" + 0.006*"population" + 0.006*"system"
------------------------- 2 ---------------
0.010*"infected" + 0.010*"time" + 0.008*"data" + 0.008*"covid" + 0.007*"model" + 0.007*"population" + 0.006*"þ" + 0.006*"rate" + 0.006*"disease" + 0.005*"system"
0.011*"time" + 0.011*"þ" + 0.009*"covid" + 0.008*"disease" + 0.008*"à" + 0.007*"infected" + 0.006*"system" + 0.006*"model" + 0.006*"infection" + 0.006*"data"
------------------------- 3 ---------------
0.012*"covid" + 0.009*"infected" + 0.009*"þ" + 0.009*"time" + 0.007*"disease" + 0.007*"à" + 0.006*"infection" + 0.006*"model" + 0.006*"data" + 0.005*"rate"
0.014*"þ" + 0.014*"time" + 0.010*"à" + 0.009*"disease" + 0.009*"system" + 0.008*"infected" + 0.008*"data" + 0.007*"model" + 0.007*"rate" + 0.006*"population"
0.011*"covid" + 0.008*"time" + 0.008*"data" + 0.007*"infected" + 0.007*"model" + 0.006*"population" + 0.006*"virus" + 0.005*"individuals" + 0.005*"infection" + 0.005*"disease"
------------------------- 4 ---------------
0.014*"time" + 0.012*"infected" + 0.007*"disease" + 0.007*"model" + 0.006*"covid" + 0.006*"data" + 0.006*"þ" + 0.006*"rate" + 0.006*"population" + 0.005*"virus"
0.016*"þ" + 0.013*"à" + 0.011*"covid" + 0.009*"disease" + 0.008*"system" + 0.007*"infected" + 0.007*"time" + 0.006*"population" + 0.006*"model" + 0.006*"infection"
0.011*"þ" + 0.009*"time" + 0.009*"covid" + 0.008*"disease" + 0.007*"data" + 0.007*"à" + 0.007*"population" + 0.006*"infected" + 0.006*"infection" + 0.006*"system"
0.010*"time" + 0.010*"covid" + 0.009*"data" + 0.008*"model" + 0.007*"infected" + 0.005*"population" + 0.005*"individuals" + 0.005*"rate" + 0.005*"infection" + 0.004*"disease"
3.Word2vec模型——主题词相似度计算
from gensim.models import Word2Vec
data = list(iter(open('data.txt')))
data = [content.split() for content in data]
# data 为 list to list 格式
word2vec_model = Word2Vec(data, sg=1, size=100, window=5, min_count=5, negative=3, sample=0.001, hs=1, workers=4)
wv = word2vec_model.wv # 得到训练之后的词向量
# word2vec模型词向量保存与加载
wv.save('word_vector.model.wv') # 保存word vectors
from gensim.models import KeyedVectors
wv = KeyedVectors.load('word_vector.model.wv', mmap='r') # 加载保存的word vectors
# 得到词的词向量
print(wv['water'])
# 得到所有词的词表
wv.vocab.keys()
# 计算两个词之间的余弦相似度
wv.similarity('culture', 'cardboard')
# 计算一个词d(或者词表)和相似度,使得该词的向量v(d)与v(a="woman")-v(c="man")+v(b="king")最近
wv.most_similar(positive=["surfaces", "survival"], negative=["steel"], topn=10) # topn是返回最近的词的个数,默认返回前10个
# 返回与“man”最近的词和相似度
wv.similar_by_word("surfaces", topn=10)
# 找出不太合群的词
word2vec_model.doesnt_match("culture cardboard substrates surfaces".split()) # 这个结果是cereal
4.LSI建模——文本相似度计算
虽然TF-IDF也可以做文本的相似度计算,但是LSI的效果会更好
from gensim import corpora, models, similarities
data = list(iter(open('data.txt')))
data = [content.split() for content in data]
# 构造词袋模型
dictionary = corpora.Dictionary(data)
doc_vectors = [dictionary.doc2bow(text) for text in data]
tfidf = models.TfidfModel(doc_vectors)
tfidf_vectors = tfidf[doc_vectors]
# LSI模型建模,这里假设共有5类主题
lsi = models.LsiModel(tfidf_vectors, id2word=dictionary, num_topics=5)
lsi.print_topics(2)
lsi_vector = lsi[tfidf_vectors]
for vec in lsi_vector:
print(vec)
# 拿第一篇文章做实验
query = data[0]
query_bow = dictionary.doc2bow(query)
print(query_bow)
query_lsi = lsi[query_bow]
index = similarities.MatrixSimilarity(lsi_vector)
sims = index[query_lsi]
print(list(enumerate(sims)))