英文文本分析:与COVID-19有关的论文文本分析

数据源

kaggle:https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge

数据预览:

{
    "paper_id": "0ab9404566ff5d8d81aebd0e9d9128b9589d9015",
    "metadata": {
        "title": "A CASE STUDY OF BRAZIL",
        "authors": [
            {
                "first": "Pedro",
                "middle": [
                    "V"
                ],
                "last": "Savi",
                "suffix": "",
                "affiliation": {
                    "laboratory": "",
                    "institution": "Universidade Federal do Rio de Janeiro",
                    "location": {
                        "addrLine": "21.941.972 -Rio de Janeiro -RJ",
                        "postBox": "P.O. Box 68.503",
                        "country": "Brazil"
                    }
                },
                "email": "pedrov.savi@gmail.com"
            },
        ]
    },
    ............................好长好长好长好长

数据预处理

1. json数据提取


# 1. 读取文件
import os
import json

def readfile(path):
    files = os.listdir(path)
    file_list = []
    for file in files:  # 遍历文件夹
        if not os.path.isdir(file):
            file_list.append(path + '/' + file)
    return file_list


file_list = readfile('data')
all_content = []
# todo for all files
for file_path in file_list:
    with open(file_path, 'r', encoding='utf8')as fp:
        paper_data = json.load(fp)
    all_abstract = paper_data['abstract']
    # body_texts = paper_data['body_text']
    content = ''
    for text in all_abstract:
        content += text['text']
    all_content.append(content)

2.文本转小写

for i, content in enumerate(all_content):
    all_content[i] = content.lower()  # 文本转小写

3.词形还原

首先我们需要先安装nltk,然后下载wordnet工具,运行下面的代码就可以:

import nltk
nltk.download('wordnet')

如果不幸被墙了,可以到这个网站下载:http://www.nltk.org/nltk_data/

wordnet的具体下载地址是:https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/wordnet.zip

具体的离线安装方法在网址:http://www.nltk.org/data.html

词形还原的代码如下:


from sklearn.feature_extraction.text import TfidfVectorizer

# 3.词形还原
for i, content in enumerate(all_content):
    lemmatizer = WordNetLemmatizer()
    all_content[i] = lemmatizer.lemmatize(content)

4.分词

首先要下载NLTK包,如果词形还原处已经安装,则可跳过。

import nltk.tokenize as tk

tokenizer = tk.WordPunctTokenizer()
for i, content in enumerate(all_content):
    tokens = tokenizer.tokenize(content)
    all_content[i] = tokens

5.剔除停用词

def drop_stopwords(contents, stopwords):
    contents_clean = []
    for line in contents:
        line_clean = []
        for word in line:
            if word in stopwords:
                continue
            elif not word.isalpha():  # 只保留全部是字母的单词
                continue
            line_clean.append(word)
        contents_clean.append(line_clean)
    return contents_clean

f = open("eng_stop_words.txt", "r")  # 设置文件对象
stopwords = [i.strip() for i in f.readlines()]
content_words_list = drop_stopwords(all_content, stopwords)

这里我们也可以暂时保存一下:

import pandas as pd

data = pd.DataFrame(content_words_list)
data.to_csv('data.csv', header=None, index=None)

6.词频统计

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

corpus = []
for content in content_words_list:
    str_content = " ".join(content)
    corpus.append(str_content)

vectorizer = CountVectorizer()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(
    vectorizer.fit_transform([corpus[0]])) # 这里统计corpus[0]的词频
word_count = vectorizer.vocabulary_ # word_count 即为词频

这里我们可以对词频绘图:
在这里插入图片描述

import matplotlib.pyplot as plt
import seaborn as sns

word_sort = sorted(word_count.items(), key=lambda d: d[1], reverse=True)
word_sort = word_sort[:16]
x = []
y = []
for word in word_sort:
    x.append(word[0])
    y.append(word[1])

plt.style.use({'figure.figsize': (17, 8)})
sns.barplot(x, y, palette="BuPu_r")
plt.ylabel('Word count')
# 数据可视化:柱状图
sns.despine(bottom=True)
plt.show()

得到这样的图片:
在这里插入图片描述
或者随机选取一些单词,然后绘制他们的词频图:

import matplotlib.pyplot as plt
import seaborn as sns
import random

word_sort = sorted(word_count.items(), key=lambda d: d[1], reverse=True)
word_sort = random.sample(word_sort, 16)
x = []
y = []
for word in word_sort:
    x.append(word[0])
    y.append(word[1])

plt.style.use({'figure.figsize': (17, 8)})
sns.barplot(x, y, palette="BuPu_r")
plt.ylabel('Word count')
# 数据可视化:柱状图
sns.despine(bottom=True)
plt.show()

或是做一个常见的词云:

import numpy as np
import wordcloud  # 词云展示库
from PIL import Image  # 图像处理库

plt.style.use({'figure.figsize': (10, 16)})
mask = np.array(Image.open('amvw4-olmxk.jpg'))  # 定义词频背景
wc = wordcloud.WordCloud(
    mask=mask,  # 设置背景图
    max_words=50,  # 最多显示词数
    max_font_size=100  # 字体最大值
)
word_cloud = dict(word_sort)
wc.generate_from_frequencies(word_cloud)  # 从字典生成词云
image_colors = wordcloud.ImageColorGenerator(mask)  # 从背景图建立颜色方案
wc.recolor(color_func=image_colors)  # 将词云颜色设置为背景图方案
plt.imshow(wc)  # 显示词云
plt.axis('off')  # 关闭坐标轴
plt.show()  # 显示图像

在这里插入图片描述
原图是这样的:
在这里插入图片描述

数据分析

1.TF-IDF——文本相似度

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import heapq

word = vectorizer.get_feature_names()  # 获取词袋模型中的所有词语
weight = tfidf.toarray()  # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
# 根据TF-IDF得到第k篇文章的主题词
k = 3
paper_weight = list(weight[k])
max_num_index = list(map(paper_weight.index, heapq.nlargest(10, paper_weight)))
# 这就是TF-IDF值最大前10个代表词
tfidf_topic_word = []
for index in max_num_index:
    tfidf_topic_word.append(word[index])

2.LDA 建模——主题提取

首先需要将corpus的数据保存到本地,也方便放到服务器上训练,训练时间受数据量影响比较大

LDA python实现:https://blog.csdn.net/wind_blast/article/details/53815757

import pandas as pd

data = pd.DataFrame(corpus)
data.to_csv('data.txt', header=None, index=None)

这是LDA建模的核心代码

from gensim import corpora
import gensim  # pip install gensim

def get_topic(all_contents, num_topic=10):
    # num_topic 定义LDA模型需要训练成多少类
    try:
        def lda_analyze(all_contents, num_topic=10):
            """这是训练LDA的核心方法"""
            dictionary = corpora.Dictionary(all_contents)
            corpus = [dictionary.doc2bow(sentence) for sentence in all_contents]
            lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topic)  # 核心代码
            return lda

        # all_contents is list to list
        lda = lda_analyze(all_contents, num_topic=num_topic)
        print("------------------------- " + str(num_topic) + " ---------------")
        for topic in lda.print_topics(num_words=10):  # 这里是打印LDA分类的结果
            print(topic[1])
        # save model
        # lda.save('lda_' + str(num_topic) + '.model')
    except Exception as e:
        print(e)

# 整合data的核心代码
data = list(iter(open('data.txt')))
data = [content.split() for content in data]
for i in range(16):
    get_topic(data, i + 1)  # 从分为1个类别到16个类别,都跑一跑,然后把结果保存下来

效果如下(可能是训练量太少了):

由于LDA是无监督分类方法,因此分割线上的数字表示将原文本分成多少类

------------------------- 1 ---------------
0.010*"time" + 0.009*"þ" + 0.009*"covid" + 0.008*"infected" + 0.007*"disease" + 0.007*"data" + 0.007*"model" + 0.007*"à" + 0.006*"population" + 0.006*"system"
------------------------- 2 ---------------
0.010*"infected" + 0.010*"time" + 0.008*"data" + 0.008*"covid" + 0.007*"model" + 0.007*"population" + 0.006*"þ" + 0.006*"rate" + 0.006*"disease" + 0.005*"system"
0.011*"time" + 0.011*"þ" + 0.009*"covid" + 0.008*"disease" + 0.008*"à" + 0.007*"infected" + 0.006*"system" + 0.006*"model" + 0.006*"infection" + 0.006*"data"
------------------------- 3 ---------------
0.012*"covid" + 0.009*"infected" + 0.009*"þ" + 0.009*"time" + 0.007*"disease" + 0.007*"à" + 0.006*"infection" + 0.006*"model" + 0.006*"data" + 0.005*"rate"
0.014*"þ" + 0.014*"time" + 0.010*"à" + 0.009*"disease" + 0.009*"system" + 0.008*"infected" + 0.008*"data" + 0.007*"model" + 0.007*"rate" + 0.006*"population"
0.011*"covid" + 0.008*"time" + 0.008*"data" + 0.007*"infected" + 0.007*"model" + 0.006*"population" + 0.006*"virus" + 0.005*"individuals" + 0.005*"infection" + 0.005*"disease"
------------------------- 4 ---------------
0.014*"time" + 0.012*"infected" + 0.007*"disease" + 0.007*"model" + 0.006*"covid" + 0.006*"data" + 0.006*"þ" + 0.006*"rate" + 0.006*"population" + 0.005*"virus"
0.016*"þ" + 0.013*"à" + 0.011*"covid" + 0.009*"disease" + 0.008*"system" + 0.007*"infected" + 0.007*"time" + 0.006*"population" + 0.006*"model" + 0.006*"infection"
0.011*"þ" + 0.009*"time" + 0.009*"covid" + 0.008*"disease" + 0.007*"data" + 0.007*"à" + 0.007*"population" + 0.006*"infected" + 0.006*"infection" + 0.006*"system"
0.010*"time" + 0.010*"covid" + 0.009*"data" + 0.008*"model" + 0.007*"infected" + 0.005*"population" + 0.005*"individuals" + 0.005*"rate" + 0.005*"infection" + 0.004*"disease"

3.Word2vec模型——主题词相似度计算

from gensim.models import Word2Vec

data = list(iter(open('data.txt')))
data = [content.split() for content in data]

# data 为 list to list 格式
word2vec_model = Word2Vec(data, sg=1, size=100, window=5, min_count=5, negative=3, sample=0.001, hs=1, workers=4)
wv = word2vec_model.wv  # 得到训练之后的词向量

# word2vec模型词向量保存与加载
wv.save('word_vector.model.wv')  # 保存word vectors
from gensim.models import KeyedVectors

wv = KeyedVectors.load('word_vector.model.wv', mmap='r')  # 加载保存的word vectors

# 得到词的词向量
print(wv['water'])

# 得到所有词的词表
wv.vocab.keys()

# 计算两个词之间的余弦相似度
wv.similarity('culture', 'cardboard')

# 计算一个词d(或者词表)和相似度,使得该词的向量v(d)与v(a="woman")-v(c="man")+v(b="king")最近
wv.most_similar(positive=["surfaces", "survival"], negative=["steel"], topn=10)  # topn是返回最近的词的个数,默认返回前10个

# 返回与“man”最近的词和相似度
wv.similar_by_word("surfaces", topn=10)

# 找出不太合群的词
word2vec_model.doesnt_match("culture cardboard substrates surfaces".split())  # 这个结果是cereal

4.LSI建模——文本相似度计算

虽然TF-IDF也可以做文本的相似度计算,但是LSI的效果会更好

from gensim import corpora, models, similarities

data = list(iter(open('data.txt')))
data = [content.split() for content in data]

# 构造词袋模型
dictionary = corpora.Dictionary(data)
doc_vectors = [dictionary.doc2bow(text) for text in data]

tfidf = models.TfidfModel(doc_vectors)
tfidf_vectors = tfidf[doc_vectors]

# LSI模型建模,这里假设共有5类主题
lsi = models.LsiModel(tfidf_vectors, id2word=dictionary, num_topics=5)
lsi.print_topics(2)
lsi_vector = lsi[tfidf_vectors]
for vec in lsi_vector:
    print(vec)

# 拿第一篇文章做实验
query = data[0]
query_bow = dictionary.doc2bow(query)
print(query_bow)

query_lsi = lsi[query_bow]

index = similarities.MatrixSimilarity(lsi_vector)
sims = index[query_lsi]
print(list(enumerate(sims)))
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

呆萌的代Ma

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值