共现矩阵
为了以某一个词为中心预测周围词,提供一定的概率分布
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import offsetbox
from sklearn.decomposition import TruncatedSVD
def read_corpus(category='crude'):
files = reuters.fileids(category)
return [[START_TOKEN] + [w.lower() for w in list(reuters.words(f))] + [END_TOKEN] for f in files]
def distinct_words(corpus):
"""
去重
:param corpus: 句子
:return: 去重后的列表,长度
"""
# 将各个句子放在一个列表中
corpus_words = []
for sentence in corpus:
corpus_words.extend(sentence)
# 去重后的句子
de_weighting = sorted(list(set(corpus_words)))
num_corpus_words = len(de_weighting)
return de_weighting, num_corpus_words, corpus_words
def compute_co_occurrence_matrix(corpus, window_size=4):
"""
共现矩阵
:param corpus: 句子
:param window_size: 中心词,例如:i like nlp. lik