def compute_co_occurrence_matrix(corpus, window_size=4):
""" Compute co-occurrence matrix for the given corpus and window_size (default of 4).
Note: Each word in a document should be at the center of a window. Words near edges will have a smaller
number of co-occurring words.
For example, if we take the document "START All that glitters is not gold END" with window size of 4,
"All" will co-occur with "START", "that", "glitters", "is", and "not".
Params:
corpus (list of list of strings): corpus of documents
window_size (int): size of context window
Return:
M (numpy matrix of shape (number of corpus words, number of corpus words)):
Co-occurence matrix of word counts.
The ordering of the words in the rows/columns should be the same as the ordering of the words given by the distinct_words function.
word2Ind (dict): dictionary that maps word to index (i.e. row/column number) for matrix M.
"""
words, num_words = distinct_words(corpus)
M = None
word2Ind = {}
M = np.zeros([len(words),len(words)])
#print(M)
idx=0
for word in words:
word2Ind[word]=idx
idx+=1
print(word2Ind)
# ------------------我使用的循坏
for i in range(len(corpus)):
for j in range(len(corpus[i])):
curr_text=corpus[i][j]
if j-window_size>=0 and j+window_size<=len(corpus[i]):
for k in range(j-window_size,j+window_size):
if j!=k:
neib_word=corpus[i][k]
M[word2Ind[curr_text]][word2Ind[neib_word]]=1
M[word2Ind[neib_word]][word2Ind[curr_text]]=1
elif j<window_size:
for k in range(0,j+window_size):
if j!=k:
neib_word=corpus[i][k]
M[word2Ind[curr_text]][word2Ind[neib_word]]=1
M[word2Ind[neib_word]][word2Ind[curr_text]]=1
elif j+window_size>len(corpus[i]):
for k in range(j-window_size,len(corpus[i])):
if j!=k:
neib_word=corpus[i][k]
M[word2Ind[curr_text]][word2Ind[neib_word]]=1
M[word2Ind[neib_word]][word2Ind[curr_text]]=1
return M, word2Ind
希望大佬给出一个简单办法,觉得for循环太蠢!
被自己蠢到,脑子里只有索引,看了大佬代码,才意识到可以直接遍历元素…
找到了,转自大佬
def compute_co_occurrence_matrix(corpus, window_size=4):
words, num_words = distinct_words(corpus)
M = None
word2Ind = {}
# ------------------
M = np.zeros(shape=(num_words, num_words), dtype=np.int32)
for i in range(num_words):
word2Ind[words[i]] = i
for sent in corpus:
for p in range(len(sent)):
# ci for center word index
ci = word2Ind[sent[p]]
# proceeding
for w in sent[max(0, p - window_size):p]:
wi = word2Ind[w]
M[ci][wi] += 1
# subsequent
for w in sent[p + 1 : p + 1 + window_size]:
wi = word2Ind[w]
M[ci][wi] += 1
# ------------------
return M, word2Ind