文档信息的向量化-词袋模型、gensim实现和词条分布
from gensim import corpora
texts = [['human', 'interface', 'computer']]
# fit dictionary
dct = corpora.Dictionary(texts)
print(dct.token2id)
通过.属性来使用
from gensim import corpora
texts = [['human', 'interface', 'computer']]
# fit dictionary
dct = corpora.Dictionary(texts)
print(dct.token2id)
# 向词典增加词条
dct.add_documents([['cat', 'say', 'meow'], ['dog']])
print(dct.token2id)
from gensim import corpora
texts = [['human', 'interface', 'computer']]
# fit dictionary
dct = corpora.Dictionary(texts)
print(dct.token2id)
# 向词典增加词条
dct.add_documents([['cat', 'say', 'meow'], ['dog']])
print(dct.token2id)
obj1 = dct.doc2bow(['this', 'is', 'cat', 'not', 'a', 'dog'])
print(obj1)
obj2 = dct.doc2bow(['this', 'is', 'cat', 'not', 'a', 'dog'], return_missing=True)
print(obj2)
from gensim import corpora
texts = [['human', 'interface', 'computer']]
# fit dictionary
dct = corpora.Dictionary(texts)
print(dct.token2id)
# 向词典增加词条
dct.add_documents([['cat', 'say', 'meow'], ['dog']])
print(dct.token2id)
obj1 = dct.doc2bow(['this', 'is', 'cat', 'not', 'a', 'dog'])
print(obj1)
obj2 = dct.doc2bow(['this', 'is', 'cat', 'not', 'a', 'dog'], return_missing=True)
print(obj2)
obj3 = dct.doc2idx(['this', 'is', 'a', 'dog', 'not', 'cat'])
print(obj3)
import pandas as pd
import jieba
# 设定分词以及清楚停用词函数
# 熟悉python的话,可以直接使用open('stopword.txt').readlines() 获取停用词list,效率会更高
stoplist = list(pd.read_csv('../data/停用词.txt', names=['w'], sep='aaa', encoding='utf-8').w)
def m_cut(text):
return [word for word in jieba.cut(text) if word not in stoplist and len(word) > 1]
# 设定数据库转换函数
def m_appdf(chapnum):
tmpdf = pd.DataFrame(m_cut(chapter.txt[chapnum + 1]), columns=['word'])
tmpdf['chap'] = chapter.index[chapnum]
return tmpdf
# 全部读入并存入数据框
df0 = pd.DataFrame(columns=['word', 'chap']) # 初始化结果数据框
for i in range(len(chapter)):
df0 = df0.append(m_appdf(i))
df0.head()
# 输出为序列格式
df0.groupby(['word', 'chap']).agg('size').tail(10)
# 直接输出为数据框
t2d = pd.crosstab(df0.word, df0.chap)
t2d
# 计算词条出现的总频率,准备进行低频词的删除
totnum = t2d.agg(func='sum', axis=1)
totnum
totclear = t2d.iloc[list(totnum >= 10)]
totclear