


import jieba
import pandas as pd
from gensim.models.word2vec import Word2Vec
from gensim import corpora, models
from gensim.models.ldamodel import LdaModel

raw = pd.read_table('./金庸-射雕英雄传txt精校版.txt', names=['txt'], encoding="GBK")

#  章节判断用变量预处理
def m_head(tmpstr):
    return tmpstr[:1]

def m_mid(tmpstr):
    return tmpstr.find("回 ")

raw['head'] = raw.txt.apply(m_head)
raw['mid'] = raw.txt.apply(m_mid)
raw['len'] = raw.txt.apply(len)

#  章节判断
chapnum = 0
for i in range(len(raw)):
    if raw['head'][i] == "第" and raw['mid'][i] > 0 and raw['len'][i] < 30:
        chapnum += 1
    if chapnum >= 40 and raw['txt'][i] == "附录一:成吉思汗家族":
        chapnum = 0
    raw.loc[i, 'chap'] = chapnum

#  删除临时变量
del raw['head']
del raw['mid']
del raw['len']

rawgrp = raw.groupby('chap')
chapter = rawgrp.agg(sum)  # 只有字符串的情况下,sum函数自动转为合并字符串
chapter = chapter[chapter.index != 0]
# print(chapter)

#  设定分词及请理停用词函数
stop_list = list(pd.read_csv('./停用词.txt', names=['w'], sep='aaa', encoding='utf-8').w)
# print(stop_list)

#  分词和预处理,生成list of list格式
chapter['cut'] = chapter.txt.apply(jieba.lcut)

#  初始化word2vec模型和词表
n_dim = 300  # 指定向量维度,大样本量时300~500较好

w2v_model = Word2Vec(size=n_dim, min_count=10)
w2v_model.build_vocab(chapter.cut)  # 生成词表

#  在评论训练集上建模(大数据集时可能会花费几分钟)
w2v_model.train(chapter.cut, total_examples=w2v_model.corpus_count, epochs=10)

#  训练完毕的模型实质


print(w2v_model.wv.most_similar('黄蓉', topn=20))


#  寻找对应关系
print(w2v_model.wv.most_similar(['郭靖', '小红马'], ['黄药师'], topn=5))

print(w2v_model.wv.most_similar(positive=['郭靖', '黄蓉'], negative=['杨康'], topn=10))

#  计算两个词的相似度/相关程度
print(w2v_model.wv.similarity('郭靖', '黄蓉'))
print(w2v_model.wv.similarity('郭靖', '杨康'))
print(w2v_model.wv.similarity('郭靖', '杨铁心'))

#  找出不合群的词
print(w2v_model.wv.doesnt_match('小红马 黄药师 鲁有脚'.split()))
print(w2v_model.wv.doesnt_match('杨铁心 黄药师 黄蓉 洪七公'.split()))
print(w2v_model.wv.doesnt_match('郭靖 黄药师 黄蓉 洪七公'.split()))


                                                    txt                                                cut
1.0   第一回 风雪惊变    钱塘江浩浩江水,日日夜夜无穷无休的从两浙西路临安府牛家村边绕过,东流...  [第一回,  , 风雪, 惊变,  ,  ,  ,  , 钱塘江, 浩浩, 江水, ,, 日...
2.0   第二回 江南七怪    颜烈跨出房门,过道中一个中年士人拖着鞋皮,踢跶踢跶的直响,一路打着哈...  [第二回,  , 江南七怪,  ,  ,  ,  , 颜烈, 跨出, 房门, ,, 过道, ...
3.0   第三回 黄沙莽莽    寺里僧众见焦木圆寂,尽皆悲哭。有的便为伤者包扎伤处,抬入客舍。  忽...  [第三回,  , 黄沙, 莽莽,  ,  ,  ,  , 寺里, 僧众, 见, 焦木, 圆寂...
4.0   第四回 黑风双煞    完颜洪熙笑道:“好,再打他个痛快。”蒙古兵前哨报来:“王罕亲自前来迎...  [第四回,  , 黑风双, 煞,  ,  ,  ,  , 完颜洪熙, 笑, 道, :, “,...
5.0   第五回 弯弓射雕    一行人下得山来,走不多时,忽听前面猛兽大吼声一阵阵传来。韩宝驹一提缰...  [第五回,  , 弯弓, 射雕,  ,  ,  ,  , 一行, 人下, 得, 山来, ,,...
Word2Vec(vocab=5459, size=300, alpha=0.025)
[('黄蓉', 0.9228439331054688), ('欧阳克', 0.8506240844726562), ('欧阳锋', 0.7657182216644287), ('梅超风', 0.7550132274627686), ('裘千仞', 0.7529821395874023), ('穆念慈', 0.74937903881073), ('程瑶迦', 0.7446237206459045), ('黄药师', 0.7445610165596008), ('完颜康', 0.7358383536338806), ('周伯通', 0.7228418588638306)]
[('郭靖', 0.9228439927101135), ('欧阳克', 0.8548903465270996), ('穆念慈', 0.8016418218612671), ('周伯通', 0.797595739364624), ('完颜康', 0.7936385869979858), ('程瑶迦', 0.7841259241104126), ('陆冠英', 0.7631279826164246), ('洪七公', 0.7594529986381531), ('裘千仞', 0.7572054266929626), ('杨康', 0.7420238256454468), ('李萍', 0.7414523363113403), ('那道人', 0.7407932281494141), ('柯镇恶', 0.737557590007782), ('一灯', 0.7374000549316406), ('欧阳锋', 0.7373932003974915), ('黄药师', 0.7257256507873535), ('华筝', 0.7247843742370605), ('那公子', 0.7243456840515137), ('鲁有脚', 0.721243679523468), ('穆易', 0.721138060092926)]
[('郭靖道', 0.9680619239807129), ('杨康道', 0.9004895091056824), ('朱聪道', 0.8976290225982666), ('傻姑道', 0.8683680295944214), ('头道', 0.8035480976104736), ('马钰道', 0.7973171472549438), ('那人道', 0.7740610241889954), ('郭靖摇', 0.7600215673446655), ('怒道', 0.7592462301254272), ('欧阳克笑', 0.7311669588088989)]
[('奔', 0.8548181056976318), ('晕', 0.8207278251647949), ('退', 0.8112013936042786), ('茶', 0.8089604377746582), ('远远', 0.8086404204368591)]
[('欧阳克', 0.75826096534729), ('欧阳锋', 0.7300347089767456), ('梅超风', 0.6927435398101807), ('洪七公', 0.6642888784408569), ('她', 0.6387393474578857), ('黄药师', 0.6353062391281128), ('周伯通', 0.6277655363082886), ('那道人', 0.6228901743888855), ('当下', 0.6063475012779236), ('主人', 0.5766028165817261)]


import jieba
import pandas as pd
from gensim.models.word2vec import Word2Vec
from gensim import corpora, models
from gensim.models.ldamodel import LdaModel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction.text import TfidfTransformer

raw = pd.read_table('./金庸-射雕英雄传txt精校版.txt', names=['txt'], encoding="GBK")

#  章节判断用变量预处理
def m_head(tmpstr):
    return tmpstr[:1]

def m_mid(tmpstr):
    return tmpstr.find("回 ")

raw['head'] = raw.txt.apply(m_head)
raw['mid'] = raw.txt.apply(m_mid)
raw['len'] = raw.txt.apply(len)

#  章节判断
chapnum = 0
for i in range(len(raw)):
    if raw['head'][i] == "第" and raw['mid'][i] > 0 and raw['len'][i] < 30:
        chapnum += 1
    if chapnum >= 40 and raw['txt'][i] == "附录一:成吉思汗家族":
        chapnum = 0
    raw.loc[i, 'chap'] = chapnum

#  删除临时变量
del raw['head']
del raw['mid']
del raw['len']

rawgrp = raw.groupby('chap')
chapter = rawgrp.agg(sum)  # 只有字符串的情况下,sum函数自动转为合并字符串
chapter = chapter[chapter.index != 0]
# print(chapter)

#  设定分词及请理停用词函数
stop_list = list(pd.read_csv('./停用词.txt', names=['w'], sep='aaa', encoding='utf-8').w)
# print(stop_list)

#  jeiba分词
def m_cut(intxt):
    return [w for w in jieba.cut(intxt) if w not in stop_list and len(w) > 1]

clean_chap = [" ".join(m_cut(w)) for w in chapter.txt.iloc[:5]]

count_vec = CountVectorizer()
resmtx = count_vec.fit_transform(clean_chap)

#  基于词频矩阵X计算TF-Idf值
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(resmtx)

print(pairwise_distances(resmtx))  # 默认值为Euclidean
print(pairwise_distances(resmtx, metric='cosine'))

#  使用TF-IDF矩阵进行相似度计算
print(pairwise_distances(tfidf[:5], metric='cosine'))


[[  0.         295.77356204 317.52637686 320.33576135 316.85170033]
 [295.77356204   0.         266.95130642 265.77622166 277.24898557]
 [317.52637686 266.95130642   0.         233.9615353  226.09290126]
 [320.33576135 265.77622166 233.9615353    0.         202.57344347]
 [316.85170033 277.24898557 226.09290126 202.57344347   0.        ]]
[[0.         0.63250402 0.77528382 0.78540047 0.82880469]
 [0.63250402 0.         0.62572437 0.61666388 0.73192845]
 [0.77528382 0.62572437 0.         0.51645443 0.5299046 ]
 [0.78540047 0.61666388 0.51645443 0.         0.42108002]
 [0.82880469 0.73192845 0.5299046  0.42108002 0.        ]]
[[0.         0.69200348 0.84643282 0.85601472 0.89124575]
 [0.69200348 0.         0.7438766  0.70590455 0.81767486]
 [0.84643282 0.7438766  0.         0.60106637 0.63537168]
 [0.85601472 0.70590455 0.60106637 0.         0.54121177]
 [0.89124575 0.81767486 0.63537168 0.54121177 0.        ]]


import jieba
import pandas as pd
from gensim import corpora, models
from gensim.models.ldamodel import LdaModel
from gensim import similarities

raw = pd.read_table('./金庸-射雕英雄传txt精校版.txt', names=['txt'], encoding="GBK")

#  章节判断用变量预处理
def m_head(tmpstr):
    return tmpstr[:1]

def m_mid(tmpstr):
    return tmpstr.find("回 ")

raw['head'] = raw.txt.apply(m_head)
raw['mid'] = raw.txt.apply(m_mid)
raw['len'] = raw.txt.apply(len)

#  章节判断
chapnum = 0
for i in range(len(raw)):
    if raw['head'][i] == "第" and raw['mid'][i] > 0 and raw['len'][i] < 30:
        chapnum += 1
    if chapnum >= 40 and raw['txt'][i] == "附录一:成吉思汗家族":
        chapnum = 0
    raw.loc[i, 'chap'] = chapnum

#  删除临时变量
del raw['head']
del raw['mid']
del raw['len']

rawgrp = raw.groupby('chap')
chapter = rawgrp.agg(sum)  # 只有字符串的情况下,sum函数自动转为合并字符串
chapter = chapter[chapter.index != 0]
# print(chapter)

#  设定分词及请理停用词函数
stop_list = list(pd.read_csv('./停用词.txt', names=['w'], sep='aaa', encoding='utf-8').w)

# print(stop_list)

#  jeiba分词
def m_cut(intxt):
    return [w for w in jieba.cut(intxt) if w not in stop_list and len(w) > 1]

#  文档预处理,提取主题词
chap_list = [m_cut(w) for w in chapter.txt]

#  生成文档对应的字典和bow稀疏向量
dictionary = corpora.Dictionary(chap_list)
corpus = [dictionary.doc2bow(text) for text in chap_list]  # 仍为list in list

tfidf_model = models.TfidfModel(corpus)  # 建立TF-IDF模型
corpus_tfidf = tfidf_model[corpus]  # 对所需文档计算TF-IDF

ldamodel = LdaModel(corpus_tfidf, id2word=dictionary, num_topics=10, passes=5)

#  检索和第一章内容最相似(所属主题相同)的章节
simmtx = similarities.MatrixSimilarity(corpus)  # 使用的矩阵种类要和拟合模型时相同


#  使用gensim的LDA拟合结果进行演示
query = chapter.txt[1]
query_bow = dictionary.doc2bow(m_cut(query))

lda_evc = ldamodel[query_bow]  # 转换为lda模型下的向量
sims = simmtx[lda_evc]  # 进行矩阵内向量和所提供向量的余弦相似度查询
sims = sorted(enumerate(sims), key=lambda item: -item[1])


MatrixSimilarity<40 docs, 43955 features>
[[0.00360987 0.11551594 0.00360987 ... 0.         0.         0.        ]
 [0.         0.15784042 0.         ... 0.         0.         0.        ]]
[(0, 0.0058326684), (6, 0.0038030709), (3, 0.0035722235), (30, 0.002921846), (1, 0.0028936525), (7, 0.0028846266), (38, 0.002834554), (23, 0.0027586762), (4, 0.0027273756), (35, 0.0026802267), (8, 0.002584503), (31, 0.0024519921), (2, 0.0023968564), (29, 0.0022414625), (36, 0.0022174662), (24, 0.0020882194), (27, 0.0020224026), (21, 0.0019537606), (25, 0.0019245804), (5, 0.0018208353), (28, 0.001775686), (15, 0.0017177735), (39, 0.0016619969), (16, 0.001606744), (12, 0.0015473039), (33, 0.0015236269), (26, 0.0015073677), (10, 0.0015002472), (22, 0.0013651266), (11, 0.0013272304), (9, 0.001299706), (34, 0.0012887274), (37, 0.0011597403), (17, 0.0010317966), (14, 0.0009737435), (32, 0.00091980596), (18, 0.0008978047), (20, 0.00086604763), (19, 0.00065769325), (13, 0.0006388487)]


