python jieba 文本相似度_Python:通过gensim和jieba分词进行文本相似度分析

#! -*- coding:utf-8 -*-

importpymongo

importcodecs,sys

frompymongo importMongoClient

importjieba

fromgensim importcorpora, models, similarities

importnltk

importjieba.analyse

fromnltk.tokenize importword_tokenize

frompprint importpprint # pretty-printer

reload(sys)

sys.setdefaultencoding('utf-8')

kickpath=""#"/root/python/"

dics=[]

dits={}

labels={}

count=1

mydoclist =[]

courses=[]

questions=[]

uuids=[]

#通过jieba中文分词生成词条

defjieba_preprocess_cn(courses, low_freq_filter = True):

#jieba.analyse.set_stop_words("../extra_dict/stop_words.txt")

#jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");

texts_tokenized = []

fordocument incourses:

texts_tokenized_tmp= []

words= jieba.cut(document,cut_all=True)

tages= jieba.analyse.extract_tags(document,500)

texts_tokenized.append(tages)

texts_filtered_stopwords = texts_tokenized

pprint(texts_filtered_stopwords)

#去除标点符号

english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']

texts_filtered = [[word forword indocument if notword inenglish_punctuations] fordocument intexts_filtered_stopwords]

#去除过低频词

iflow_freq_filter:

# remove words that appear only once

fromcollections importdefaultdict

frequency = defaultdict(int)

fortext intexts_filtered:

fortoken intext:

frequency[token] += 1

texts = [[token fortoken intext iffrequency[token] > 1] fortext intexts_filtered]

else:

texts = texts_filtered

pprint(texts)

returntexts

deftrain_by_lsi(lib_texts):

#为了能看到过程日志

#import logging

#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

dictionary = corpora.Dictionary(lib_texts)

corpus = [dictionary.doc2bow(text) fortext inlib_texts] #doc2bow(): 将collection words 转为词袋,用两元组(word_id, word_frequency)表示

tfidf = models.TfidfModel(corpus)

corpus_tfidf = tfidf[corpus]

#拍脑袋的:训练topic数量为10的LSI模型

lsi = models.LsiModel(corpus_tfidf, id2word=dictionary) #, num_topics=10)

index = similarities.MatrixSimilarity(lsi[corpus]) # index 是 gensim.similarities.docsim.MatrixSimilarity 实例

dictionary.save(kickpath+"kick.dict")

lsi.save(kickpath+"kick.lsi")

index.save(kickpath+"kick.index")

return(index, dictionary, lsi)

if__name__ == '__main__':

conn = MongoClient("xxx", 27017)

db = conn.health

db.authenticate("xx", "xxx")

content = db.kickchufang.find({'doctorId':'huanghuang'})

index=0

fori incontent:

line = str(i['desc']) #.decode("utf-8") #.encode("GB18030"))

#print "line:",line

uuid = i['uuid']

uuids.append(uuid)

#print uuid, line

courses.append(line)

printstr(index)

index=index+1

#if (index>10):

# break

man_file = open(kickpath+"kick.uuids", 'w')

print(uuids, man_file)

man_file.close()

courses_name = courses

# 库建立完成 -- 这部分可能数据很大,可以预先处理好,存储起来

lib_texts = jieba_preprocess_cn(courses)

(index, dictionary, lsi) = train_by_lsi(lib_texts)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值