#! -*- coding:utf-8 -*-
importpymongo
importcodecs,sys
frompymongo importMongoClient
importjieba
fromgensim importcorpora, models, similarities
importnltk
importjieba.analyse
fromnltk.tokenize importword_tokenize
frompprint importpprint # pretty-printer
reload(sys)
sys.setdefaultencoding('utf-8')
kickpath=""#"/root/python/"
dics=[]
dits={}
labels={}
count=1
mydoclist =[]
courses=[]
questions=[]
uuids=[]
#通过jieba中文分词生成词条
defjieba_preprocess_cn(courses, low_freq_filter = True):
#jieba.analyse.set_stop_words("../extra_dict/stop_words.txt")
#jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
texts_tokenized = []
fordocument incourses:
texts_tokenized_tmp= []
words= jieba.cut(document,cut_all=True)
tages= jieba.analyse.extract_tags(document,500)
texts_tokenized.append(tages)
texts_filtered_stopwords = texts_tokenized
pprint(texts_filtered_stopwords)
#去除标点符号
english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
texts_filtered = [[word forword indocument if notword inenglish_punctuations] fordocument intexts_filtered_stopwords]
#去除过低频词
iflow_freq_filter:
# remove words that appear only once
fromcollections importdefaultdict
frequency = defaultdict(int)
fortext intexts_filtered:
fortoken intext:
frequency[token] += 1
texts = [[token fortoken intext iffrequency[token] > 1] fortext intexts_filtered]
else:
texts = texts_filtered
pprint(texts)
returntexts
deftrain_by_lsi(lib_texts):
#为了能看到过程日志
#import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
dictionary = corpora.Dictionary(lib_texts)
corpus = [dictionary.doc2bow(text) fortext inlib_texts] #doc2bow(): 将collection words 转为词袋,用两元组(word_id, word_frequency)表示
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
#拍脑袋的:训练topic数量为10的LSI模型
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary) #, num_topics=10)
index = similarities.MatrixSimilarity(lsi[corpus]) # index 是 gensim.similarities.docsim.MatrixSimilarity 实例
dictionary.save(kickpath+"kick.dict")
lsi.save(kickpath+"kick.lsi")
index.save(kickpath+"kick.index")
return(index, dictionary, lsi)
if__name__ == '__main__':
conn = MongoClient("xxx", 27017)
db = conn.health
db.authenticate("xx", "xxx")
content = db.kickchufang.find({'doctorId':'huanghuang'})
index=0
fori incontent:
line = str(i['desc']) #.decode("utf-8") #.encode("GB18030"))
#print "line:",line
uuid = i['uuid']
uuids.append(uuid)
#print uuid, line
courses.append(line)
printstr(index)
index=index+1
#if (index>10):
# break
man_file = open(kickpath+"kick.uuids", 'w')
print(uuids, man_file)
man_file.close()
courses_name = courses
# 库建立完成 -- 这部分可能数据很大,可以预先处理好,存储起来
lib_texts = jieba_preprocess_cn(courses)
(index, dictionary, lsi) = train_by_lsi(lib_texts)