#-*-encoding:utf-8-*-
import sys
import urllib
import json
import numpy as np
import math
import jieba
import codecs
import json
import os
import re
import time
import utils
from gensim.corpora import Dictionary
import numpy as np
reload(sys)
sys.setdefaultencoding( "utf-8" )
import logging
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)
print time.ctime(),"开始"
# 加载字典
dict_path="dict/"
dict_name=["entity_syn", "action_syn","location_syn","time_syn","stopwords.dat"]
for fname in dict_name:
with codecs.open(os.path.join(dict_path,fname),"r","utf-8") as f:
for line in f.readlines():
word = line.replace("\n","").split()
for w in word:
jieba.add_word(w,1000)
print time.ctime(),"字典加载完成"
# 加载停用词
stopwords_path="stopwords.dat"
stopwords={}
with codecs.open(os.path.join(dict_path,fname),"r","utf-8") as f:
for line in f.readlines():
word = line.replace("\n","").strip()
if len(word)>1:continue
stopwords[word]=1
print time.ctime(),"停用词加载完成"
print time.ctime(),"开始构建索引 bid index"
# 资源文件 待处理文件
fname="pool.dat"
index_bid={}
index_bid_path="index_bid.json"
if os.path.exists(index_bid_path):
with codecs.open(index_bid_path,"r","utf-8") as f:
index_bid = json.load(f)
print time.ctime(),"现有index_bid,",len(index_bid)
new_add=0
new_adds={}
documents=[]
with codecs.open(fname,"r","utf-8") as f:
lines = f.readlines()
for number,line in enumerate(lines):
tmp = line.replace("\n","").split("\t")
if len(tmp)==4:#5:
bid = tmp[0]+"_"+tmp[1]
title=utils.dataclean(tmp[2])
content = "N"#utils.dataclean(tmp[3])
create_at = "N"#tmp[4]
if len(title)<1:continue ###若标题不存在 下一条
if bid not in index_bid:
tmppp=[]
for word in list(jieba.cut(title)):
if word not in stopwords:
tmppp.append(word)
documents.append(tmppp)
# all_words.append(jieba.cut())
##########
new_add+=1
new_adds[bid]=1
index_bid[bid]={"number":number,"title":title,"content":content,"create_at":create_at} ## bid索引时 title不作任何处理
print time.ctime(),"新增:",new_add,"条数据"
print time.ctime(),"len(documents):",len(documents)
# 构建字典
dict_words=Dictionary(documents)
dict_words.filter_extremes(no_below=5, no_above=0.5, keep_n=10000)
dict_words.save_as_text("word_dict.dat")
print time.ctime(),"len(dict_words):",len(dict_words)
token2id = dict_words.token2id
# dict_words.save("words_id.dict")
# n=0
# for k in dict_words:
# print k,dict_words[k]
# n+=1
# if n>10:
# break
with codecs.open(index_bid_path,"w","utf-8") as f:
json.dump(index_bid,f)
print time.ctime(),"现有index_bid,",len(index_bid)
print time.ctime(),"构建index_word,bid完成"
# 构建矩阵 size_bid* size_word
all_metric=[]
for bid in index_bid:
tmp = np.zeros(len(dict_words))
title = index_bid[bid]["title"]
for word in list(jieba.cut(title)):
if word not in stopwords and word in token2id:
tmp[token2id[word]]=1
all_metric.append(tmp)
all_metric = np.array(all_metric)
print time.ctime(),"all_metric.shape:",all_metric.shape
2
最新推荐文章于 2021-08-30 15:27:43 发布