2

#-*-encoding:utf-8-*-
import sys
import urllib
import json
import numpy as np
import math
import jieba
import codecs
import json
import os
import re
import time
import utils
from gensim.corpora import Dictionary
import numpy as np

reload(sys)
sys.setdefaultencoding( "utf-8" ) 

import logging 
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)

print time.ctime(),"开始"


# 加载字典
dict_path="dict/"
dict_name=["entity_syn", "action_syn","location_syn","time_syn","stopwords.dat"]
for fname in dict_name:
    with codecs.open(os.path.join(dict_path,fname),"r","utf-8") as f:
        for line in f.readlines():
            word = line.replace("\n","").split()
            for w in word:
                jieba.add_word(w,1000)
print time.ctime(),"字典加载完成"

# 加载停用词
stopwords_path="stopwords.dat"
stopwords={}
with codecs.open(os.path.join(dict_path,fname),"r","utf-8") as f:
    for line in f.readlines():
        word = line.replace("\n","").strip()
        if len(word)>1:continue
        stopwords[word]=1
print time.ctime(),"停用词加载完成"




print time.ctime(),"开始构建索引 bid index"

# 资源文件 待处理文件
fname="pool.dat"


index_bid={}
index_bid_path="index_bid.json"
if os.path.exists(index_bid_path):
    with codecs.open(index_bid_path,"r","utf-8") as f:
        index_bid = json.load(f)
        print time.ctime(),"现有index_bid,",len(index_bid)

new_add=0
new_adds={}

documents=[]

with codecs.open(fname,"r","utf-8") as f:
    lines = f.readlines()
    for number,line in enumerate(lines):
        tmp = line.replace("\n","").split("\t")
        if len(tmp)==4:#5:
            bid = tmp[0]+"_"+tmp[1]
            title=utils.dataclean(tmp[2])
            content = "N"#utils.dataclean(tmp[3])
            create_at = "N"#tmp[4]

            if len(title)<1:continue  ###若标题不存在 下一条

            if bid not in index_bid:
                tmppp=[]
                for word in list(jieba.cut(title)):
                    if word not in stopwords:
                        tmppp.append(word)
                documents.append(tmppp)
                # all_words.append(jieba.cut())
                ##########
                new_add+=1
                new_adds[bid]=1
                index_bid[bid]={"number":number,"title":title,"content":content,"create_at":create_at}  ## bid索引时 title不作任何处理

print time.ctime(),"新增:",new_add,"条数据"
print time.ctime(),"len(documents):",len(documents)

# 构建字典 
dict_words=Dictionary(documents)
dict_words.filter_extremes(no_below=5, no_above=0.5, keep_n=10000)
dict_words.save_as_text("word_dict.dat")
print time.ctime(),"len(dict_words):",len(dict_words)
token2id = dict_words.token2id
# dict_words.save("words_id.dict")
# n=0
# for k in dict_words:
#     print k,dict_words[k]
#     n+=1
#     if n>10:
#         break



with codecs.open(index_bid_path,"w","utf-8") as f:
    json.dump(index_bid,f)
    print time.ctime(),"现有index_bid,",len(index_bid)
print time.ctime(),"构建index_word,bid完成"

# 构建矩阵 size_bid* size_word
all_metric=[]
for bid in index_bid:
    tmp = np.zeros(len(dict_words))
    title = index_bid[bid]["title"]

    for word in list(jieba.cut(title)):
        if word not in stopwords and word in token2id:
            tmp[token2id[word]]=1
    all_metric.append(tmp)
all_metric = np.array(all_metric)

print time.ctime(),"all_metric.shape:",all_metric.shape



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值