2

最新推荐文章于 2021-08-30 15:27:43 发布

erinapple

最新推荐文章于 2021-08-30 15:27:43 发布

阅读量180

点赞数

分类专栏：机器学习实战

本文链接：https://blog.csdn.net/erinapple/article/details/80684334

版权

机器学习实战专栏收录该内容

30 篇文章 0 订阅

订阅专栏

#-*-encoding:utf-8-*-
import sys
import urllib
import json
import numpy as np
import math
import jieba
import codecs
import json
import os
import re
import time
import utils
from gensim.corpora import Dictionary
import numpy as np

reload(sys)
sys.setdefaultencoding( "utf-8" ) 

import logging 
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)

print time.ctime(),"开始"


# 加载字典
dict_path="dict/"
dict_name=["entity_syn", "action_syn","location_syn","time_syn","stopwords.dat"]
for fname in dict_name:
    with codecs.open(os.path.join(dict_path,fname),"r","utf-8") as f:
        for line in f.readlines():
            word = line.replace("\n","").split()
            for w in word:
                jieba.add_word(w,1000)
print time.ctime(),"字典加载完成"

# 加载停用词
stopwords_path="stopwords.dat"
stopwords={}
with codecs.open(os.path.join(dict_path,fname),"r","utf-8") as f:
    for line in f.readlines():
        word = line.replace("\n","").strip()
        if len(word)>1:continue
        stopwords[word]=1
print time.ctime(),"停用词加载完成"




print time.ctime(),"开始构建索引 bid index"

# 资源文件 待处理文件
fname="pool.dat"


index_bid={}
index_bid_path="index_bid.json"
if os.path.exists(index_bid_path):
    with codecs.open(index_bid_path,"r","utf-8") as f:
        index_bid = json.load(f)
        print time.ctime(),"现有index_bid,",len(index_bid)

new_add=0
new_adds={}

documents=[]

with codecs.open(fname,"r","utf-8") as f:
    lines = f.readlines()
    for number,line in enumerate(lines):
        tmp = line.replace("\n","").split("\t")
        if len(tmp)==4:#5:
            bid = tmp[0]+"_"+tmp[1]
            title=utils.dataclean(tmp[2])
            content = "N"#utils.dataclean(tmp[3])
            create_at = "N"#tmp[4]

            if len(title)<1:continue  ###若标题不存在　下一条

            if bid not in index_bid:
                tmppp=[]
                for word in list(jieba.cut(title)):
                    if word not in stopwords:
                        tmppp.append(word)
                documents.append(tmppp)
                # all_words.append(jieba.cut())
                ##########
                new_add+=1
                new_adds[bid]=1
                index_bid[bid]={"number":number,"title":title,"content":content,"create_at":create_at}  ## bid索引时 title不作任何处理

print time.ctime(),"新增:",new_add,"条数据"
print time.ctime(),"len(documents):",len(documents)

# 构建字典 
dict_words=Dictionary(documents)
dict_words.filter_extremes(no_below=5, no_above=0.5, keep_n=10000)
dict_words.save_as_text("word_dict.dat")
print time.ctime(),"len(dict_words):",len(dict_words)
token2id = dict_words.token2id
# dict_words.save("words_id.dict")
# n=0
# for k in dict_words:
#     print k,dict_words[k]
#     n+=1
#     if n>10:
#         break



with codecs.open(index_bid_path,"w","utf-8") as f:
    json.dump(index_bid,f)
    print time.ctime(),"现有index_bid,",len(index_bid)
print time.ctime(),"构建index_word,bid完成"

# 构建矩阵 size_bid* size_word
all_metric=[]
for bid in index_bid:
    tmp = np.zeros(len(dict_words))
    title = index_bid[bid]["title"]

    for word in list(jieba.cut(title)):
        if word not in stopwords and word in token2id:
            tmp[token2id[word]]=1
    all_metric.append(tmp)
all_metric = np.array(all_metric)

print time.ctime(),"all_metric.shape:",all_metric.shape