纯中文使用TF-IDF 标准词袋

# 静态配置
stop_word_path = "/InferenceSystem/src/I5_algorithm/NLP数据集合/停词库/stop_word_for_chinese.txt"
corpus = "南方网 3.csv"
get_news_paper(url,corpus)

#临时删除文本元素
def del_element(strings,symbles):
    srcrep = {i:'' for i in symbles }
    rep = dict((re.escape(k), v) for k, v in srcrep.items())
    pattern = re.compile("|".join(rep.keys()))
    return pattern.sub(lambda m: rep[re.escape(m.group(0))], strings)

#加载停用词
stop_words = stop_words = open(stop_word_path,'r').read().split('\n')+['\n']

#过滤停用词
def filter_stop_word(paper,stop_words):
    return np.array(list(filter(lambda x: x not in stop_words,jieba.cut(del_element(paper,'\n')))))

#读取本地新闻
def read_txt(corpus):
    return np.array([re.sub('\n','',str(word)) for word in tqdm(pd.read_csv(corpus).text,desc='加载文章')])

#只要中文
def just_chinese(strings):
    regStr = ".*?([\u4E00-\u9FA5]+).*?"
    expr = ''.join(re.findall(regStr, strings))
    if expr:
        return expr
    return '\n'

#分词
def split_word(original,temp_del=stop_words):
    result = []
    for paper in tqdm(original,desc='分词文章'):
        chinese = just_chinese(paper)
        temp_split_words = filter_stop_word(chinese,stop_words)
        result.append(temp_split_words)
    return np.array(result)

# 排序字典
def sort_dict(dict_items):
    sorted_tuple = np.array(sorted(dict_items.items(), key=lambda x: x[0], reverse=True))
    return dict(zip(sorted_tuple[:,0],sorted_tuple[:,1]))

'''数据预处理函数'''
def data_preprocessing(corpus):
    # 读取原文
    read_original = read_txt(corpus) 
    # 倒入文章并分词
    init_paper = split_word(read_original,stop_words)
    # 所有单词降维到一维
    all_words = np.array([j for i in tqdm(init_paper,desc='词列表降维') for j in i])
    # 单词去重
    word_vector = np.unique(all_words)
    # 测量共有词汇量
    m = all_words.size
    init_word_dict = {word:(all_words==word).dot(np.ones(m))/m for word in tqdm(word_vector,desc='构建频率词典')}
    #构建排序字典和特征向量 
    word_dict = sort_dict(init_word_dict)
    word_vector = np.array(list(word_dict)) 
    return word_dict,word_vector,read_original,init_paper


'''TF-IDF标准词袋'''
def TF(paper_words,word_vector):
    m = word_vector.size
    init_TF = np.zeros(m)
    for word in paper_words:
        if word in word_vector:
            index_ = np.argwhere(word_vector==word)[0][0]
            init_TF[index_] += 1
    return init_TF

def IDF(paper_words_list,word_vector):
    m = word_vector.size
    init_IDF = np.zeros(m)
    N = paper_words_list.shape
    n = -1
    for word in tqdm(word_vector,desc = 'IDF词汇'):
        n += 1
        for paper_arr in paper_words_list:
            if word in paper_arr:
                init_IDF[n] += 1
    return np.log(N/(init_IDF+1))

def TFIDF(paper_words_list,word_vector):
    IDF_arr = IDF(init_paper,word_vector)
    TF_arr = np.array([TF(paper,word_vector) for paper in tqdm(paper_words_list,desc = 'TF矩阵')])
    return TF_arr*IDF_arr
  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值