同义词替换,停词去除

# -*- coding: utf-8 -*-
# 去重:①同义词去重,停顿词去掉 ②删除特殊符号  ③ 删除重复
import re
from typing import Iterable
import time
import jieba

jieba.initialize()
jieba.load_userdict('./stopword.txt')

tongyici = {
    '梦见': ['梦到', '做梦'],
    '怎么': ['怎么样', '如何'],
    '男孩': ['男孩子', '男宝宝', '男生'],
    '女孩': ['女孩子', '女宝宝', '女生', '闺女'],
}


# 去除特殊符号
def remove_char(word_text: str):
    word_text = re.sub(r"[! ?:;$#^&*()@+\-\\|=_—…%¥!《》.,<>?。,:;’“【】、]+", "", word_text)
    # 删除单个字母
    return re.sub(r'([^a-z]*)[a-z]([^a-z]*)', '\g<1>\g<2>', word_text, flags=re.I)


# 去掉同义词
def replace_word(tyc, word):
    new_word = word
    for ke, va in tyc.items():
        new_word = re.sub('|'.join(va), ke, new_word)
    return [w for w in new_word.split('\n')]



#去掉停词,双循环在词量大的条件下创建字典效率非常低下
# def repalce_stopword(stop_words: Iterable, text: Iterable):
#     result = {}
#     for word in text:
#         new_word = word
#         for item in stop_words:
#             new_word = re.sub(item, '', new_word)
#         result[word] = new_word
    # 去除重复
    # return result

def repalce_stopword(stop_words, text):
    result2 = {}
    for word in text:
        #通过jieba导入,取交际替换,避免出现单一停词替换影响语句不通顺的问题。
        cut_word = jieba.lcut(word)
        new_cut_word = set(cut_word)
        same_word = new_cut_word&stop_words
        new_words = ''.join(w for w in cut_word if w not in same_word)
        result2[word] = new_words
    return result2

def quchong(word):
    result1 = {}
    for key1, val1 in word.items():
        if result1.get(val1):
            result1[val1].append(key1)
        else:
            result1[val1] = [key1]
    return result1


if __name__ == '__main__':
    with open('keywords.txt', encoding='utf-8') as fd:
        start_time = time.time()
        words = remove_char(fd.read())
        next_text = replace_word(tongyici, words)

    with open('stopword.txt', encoding='utf-8') as fs:
        # print(fs.read())
        stop_word = {w.strip() for w in fs.readlines()}
        next_stop = repalce_stopword(stop_word, next_text)
        qc_word = quchong(next_stop)
        result = {}
        with open('quchong.txt',"w",encoding='utf-8') as fd:
            for key,val in qc_word.items():
                num = val[0]
                for index in range(1,len(val)):
                    if len(val[index]) > len(num):
                        num = index
                result[key] = num
            for key,val in result.items():
                 fd.write(f"[{key}]\t[{val}]\n")
        # 保留字数最多的词,搜索量大的词,竞争度小的词

    print(f'处理耗时:{time.time() - start_time:.3f}s')

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值