python:列出可能的专有名词

1. 统计

通过计算获得出现频率高的几组连续词是否能组成专有名词

import jieba
from collections import Counter
from itertools import chain
import math
import pandas as pd
import os
import re

base_dir = os.path.dirname(__file__)
knowledge_path = os.path.join(base_dir, 'local_data', 'knowledge.json')
stop_words_path = os.path.join(base_dir, 'utils', 'stop_words_fuhao')
calc_user_words_path = os.path.join(base_dir, 'utils', 'calc_user_words')
df_knowledge_ = pd.read_json(knowledge_path)

with open(stop_words_path, 'r', encoding='utf-8') as f:
    stop_words = f.read().split('\n')


def load_documents():
    # 这里模拟加载文档数据,实际应用中应从文件或数据库加载
    return df_knowledge_.description.str.replace(r'<.*?>|&nbsp;|\r|\n', '').str.lower().to_list()


def get_words(documents, user_words=None):
    # 使用jieba进行分词
    for i in user_words:
        jieba.add_word(i)
    words = [[w for w in jieba.cut(doc, use_paddle=True) if (w not in stop_words) and (not w.isdigit())] for doc in
             documents]
    return list(chain(*words))


def get_bigrams(words):
    # 生成bigram列表
    return [(words[i], words[i + 1]) for i in range(len(words) - 1)]


def calculate_pmi(bigrams, word_count, min_count=5):
    # 计算每个bigram的PMI
    # min_count '专有名词'出现的次数不低于
    bigram_count = Counter(bigrams)
    total = sum(word_count.values())
    pmi = {}
    for bigram, freq in bigram_count.items():
        if freq >= min_count:  # 过滤低频bigrams
            word1, word2 = bigram
            prob_word1 = word_count[word1] / total
            prob_word2 = word_count[word2] / total
            prob_bigram = freq / total
            pmi[bigram] = math.log(prob_bigram / (prob_word1 * prob_word2), 2)
    return pmi


def main(user_words):
    documents = load_documents()
    words = get_words(documents, user_words)
    word_count = Counter(words)
    bigrams = get_bigrams(words)
    pmi = calculate_pmi(bigrams, word_count)
    maybe_words = []
    # 输出PMI值较高的bigrams
    for bigram, value in sorted(pmi.items(), key=lambda x: x[1], reverse=True):
        # print(f"{bigram}: {value}")
        concat_words = ''.join(bigram)
        if re.match(r'[a-zA-Z]*', concat_words).group() == concat_words:
            # 纯英文用空格间隔
            maybe_words.append(' '.join(bigram))
        else:
            maybe_words.append(concat_words)
    return maybe_words


if __name__ == "__main__":
    ll = []
    count = 5  # 1:前后2个词拼接的词;2:前后3个词拼接的词;3:前后4个词拼接的词......
    while count:
        ll.extend(main(tuple(ll)))
        count -= 1
    sort_ll = sorted(set(ll))
    with open(calc_user_words_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join([f"{i} owner" for i in sort_ll]))




2. transformers

通过历史数据训练模型,根据传入的第一个词得到可能拼接在后面的第二个词,再把第一第二个词传入得到可能拼接在后面的第三个词

...
  • 3
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值