发现新词 | NLP之无监督方式构建词库(四)

前言

  这篇文章是对发现新词 | NLP之无监督方式构建词库(三)的性能优化。主要改动包括如下两个方面:

1、使用了语言模型工具kenlm的count_ngrams程序来统计ngram。由于kenlm是用C++写的,速度有保证,并且它还做了优化,所以对内存很友好。
2、在第二次遍历词库以得到候选词的时候,使用了Trie树结构来加速搜索字符串是否出现过某个ngram。Trie树或者其变种基本上是所有基于词典的分词工具的标配,就是因为它可以加快搜索字符串中是否出现过词典中的词。

一、数据介绍

  本文针对一万多条商品名称语料来进行实验,数据格式如下:
在这里插入图片描述

二、实验代码

  代码部分参见:更好更快的新词发现
  首先,定义了一些变量与一个显示进度的类。

#! -*- coding: utf-8 -*-
import re
import struct
import os
import six
import codecs
import math
import logging

min_count = 2
order = 4
Entropy_Threshold = [0, 2, 4, 6]  # 互信息熵阈值
corpus_file = 'data/thucnews.corpus'  # 语料保存的文件名
vocab_file = 'data/thucnews.chars'  # 字符集保存的文件名
ngram_file = 'data/thucnews.ngrams'  # ngram集保存的文件名
output_file = 'data/thucnews_min.vocab'  # 最后导出的词表文件名
memory = 0.5  # memory是占用内存比例,理论上不能超过可用内存比例

logging.basicConfig(level=logging.INFO, format=u'%(asctime)s - %(levelname)s - %(message)s')


class Progress:
    """显示进度,自己简单封装,比tqdm更可控一些
    iterator: 可迭代的对象;
    period: 显示进度的周期;
    steps: iterator可迭代的总步数,相当于len(iterator)
    """

    def __init__(self, iterator, period=1, steps=None, desc=None):
        self.iterator = iterator
        self.period = period
        if hasattr(iterator, '__len__'):
            self.steps = len(iterator)
        else:
            self.steps = steps
        self.desc = desc
        if self.steps:
            self._format_ = u'%s/%s passed' % ('%s', self.steps)
        else:
            self._format_ = u'%s passed'
        if self.desc:
            self._format_ = self.desc + ' - ' + self._format_
        self.logger = logging.getLogger()

    def __iter__(self):
        for i, j in enumerate(self.iterator):
            if (i + 1) % self.period == 0:
                self.logger.info(self._format_ % (i + 1))
            yield j

1.将语料转存为文本

  这一步的主要操作是将语料间的词与词之间用空格隔开。这样做的原因是kenlm需要一个以空格分词的、纯文本格式的语料作为输入。

def write_corpus(texts, filename):
    """将语料写到文件中,词与词(字与字)之间用空格隔开
    """
    with codecs.open(filename, 'w', encoding='utf-8') as f:
        for s in Progress(texts, 10000, desc=u'exporting corpus'):
            s = ' '.join(s) + '\n'
            f.write(s)


# 语料生成器,并且初步预处理语料
def text_generator():
    d = codecs.open("data/file_corpus_min.txt", encoding='utf-8').read()
    # d = d.replace(u'\u3000', ' ').strip()
    yield re.sub(u'[^\u4e00-\u9fa50-9a-zA-Z ]+', '\n', d)


# 1.将语料转存为文本
write_corpus(text_generator(), corpus_file)

生成的thucnews.corpus文件格式如下:
在这里插入图片描述

2.用Kenlm统计ngram

  这一步,就是调用kenlmcount_ngrams程序来统计ngram。所以,你需要自行编译好kenlmubuntu20.04 | 安装编译kenlm),并把它的count_ngrams放到跟word_discovery.py同一目录下。在使用前,需要赋予count_ngrams可执行权限。

(base) liujie@liujie-ThinkPad-L490:~/projects/PycharmProjects/word-discovery$ chmod +x count_ngrams 

执行示例如下:

./count_ngrams -S 50% -o 4 --write_vocab_list output/test2.chars <output/test2.corpus >output/test2.ngrams
  • -S:[ --memory ] arg (=80%) Sorting memory内存预占用量
  • -O:n:最高采用n-gram语法
  • --write_vocab_listpath1 <path2 >path3,分别是字符集文件、语料文件、ngram文件。
def count_ngrams(corpus_file, order, vocab_file, ngram_file, memory=0.5):
    """通过os.system调用Kenlm的count_ngrams来统计频数
    其中,memory是占用内存比例,理论上不能超过可用内存比例。
    """
    done = os.system(
        './count_ngrams -o %s --memory=%d%% --write_vocab_list %s <%s >%s'
        % (order, memory * 100, vocab_file, corpus_file, ngram_file)
    )
    if done != 0:
        raise ValueError('Failed to count ngrams by KenLM.')


# 2.用Kenlm统计ngram
count_ngrams(corpus_file, order, vocab_file, ngram_file, memory)

生成的文件都是二进制文件,后续需要进行转换。

3.加载Kenlm的ngram统计结果

  这一步就是转换二进制文件格式并加载到内存中。

class KenlmNgrams:
    """加载Kenlm的ngram统计结果
    vocab_file: Kenlm统计出来的词(字)表;
    ngram_file: Kenlm统计出来的ngram表;
    order: 统计ngram时设置的n,必须跟ngram_file对应;
    min_count: 自行设置的截断频数。
    """

    def __init__(self, vocab_file, ngram_file, order, min_count):
        self.vocab_file = vocab_file
        self.ngram_file = ngram_file
        self.order = order
        self.min_count = min_count
        self.read_chars()  # 读取词(字)表
        self.read_ngrams()  # 读取ngram表

    def read_chars(self):
        f = open(self.vocab_file)
        chars = f.read()
        f.close()
        chars = chars.split('\x00')
        # six.PY2 返回一个表示当前运行环境是否为python2的boolean值
        self.chars = [i.decode('utf-8') if six.PY2 else i for i in chars]

    def read_ngrams(self):
        """读取思路参考https://github.com/kpu/kenlm/issues/201
        """
        self.ngrams = [{} for _ in range(self.order)]
        self.total = 0
        size_per_item = self.order * 4 + 8

        def ngrams():
            with open(self.ngram_file, 'rb') as f:
                while True:
                    s = f.read(size_per_item)
                    if len(s) == size_per_item:
                        n = self.unpack('l', s[-8:])
                        yield s, n
                    else:
                        break

        for s, n in Progress(ngrams(), 100000, desc=u'loading ngrams'):
            if n >= self.min_count:
                self.total += n
                c = [self.unpack('i', s[j * 4: (j + 1) * 4]) for j in range(self.order)]
                c = ''.join([self.chars[j] for j in c if j > 2])
                for j in range(len(c)):
                    self.ngrams[j][c[:j + 1]] = self.ngrams[j].get(c[:j + 1], 0) + n

    def unpack(self, t, s):
        # struct.unpack:把bytes变成相应的Python数据类型
        # t=l:表示long转为integer
        # t=i:表示int转为integer
        return struct.unpack(t, s)[0]


# 3.加载ngram
ngrams = KenlmNgrams(vocab_file, ngram_file, order, min_count)

4.过滤ngram

  基于互信息熵过滤ngram,[0, 2, 4, 6]是互信息的阈值,其中第一个0无意义,仅填充用,而2, 4, 6分别是2gram、3gram、4gram的互信息阈值,基本上单调递增比较好。

# TODO:5.过滤ngram
def filter_ngrams(ngrams, total, min_pmi=1):
    """
    通过互信息过滤ngrams,只保留“结实”的ngram。
    :param ngrams: [{one_ngram}, {two_ngram}, {three_ngram}, {four_ngram}]
    :param total: 总数
    :param min_pmi: 不同ngram的互信息熵
    :return:
    """
    order = len(ngrams)
    if hasattr(min_pmi, '__iter__'):
        min_pmi = list(min_pmi)
    else:
        min_pmi = [min_pmi] * order
    output_ngrams = set()
    total = float(total)
    for i in range(order - 1, 0, -1):
        for w, v in ngrams[i].items():
            pmi = min([
                total * v / (ngrams[j].get(w[:j + 1], total) * ngrams[i - j - 1].get(w[j + 1:], total))
                for j in range(i)
            ])
            if math.log(pmi) >= min_pmi[i]:
                output_ngrams.add(w)
    return output_ngrams


# 4.过滤ngram
ngrams = filter_ngrams(ngrams.ngrams, ngrams.total, Entropy_Threshold)

5.构建字典树,做预分词,得到候选词

  首先构建一个ngramTrie树,然后用这个Trie树就可以做一个基本的“预分词”。在第二次遍历词库以得到候选词的时候,使用了Trie树结构来加速搜索字符串是否出现过某个ngram。Trie树或者其变种基本上是所有基于词典的分词工具的标配,就是因为它可以加快搜索字符串中是否出现过词典中的词。

# TODO:6.构建一个ngram的Trie树,然后用这个Trie树做一个基本的预分词,
#  只要一个片段出现在字典树中,这个片段就不切分
class SimpleTrie:
    """
        通过Trie树结构,来搜索ngrams组成的连续片段
    """

    def __init__(self):
        self.dic = {}
        self.end = True

    def add_word(self, word):
        _ = self.dic
        # 遍历字符串中的每一个字符
        for c in word:
            # 如果字符不在字典中
            if c not in _:
                _[c] = {}
            # 如果字符在字典中
            _ = _[c]
        _[self.end] = word

    def tokenize(self, sent):
        """
        通过最长联接的方式来对句子进行分词
        只要一个片段出现在字典树中,这个片段就不切分
        :param sent: 句子
        :return:
        """
        result = []
        start, end = 0, 1
        for i, c1 in enumerate(sent):
            _ = self.dic
            if i == end:
                result.append(sent[start: end])
                start, end = i, i + 1
            for j, c2 in enumerate(sent[i:]):
                if c2 in _:
                    _ = _[c2]
                    if self.end in _:
                        if i + j + 1 > end:
                            end = i + j + 1
                else:
                    break
        result.append(sent[start: end])
        return result


# 构建ngram的Trie树
ngtrie = SimpleTrie()
for w in Progress(ngrams, 10000, desc=u'build ngram trie'):
    _ = ngtrie.add_word(w)

# 得到候选词
candidates = {}
for t in Progress(text_generator()):
    for w in ngtrie.tokenize(t):  # 预分词
        candidates[w] = candidates.get(w, 0) + 1

6.候选词回溯

# TODO:7.候选词过滤
# 如果它是一个小于等于n字的词,那么检测它在不在G中,不在就出局;
# 如果它是一个大于n字的词,那个检测它每个n字片段是不是在G中,只要有一个片段不在,就出局
def filter_vocab(candidates, ngrams, order):
    """通过与ngrams对比,排除可能出来的不牢固的词汇(回溯)
    """
    result = {}
    for i, j in candidates.items():
        if len(i) < 3:
            result[i] = j
        elif len(i) <= order and i in ngrams:
            result[i] = j
        elif len(i) > order:
            flag = True
            for k in range(len(i) + 1 - order):
                if i[k: k + order] not in ngrams:
                    flag = False
            if flag:
                result[i] = j
    return result


# 频数过滤
candidates = {i: j for i, j in candidates.items() if j >= min_count}
# 互信息过滤(回溯)
candidates = filter_vocab(candidates, ngrams, order)

7.输出结果文件

# TODO:8.输出结果文件
with codecs.open(output_file, 'w', encoding='utf-8') as f:
    for i, j in sorted(candidates.items(), key=lambda s: -s[1]):
        if len(i) != 1:
            s = '%s %s\n' % (i, j)
            f.write(s)

部分效果展示:
在这里插入图片描述


参考

  • 2
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
博文的 java 实现,可以自动抽取语料库中的词汇,可以作为自然语言处理的第一步,准备词典。成词条件互信息左右熵位置成词概率ngram 频率运行方法下载或者gradle distTar打包程序解压dict_build-x.x.x.tar解压之后,进入bin. 运行:./dict_build 你的数据文件的绝对路径结束之后,在数据文件同目录有文件:words_sort.data列分别为:词,词频,互信息,左右熵,位置成词概率.示例《金瓶梅》抽取结果西门庆  4754    6.727920454563199   2.0315193024276885  0.17472535684926388 月娘    1829    6.491853096329675   2.3714166640957095  0.22135096835144072 敬济    906 9.084808387804362   2.554594603718855   0.14485683987274656 春梅    799 8.134426320220927   2.7880175589451714  0.16484505593416485 玳安    796 8.228818690495881   2.865686193737731   0.11791820110723605 后边    617 6.6293566200796095  4.008365154080131   0.2160373686259245 玉楼    594 7.977279923499917   2.27346284978306    0.27518689925240297 明日    580 6.189824558880018   2.705423396095033   0.1774535638537181 两银子  458 6.129283016944967   2.351100547282295   0.3809078896437581 小厮    454 7.257387842692652   3.945653525477103   0.16666666666666666 打发    444 6.870364719583405   3.694604352707633   0.18409496065046307 如今    410 6.643856189774725   2.1460777430093394  0.1780766096169519 淫妇    382 7.768184324776926   3.277903508489837   0.2555205047318612 桂姐    371 7.584962500721156   2.5922046565140424  0.36255305256284687 老婆    331 6.266786540694902   3.5783015008688523  0.3758007117437722 衣服    309 8.90388184573618    2.786139685416002   0.13284518828451883 丫头    297 7.383704292474053   4.291010086795063   0.21875 潘金莲  288 8.276124405274238   2.4955186567189194  0.35333669524289796 昨日    285 6.857980995127572   2.6387249970833997  0.1774535638537181 王婆    284 7.1799090900149345  2.3129267619188907  0.3758007117437722《西游记》抽取结果八戒    1807    7.88874324889826    2.00952580557629    0.36441586280814575 师父    1632    7.507794640198696   3.745294449785798   0.1371395690812608 大圣    1270    6.599912842187128   2.7790919785432147  0.13128460061010055 唐僧    1003    7.076815597050832   4.350465172292435   0.43277723258096173 菩萨    765 9.471675214392045   3.6013747138664756  0.15910495734948696 妖精    634 7.199672344836364   3.1817261900583627  0.13134411600669268 徒弟    439 8.060695931687555   2.498555429145656   0.15553809897879026 兄弟    284 7.845490050944376   2.93037668783551    0.16085578446909668 宝贝    283 9.319672120946995   2.616164396748633   0.15108220492589827 今日    282 6.714245517666122   2.1303069812971214  0.1774535638537181 取经    263 7.539158811108032   2.663944888382171   0.10181178023912565 如今    259 6.189824558880018   2.056188859866133   0.1780766096169519 认得    223 6.357552004618085   2.9543379335926954  0.2326782564877803 东土    212 8.422064766172811   3.326253983395916   0.14745277618775043 孙大圣  202 6.022367813028454   2.4886576514017107  0.13128460061010055 变作    189 7.554588851677638   3.0713596792578635  0.23452975920036348 玉帝    189 8.912889336229961   2.973106046717708   0.27518689925240297 土地    179 7.499845887083206   3.1206506190132566  0.2819944064037033 欢喜    173 8.861086905995393   2.184918471204895   0.31727272727272726 贫僧    170 7.400879436282184   2.0731236036504477  0.43277723258096173拉勾JD语料抽取结果工作  641962  11.645208082774683  4.083574124851783   0.11247281022865935 开发  348538  14.031184262140844  4.37645153459778    0.18409496065046307 相关  300517  10.477758266443889  5.038915743418073   0.1758213331033888 合作  159688  10.397674632948268  3.9963476653135794  0.19498851077798446 专业  158831  10.712527000439824  3.152041650598071   0.2640750670241287 测试  158179  13.65362883340751   4.464104436545589   0.18344308560677328 互联网   148818  16.106992250086762  3.9556191209604314  0.407386403912951 活动  131099  10.391243589427443  3.9155422678129406  0.20137250696976194 维护  120316  12.681677655209691  3.2400117935377266  0.1960306406685237 问题  112116  9.159871336778389   2.314215135279833   0.20283174185051037 优化  109563  11.324180546618742  4.331660381832997   0.2456782591010779 营销  105845  14.36850646150769   5.097001962525406   0.14961371773129828 平台  100783  9.002815015607053   4.443804901153697   0.2877423571272965 培训  93204   9.041659151637216   3.8898570467819824  0.13345998575160295 资源  90339   8.651051691178928   4.063430372719874   0.14695817490494298 相关专业    87545   8.988684686772165   2.4897196388075598  0.2905199904149232 网站  87182   8.92184093707449    5.465843476701055   0.21266038137095059 独立  86111   9.074141462752506   3.1456261690072957  0.19050261614079594 一定  83798   8.335390354693924   2.107303660112154   0.26157299167679793 流程  83165   9.321928094887362   2.5509378861028074  0.2063141084699957 网络  82742   9.087462841250339   4.681429111504988   0.21266038137095059 优秀  74600   9.370687406807217   2.0756995478573135  0.2899855507391353 信息  71009   9.820178962415188   4.2602697278449755  0.18863532864443658 媒体  67533   10.556506054671928  4.615376861300178   0.17976710334788937 编写  64337   7.960001932068081   3.482400585501417   0.265625 思维  62351   8.741466986401146   2.4320664807326646  0.15396736072031514 规划  59733   7.851749041416057   2.936854928368285   0.14166201896263245 移动  59671   10.10459875356437   3.4421932833155653  0.20137250696976194 渠道  59072   9.513727595952437   4.597891463808354   0.23578595317725753 关系  58483   8.348728154231077   2.4369558675502927  0.3170022612253688 积极  57295   9.044394119358454   2.763249521041074   0.1746848469256496 实施  56645   7.781359713524661   4.371966846513886   0.15944453739334113 福利  55732   8.475733430966399   2.4036919305145426  0.20908952728378172 其他  55665   8.434628227636725   2.9614863103296867  0.15943975441289332 功能  55087   7.787902559391432   4.1663586610392755  0.18097560975609756 代码  52431   7.88874324889826    3.876917512626917   0.2135697048449972 微信  49143   8.945443836377912   3.6868130380800643  0.18215857916308253 企业  48799   9.422064766172813   5.568662443510237   0.2905199904149232 提升  48446   8.233619676759702   3.7390647282620666  0.29750778816199375 质量  47918   10.861862340059153  3.391825261582227   0.10921827734437191 人员  47109   7.774787059601174   5.249783964892326   0.13589632038101343 数据库   45445   8.290018846932618   4.123423571610193   0.2640569395017794 商务  44047   8.189824558880018   3.44858516585648    0.12901085044961344 主动  42628   13.815583433851023  2.5049637884195137  0.1968791796700847 创意  41768   14.396470993910388  4.115068825929573   0.30544056771141337 工具  40227   9.927777962082342   2.208874047820781   0.11247281022865935 等相关   39230   11.919608238603255  3.0330398736413557  0.1758213331033888 提出  38741   10.179909090014934  4.46446156782086    0.13053040103492886 各类  38309   8.344295907915816   5.136417986953123   0.3969948596283116 操作  37061   9.06339508128851    4.676836974292029   0.23452975920036348 收集  36600   8.800899899920305   2.797691452951563   0.11388512456999896 过程  36534   8.214319120800766   2.5633950372758565  0.2063141084699957 数据分析    36081   8.442943495848729   3.5589033442862585  0.2640569395017794 标签:dictbuild

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值