jieba库:Tokenizer()类详解(一)初始化

2021SC@SDUSC


看到代码:

class Tokenizer(object):
    # 类初始化时对数据进行初始化。
    def __init__(self, dictionary=DEFAULT_DICT):
        # 进程锁,用于防止进程对同一个对象进行操作造成资源的争用,甚至导致死锁,或者读写混乱。
        self.lock = threading.RLock()
        '''
        字典的选取,默认值为本文件夹下的dict.txt, 源码:
        DEFAULT_DICT = None
        DEFAULT_DICT_NAME = "dict.txt"
        '''
        if dictionary == DEFAULT_DICT:
            self.dictionary = dictionary
        else:
            self.dictionary = _get_abs_path(dictionary)
        # FREQ是用于存放词汇的词频的字典。
        self.FREQ = {}
        # 总词频(所有词频的标量和),用于关键词提取算法中进行权值排序
        self.total = 0
        # user_word_tag_tab用于存放词汇的词性。
        self.user_word_tag_tab = {}
        # jieba词典使用延迟加载技术,使用时加载,如果需要提前加载,可以使用jieba.initialize()加载
        self.initialized = False
        self.tmp_dir = None
        self.cache_file = None

    '''
    通过重写类的 __repr__() 方法,输出某个实例化对象时,其调用的就是该对象的 __repr__() 方法,输出的是该方法的返回值。
    '''

    def __repr__(self):
        return '<Tokenizer dictionary=%r>' % self.dictionary

    # 静态方法
    @staticmethod
    # 获取文件(词典)的每个字的词频(字典)和所有字的总词频(int)
    def gen_pfdict(f):
        lfreq = {}
        ltotal = 0
        # 确认f为文件。源码:
        '''
        def resolve_filename(f):
            try:
                return f.name
            except AttributeError:
                return repr(f)
        '''
        f_name = resolve_filename(f)
        # 逐行读取词典,对所有的词进行统计
        for lineno, line in enumerate(f, 1):
            try:
                line = line.strip().decode('utf-8')
                # 读取前两个参数,词和词频
                word, freq = line.split(' ')[:2]
                freq = int(freq)
                # 把‘词汇’:词频 加入字典lfreq。 词频累加起来赋给 ltotal
                lfreq[word] = freq
                ltotal += freq
                # 把词汇的子词都添加到lfreq中,词频为0
                for ch in xrange(len(word)):
                    wfrag = word[:ch + 1]
                    if wfrag not in lfreq:
                        lfreq[wfrag] = 0
            except ValueError:
                raise ValueError(
                    'invalid dictionary entry in %s at Line %s: %s' % (f_name, lineno, line))
        f.close()
        return lfreq, ltotal

    # 初始化,加载词典
    def initialize(self, dictionary=None):
        # 词典选择,默认为 本文件夹下 dict.txt,可自定义文件。
        if dictionary:
            abs_path = _get_abs_path(dictionary)
            if self.dictionary == abs_path and self.initialized:
                return
            else:
                self.dictionary = abs_path
                self.initialized = False
        else:
            abs_path = self.dictionary
        # 使用锁。
        with self.lock:
            try:
                with DICT_WRITING[abs_path]:
                    pass
            except KeyError:
                pass
            # 如果已经初始化就return
            if self.initialized:
                return

            default_logger.debug("Building prefix dict from %s ..." % (abs_path or 'the default dictionary'))
            t1 = time.time()
            if self.cache_file:
                cache_file = self.cache_file
            # default dictionary
            elif abs_path == DEFAULT_DICT:
                cache_file = "jieba.cache"
            # custom dictionary
            else:
                cache_file = "jieba.u%s.cache" % md5(
                    abs_path.encode('utf-8', 'replace')).hexdigest()
            cache_file = os.path.join(
                self.tmp_dir or tempfile.gettempdir(), cache_file)
            # prevent absolute path in self.cache_file
            tmpdir = os.path.dirname(cache_file)

            load_from_cache_fail = True
            if os.path.isfile(cache_file) and (abs_path == DEFAULT_DICT or
                                               os.path.getmtime(cache_file) > os.path.getmtime(abs_path)):
                default_logger.debug(
                    "Loading model from cache %s" % cache_file)
                # 如果cache_file是文件,则打开,并且把load_from_cache_fail置为False,表明从cache加载字典成功
                try:
                    with open(cache_file, 'rb') as cf:
                        self.FREQ, self.total = marshal.load(cf)
                    load_from_cache_fail = False
                except Exception:
                    load_from_cache_fail = True
            # 如果加载失败
            if load_from_cache_fail:
                wlock = DICT_WRITING.get(abs_path, threading.RLock())
                DICT_WRITING[abs_path] = wlock
                with wlock:
                    self.FREQ, self.total = self.gen_pfdict(self.get_dict_file())
                    default_logger.debug(
                        "Dumping model to file cache %s" % cache_file)
                    try:
                        # prevent moving across different filesystems
                        fd, fpath = tempfile.mkstemp(dir=tmpdir)
                        with os.fdopen(fd, 'wb') as temp_cache_file:
                            marshal.dump(
                                (self.FREQ, self.total), temp_cache_file)
                        _replace_file(fpath, cache_file)
                    except Exception:
                        default_logger.exception("Dump cache file failed.")

                try:
                    del DICT_WRITING[abs_path]
                except KeyError:
                    pass
            # 初始化分词器完成,置initialized为True
            self.initialized = True
            default_logger.debug(
                "Loading model cost %.3f seconds." % (time.time() - t1))
            default_logger.debug("Prefix dict has been built successfully.")

    # 检查是否初始化
    def check_initialized(self):
        if not self.initialized:
            self.initialize()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值