jieba库：Tokenizer()类详解（一）初始化

最新推荐文章于 2022-09-14 08:26:13 发布

拉克丝の碎花裙

最新推荐文章于 2022-09-14 08:26:13 发布

阅读量1.2k

点赞数

分类专栏：笔记文章标签： python

本文链接：https://blog.csdn.net/qq_51945755/article/details/122181338

版权

笔记专栏收录该内容

21 篇文章 0 订阅

订阅专栏

2021SC@SDUSC

看到代码：

class Tokenizer(object):
    # 类初始化时对数据进行初始化。
    def __init__(self, dictionary=DEFAULT_DICT):
        # 进程锁，用于防止进程对同一个对象进行操作造成资源的争用，甚至导致死锁，或者读写混乱。
        self.lock = threading.RLock()
        '''
        字典的选取，默认值为本文件夹下的dict.txt, 源码：
        DEFAULT_DICT = None
        DEFAULT_DICT_NAME = "dict.txt"
        '''
        if dictionary == DEFAULT_DICT:
            self.dictionary = dictionary
        else:
            self.dictionary = _get_abs_path(dictionary)
        # FREQ是用于存放词汇的词频的字典。
        self.FREQ = {}
        # 总词频（所有词频的标量和），用于关键词提取算法中进行权值排序
        self.total = 0
        # user_word_tag_tab用于存放词汇的词性。
        self.user_word_tag_tab = {}
        # jieba词典使用延迟加载技术，使用时加载，如果需要提前加载，可以使用jieba.initialize()加载
        self.initialized = False
        self.tmp_dir = None
        self.cache_file = None

    '''
    通过重写类的 __repr__() 方法，输出某个实例化对象时，其调用的就是该对象的 __repr__() 方法，输出的是该方法的返回值。
    '''

    def __repr__(self):
        return '<Tokenizer dictionary=%r>' % self.dictionary

    # 静态方法
    @staticmethod
    # 获取文件（词典）的每个字的词频（字典）和所有字的总词频（int）
    def gen_pfdict(f):
        lfreq = {}
        ltotal = 0
        # 确认f为文件。源码：
        '''
        def resolve_filename(f):
            try:
                return f.name
            except AttributeError:
                return repr(f)
        '''
        f_name = resolve_filename(f)
        # 逐行读取词典，对所有的词进行统计
        for lineno, line in enumerate(f, 1):
            try:
                line = line.strip().decode('utf-8')
                # 读取前两个参数，词和词频
                word, freq = line.split(' ')[:2]
                freq = int(freq)
                # 把‘词汇’：词频 加入字典lfreq。 词频累加起来赋给 ltotal
                lfreq[word] = freq
                ltotal += freq
                # 把词汇的子词都添加到lfreq中，词频为0
                for ch in xrange(len(word)):
                    wfrag = word[:ch + 1]
                    if wfrag not in lfreq:
                        lfreq[wfrag] = 0
            except ValueError:
                raise ValueError(
                    'invalid dictionary entry in %s at Line %s: %s' % (f_name, lineno, line))
        f.close()
        return lfreq, ltotal

    # 初始化，加载词典
    def initialize(self, dictionary=None):
        # 词典选择，默认为 本文件夹下 dict.txt,可自定义文件。
        if dictionary:
            abs_path = _get_abs_path(dictionary)
            if self.dictionary == abs_path and self.initialized:
                return
            else:
                self.dictionary = abs_path
                self.initialized = False
        else:
            abs_path = self.dictionary
        # 使用锁。
        with self.lock:
            try:
                with DICT_WRITING[abs_path]:
                    pass
            except KeyError:
                pass
            # 如果已经初始化就return
            if self.initialized:
                return

            default_logger.debug("Building prefix dict from %s ..." % (abs_path or 'the default dictionary'))
            t1 = time.time()
            if self.cache_file:
                cache_file = self.cache_file
            # default dictionary
            elif abs_path == DEFAULT_DICT:
                cache_file = "jieba.cache"
            # custom dictionary
            else:
                cache_file = "jieba.u%s.cache" % md5(
                    abs_path.encode('utf-8', 'replace')).hexdigest()
            cache_file = os.path.join(
                self.tmp_dir or tempfile.gettempdir(), cache_file)
            # prevent absolute path in self.cache_file
            tmpdir = os.path.dirname(cache_file)

            load_from_cache_fail = True
            if os.path.isfile(cache_file) and (abs_path == DEFAULT_DICT or
                                               os.path.getmtime(cache_file) > os.path.getmtime(abs_path)):
                default_logger.debug(
                    "Loading model from cache %s" % cache_file)
                # 如果cache_file是文件，则打开，并且把load_from_cache_fail置为False，表明从cache加载字典成功
                try:
                    with open(cache_file, 'rb') as cf:
                        self.FREQ, self.total = marshal.load(cf)
                    load_from_cache_fail = False
                except Exception:
                    load_from_cache_fail = True
            # 如果加载失败
            if load_from_cache_fail:
                wlock = DICT_WRITING.get(abs_path, threading.RLock())
                DICT_WRITING[abs_path] = wlock
                with wlock:
                    self.FREQ, self.total = self.gen_pfdict(self.get_dict_file())
                    default_logger.debug(
                        "Dumping model to file cache %s" % cache_file)
                    try:
                        # prevent moving across different filesystems
                        fd, fpath = tempfile.mkstemp(dir=tmpdir)
                        with os.fdopen(fd, 'wb') as temp_cache_file:
                            marshal.dump(
                                (self.FREQ, self.total), temp_cache_file)
                        _replace_file(fpath, cache_file)
                    except Exception:
                        default_logger.exception("Dump cache file failed.")

                try:
                    del DICT_WRITING[abs_path]
                except KeyError:
                    pass
            # 初始化分词器完成，置initialized为True
            self.initialized = True
            default_logger.debug(
                "Loading model cost %.3f seconds." % (time.time() - t1))
            default_logger.debug("Prefix dict has been built successfully.")

    # 检查是否初始化
    def check_initialized(self):
        if not self.initialized:
            self.initialize()