2021SC@SDUSC
看到代码:
class Tokenizer(object):
# 类初始化时对数据进行初始化。
def __init__(self, dictionary=DEFAULT_DICT):
# 进程锁,用于防止进程对同一个对象进行操作造成资源的争用,甚至导致死锁,或者读写混乱。
self.lock = threading.RLock()
'''
字典的选取,默认值为本文件夹下的dict.txt, 源码:
DEFAULT_DICT = None
DEFAULT_DICT_NAME = "dict.txt"
'''
if dictionary == DEFAULT_DICT:
self.dictionary = dictionary
else:
self.dictionary = _get_abs_path(dictionary)
# FREQ是用于存放词汇的词频的字典。
self.FREQ = {}
# 总词频(所有词频的标量和),用于关键词提取算法中进行权值排序
self.total = 0
# user_word_tag_tab用于存放词汇的词性。
self.user_word_tag_tab = {}
# jieba词典使用延迟加载技术,使用时加载,如果需要提前加载,可以使用jieba.initialize()加载
self.initialized = False
self.tmp_dir = None
self.cache_file = None
'''
通过重写类的 __repr__() 方法,输出某个实例化对象时,其调用的就是该对象的 __repr__() 方法,输出的是该方法的返回值。
'''
def __repr__(self):
return '<Tokenizer dictionary=%r>' % self.dictionary
# 静态方法
@staticmethod
# 获取文件(词典)的每个字的词频(字典)和所有字的总词频(int)
def gen_pfdict(f):
lfreq = {}
ltotal = 0
# 确认f为文件。源码:
'''
def resolve_filename(f):
try:
return f.name
except AttributeError:
return repr(f)
'''
f_name = resolve_filename(f)
# 逐行读取词典,对所有的词进行统计
for lineno, line in enumerate(f, 1):
try:
line = line.strip().decode('utf-8')
# 读取前两个参数,词和词频
word, freq = line.split(' ')[:2]
freq = int(freq)
# 把‘词汇’:词频 加入字典lfreq。 词频累加起来赋给 ltotal
lfreq[word] = freq
ltotal += freq
# 把词汇的子词都添加到lfreq中,词频为0
for ch in xrange(len(word)):
wfrag = word[:ch + 1]
if wfrag not in lfreq:
lfreq[wfrag] = 0
except ValueError:
raise ValueError(
'invalid dictionary entry in %s at Line %s: %s' % (f_name, lineno, line))
f.close()
return lfreq, ltotal
# 初始化,加载词典
def initialize(self, dictionary=None):
# 词典选择,默认为 本文件夹下 dict.txt,可自定义文件。
if dictionary:
abs_path = _get_abs_path(dictionary)
if self.dictionary == abs_path and self.initialized:
return
else:
self.dictionary = abs_path
self.initialized = False
else:
abs_path = self.dictionary
# 使用锁。
with self.lock:
try:
with DICT_WRITING[abs_path]:
pass
except KeyError:
pass
# 如果已经初始化就return
if self.initialized:
return
default_logger.debug("Building prefix dict from %s ..." % (abs_path or 'the default dictionary'))
t1 = time.time()
if self.cache_file:
cache_file = self.cache_file
# default dictionary
elif abs_path == DEFAULT_DICT:
cache_file = "jieba.cache"
# custom dictionary
else:
cache_file = "jieba.u%s.cache" % md5(
abs_path.encode('utf-8', 'replace')).hexdigest()
cache_file = os.path.join(
self.tmp_dir or tempfile.gettempdir(), cache_file)
# prevent absolute path in self.cache_file
tmpdir = os.path.dirname(cache_file)
load_from_cache_fail = True
if os.path.isfile(cache_file) and (abs_path == DEFAULT_DICT or
os.path.getmtime(cache_file) > os.path.getmtime(abs_path)):
default_logger.debug(
"Loading model from cache %s" % cache_file)
# 如果cache_file是文件,则打开,并且把load_from_cache_fail置为False,表明从cache加载字典成功
try:
with open(cache_file, 'rb') as cf:
self.FREQ, self.total = marshal.load(cf)
load_from_cache_fail = False
except Exception:
load_from_cache_fail = True
# 如果加载失败
if load_from_cache_fail:
wlock = DICT_WRITING.get(abs_path, threading.RLock())
DICT_WRITING[abs_path] = wlock
with wlock:
self.FREQ, self.total = self.gen_pfdict(self.get_dict_file())
default_logger.debug(
"Dumping model to file cache %s" % cache_file)
try:
# prevent moving across different filesystems
fd, fpath = tempfile.mkstemp(dir=tmpdir)
with os.fdopen(fd, 'wb') as temp_cache_file:
marshal.dump(
(self.FREQ, self.total), temp_cache_file)
_replace_file(fpath, cache_file)
except Exception:
default_logger.exception("Dump cache file failed.")
try:
del DICT_WRITING[abs_path]
except KeyError:
pass
# 初始化分词器完成,置initialized为True
self.initialized = True
default_logger.debug(
"Loading model cost %.3f seconds." % (time.time() - t1))
default_logger.debug("Prefix dict has been built successfully.")
# 检查是否初始化
def check_initialized(self):
if not self.initialized:
self.initialize()