from __future__ import absolute_import, unicode_literals
__version__ = '0.39'
__license__ = 'MIT'
import re
import os
import sys
import time
import logging
import marshal
import tempfile
import threading
from math import log
from hashlib import md5
from ._compat import *
from . import finalseg
if os.name == 'nt':
from shutil import move as _replace_file
else:
_replace_file = os.rename
_get_abs_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), path))
DEFAULT_DICT = None
DEFAULT_DICT_NAME = "dict.txt"
log_console = logging.StreamHandler(sys.stderr)
default_logger = logging.getLogger(__name__)
default_logger.setLevel(logging.DEBUG)
default_logger.addHandler(log_console)
DICT_WRITING = {}
pool = None
re_userdict = re.compile('^(.+?)( [0-9]+)?( [a-z]+)?$', re.U)
re_eng = re.compile('[a-zA-Z0-9]', re.U)
# \u4E00-\u9FD5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
# \r\n|\s : whitespace characters. Will not be handled.
re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)", re.U)
re_skip_default = re.compile("(\r\n|\s)", re.U)
re_han_cut_all = re.compile("([\u4E00-\u9FD5]+)", re.U)
re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U)
def setLogLevel(log_level):
global logger
default_logger.setLevel(log_level)
class Tokenizer(object):
def __init__(self, dictionary=DEFAULT_DICT):
self.lock = threading.RLock()
if dictionary == DEFAULT_DICT:
self.dictionary = dictionary
else:
self.dictionary = _get_abs_path(dictionary)
self.FREQ = {}
self.total = 0
self.user_word_tag_tab = {}
self.initialized = False
self.tmp_dir = None
self.cache_file = None
def __repr__(self):
return '<Tokenizer dictionary=%r>' % self.dictionary
def gen_pfdict(self, f):
lfreq = {}
ltotal = 0
f_name = resolve_filename(f)
for lineno, line in enumerate(f, 1):
try:
line = line.strip().decode('utf-8')
word, freq = line.split(' ')[:2]
freq = int(freq)
lfreq[word] = freq
ltotal += freq
for ch in xrange(len(word)):
wfrag = word[:ch + 1]
if wfrag not in lfreq:
lfreq[wfrag] = 0
except ValueError:
raise ValueError(
'invalid dictionary entry in %s at Line %s: %s' % (f_name, lineno, line))
f.close()
return lfreq, ltotal
def initialize(self, dictionary=None):
if dictionary:
abs_path = _get_abs_path(dictionary)
if self.dictionary == abs_path and self.initialized:
return
else:
self.dictionary = abs_path
self.initialized = False
else:
abs_path = self.dictionary
with self.lock:
try:
with DICT_WRITING[abs_path]:
pass
except KeyError:
pass
if self.initialized:
return
default_logger.debug("Building prefix dict from %s ..." % (abs_path or 'the default dictionary'))
t1 = time.time()
if self.cache_file:
cache_file = self.cache_file
# default dictionary
elif abs_path == DEFAULT_DICT:
cache_file = "jieba.cache"
# custom dictionary
else:
cache_file = "jieba.u%s.cache" % md5(
abs_path.encode('u
结巴分词源码解读(一)
最新推荐文章于 2024-09-29 15:20:22 发布
本文主要探讨了结巴分词的源码实现,从基础的分词原理到关键算法的详细步骤,为读者提供了深入理解结巴分词工作方式的入口。
摘要由CSDN通过智能技术生成