二、分字处理及文本清洗
该部分涉及tokenization.py
1. CharacterRecognition类
该类中存放一些字符识别函数, 判断是否为whitespace、control、punctuation、chinese_char。
class CharacterRecognition:
@staticmethod
def is_whitespace(char):
"""\t, \n, \r 通通认为是 whitespace. """
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False
@staticmethod
def is_control(char):
"""控制符 """
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat in ("Cc", "Cf"):
return True
return False
@staticmethod
def is_punctuation(char):
"""所有非 字母/数字 ASCII都看作是 标点符号"""
# 包括 "^", "$", "`"
cp = ord(char)
if ((33 <= cp <= 47) or (58 <= cp <= 64) or
(91 <= cp <= 96) or (123 <= cp <= 126)):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False
@staticmethod
def is_chinese_char(char):
cp = ord(char)
if ((0x4E00 <= cp <= 0x9FFF) or
(0x3400 <= cp <= 0x4DBF) or
(0x20000 <= cp <= 0x2A6DF) or
(0x2A700 <= cp <= 0x2B73F) or
(0x2B740 <= cp <= 0x2B81F) or
(0x2B820 <= cp <= 0x2CEAF) or
(0xF900 <= cp <= 0xFAFF) or
(0x2F800 <= cp <= 0x2FA1F)):
return True
return False
2.BasicTokenizer类
该类主要包含以下功能:
- 文本清洗
移除无效字符; whitespace替换; - 中文处理
对于文本中的每个中文字符,前后都添加一个空格。 - 按whitespace分词
这样分词后,还可能存在"hello," 这种情况 - 按punctuation分词
- " ".join()后再进行一次whitespace分词
最后结果是list[char]
class BasicTokenizer(object):
def __init__(self, do_lower_case=True):
"""文本中的英文是否需要小写化"""
self.character_recognition = CharacterRecognition()
self.do_lower_case = do_lower_case
def tokenizer(self, text):
"""文本的清洗、数据处理, 以及分词"""
text = self._clean_text(text)
text = self._tokenize_chinese_chars(text)
orig_tokens = self._whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case:
token = token.lower()
split_tokens.extend(self._run_split_on_punc(token))
output_tokens = self._whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_split_on_punc(self, text):
"""切分 小片段文本中的标点符号, 可能有 "hello,"这种 """
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if self.character_recognition.is_whitespace(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
@classmethod
def _whitespace_tokenize(cls, text):
"""空格处理, 移除多余的, 并分词"""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
def _tokenize_chinese_chars(self, text):
"""中文处理, 每个中文周围加一个 分隔符."""
output = []
for char in text:
if self.character_recognition.is_chinese_char(char):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
# _开头的, 仅仅是概念上的私有。
def _clean_text(self, text):
"""移除无效字符, whitespace替换"""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or \
self.character_recognition.is_control(char):
continue
if self.character_recognition.is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
3. FullTokenizer类
该类主要包含以下功能:
- 建立词表
word2id - 建立反词表
id2word - 分词
调用BasicTokenizer, 返回list[char]
class FullTokenizer(object):
"""端到端的 tokenizer. """
def __init__(self, vocab_file, do_lower_case=True):
self.vocab = load_vocab(vocab_file)
# inv_vocab 是 vocab的反转. {id:word, id:word}
self.inv_vocab = {v: k for k, v in self.vocab.items()}
# 数据清洗, 以及分词处理
self.basic_tokenizer = BasicTokenizer()
def tokenize(self, text):
split_tokens = []
for token in self.basic_tokenizer.tokenizer(text):
split_tokens.append(token)
return split_tokens
def convert_tokens_to_ids(self, tokens):
return convert_by_vocab(self.vocab, tokens)
def convert_ids_to_tokens(self, ids):
return convert_by_vocab(self.inv_vocab, ids)
4.通用函数
a、加载词表
遇到空行, 结束加载。
def load_vocab(vocab_file):
vocab = collections.OrderedDict()
index = 0
with tf.gfile.GFile(vocab_file, "r") as reader:
while True:
if not token:
# 遇到空行, 或结尾.
break
token = reader.readline()
token = token.strip()
vocab[token] = index
index += 1
return vocab
b、利用字典转换.
被token2id和id2token调用.
def convert_by_vocab(vocab, items):
# 被 convert_tokens_to_ids 和 convert_ids_to_tokens调用.
output = []
for item in items:
output.append(vocab[item])
return output
c、token2id.
def convert_tokens_to_ids(vocab, tokens):
return convert_by_vocab(vocab, tokens)
d、id2token.
def convert_ids_to_tokens(inv_vocab, ids):
return convert_by_vocab(inv_vocab, ids)