最近读 https://github.com/google-research/bert
tokenization.py 里面一段代码觉得很有用,记录一下,以后也许用得到哈哈
def _is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs": #[Zs] Separator, Space
return True
return False
def _is_control(char):
"""Checks whether `chars` is a control character 控制字符."""
# These are technically control characters but we count them as whitespace
# characters. 这些在技术上是控制字符,但我们将它们视为空白字符
if char == "\t" or char == "\n" or char == "\r": # \r回车符 \n换行符
return False
cat = unicodedata.category(char) # unicodedata.category(chr) 把一个字符返回它在UNICODE里分类的类型。具体类型如下:https://blog.csdn.net/xc_zhou/article/details/82079753
if cat in ("Cc", "Cf"): # [Cc] Other, Control [Cf] Other, Format
return True
return False
def _is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
return True
cat = unicodedata.category(char)
if cat.startswith("P"): # [Pc] Punctuation, Connector [Pd] Punctuation, Dash [Pe] Punctuation, Close [Pf] Punctuation, Final quote (may behave like Ps or Pe depending on usage) [Pi] Punctuation, Initial quote (may behave like Ps or Pe depending on usage) [Po] Punctuation, Other [Ps] Punctuation, Open
return True
return False
unicodedata.category(chr) 把一个字符返回它在UNICODE里分类的类型。具体类型如下:https://blog.csdn.net/xc_zhou/article/details/82079753
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
(cp >= 0x3400 and cp <= 0x4DBF) or #
(cp >= 0x20000 and cp <= 0x2A6DF) or #
(cp >= 0x2A700 and cp <= 0x2B73F) or #
(cp >= 0x2B740 and cp <= 0x2B81F) or #
(cp >= 0x2B820 and cp <= 0x2CEAF) or
(cp >= 0xF900 and cp <= 0xFAFF) or #
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
return True
return False
def _clean_text(self, text): # ?
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or _is_control(char): # 0 -> NULL
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)