# 指定编码为 UTF-8# 版权声明,版权归Google AI Language Team和HuggingFace Inc.团队所有,使用Apache License 2.0授权# 只有在遵守许可证的情况下才能使用此文件# 可以在以下网址获取许可证的副本:http://www.apache.org/licenses/LICENSE-2.0# 如果适用法律要求或书面同意,软件将按“原样”分发,不提供任何明示或暗示的保证或条件# 请参阅许可证以了解详细信息"""Bert的标记化类。"""# 导入所需模块import collections # 导入collections模块import os # 导入os模块import unicodedata # 导入unicodedata模块from typing import List, Optional, Tuple # 导入类型提示所需的模块# 从tokenization_utils.py中导入预训练的标记器和一些辅助函数from...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
# 导入日志记录功能from...utils import logging
# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)# 定义词汇文件的名称,这里是一个包含词汇的文本文件
VOCAB_FILES_NAMES ={"vocab_file":"vocab.txt"}# 预训练词汇文件的映射,这里假设只有一个vocab_file键,对应的值是vocab.txt文件名
PRETRAINED_VOCAB_FILES_MAP ={{"vocab_file":{"google-bert/bert-base-uncased":"https://huggingface.co/google-bert/bert-base-uncased/resolve/main/vocab.txt","google-bert/bert-large-uncased":"https://huggingface.co/google-bert/bert-large-uncased/resolve/main/vocab.txt","google-bert/bert-base-cased":"https://huggingface.co/google-bert/bert-base-cased/resolve/main/vocab.txt","google-bert/bert-large-cased":"https://huggingface.co/google-bert/bert-large-cased/resolve/main/vocab.txt","google-bert/bert-base-multilingual-uncased":("https://huggingface.co/google-bert/bert-base-multilingual-uncased/resolve/main/vocab.txt"),"google-bert/bert-base-multilingual-cased":"https://huggingface.co/google-bert/bert-base-multilingual-cased/resolve/main/vocab.txt","google-bert/bert-base-chinese":"https://huggingface.co/google-bert/bert-base-chinese/resolve/main/vocab.txt","google-bert/bert-base-german-cased":"https://huggingface.co/google-bert/bert-base-german-cased/resolve/main/vocab.txt","google-bert/bert-large-uncased-whole-word-masking":("https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking/resolve/main/vocab.txt"),"google-bert/bert-large-cased-whole-word-masking":("https://huggingface.co/google-bert/bert-large-cased-whole-word-masking/resolve/main/vocab.txt"),"google-bert/bert-large-uncased-whole-word-masking-finetuned-squad":("https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt"),"google-bert/bert-large-cased-whole-word-masking-finetuned-squad":("https://huggingface.co/google-bert/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt"),"google-bert/bert-base-cased-finetuned-mrpc":("https://huggingface.co/google-bert/bert-base-cased-finetuned-mrpc/resolve/main/vocab.txt"),"google-bert/bert-base-german-dbmdz-cased":"https://huggingface.co/google-bert/bert-base-german-dbmdz-cased/resolve/main/vocab.txt","google-bert/bert-base-german-dbmdz-uncased":("https://huggingface.co/google-bert/bert-base-german-dbmdz-uncased/resolve/main/vocab.txt"),"TurkuNLP/bert-base-finnish-cased-v1":("https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/vocab.txt"),"TurkuNLP/bert-base-finnish-uncased-v1":("https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/vocab.txt"),"wietsedv/bert-base-dutch-cased":("https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/vocab.txt"),}}
注释:
# vocab_file 是一个包含不同 BERT 模型及其对应词汇表 URL 的字典{"google-bert/bert-base-uncased":"https://huggingface.co/google-bert/bert-base-uncased/resolve/main/vocab.txt",# Google BERT base uncased 模型的词汇表 URL"google-bert/bert-large-uncased":"https://huggingface.co/google-bert/bert-large-uncased/resolve/main/vocab.txt",# Google BERT large uncased 模型的词汇表 URL"google-bert/bert-base-cased":"https://huggingface.co/google-bert/bert-base-cased/resolve/main/vocab.txt",# Google BERT base cased 模型的词汇表 URL"google-bert/bert-large-cased":"https://huggingface.co/google-bert/bert-large-cased/resolve/main/vocab.txt",# Google BERT large cased 模型的词汇表 URL"google-bert/bert-base-multilingual-uncased":("https://huggingface.co/google-bert/bert-base-multilingual-uncased/resolve/main/vocab.txt"# Google BERT base 多语言 uncased 模型的词汇表 URL),"google-bert/bert-base-multilingual-cased":"https://huggingface.co/google-bert/bert-base-multilingual-cased/resolve/main/vocab.txt",# Google BERT base 多语言 cased 模型的词汇表 URL"google-bert/bert-base-chinese":"https://huggingface.co/google-bert/bert-base-chinese/resolve/main/vocab.txt",# Google BERT base 中文模型的词汇表 URL"google-bert/bert-base-german-cased":"https://huggingface.co/google-bert/bert-base-german-cased/resolve/main/vocab.txt",# Google BERT base 德语 cased 模型的词汇表 URL"google-bert/bert-large-uncased-whole-word-masking":("https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking/resolve/main/vocab.txt"# Google BERT large uncased 整词屏蔽模型的词汇表 URL),"google-bert/bert-large-cased-whole-word-masking":("https://huggingface.co/google-bert/bert-large-cased-whole-word-masking/resolve/main/vocab.txt"# Google BERT large cased 整词屏蔽模型的词汇表 URL),"google-bert/bert-large-uncased-whole-word-masking-finetuned-squad":("https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt"# Google BERT large uncased 整词屏蔽模型(在 SQuAD 上微调)的词汇表 URL),"google-bert/bert-large-cased-whole-word-masking-finetuned-squad":("https://huggingface.co/google-bert/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt"# Google BERT large cased 整词屏蔽模型(在 SQuAD 上微调)的词汇表 URL),"google-bert/bert-base-cased-finetuned-mrpc":("https://huggingface.co/google-bert/bert-base-cased-finetuned-mrpc/resolve/main/vocab.txt"# Google BERT base cased 模型(在 MRPC 数据集上微调)的词汇表 URL),"google-bert/bert-base-german-dbmdz-cased":"https://huggingface.co/google-bert/bert-base-german-dbmdz-cased/resolve/main/vocab.txt",# Google BERT base 德语(由 DBMDZ 组织提供,cased)模型的词汇表 URL"google-bert/bert-base-german-dbmdz-uncased":("https://huggingface.co/google-bert/bert-base-german-dbmdz-uncased/resolve/main/vocab.txt"# Google BERT base 德语(由 DBMDZ 组织提供,uncased)模型的词汇表 URL),"TurkuNLP/bert-base-finnish-cased-v1":("https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/vocab.txt"# TurkuNLP 提供的芬兰语 cased BERT base v1 模型的词汇表 URL),"TurkuNLP/bert-base-finnish-uncased-v1":("https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/vocab.txt"# TurkuNLP 提供的芬兰语 uncased BERT base v1 模型的词汇表 URL),"wietsedv/bert-base-dutch-cased":(
"
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES ={"google-bert/bert-base-uncased":512,# 设置预训练模型的位置嵌入尺寸"google-bert/bert-large-uncased":512,"google-bert/bert-base-cased":512,"google-bert/bert-large-cased":512,"google-bert/bert-base-multilingual-uncased":512,"google-bert/bert-base-multilingual-cased":512,"google-bert/bert-base-chinese":512,"google-bert/bert-base-german-cased":512,"google-bert/bert-large-uncased-whole-word-masking":512,"google-bert/bert-large-cased-whole-word-masking":512,"google-bert/bert-large-uncased-whole-word-masking-finetuned-squad":512,"google-bert/bert-large-cased-whole-word-masking-finetuned-squad":512,"google-bert/bert-base-cased-finetuned-mrpc":512,"google-bert/bert-base-german-dbmdz-cased":512,"google-bert/bert-base-german-dbmdz-uncased":512,"TurkuNLP/bert-base-finnish-cased-v1":512,"TurkuNLP/bert-base-finnish-uncased-v1":512,"wietsedv/bert-base-dutch-cased":512,}
PRETRAINED_INIT_CONFIGURATION ={"google-bert/bert-base-uncased":{"do_lower_case":True},# 配置预训练模型初始化参数"google-bert/bert-large-uncased":{"do_lower_case":True},"google-bert/bert-base-cased":{"do_lower_case":False},"google-bert/bert-large-cased":{"do_lower_case":False},"google-bert/bert-base-multilingual-uncased":{"do_lower_case":True},"google-bert/bert-base-multilingual-cased":{"do_lower_case":False},"google-bert/bert-base-chinese":{"do_lower_case":False},"google-bert/bert-base-german-cased":{"do_lower_case":False},"google-bert/bert-large-uncased-whole-word-masking":{"do_lower_case":True},"google-bert/bert-large-cased-whole-word-masking":{"do_lower_case":False},"google-bert/bert-large-uncased-whole-word-masking-finetuned-squad":{"do_lower_case":True},"google-bert/bert-large-cased-whole-word-masking-finetuned-squad":{"do_lower_case":False},"google-bert/bert-base-cased-finetuned-mrpc":{"do_lower_case":False},"google-bert/bert-base-german-dbmdz-cased":{"do_lower_case":False},"google-bert/bert-base-german-dbmdz-uncased":{"do_lower_case":True},"TurkuNLP/bert-base-finnish-cased-v1":{"do_lower_case":False},"TurkuNLP/bert-base-finnish-uncased-v1":{"do_lower_case":True},"wietsedv/bert-base-dutch-cased":{"do_lower_case":False},}defload_vocab(vocab_file):"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()# 创建一个有序字典用于存储词汇表withopen(vocab_file,"r", encoding="utf-8")as reader:
tokens = reader.readlines()# 读取词汇文件中的所有行for index, token inenumerate(tokens):
token = token.rstrip("\n")# 去除每个词汇的换行符
vocab[token]= index # 将词汇添加到字典中,键为词汇,值为索引return vocab # 返回加载后的词汇表字典defwhitespace_tokenize(text):"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()# 去除文本首尾空白字符ifnot text:return[]# 如果文本为空,则返回空列表
tokens = text.split()# 使用空格分割文本生成词汇列表return tokens # 返回分割后的词汇列表classBertTokenizer(PreTrainedTokenizer):r"""
Construct a BERT tokenizer. Based on WordPiece.
"""# 从`PreTrainedTokenizer`继承,该类包含大多数主要方法。用户应参考这个超类以获取关于这些方法的更多信息。# 参数:# vocab_file (`str`):# 包含词汇表的文件。# do_lower_case (`bool`, *可选*, 默认为 `True`):# 在标记化时是否将输入转换为小写。# do_basic_tokenize (`bool`, *可选*, 默认为 `True`):# 是否在使用WordPiece之前进行基本的标记化。# never_split (`Iterable`, *可选*):# 在标记化时永远不会分割的一组标记。仅在 `do_basic_tokenize=True` 时有效。# unk_token (`str`, *可选*, 默认为 `"[UNK]"`):# 未知标记。词汇表中不存在的标记无法转换为ID,并将被设置为此标记。# sep_token (`str`, *可选*, 默认为 `"[SEP]"`):# 分隔符标记,在构建来自多个序列的序列时使用,例如用于序列分类或用于文本和问题的问题回答。在使用特殊标记构建的序列的最后一个标记也会使用此标记。# pad_token (`str`, *可选*, 默认为 `"[PAD]"`):# 用于填充的标记,例如在批处理不同长度的序列时使用。# cls_token (`str`, *可选*, 默认为 `"[CLS]"`):# 分类器标记,在进行序列分类(整个序列的分类而不是每个标记的分类)时使用。在使用特殊标记构建的序列的第一个标记。# mask_token (`str`, *可选*, 默认为 `"[MASK]"`):# 用于屏蔽值的标记。这是在使用掩蔽语言建模训练模型时使用的标记。模型将尝试预测此标记。# tokenize_chinese_chars (`bool`, *可选*, 默认为 `True`):# 是否标记化中文字符。# 对于日文,这可能应该停用(参见此[问题](https://github.com/huggingface/transformers/issues/328))。# strip_accents (`bool`, *可选*):# 是否删除所有重音符号。如果未指定此选项,则将根据 `lowercase` 的值确定(与原始BERT相同)。
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
# 初始化方法,用于初始化一个Tokenizer对象def__init__(
self,
vocab_file,
do_lower_case=True,
do_basic_tokenize=True,
never_split=None,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,**kwargs,):# 检查给定的词汇文件是否存在,如果不存在则抛出异常ifnot os.path.isfile(vocab_file):raise ValueError(f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"" model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`")# 加载词汇表文件到self.vocab中
self.vocab = load_vocab(vocab_file)# 根据加载的词汇表构建从id到token的有序字典
self.ids_to_tokens = collections.OrderedDict([(ids, tok)for tok, ids in self.vocab.items()])# 是否进行基本的tokenize操作
self.do_basic_tokenize = do_basic_tokenize
# 如果需要进行基本tokenize,则初始化BasicTokenizer对象if do_basic_tokenize:
self.basic_tokenizer = BasicTokenizer(
do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,)# 初始化WordpieceTokenizer对象,使用加载的词汇表和未知token
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))# 调用父类的初始化方法,传递相同的参数和额外的参数super().__init__(
do_lower_case=do_lower_case,
do_basic_tokenize=do_basic_tokenize,
never_split=never_split,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,**kwargs,)# 属性方法,返回是否进行小写处理的标志位@propertydefdo_lower_case(self):return self.basic_tokenizer.do_lower_case
# 属性方法,返回词汇表的大小@propertydefvocab_size(self):returnlen(self.vocab)# 方法,返回包含所有词汇和特殊token编码的字典defget_vocab(self):returndict(self.vocab,**self.added_tokens_encoder)# 方法,对文本进行tokenize操作,返回token列表def_tokenize(self, text, split_special_tokens=False):
split_tokens =[]# 如果需要进行基本tokenize操作if self.do_basic_tokenize:# 遍历基本tokenizer的tokenize结果for token in self.basic_tokenizer.tokenize(
text, never_split=self.all_special_tokens ifnot split_special_tokens elseNone):# 如果token在不分割集合中,则直接加入split_tokens列表if token in self.basic_tokenizer.never_split:
split_tokens.append(token)else:# 否则,使用WordpieceTokenizer对token进行进一步的分词处理,并加入split_tokens列表
split_tokens += self.wordpiece_tokenizer.tokenize(token)else:# 否则,直接使用WordpieceTokenizer对整个text进行tokenize操作
split_tokens = self.wordpiece_tokenizer.tokenize(text)return split_tokens
# 方法,根据token获取对应的iddef_convert_token_to_id(self, token):"""Converts a token (str) in an id using the vocab."""return self.vocab.get(token, self.vocab.get(self.unk_token))# 方法,根据id获取对应的tokendef_convert_id_to_token(self, index):"""Converts an index (integer) in a token (str) using the vocab."""return self.ids_to_tokens.get(index, self.unk_token)defconvert_tokens_to_string(self, tokens):"""
Converts a sequence of tokens (string) into a single string by joining them,
removing '##' and stripping leading/trailing whitespace.
Args:
tokens (List[str]): List of tokens to be converted.
Returns:
str: The concatenated string of tokens.
"""
out_string =" ".join(tokens).replace(" ##","").strip()return out_string
defbuild_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]]=None)-> List[int]:"""
Builds model inputs from a sequence or a pair of sequences for sequence classification tasks
by adding special tokens. A BERT sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (List[int]): List of token IDs for the first sequence.
token_ids_1 (Optional[List[int]]): Optional list of token IDs for the second sequence.
Returns:
List[int]: List of input IDs with the appropriate special tokens added.
"""if token_ids_1 isNone:return[self.cls_token_id]+ token_ids_0 +[self.sep_token_id]
cls =[self.cls_token_id]
sep =[self.sep_token_id]return cls + token_ids_0 + sep + token_ids_1 + sep
defget_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]]=None, already_has_special_tokens:bool=False)-> List[int]:"""
Retrieves a mask indicating whether each token in the input list is a special token
(1 for special token, 0 for sequence token). This is used when preparing tokens for a model.
Args:
token_ids_0 (List[int]): List of token IDs for the first sequence.
token_ids_1 (Optional[List[int]]): Optional list of token IDs for the second sequence.
already_has_special_tokens (bool, optional): Whether the input token lists already include special tokens.
Returns:
List[int]: A list of integers representing the mask.
"""if already_has_special_tokens:returnsuper().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True)if token_ids_1 isnotNone:return[1]+([0]*len(token_ids_0))+[1]+([0]*len(token_ids_1))+[1]return[1]+([0]*len(token_ids_0))+[1]defcreate_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]]=None)-> List[int]:"""
Creates token type IDs from token lists representing sequences or pairs of sequences.
Args:
token_ids_0 (List[int]): List of token IDs for the first sequence.
token_ids_1 (Optional[List[int]]): Optional list of token IDs for the second sequence.
Returns:
List[int]: List of token type IDs.
""")-> List[int]:"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs representing the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List representing the token type IDs for the given sequence(s).
"""# Define separator and classification tokens
sep =[self.sep_token_id]# Separator token ID
cls =[self.cls_token_id]# Classification token ID# If token_ids_1 is None, return a mask with zeros corresponding to the first sequence onlyif token_ids_1 isNone:returnlen(cls + token_ids_0 + sep)*[0]# Create and return mask with zeros# If token_ids_1 is provided, return a mask with zeros for the first sequence and ones for the second sequencereturnlen(cls + token_ids_0 + sep)*[0]+len(token_ids_1 + sep)*[1]defsave_vocabulary(self, save_directory:str, filename_prefix: Optional[str]=None)-> Tuple[str]:# Initialize index counter
index =0# Determine vocabulary file pathif os.path.isdir(save_directory):# If save_directory is a directory, construct file path inside the directory
vocab_file = os.path.join(
save_directory,(filename_prefix +"-"if filename_prefix else"")+ VOCAB_FILES_NAMES["vocab_file"])else:# Otherwise, treat save_directory as the full file path
vocab_file =(filename_prefix +"-"if filename_prefix else"")+ save_directory
# Write vocabulary to the specified filewithopen(vocab_file,"w", encoding="utf-8")as writer:# Iterate through vocabulary items sorted by indexfor token, token_index insorted(self.vocab.items(), key=lambda kv: kv[1]):# Check for non-consecutive indices in the vocabularyif index != token_index:
logger.warning(f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."" Please check that the vocabulary is not corrupted!")
index = token_index # Update index to current token's index
writer.write(token +"\n")# Write token to file
index +=1# Increment index for the next token# Return the path to the saved vocabulary filereturn(vocab_file,)# 定义一个名为 BasicTokenizer 的类,用于执行基本的分词(如分割标点符号、转换为小写等)。classBasicTokenizer(object):"""
Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
Args:
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
never_split (`Iterable`, *optional*):
Collection of tokens which will never be split during tokenization. Only has an effect when
`do_basic_tokenize=True`
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters.
This should likely be deactivated for Japanese (see this
[issue](https://github.com/huggingface/transformers/issues/328)).
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""# 初始化方法,设置类的属性def__init__(
self,
do_lower_case=True,# 是否将输入转换为小写,默认为True
never_split=None,# 永远不分割的 token 集合,默认为 None
tokenize_chinese_chars=True,# 是否分割中文字符,默认为 True
strip_accents=None,# 是否去除所有重音符号,默认根据 lowercase 决定
do_split_on_punc=True,# 是否在基本标点符号处分割,默认为 True):# 如果 never_split 为 None,则设为一个空列表if never_split isNone:
never_split =[]# 设置实例的属性值
self.do_lower_case = do_lower_case # 是否小写化输入
self.never_split =set(never_split)# 永远不分割的 token 集合,转为集合类型
self.tokenize_chinese_chars = tokenize_chinese_chars # 是否分割中文字符
self.strip_accents = strip_accents # 是否去除重音符号
self.do_split_on_punc = do_split_on_punc # 是否在基本标点符号处分割deftokenize(self, text, never_split=None):"""
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
[`PreTrainedTokenizer.tokenize`]) List of token not to split.
"""# 使用 never_split 参数更新当前对象的 never_split 集合(若提供的话)
never_split = self.never_split.union(set(never_split))if never_split else self.never_split
# 清理文本,如去除无用空白等
text = self._clean_text(text)# 以下部分是为了支持多语言和中文模型而添加的代码(2018 年 11 月 1 日起)# 现在英语模型也应用了这一代码,但由于英语模型未经过中文数据的训练,# 这段代码对英语模型基本没有影响(尽管英语词汇表中包含了一些中文单词,# 这是因为英语维基百科中包含了一些中文词汇)。if self.tokenize_chinese_chars:# 对包含中文字符的文本进行特殊处理,分词
text = self._tokenize_chinese_chars(text)# 将文本中的 Unicode 标准化为 NFC 格式(避免同一字符的不同 Unicode 编码被视为不同字符)
unicode_normalized_text = unicodedata.normalize("NFC", text)# 使用空白符分割文本,得到原始 token 列表
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens =[]# 遍历每个原始 tokenfor token in orig_tokens:# 如果 token 不在 never_split 集合中if token notin never_split:# 如果设置为小写处理,则将 token 转换为小写if self.do_lower_case:
token = token.lower()# 如果需要去除重音符号,则执行去除重音符号的操作if self.strip_accents isnotFalse:
token = self._run_strip_accents(token)# 如果需要去除重音符号,则执行去除重音符号的操作elif self.strip_accents:
token = self._run_strip_accents(token)# 将处理后的 token 通过标点符号分割函数进一步分割
split_tokens.extend(self._run_split_on_punc(token, never_split))# 使用空白符重新组合处理后的 token,并分割为最终的输出 token 列表
output_tokens = whitespace_tokenize(" ".join(split_tokens))# 返回最终的输出 token 列表return output_tokens
def_run_strip_accents(self, text):"""Strips accents from a piece of text."""# 将文本中的字符标准化为 NFD 格式
text = unicodedata.normalize("NFD", text)
output =[]# 遍历文本中的每个字符for char in text:# 获取字符的 Unicode 分类
cat = unicodedata.category(char)# 如果字符是非组合型记号(Mn),则跳过if cat =="Mn":continue# 否则将字符添加到输出列表中
output.append(char)# 将输出列表中的字符连接成字符串并返回return"".join(output)def_run_split_on_punc(self, text, never_split=None):"""按照标点符号分割文本。
Args:
text (str): 要分割的文本。
never_split (set): 不应该被分割的文本集合。
Returns:
list: 分割后的文本列表。
"""# 如果不需要按标点符号分割,或者文本在不分割的集合中,则直接返回原文本列表ifnot self.do_split_on_punc or(never_split isnotNoneand text in never_split):return[text]# 将文本转换为字符列表
chars =list(text)
i =0
start_new_word =True
output =[]while i <len(chars):
char = chars[i]# 如果是标点符号,则作为新词开始if _is_punctuation(char):
output.append([char])
start_new_word =Trueelse:# 如果不是标点符号,根据start_new_word标记将字符添加到当前词列表中if start_new_word:
output.append([])
start_new_word =False
output[-1].append(char)
i +=1# 将列表中的字符列表连接为字符串,并返回分割后的文本列表return["".join(x)for x in output]def_tokenize_chinese_chars(self, text):"""在每个CJK字符周围添加空格。
Args:
text (str): 要处理的文本。
Returns:
str: 处理后的文本。
"""
output =[]for char in text:
cp =ord(char)# 如果是CJK字符,添加空格前后包裹该字符if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")else:
output.append(char)# 将列表中的字符连接为一个字符串,并返回处理后的文本return"".join(output)def_is_chinese_char(self, cp):"""检查CP是否是CJK字符的码点。
Args:
cp (int): 要检查的字符的Unicode码点。
Returns:
bool: 如果是CJK字符则返回True,否则返回False。
"""# 这里的CJK字符定义来自于CJK统一表意文字块的Unicode范围if((cp >=0x4E00and cp <=0x9FFF)or(cp >=0x3400and cp <=0x4DBF)or(cp >=0x20000and cp <=0x2A6DF)or(cp >=0x2A700and cp <=0x2B73F)or(cp >=0x2B740and cp <=0x2B81F)or(cp >=0x2B820and cp <=0x2CEAF)or(cp >=0xF900and cp <=0xFAFF)or(cp >=0x2F800and cp <=0x2FA1F)):returnTruereturnFalsedef_clean_text(self, text):"""对文本进行无效字符移除和空白字符清理。
Args:
text (str): 要清理的文本。
Returns:
str: 清理后的文本。
"""
output =[]for char in text:
cp =ord(char)# 移除无效字符和控制字符,以及替换空白字符为单个空格if cp ==0or cp ==0xFFFDor _is_control(char):continueif _is_whitespace(char):
output.append(" ")else:
output.append(char)# 将列表中的字符连接为一个字符串,并返回清理后的文本return"".join(output)classWordpieceTokenizer(object):"""Runs WordPiece tokenization."""def__init__(self, vocab, unk_token, max_input_chars_per_word=100):# 初始化WordpieceTokenizer对象,设置词汇表、未知标记和单词的最大字符数
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
deftokenize(self, text):"""
Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
tokenization using the given vocabulary.
For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through *BasicTokenizer*.
Returns:
A list of wordpiece tokens.
"""# 初始化输出token列表
output_tokens =[]# 使用whitespace_tokenize函数将文本分割成单词或标记for token in whitespace_tokenize(text):# 将token转换为字符列表
chars =list(token)# 如果token的长度超过最大输入字符数,则将未知标记添加到输出token列表中iflen(chars)> self.max_input_chars_per_word:
output_tokens.append(self.unk_token)continue# 初始化标志变量和起始位置
is_bad =False
start =0
sub_tokens =[]# 循环直到处理完所有字符while start <len(chars):
end =len(chars)
cur_substr =None# 使用最长匹配算法找到合适的子串while start < end:
substr ="".join(chars[start:end])if start >0:
substr ="##"+ substr
# 如果找到了匹配词汇表的子串,则更新当前子串并跳出循环if substr in self.vocab:
cur_substr = substr
break
end -=1# 如果未找到合适的子串,则标记为无效if cur_substr isNone:
is_bad =Truebreak# 将找到的子串添加到sub_tokens列表中
sub_tokens.append(cur_substr)
start = end
# 如果标记为无效,则将未知标记添加到输出token列表中;否则将sub_tokens列表中的token添加到输出token列表中if is_bad:
output_tokens.append(self.unk_token)else:
output_tokens.extend(sub_tokens)# 返回最终的token列表return output_tokens
.\models\bert\tokenization_bert_fast.py
# coding=utf-8# 上面是指定脚本的编码格式为 UTF-8# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.# 版权声明,指明了代码的版权归属## Licensed under the Apache License, Version 2.0 (the "License");# 根据 Apache License, Version 2.0 许可证,可以自由使用本代码# you may not use this file except in compliance with the License.# 除非遵循许可证规定,否则不能使用该文件# You may obtain a copy of the License at# 可以在以下链接获取许可证的副本## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.# 除非适用法律要求或书面同意,否则根据许可证分发的软件是基于“原样”分发的,没有任何形式的保证或条件。# 请参阅许可证以获取详细的权限和限制信息。"""Fast Tokenization classes for Bert."""# 用于 Bert 的快速标记化类import json
from typing import List, Optional, Tuple
from tokenizers import normalizers
from...tokenization_utils_fast import PreTrainedTokenizerFast
from...utils import logging
from.tokenization_bert import BertTokenizer
# 获取日志记录器对象
logger = logging.get_logger(__name__)# 定义词汇文件的名称映射
VOCAB_FILES_NAMES ={"vocab_file":"vocab.txt","tokenizer_file":"tokenizer.json"}# 预训练模型所需的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP ={{"vocab_file":{"google-bert/bert-base-uncased":"https://huggingface.co/google-bert/bert-base-uncased/resolve/main/vocab.txt","google-bert/bert-large-uncased":"https://huggingface.co/google-bert/bert-large-uncased/resolve/main/vocab.txt","google-bert/bert-base-cased":"https://huggingface.co/google-bert/bert-base-cased/resolve/main/vocab.txt","google-bert/bert-large-cased":"https://huggingface.co/google-bert/bert-large-cased/resolve/main/vocab.txt","google-bert/bert-base-multilingual-uncased":("https://huggingface.co/google-bert/bert-base-multilingual-uncased/resolve/main/vocab.txt"),"google-bert/bert-base-multilingual-cased":"https://huggingface.co/google-bert/bert-base-multilingual-cased/resolve/main/vocab.txt","google-bert/bert-base-chinese":"https://huggingface.co/google-bert/bert-base-chinese/resolve/main/vocab.txt","google-bert/bert-base-german-cased":"https://huggingface.co/google-bert/bert-base-german-cased/resolve/main/vocab.txt","google-bert/bert-large-uncased-whole-word-masking":("https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking/resolve/main/vocab.txt"),"google-bert/bert-large-cased-whole-word-masking":("https://huggingface.co/google-bert/bert-large-cased-whole-word-masking/resolve/main/vocab.txt"),"google-bert/bert-large-uncased-whole-word-masking-finetuned-squad":("https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt"),"google-bert/bert-large-cased-whole-word-masking-finetuned-squad":("https://huggingface.co/google-bert/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt"),"google-bert/bert-base-cased-finetuned-mrpc":("https://huggingface.co/google-bert/bert-base-cased-finetuned-mrpc/resolve/main/vocab.txt"),"google-bert/bert-base-german-dbmdz-cased":"https://huggingface.co/google-bert/bert-base-german-dbmdz-cased/resolve/main/vocab.txt","google-bert/bert-base-german-dbmdz-uncased":("https://huggingface.co/google-bert/bert-base-german-dbmdz-uncased/resolve/main/vocab.txt"),"TurkuNLP/bert-base-finnish-cased-v1":("https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/vocab.txt"),"TurkuNLP/bert-base-finnish-uncased-v1":("https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/vocab.txt"),"wietsedv/bert-base-dutch-cased":("https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/vocab.txt")}}# 注释:"vocab_file" 字典包含了多个键值对,每个键代表一个预训练的BERT模型,对应的值是该模型的词汇表(vocab.txt)的下载链接。
这些链接可以通过Hugging Face模型中心获取,用于获取BERT模型的词汇表数据。
{// Tokenizer文件的映射,键是模型名称,值是对应的Tokenizer.json文件的URL
"tokenizer_file":{"google-bert/bert-base-uncased":"https://huggingface.co/google-bert/bert-base-uncased/resolve/main/tokenizer.json","google-bert/bert-large-uncased":"https://huggingface.co/google-bert/bert-large-uncased/resolve/main/tokenizer.json","google-bert/bert-base-cased":"https://huggingface.co/google-bert/bert-base-cased/resolve/main/tokenizer.json","google-bert/bert-large-cased":"https://huggingface.co/google-bert/bert-large-cased/resolve/main/tokenizer.json","google-bert/bert-base-multilingual-uncased":("https://huggingface.co/google-bert/bert-base-multilingual-uncased/resolve/main/tokenizer.json"),"google-bert/bert-base-multilingual-cased":("https://huggingface.co/google-bert/bert-base-multilingual-cased/resolve/main/tokenizer.json"),"google-bert/bert-base-chinese":"https://huggingface.co/google-bert/bert-base-chinese/resolve/main/tokenizer.json","google-bert/bert-base-german-cased":"https://huggingface.co/google-bert/bert-base-german-cased/resolve/main/tokenizer.json","google-bert/bert-large-uncased-whole-word-masking":("https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking/resolve/main/tokenizer.json"),"google-bert/bert-large-cased-whole-word-masking":("https://huggingface.co/google-bert/bert-large-cased-whole-word-masking/resolve/main/tokenizer.json"),"google-bert/bert-large-uncased-whole-word-masking-finetuned-squad":("https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/tokenizer.json"),"google-bert/bert-large-cased-whole-word-masking-finetuned-squad":("https://huggingface.co/google-bert/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/tokenizer.json"),"google-bert/bert-base-cased-finetuned-mrpc":("https://huggingface.co/google-bert/bert-base-cased-finetuned-mrpc/resolve/main/tokenizer.json"),"google-bert/bert-base-german-dbmdz-cased":("https://huggingface.co/google-bert/bert-base-german-dbmdz-cased/resolve/main/tokenizer.json"),"google-bert/bert-base-german-dbmdz-uncased":("https://huggingface.co/google-bert/bert-base-german-dbmdz-uncased/resolve/main/tokenizer.json"),"TurkuNLP/bert-base-finnish-cased-v1":("https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/tokenizer.json"),"TurkuNLP/bert-base-finnish-uncased-v1":("https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/tokenizer.json"),"wietsedv/bert-base-dutch-cased":("https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/tokenizer.json")}}}# 首先定义了一个空的类 BertTokenizerFast,该类继承自 PreTrainedTokenizerFastclassBertTokenizerFast(PreTrainedTokenizerFast):# docstring: 构建一个“快速”BERT tokenizer,使用 HuggingFace 的 tokenizers 库支持,基于 WordPiecer"""
Construct a "fast" BERT tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.
"""# 定义类 BertTokenizer,用于处理 BERT 模型的分词器功能classBertTokenizer:# 类的初始化方法,用于设置分词器的各种参数和选项def__init__(
self,
vocab_file=None,# 词汇表文件路径,用于加载模型的词汇表
tokenizer_file=None,# 分词器文件路径,可选,用于加载预训练的分词器模型
do_lower_case=True,# 是否将输入转换为小写
unk_token="[UNK]",# 未知标记,当词汇表中不存在某个词时使用
sep_token="[SEP]",# 分隔符标记,在构建多序列时使用
pad_token="[PAD]",# 填充标记,在对不同长度的序列进行批处理时使用
cls_token="[CLS]",# 分类器标记,用于序列分类任务中
mask_token="[MASK]",# 掩码标记,用于掩码语言模型任务中
tokenize_chinese_chars=True,# 是否分词中文字符
strip_accents=None,# 是否去除所有重音符号**kwargs,# 其他参数,用于兼容未来可能添加的参数):# 调用父类的构造函数,初始化模型的tokenizersuper().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
do_lower_case=do_lower_case,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,**kwargs,)# 获取当前tokenizer的规范化器状态并转换为JSON格式
normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())# 检查是否有用户设置的规范化器状态与当前初始化参数不匹配,如果不匹配则进行更新if(
normalizer_state.get("lowercase", do_lower_case)!= do_lower_case
or normalizer_state.get("strip_accents", strip_accents)!= strip_accents
or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars)!= tokenize_chinese_chars
):# 获取当前规范化器的类并进行实例化
normalizer_class =getattr(normalizers, normalizer_state.pop("type"))# 更新规范化器的参数
normalizer_state["lowercase"]= do_lower_case
normalizer_state["strip_accents"]= strip_accents
normalizer_state["handle_chinese_chars"]= tokenize_chinese_chars
# 将更新后的规范化器应用于当前的tokenizer对象
self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)# 更新当前对象的小写处理标志
self.do_lower_case = do_lower_case
defbuild_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""# 构建带有特殊标记的模型输入序列,用于序列分类任务
output =[self.cls_token_id]+ token_ids_0 +[self.sep_token_id]# 如果存在第二个序列token_ids_1,则连接第二个序列的特殊标记if token_ids_1 isnotNone:
output += token_ids_1 +[self.sep_token_id]return output
defcreate_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]]=Nonedefcreate_token_type_ids_from_sequences(self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]]=None)-> List[int]:"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of token IDs representing the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional second list of token IDs representing the second sequence in sequence-pair tasks.
Returns:
`List[int]`: List of token type IDs according to the given sequence(s).
"""# Define the separator token ID and the classification token ID
sep =[self.sep_token_id]
cls =[self.cls_token_id]# If only one sequence is provided, return a mask with 0s for the first sequenceif token_ids_1 isNone:returnlen(cls + token_ids_0 + sep)*[0]# If both sequences are provided, concatenate their lengths with separator and classification tokens# Return a mask with 0s for the first sequence and 1s for the second sequencereturnlen(cls + token_ids_0 + sep)*[0]+len(token_ids_1 + sep)*[1]defsave_vocabulary(self, save_directory:str, filename_prefix: Optional[str]=None)-> Tuple[str]:"""
Save the vocabulary files associated with the tokenizer's model to a specified directory.
Args:
save_directory (str):
Directory where the vocabulary files will be saved.
filename_prefix (Optional[str]):
Optional prefix to prepend to the saved vocabulary file names.
Returns:
Tuple[str]: Tuple containing the filenames of the saved vocabulary files.
"""# Call the model's save method to save the vocabulary files to the specified directory
files = self._tokenizer.model.save(save_directory, name=filename_prefix)# Return the filenames as a tuplereturntuple(files)
.\models\bert\tokenization_bert_tf.py
# 导入所需的标准库和模块import os
from typing import List, Union
# 导入 TensorFlow 库import tensorflow as tf
# 导入 TensorFlow Text 库中的 BERT 分词器from tensorflow_text import BertTokenizer as BertTokenizerLayer
from tensorflow_text import FastBertTokenizer, ShrinkLongestTrimmer, case_fold_utf8, combine_segments, pad_model_inputs
# 导入自定义的 Keras 辅助函数from...modeling_tf_utils import keras
# 导入自定义的 BERT 分词器from.tokenization_bert import BertTokenizer
# 定义一个 Keras 层,用于在图中进行 BERT 分词classTFBertTokenizer(keras.layers.Layer):"""
This is an in-graph tokenizer for BERT. It should be initialized similarly to other tokenizers, using the
`from_pretrained()` method. It can also be initialized with the `from_tokenizer()` method, which imports settings
from an existing standard tokenizer object.
In-graph tokenizers, unlike other Hugging Face tokenizers, are actually Keras layers and are designed to be run
when the model is called, rather than during preprocessing. As a result, they have somewhat more limited options
than standard tokenizer classes. They are most useful when you want to create an end-to-end model that goes
straight from `tf.string` inputs to outputs.
"""# 初始化函数,用于创建一个 Tokenizer 对象def__init__(
self,
vocab_list: List,# 词汇表列表,包含了 Tokenizer 所需的词汇
do_lower_case:bool,# 是否将输入文本转换为小写进行分词
cls_token_id:int=None,# 分类器标记的 ID,在序列分类中用作序列的第一个标记
sep_token_id:int=None,# 分隔符标记的 ID,在构建序列时用于多序列的分隔
pad_token_id:int=None,# 填充标记的 ID,在批处理不同长度的序列时使用
padding:str="longest",# 填充类型,可以是"longest"或"max_length"
truncation:bool=True,# 是否对序列进行截断,使其不超过最大长度
max_length:int=512,# 序列的最大长度,用于填充和截断
pad_to_multiple_of:int=None,# 如果设置,序列将填充到此值的倍数
return_token_type_ids:bool=True,# 是否返回 token_type_ids
return_attention_mask:bool=True,# 是否返回 attention_mask
use_fast_bert_tokenizer:bool=True,# 是否使用 FastBertTokenizer 类(Tensorflow Text)进行分词**tokenizer_kwargs,# 其他可能传递给 tokenizer 的参数):super().__init__()# 调用父类的初始化方法if use_fast_bert_tokenizer:# 如果使用快速的 BERT 分词器
self.tf_tokenizer = FastBertTokenizer(
vocab_list, token_out_type=tf.int64, lower_case_nfd_strip_accents=do_lower_case,**tokenizer_kwargs
)else:# 否则使用静态词汇表创建查找表
lookup_table = tf.lookup.StaticVocabularyTable(
tf.lookup.KeyValueTensorInitializer(
keys=vocab_list,
key_dtype=tf.string,
values=tf.range(tf.size(vocab_list, out_type=tf.int64), dtype=tf.int64),
value_dtype=tf.int64,),
num_oov_buckets=1,)# 使用查找表创建 BERT 分词器层
self.tf_tokenizer = BertTokenizerLayer(
lookup_table, token_out_type=tf.int64, lower_case=do_lower_case,**tokenizer_kwargs
)
self.vocab_list = vocab_list
self.do_lower_case = do_lower_case
# 设置特殊 token 的索引,如果未提供则从 vocab_list 中获取
self.cls_token_id = vocab_list.index("[CLS]")if cls_token_id isNoneelse cls_token_id
self.sep_token_id = vocab_list.index("[SEP]")if sep_token_id isNoneelse sep_token_id
self.pad_token_id = vocab_list.index("[PAD]")if pad_token_id isNoneelse pad_token_id
# 初始化用于截断最长序列的 paired_trimmer
self.paired_trimmer = ShrinkLongestTrimmer(max_length -3, axis=1)# Allow room for special tokens
self.max_length = max_length
self.padding = padding
self.truncation = truncation
self.pad_to_multiple_of = pad_to_multiple_of
self.return_token_type_ids = return_token_type_ids
self.return_attention_mask = return_attention_mask
deffrom_tokenizer(cls, tokenizer:"PreTrainedTokenizerBase",**kwargs):# noqa: F821"""
Initialize a `TFBertTokenizer` from an existing `Tokenizer`.
Args:
tokenizer (`PreTrainedTokenizerBase`):
The tokenizer to use to initialize the `TFBertTokenizer`.
Examples:
```
from transformers import AutoTokenizer, TFBertTokenizer
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
tf_tokenizer = TFBertTokenizer.from_tokenizer(tokenizer)
```
"""# Retrieve the 'do_lower_case' parameter from kwargs; if not provided, use tokenizer's setting
do_lower_case = kwargs.pop("do_lower_case",None)
do_lower_case = tokenizer.do_lower_case if do_lower_case isNoneelse do_lower_case
# Retrieve the 'cls_token_id' parameter from kwargs; if not provided, use tokenizer's setting
cls_token_id = kwargs.pop("cls_token_id",None)
cls_token_id = tokenizer.cls_token_id if cls_token_id isNoneelse cls_token_id
# Retrieve the 'sep_token_id' parameter from kwargs; if not provided, use tokenizer's setting
sep_token_id = kwargs.pop("sep_token_id",None)
sep_token_id = tokenizer.sep_token_id if sep_token_id isNoneelse sep_token_id
# Retrieve the 'pad_token_id' parameter from kwargs; if not provided, use tokenizer's setting
pad_token_id = kwargs.pop("pad_token_id",None)
pad_token_id = tokenizer.pad_token_id if pad_token_id isNoneelse pad_token_id
# Get the vocabulary dictionary from the tokenizer and sort it by indices
vocab = tokenizer.get_vocab()
vocab =sorted(vocab.items(), key=lambda x: x[1])# Extract just the vocabulary tokens into a list
vocab_list =[entry[0]for entry in vocab]# Instantiate a new TFBertTokenizer using the retrieved parameters and vocab_listreturn cls(
vocab_list=vocab_list,
do_lower_case=do_lower_case,
cls_token_id=cls_token_id,
sep_token_id=sep_token_id,
pad_token_id=pad_token_id,**kwargs,)@classmethoddeffrom_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],*init_inputs,**kwargs):"""
Instantiate a `TFBertTokenizer` from a pre-trained tokenizer.
Args:
pretrained_model_name_or_path (`str` or `os.PathLike`):
The name or path to the pre-trained tokenizer.
Examples:
```
from transformers import TFBertTokenizer
tf_tokenizer = TFBertTokenizer.from_pretrained("google-bert/bert-base-uncased")
```
"""try:# Attempt to create a BertTokenizer instance from the provided pretrained_model_name_or_path
tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path,*init_inputs,**kwargs)except:# noqa: E722# If the above fails, fall back to using BertTokenizerFastfrom.tokenization_bert_fast import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path,*init_inputs,**kwargs)# Call from_tokenizer to create a TFBertTokenizer instance using the obtained tokenizerreturn cls.from_tokenizer(tokenizer,**kwargs)defunpaired_tokenize(self, texts):# If do_lower_case is True, convert texts to lowercase using case_fold_utf8if self.do_lower_case:
texts = case_fold_utf8(texts)# Tokenize texts using tf_tokenizer's tokenize method
tokens = self.tf_tokenizer.tokenize(texts)# Merge dimensions from 1 to -1 in tokensreturn tokens.merge_dims(1,-1)defcall(
self,
text,
text_pair=None,
padding=None,
truncation=None,
max_length=None,
pad_to_multiple_of=None,
return_token_type_ids=None,
return_attention_mask=None,# 定义一个方法,用于获取配置信息的字典defget_config(self):# 返回包含各种配置项的字典return{"vocab_list": self.vocab_list,# 返回实例的词汇表列表"do_lower_case": self.do_lower_case,# 返回是否执行小写转换的布尔值"cls_token_id": self.cls_token_id,# 返回类别标记的 ID"sep_token_id": self.sep_token_id,# 返回分隔标记的 ID"pad_token_id": self.pad_token_id,# 返回填充标记的 ID}
# 导入标准库和第三方库import html # 用于 HTML 编码和解码import os # 提供与操作系统交互的功能import re # 用于正则表达式操作from shutil import copyfile # 用于复制文件from typing import List, Optional, Tuple # 引入类型提示相关的库import regex # 引入 regex 库,支持更强大的正则表达式功能# 导入 Tokenizer 的基类 PreTrainedTokenizer 和日志模块from...tokenization_utils import PreTrainedTokenizer
from...utils import logging
# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)# 定义词汇文件和合并文件的名称映射
VOCAB_FILES_NAMES ={"vocab_file":"vocab.txt","merges_file":"bpe.codes",}# 预训练模型的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP ={"vocab_file":{"vinai/bertweet-base":"https://huggingface.co/vinai/bertweet-base/resolve/main/vocab.txt",},"merges_file":{"vinai/bertweet-base":"https://huggingface.co/vinai/bertweet-base/resolve/main/bpe.codes",},}# 预训练模型的位置编码大小映射
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES ={"vinai/bertweet-base":128,}defget_pairs(word):"""
返回单词中的符号对集合。
单词被表示为符号元组(符号是长度可变的字符串)。
"""
pairs =set()
prev_char = word[0]for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
pairs =set(pairs)return pairs
classBertweetTokenizer(PreTrainedTokenizer):"""
构造一个 BERTweet 分词器,使用字节对编码。
此分词器继承自 PreTrainedTokenizer,该类包含大多数主要方法。用户应参考这个超类以获取更多关于这些方法的信息。
"""# 定义一个 Transformer 模型的配置类,用于管理与模型相关的参数和配置
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
# 初始化函数,用于设置模型配置参数def__init__(
self,
vocab_file,# 词汇表文件的路径
merges_file,# 合并文件的路径
normalization=False,# 是否进行标准化预处理,默认为False
bos_token="<s>",# 预训练期间用于序列开始的特殊符号,默认为"<s>"
eos_token="</s>",# 序列结束的特殊符号,默认为"</s>"
sep_token="</s>",# 用于多个序列构建时的分隔符,默认为"</s>"
cls_token="<s>",# 序列分类时使用的特殊符号,构建时是序列的第一个符号,默认为"<s>"
unk_token="<unk>",# 未知符号,词汇表中没有时的替代符号,默认为"<unk>"
pad_token="<pad>",# 填充符号,用于处理不同长度序列时的填充,默认为"<pad>"
mask_token="<mask>",# 掩码符号,用于掩码语言建模训练中的标记,默认为"<mask>"**kwargs,# 其他可选参数):try:from emoji import demojize # 尝试导入 demojize 函数从 emoji 模块
self.demojizer = demojize # 如果成功导入,将 demojize 函数赋值给 self.demojizerexcept ImportError:
logger.warning("emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3"" install emoji==0.6.0")
self.demojizer =None# 如果导入失败,记录警告信息,并将 self.demojizer 设为 None
self.vocab_file = vocab_file # 初始化词汇表文件路径
self.merges_file = merges_file # 初始化合并文件路径
self.encoder ={}# 初始化编码器字典
self.encoder[str(bos_token)]=0# 将特殊标记 bos_token 编码为 0
self.encoder[str(pad_token)]=1# 将特殊标记 pad_token 编码为 1
self.encoder[str(eos_token)]=2# 将特殊标记 eos_token 编码为 2
self.encoder[str(unk_token)]=3# 将特殊标记 unk_token 编码为 3
self.add_from_file(vocab_file)# 调用 add_from_file 方法,从 vocab_file 添加更多词汇到编码器
self.decoder ={v: k for k, v in self.encoder.items()}# 创建解码器,将编码器的键值对颠倒withopen(merges_file, encoding="utf-8")as merges_handle:
merges = merges_handle.read().split("\n")[:-1]# 读取并处理合并文件的内容
merges =[tuple(merge.split()[:-1])for merge in merges]# 将每行合并内容转换为元组列表
self.bpe_ranks =dict(zip(merges,range(len(merges))))# 创建 BPE 合并的排名字典
self.cache ={}# 初始化缓存字典
self.normalization = normalization # 设置文本规范化选项
self.tweetPreprocessor = TweetTokenizer()# 初始化 TweetTokenizer 作为 tweetPreprocessor
self.special_puncts ={"’":"'","…":"..."}# 定义特殊标点符号映射super().__init__(# 调用父类的初始化方法,传递相应参数和关键字参数
normalization=normalization,
bos_token=bos_token,
eos_token=eos_token,
sep_token=sep_token,
cls_token=cls_token,
unk_token=unk_token,
pad_token=pad_token,
mask_token=mask_token,**kwargs,)defget_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]]=None,
already_has_special_tokens:bool=False)-> List[int]:"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""# If the token list already has special tokens, delegate to the superclass methodif already_has_special_tokens:returnsuper().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True)# If there are no sequence pairs (token_ids_1 is None), add special tokens around token_ids_0if token_ids_1 isNone:return[1]+([0]*len(token_ids_0))+[1]# For sequence pairs, add special tokens around both token_ids_0 and token_ids_1return[1]+([0]*len(token_ids_0))+[1,1]+([0]*len(token_ids_1))+[1]defcreate_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]]=None)-> List[int]:"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. BERTweet does
not make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""# Define special tokens for separation and classification
sep =[self.sep_token_id]
cls =[self.cls_token_id]# If there are no sequence pairs, return a list of zeros of length equal to cls + token_ids_0 + sepif token_ids_1 isNone:returnlen(cls + token_ids_0 + sep)*[0]# For sequence pairs, return a list of zeros of length equal to cls + token_ids_0 + sep + sep + token_ids_1 + sepreturnlen(cls + token_ids_0 + sep + sep + token_ids_1 + sep)*[0]@propertydefvocab_size(self):# Return the size of the vocabulary, which is the length of the encoder dictionaryreturnlen(self.encoder)defget_vocab(self):# Return the combined dictionary of encoder and added_tokens_encoderreturndict(self.encoder,**self.added_tokens_encoder)defbpe(self, token):# 如果 token 已经在缓存中,直接返回缓存中的结果if token in self.cache:return self.cache[token]# 将 token 转换为元组形式
word =tuple(token)# 在 token 的末尾添加 "</w>",表示单词结束
word =tuple(list(word[:-1])+[word[-1]+"</w>"])# 获取单词中的所有字符对,并进行 BPE 算法处理
pairs = get_pairs(word)# 如果没有字符对,直接返回原始 tokenifnot pairs:return token
# 循环处理字符对,直到无法再合并为止whileTrue:# 找到优先级最高的字符对
bigram =min(pairs, key=lambda pair: self.bpe_ranks.get(pair,float("inf")))# 如果该字符对不在预定义的 BPE 优先级中,停止处理if bigram notin self.bpe_ranks:break
first, second = bigram
new_word =[]
i =0# 遍历单词中的字符while i <len(word):try:
j = word.index(first, i)except ValueError:# 如果找不到字符对的第一个字符,直接将剩余部分添加到新单词中
new_word.extend(word[i:])breakelse:# 将当前位置到字符对第一个字符位置之间的部分添加到新单词中
new_word.extend(word[i:j])
i = j
# 如果当前位置的字符与字符对的第一个字符相同,并且下一个字符与字符对的第二个字符相同,则合并为一个新的字符if word[i]== first and i <len(word)-1and word[i +1]== second:
new_word.append(first + second)
i +=2else:# 否则,将当前位置的字符添加到新单词中,并移动到下一个位置
new_word.append(word[i])
i +=1# 将新单词转换为元组形式,并更新 word 变量为新单词
new_word =tuple(new_word)
word = new_word
# 如果新单词长度为1,停止循环iflen(word)==1:breakelse:# 否则,继续获取新的字符对
pairs = get_pairs(word)# 将处理后的单词以 "@@ " 连接起来,并去掉末尾的特殊标记 "</w>"
word ="@@ ".join(word)
word = word[:-4]# 将处理后的结果缓存起来,并返回
self.cache[token]= word
return word
def_tokenize(self, text):"""Tokenize a string."""# 如果启用了 Tweet 规范化,则在进行 BPE 处理之前先对文本进行规范化if self.normalization:
text = self.normalizeTweet(text)
split_tokens =[]# 使用正则表达式将文本分割成单词列表
words = re.findall(r"\S+\n?", text)for token in words:# 对每个单词进行 BPE 处理,并将处理结果按空格分割后添加到 split_tokens 列表中
split_tokens.extend(list(self.bpe(token).split(" ")))return split_tokens
defnormalizeTweet(self, tweet):"""
Normalize a raw Tweet
"""# 替换 Tweet 中的特殊标点符号for punct in self.special_puncts:
tweet = tweet.replace(punct, self.special_puncts[punct])# 使用 Tweet 预处理器对 Tweet 进行分词
tokens = self.tweetPreprocessor.tokenize(tweet)# 对每个 token 进行规范化处理,并用空格连接起来
normTweet =" ".join([self.normalizeToken(token)for token in tokens])# 进行特定的单词规范化处理,替换常见的缩写和缩略语
normTweet =(
normTweet.replace("cannot ","can not ").replace("n't "," n't ").replace("n 't "," n't ").replace("ca n't","can't").replace("ai n't","ain't"))
normTweet =(
normTweet.replace("'m "," 'm ").replace("'re "," 're ").replace("'s "," 's ").replace("'ll "," 'll ").replace("'d "," 'd ").replace("'ve "," 've "))
normTweet =(
normTweet.replace(" p . m ."," p.m.").replace(" p . m "," p.m ").replace(" a . m ."," a.m.").replace(" a . m "," a.m "))return" ".join(normTweet.split())# 将给定的 token 标准化为小写形式defnormalizeToken(self, token):
lowercased_token = token.lower()# 如果 token 以 "@" 开头,则返回 "@USER"if token.startswith("@"):return"@USER"# 如果 token 的小写形式以 "http" 或 "www" 开头,则返回 "HTTPURL"elif lowercased_token.startswith("http")or lowercased_token.startswith("www"):return"HTTPURL"# 如果 token 的长度为 1eliflen(token)==1:# 如果 token 是特殊标点符号中的一种,则返回其对应的值if token in self.special_puncts:return self.special_puncts[token]# 如果存在表情解析器,则用表情解析器处理 token,否则返回原 tokenif self.demojizer isnotNone:return self.demojizer(token)else:return token
# 对于其他情况,直接返回 tokenelse:return token
# 根据 token 转换为对应的 id,使用给定的词汇表def_convert_token_to_id(self, token):return self.encoder.get(token, self.encoder.get(self.unk_token))# 根据 id 转换为对应的 token,使用给定的词汇表def_convert_id_to_token(self, index):return self.decoder.get(index, self.unk_token)# 将一系列 tokens 转换为单个字符串defconvert_tokens_to_string(self, tokens):
out_string =" ".join(tokens).replace("@@ ","").strip()return out_string
# 保存词汇表到指定目录defsave_vocabulary(self, save_directory:str, filename_prefix: Optional[str]=None)-> Tuple[str]:# 如果保存目录不存在,记录错误并返回ifnot os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")return# 构造词汇表文件路径和合并文件路径
out_vocab_file = os.path.join(
save_directory,(filename_prefix +"-"if filename_prefix else"")+ VOCAB_FILES_NAMES["vocab_file"])
out_merge_file = os.path.join(
save_directory,(filename_prefix +"-"if filename_prefix else"")+ VOCAB_FILES_NAMES["merges_file"])# 如果当前词汇表文件路径与目标路径不同且当前路径下存在词汇表文件,则复制词汇表文件到目标路径if os.path.abspath(self.vocab_file)!= os.path.abspath(out_vocab_file)and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)# 如果当前路径下不存在词汇表文件,则将当前模型的序列化词汇表模型写入目标路径elifnot os.path.isfile(self.vocab_file):withopen(out_vocab_file,"wb")as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)# 如果当前合并文件路径与目标路径不同,则复制合并文件到目标路径if os.path.abspath(self.merges_file)!= os.path.abspath(out_merge_file):
copyfile(self.merges_file, out_merge_file)return out_vocab_file, out_merge_file
defadd_from_file(self, f):"""
从文本文件中加载一个预先存在的字典,并将其符号添加到当前实例中。
"""# 如果输入参数 f 是字符串类型,则尝试打开该文件ifisinstance(f,str):try:withopen(f,"r", encoding="utf-8")as fd:# 递归调用 add_from_file 方法,加载文件内容
self.add_from_file(fd)except FileNotFoundError as fnfe:# 如果文件不存在,则抛出 FileNotFound 异常raise fnfe
except UnicodeError:# 如果在文件中检测到不正确的编码,则抛出异常raise Exception(f"Incorrect encoding detected in {f}, please rebuild the dataset")# 返回,结束当前函数调用return# 读取文件中的所有行
lines = f.readlines()# 遍历每一行内容for lineTmp in lines:# 去除行首尾空白符
line = lineTmp.strip()# 查找行中最后一个空格的位置
idx = line.rfind(" ")# 如果找不到空格,则抛出数值错误异常if idx ==-1:raise ValueError("Incorrect dictionary format, expected '<token> <cnt>'")# 提取空格之前的部分作为单词
word = line[:idx]# 将单词作为键,将当前编码器长度作为值存入编码器字典中
self.encoder[word]=len(self.encoder)# Natural Language Toolkit: Twitter Tokenizer## Copyright (C) 2001-2020 NLTK Project# Author: Christopher Potts <cgpotts@stanford.edu># Ewan Klein <ewan@inf.ed.ac.uk> (modifications)# Pierpaolo Pantone <> (modifications)# URL: http://nltk.org/# For license information, see LICENSE.TXT#"""
Twitter-aware tokenizer, designed to be flexible and easy to adapt to new domains and tasks. The basic logic is this:
1. The tuple regex_strings defines a list of regular expression strings.
2. The regex_strings strings are put, in order, into a compiled regular expression object called word_re.
3. The tokenization is done by word_re.findall(s), where s is the user-supplied string, inside the tokenize() method of
the class Tokenizer.
4. When instantiating Tokenizer objects, there is a single option: preserve_case. By default, it is set to True. If it
is set to False, then the tokenizer will lowercase everything except for emoticons.
"""######################################################################## import regex # https://github.com/nltk/nltk/issues/2409# import html######################################################################## The following strings are components in the regular expression# that is used for tokenizing. It's important that phone_number# appears first in the final regex (since it can contain whitespace).# It also could matter that tags comes after emoticons, due to the# possibility of having text like## <:| and some text >:)## Most importantly, the final element should always be last, since it# does a last ditch whitespace-based tokenization of whatever is left.# ToDo: Update with http://en.wikipedia.org/wiki/List_of_emoticons ?# This particular element is used in a couple ways, so we define it# with a name:# docstyle-ignore
EMOTICONS =r"""
(?:
[<>]? # optional opening angle bracket
[:;=8] # eyes
[\-o\*\']? # optional nose
[\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
|
[\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
[\-o\*\']? # optional nose
[:;=8] # eyes
[<>]? # optional closing angle bracket
|
<3 # heart
)"""# URL pattern due to John Gruber, modified by Tom Winzig. See# https://gist.github.com/winzig/8894715# docstyle-ignore
URLS =r""" # Capture 1: entire matched URL
(?:
https?: # URL protocol and colon
(?:
/{1,3} # 1-3 slashes
| # or
[a-z0-9%] # Single letter or digit or '%'
# (Trying not to match e.g. "URI::Escape")
)
| # or
# looks like domain name followed by a slash:
[a-z0-9.\-]+[.]
(?:[a-z]{2,13})
/
)
(?: # One or more:
[^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[]
| # or
\(
[^\s()<>{}\[\]]+
\)
)+
(?: # End with:
\(
[^\s()<>{}\[\]]+
\)
| # or
[^\s`!()\[\]{};:'".,<>?«»“”‘’]
)
"""# The above pattern defines URLs using a regex for tokenization purposes,# covering various formats and components typically found in URLs.
\([^\s()]*?\([^\s()]+\)[^\s()]*?\)# 匹配具有平衡括号的一级深度的表达式:(...(...)...)|
\([^\s]+?\)# 匹配非递归的平衡括号表达式:(...))+# 上述两种模式可以出现一次或多次,即匹配多个括号嵌套或单个括号(?:# 结尾处可以是以下模式之一:
\([^\s()]*?\([^\s()]+\)[^\s()]*?\)# 匹配具有平衡括号的一级深度的表达式:(...(...)...)|
\([^\s]+?\)# 匹配非递归的平衡括号表达式:(...)|# 或者[^\s`!()\[\]{};:'".,<>?«»“”‘’]# 不是空格或特定的标点字符)|# 或者,用于匹配裸域名:(?:(?<!@)# 前面不是 @,避免在电子邮件地址中匹配例如 "foo@_gmail.com_"[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:[a-z]{2,13})
\b
/?
(?!@)# 后面不是 @,避免在电子邮件地址中匹配例如 "foo.na" 在 "foo.na@example.com" 中)
这段代码是一个正则表达式模式,用于匹配具有特定形式的括号结构和裸域名。
# 定义正则表达式模式以识别不同类型的标记# 包括 URL、电话号码、ASCII 表情、HTML 标签、ASCII 箭头、Twitter 用户名、Twitter 主题标签、电子邮件地址等
REGEXPS =(
URLS,# 匹配 URLr"""
(?:
(?: # (国际)
\+?[01]
[ *\-.\)]*
)?
(?: # (区号)
[\(]?
\d{3}
[ *\-.\)]*
)?
\d{3} # 交换机
[ *\-.\)]*
\d{4} # 基站
)""",# 匹配电话号码
EMOTICONS,# 匹配 ASCII 表情r"""<[^>\s]+>""",# 匹配 HTML 标签r"""[\-]+>|<[\-]+""",# 匹配 ASCII 箭头r"""(?:@[\w_]+)""",# 匹配 Twitter 用户名r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""",# 匹配 Twitter 主题标签r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""",# 匹配电子邮件地址r"""
(?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # 带有撇号或破折号的单词
|
(?:[+\-]?\d+[,/.:-]\d+[+\-]?) # 数字,包括分数、小数点
|
(?:[\w_]+) # 没有撇号或破折号的单词
|
(?:\.(?:\s*\.){1,}) # 省略号
|
(?:\S) # 其他非空白字符
""",# 匹配剩余的词类)####################################################################### 这是核心的分词正则表达式:# 将 REGEXPS 中的所有模式组合成一个大的正则表达式
WORD_RE = regex.compile(r"""(%s)"""%"|".join(REGEXPS), regex.VERBOSE | regex.I | regex.UNICODE)# HANG_RE 用于识别连续字符的模式
HANG_RE = regex.compile(r"([^a-zA-Z0-9])\1{3,}")# EMOTICON_RE 用于识别表情符号的模式
EMOTICON_RE = regex.compile(EMOTICONS, regex.VERBOSE | regex.I | regex.UNICODE)# ENT_RE 用于将 HTML 实体转换为 Unicode 字符的模式
ENT_RE = regex.compile(r"&(#?(x?))([^&;\s]+);")# 导入HTML实体替换函数from nltk.tokenize.casual import _replace_html_entities
# 使用HTML实体替换函数处理包含HTML实体的字节字符串,返回替换后的字符串
_replace_html_entities(b"Price: £100")# 输出结果:'Price: \\xa3100'# 打印使用HTML实体替换函数处理包含HTML实体的字节字符串,应该输出替换后的Unicode字符串print(_replace_html_entities(b"Price: £100"))# 输出结果:Price: £100classTweetTokenizer:r"""
Examples:
```
>>> # Tokenizer for tweets.
>>> from nltk.tokenize import TweetTokenizer
>>> tknzr = TweetTokenizer()
>>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
>>> tknzr.tokenize(s0)
['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']
>>> # Examples using *strip_handles* and *reduce_len parameters*:
>>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
>>> s1 = "@remy: This is waaaaayyyy too much for you!!!!!!"
>>> tknzr.tokenize(s1)
[':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
```"""def__init__(self, preserve_case=True, reduce_len=False, strip_handles=False):# Initialize the TweetTokenizer with options to preserve case, reduce elongated words, and strip handles.
self.preserve_case = preserve_case
self.reduce_len = reduce_len
self.strip_handles = strip_handles
deftokenize(self, text):"""
Tokenize a given text into a list of words.
Args:
text: str
Returns:
list(str): A list of tokens extracted from the text.
"""# Fix HTML character entities before tokenization
text = _replace_html_entities(text)# Remove Twitter handles if strip_handles is enabledif self.strip_handles:
text = remove_handles(text)# Reduce elongated words to their base form if reduce_len is enabledif self.reduce_len:
text = reduce_lengthening(text)# Replace problematic sequences of characters for safe tokenization
safe_text = HANG_RE.sub(r"\1\1\1", text)# Tokenize the text using a regular expression for word boundaries
words = WORD_RE.findall(safe_text)# Adjust word case unless it is part of an emoticon to preserve emoticon capitalizationifnot self.preserve_case:
words =[x if EMOTICON_RE.search(x)else x.lower()for x in words]return words
####################################################################### Normalization Functions######################################################################defreduce_lengthening(text):"""
Reduce repeated character sequences of length 3 or greater to sequences of length 3.
Args:
text: str
Returns:
str: Text with reduced elongations.
"""
pattern = regex.compile(r"(.)\1{2,}")return pattern.sub(r"\1\1\1", text)defremove_handles(text):"""
Remove Twitter username handles from text.
Args:
text: str
Returns:
str: Text with removed handles replaced by spaces.
"""
pattern = regex.compile(r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)")# Substitute handles with ' ' to ensure correct tokenization around removed handlesreturn pattern.sub(" ", text)####################################################################### Tokenization Function######################################################################defcasual_tokenize(text, preserve_case=True, reduce_len=False, strip_handles=False):"""
Tokenize a text string using casual tokenization rules.
Args:
text: str
preserve_case: bool, optional (default=True)
Whether to preserve case in tokens.
reduce_len: bool, optional (default=False)
Whether to reduce elongated words.
strip_handles: bool, optional (default=False)
Whether to remove Twitter handles.
Returns:
list(str): A list of tokens extracted from the text based on specified rules.
"""# 创建一个TweetTokenizer对象,用于分词化处理,根据参数设置保留大小写、缩短长度和去除句柄"""
Convenience function for wrapping the tokenizer.
"""# 返回通过TweetTokenizer对象对文本进行分词化处理得到的结果return TweetTokenizer(preserve_case=preserve_case, reduce_len=reduce_len, strip_handles=strip_handles).tokenize(
text
)################################################################################ 定义一个函数 `calculate_total`,接收一个参数 `items`defcalculate_total(items):# 初始化一个变量 `total`,用于累计总和
total =0# 遍历参数 `items` 中的每个元素,将其加到 `total` 中for item in items:
total += item
# 返回累计的总和 `total`return total