tokenizer
from transformers import BertTokenizer
加载预训练好的tokenizer .from_pretrained()
sents = [
'1孩子想了解一下省移动网络部.'
, '2因为加班么? 不过现在还有不加班的地方么'
, '3对,楼上说的在理,省电信好像确实加班少'
, '4省电信好像只招大数据,不招开发'
, '5没好工作,投就行!先找份工作再说!'
]
tokenizer = BertTokenizer. from_pretrained(
pretrained_model_name_or_path= 'bert-base-chinese'
, cache_dir= None
, force_download= False
)
tokenizer
'''
PreTrainedTokenizer(name_or_path='bert-base-chinese',
vocab_size=21128,
model_max_len=512, is_fast=False,
padding_side='right', truncation_side='right',
special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
'''
普通编码 tokenizer.encode()
out = tokenizer. encode( text= sents[ 0 ]
, text_pair= sents[ 1 ]
, max_length= 40
, truncation= True
, padding= 'max_length'
, add_special_tokens= True
, return_tensors= None
)
print ( out)
print ( tokenizer. decode( out) )
'''
[101, 122, 2111, 2094, 2682, 749, 6237, 671, 678, 4689, 4919, 1220, 5381, 5317, 6956, 119, 102, 123, 1728, 711, 1217, 4408, 720, 8043, 679, 6814, 4385, 1762, 6820, 3300, 679, 1217, 4408, 4638, 1765, 3175, 720, 102, 0, 0]
[CLS] 1 孩 子 想 了 解 一 下 省 移 动 网 络 部. [SEP] 2 因 为 加 班 么 ? 不 过 现 在 还 有 不 加 班 的 地 方 么 [SEP] [PAD] [PAD]
'''
增强编码 tokenizer.encode_plus()
out = tokenizer. encode_plus( text= sents[ 0 ]
, text_pair= sents[ 1 ]
, max_length= 40
, truncation= True
, padding= 'max_length'
, add_special_tokens= True
, return_tensors= None
, return_token_type_ids= True
, return_attention_mask= True
, return_special_tokens_mask= True
, return_length= True
)
'''
input_ids 编码后的词
token_type_ids 第一个句子和特殊符号的位置是0,第二个句子位置是1
special_token_mask 特殊负号位置是1,其他位置是0
attention_mask pad的位置是0,其他位置是1
length 返回句子长度
'''
for k, v in out. items( ) :
print ( k, ':' , v)
print ( tokenizer. decode( out[ 'input_ids' ] ) )
'''
input_ids : [101, 122, 2111, 2094, 2682, 749, 6237, 671, 678, 4689, 4919, 1220, 5381, 5317, 6956, 119, 102, 123, 1728, 711, 1217, 4408, 720, 8043, 679, 6814, 4385, 1762, 6820, 3300, 679, 1217, 4408, 4638, 1765, 3175, 720, 102, 0, 0]
token_type_ids : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]
special_tokens_mask : [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]
attention_mask : [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]
length : 40
[CLS] 1 孩 子 想 了 解 一 下 省 移 动 网 络 部. [SEP] 2 因 为 加 班 么 ? 不 过 现 在 还 有 不 加 班 的 地 方 么 [SEP] [PAD] [PAD]
'''
批量编码 tokenizer.batch_encode_plus()
out = tokenizer. batch_encode_plus(
batch_text_or_text_pairs= [ sents[ 0 ] , sents[ 1 ] ]
, max_length= 40
, truncation= True
, padding= 'max_length'
, add_special_tokens= True
, return_tensors= None
, return_token_type_ids= True
, return_attention_mask= True
, return_special_tokens_mask= True
, return_length= True
)
for k, v in out. items( ) :
print ( k, ':' , v)
print ( '=' * 50 )
print ( tokenizer. decode( out[ 'input_ids' ] [ 0 ] ) , '\n' , tokenizer. decode( out[ 'input_ids' ] [ 1 ] ) )
'''
input_ids : [[101, 122, 2111, 2094, 2682, 749, 6237, 671, 678, 4689, 4919, 1220, 5381, 5317, 6956, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 123, 1728, 711, 1217, 4408, 720, 8043, 679, 6814, 4385, 1762, 6820, 3300, 679, 1217, 4408, 4638, 1765, 3175, 720, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
token_type_ids : [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
special_tokens_mask : [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
length : [17, 22]
attention_mask : [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
==================================================
[CLS] 1 孩 子 想 了 解 一 下 省 移 动 网 络 部. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[CLS] 2 因 为 加 班 么 ? 不 过 现 在 还 有 不 加 班 的 地 方 么 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
'''
批量成对编码 tokenizer.batch_encode_plus()
out = tokenizer. batch_encode_plus(
batch_text_or_text_pairs= [ ( sents[ 0 ] , sents[ 1 ] ) , ( sents[ 2 ] , sents[ 3 ] ) ]
, max_length= 40
, truncation= True
, padding= 'max_length'
, add_special_tokens= True
, return_tensors= None
, return_token_type_ids= True
, return_attention_mask= True
, return_special_tokens_mask= True
, return_length= True
)
for k, v in out. items( ) :
print ( k, ':' , v)
print ( '=' * 50 )
print ( tokenizer. decode( out[ 'input_ids' ] [ 0 ] ) , '\n' , tokenizer. decode( out[ 'input_ids' ] [ 1 ] ) )
'''
input_ids : [[101, 122, 2111, 2094, 2682, 749, 6237, 671, 678, 4689, 4919, 1220, 5381, 5317, 6956, 119, 102, 123, 1728, 711, 1217, 4408, 720, 8043, 679, 6814, 4385, 1762, 6820, 3300, 679, 1217, 4408, 4638, 1765, 3175, 720, 102, 0, 0], [101, 124, 2190, 8024, 3517, 677, 6432, 4638, 1762, 4415, 8024, 4689, 4510, 928, 1962, 1008, 4802, 2141, 1217, 4408, 2208, 102, 125, 4689, 4510, 928, 1962, 1008, 1372, 2875, 1920, 3144, 2945, 8024, 679, 2875, 2458, 1355, 102, 0]]
token_type_ids : [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]
special_tokens_mask : [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]]
length : [38, 39]
attention_mask : [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]
==================================================
[CLS] 1 孩 子 想 了 解 一 下 省 移 动 网 络 部. [SEP] 2 因 为 加 班 么 ? 不 过 现 在 还 有 不 加 班 的 地 方 么 [SEP] [PAD] [PAD]
[CLS] 3 对 , 楼 上 说 的 在 理 , 省 电 信 好 像 确 实 加 班 少 [SEP] 4 省 电 信 好 像 只 招 大 数 据 , 不 招 开 发 [SEP] [PAD]
'''
字典操作
zidian = tokenizer. get_vocab( )
type ( zidian) , len ( zidian) , '你好' in zidian, '我好' in zidian,
'''
(dict, 21128, False, False)
'''
tokenizer. add_tokens( new_tokens= [ '你好' , '我好' ] )
tokenizer. add_special_tokens( { 'eos_token' : '[EOS]' } )
zidian = tokenizer. get_vocab( )
type ( zidian) , len ( zidian) , '你好' in zidian, zidian[ '你好' ]
'''
(dict, 21131, True, 21128)
'''
datasets
import os
for dirname, _, filenames in os. walk( '/kaggle/input' ) :
for filename in filenames:
print ( os. path. join( dirname, filename) )
'''
/kaggle/input/chnsenticorp/ChnSentiCorp/dataset_info.json
/kaggle/input/chnsenticorp/ChnSentiCorp/ChnSentiCorp.py
/kaggle/input/chnsenticorp/ChnSentiCorp/chn_senti_corp-train.arrow
/kaggle/input/chnsenticorp/ChnSentiCorp/chn_senti_corp-test.arrow
/kaggle/input/chnsenticorp/ChnSentiCorp/chn_senti_corp-validation.arrow
'''
数据操作(from datasets import load_dataset)
1. 加载数据 .load_dataset(path=‘’, )
from datasets import load_dataset
dataset = load_dataset( path= '/kaggle/input/chnsenticorp/ChnSentiCorp' , split= 'train' )
dataset, dataset[ 0 ]
'''
(Dataset({
features: ['text', 'label'],
num_rows: 9600
}),
{'text': '选择珠江花园的原因就是方便,有电动扶梯直接到达海边,周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般,但还算整洁。 泳池在大堂的屋顶,因此很小,不过女儿倒是喜欢。 包的早餐是西式的,还算丰富。 服务吗,一般',
'label': 1})
'''
2. 排序 .sort(‘标签名’)
print ( '排序前' , dataset[ 'label' ] [ : 20 ] )
sorted_dataset = dataset. sort( 'label' )
print ( '排序后' , sorted_dataset[ 'label' ] [ : 20 ] )
'''
排序前 [1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0]
排序后 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
'''
3. 打乱 .shuffle()
print ( '打乱前' , sorted_dataset[ 'label' ] [ : 20 ] )
shuffled_dataset = sorted_dataset. shuffle( seed= 12 )
print ( '打乱后' , shuffled_dataset[ 'label' ] [ : 20 ] )
'''
打乱前 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
打乱后 [1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0]
'''
4. 选择 .select()
dataset. select( [ 0 , 10 , 20 ] )
'''
Dataset({
features: ['text', 'label'],
num_rows: 3
})
'''
5. 过滤 .filter(函数名)
print ( dataset)
def fun ( data) :
return data[ 'text' ] . startswith( '选择' )
start_with_ar = dataset. filter ( fun)
print ( len ( start_with_ar) , start_with_ar[ 'text' ] )
'''
Dataset({
features: ['text', 'label'],
num_rows: 9600
})
0%| | 0/10 [00:00<?, ?ba/s]
2 ['选择珠江花园的原因就是方便,有电动扶梯直接到达海边,周围餐馆、食廊、商场、超市、摊位一应俱全。
酒店装修一般,但还算整洁。 泳池在大堂的屋顶,因此很小,不过女儿倒是喜欢。 包的早餐是西式的,还算丰富。
服务吗,一般', '选择的事例太离奇了,夸大了心理咨询的现实意义,让人失去了信任感!
如果说这样写的效果能在一开始抓住读者的眼球,但是看到案例主人公心理问题的原因解释时就逐渐失去了兴趣,
反正有点拣了芝麻丢了西瓜的感觉。']
'''
6. 列操作 rename,remove,datatype
print ( dataset. rename_column( 'text' , 'renamed_text' ) )
print ( dataset. remove_columns( [ 'text' ] ) )
dataset. set_format( type = 'torch' , columns= [ 'label' ] )
print ( dataset[ 0 ] )
'''
Dataset({
features: ['renamed_text', 'label'],
num_rows: 9600
})
Dataset({
features: ['label'],
num_rows: 9600
})
{'label': tensor(1)}
'''
7. 保存和加载
from datasets import load_from_disk
dataset = load_dataset( path= '/kaggle/input/chnsenticorp/ChnSentiCorp' , split= 'train' )
dataset. save_to_disk( '/kaggle/output/1' )
dataset = load_from_disk( '/kaggle/output/1' )
dataset
'''
Dataset({
features: ['text', 'label'],
num_rows: 9600
})
'''
评价函数 (from datasets import load_metric)
from datasets import list_metrics
metrics_list = list_metrics( )
print ( len ( metrics_list) )
print ( metrics_list)
'''
84
['accuracy', 'bertscore', 'bleu', 'bleurt',
'brier_score', 'cer', 'chrf', 'code_eval', 'comet',
'competition_math', 'coval', 'cuad', 'exact_match', 'f1',
'frugalscore', 'glue', 'google_bleu', 'indic_glue', 'mae',
'mahalanobis', 'matthews_correlation', 'mauve', 'mean_iou',
'abidlabs/mean_iou2', 'angelina-wang/directional_bias_amplification', 'cakiki/ndcg'
... ...]
'''
from datasets import load_metric
metric = load_metric( 'glue' , 'mrpc' )
print ( metric)
'''
Metric(name: "glue", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: """
Compute GLUE evaluation metric associated to each GLUE dataset.
Args:
predictions: list of predictions to score.
Each translation should be tokenized into a list of tokens.
references: list of lists of references for each translation.
Each reference should be tokenized into a list of tokens.
Returns: depending on the GLUE subset, one or several of:
"accuracy": Accuracy
"f1": F1 score
"pearson": Pearson Correlation
"spearmanr": Spearman Correlation
"matthews_correlation": Matthew Correlation
Examples:
>>> glue_metric = datasets.load_metric('glue', 'sst2') # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]
>>> references = [0, 1]
>>> predictions = [0, 1]
>>> results = glue_metric.compute(predictions=predictions, references=references)
>>> print(results)
{'accuracy': 1.0}
>>> glue_metric = datasets.load_metric('glue', 'mrpc') # 'mrpc' or 'qqp'
>>> references = [0, 1]
>>> predictions = [0, 1]
>>> results = glue_metric.compute(predictions=predictions, references=references)
>>> print(results)
{'accuracy': 1.0, 'f1': 1.0}
>>> glue_metric = datasets.load_metric('glue', 'stsb')
>>> references = [0., 1., 2., 3., 4., 5.]
>>> predictions = [0., 1., 2., 3., 4., 5.]
>>> results = glue_metric.compute(predictions=predictions, references=references)
>>> print({"pearson": round(results["pearson"], 2), "spearmanr": round(results["spearmanr"], 2)})
{'pearson': 1.0, 'spearmanr': 1.0}
>>> glue_metric = datasets.load_metric('glue', 'cola')
>>> references = [0, 1]
>>> predictions = [0, 1]
>>> results = glue_metric.compute(predictions=predictions, references=references)
>>> print(results)
{'matthews_correlation': 1.0}
""", stored examples: 0)
'''
pre = [ 0 , 1 , 0 ]
ref = [ 0 , 1 , 1 ]
metric. compute( predictions= pre, references= ref)
'''
{'accuracy': 0.6666666666666666, 'f1': 0.6666666666666666}
'''
管道函数 (from transformers import pipeline)
from transformers import pipeline
classifier = pipeline( 'sentiment-analysis' )
print ( classifier( 'I hate you' ) )
print ( classifier( 'I love you' ) )
'''
No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)
[{'label': 'NEGATIVE', 'score': 0.9991129040718079}]
[{'label': 'POSITIVE', 'score': 0.9998656511306763}]
'''