【Python】HuggingFace

rejudge

已于 2022-11-04 17:32:45 修改

阅读量1k

点赞数 1

分类专栏： Python 文章标签： python 开发语言

于 2022-10-10 16:03:19 首次发布

本文链接：https://blog.csdn.net/qq_45249685/article/details/127245967

版权

Python 专栏收录该内容

43 篇文章 3 订阅

订阅专栏

HuggingFace

tokenizer
datasets
管道函数 (from transformers import pipeline)

tokenizer

from transformers import BertTokenizer # Auto

加载预训练好的tokenizer .from_pretrained()

sents = [
    '1孩子想了解一下省移动网络部.'
    ,'2因为加班么？ 不过现在还有不加班的地方么'
    ,'3对，楼上说的在理，省电信好像确实加班少'
    ,'4省电信好像只招大数据，不招开发'
    ,'5没好工作，投就行！先找份工作再说！'
]

# 加载预训练字典和分词方法
tokenizer = BertTokenizer.from_pretrained(
    # 往往和model名字一致,不同model编码方式不同
    pretrained_model_name_or_path='bert-base-chinese'
    ,cache_dir=None
    ,force_download=False
)
tokenizer

'''

    PreTrainedTokenizer(name_or_path='bert-base-chinese', 
    vocab_size=21128,
     model_max_len=512, is_fast=False, 
     padding_side='right', truncation_side='right', 
     special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
'''

普通编码 tokenizer.encode()

out = tokenizer.encode(text=sents[0]
                      ,text_pair=sents[1] # 可不传,只编码一个句子
                      ,max_length=40
                      ,truncation=True # 句子长度大于max_length截断
                      ,padding='max_length' # 句子长度小于max_length时补填
                      ,add_special_tokens=True # 指定添加特殊符号
                      ,return_tensors=None # 不指定数据类型,默认list
                      )
print(out)
print(tokenizer.decode(out))

'''
    [101, 122, 2111, 2094, 2682, 749, 6237, 671, 678, 4689, 4919, 1220, 5381, 5317, 6956, 119, 102, 123, 1728, 711, 1217, 4408, 720, 8043, 679, 6814, 4385, 1762, 6820, 3300, 679, 1217, 4408, 4638, 1765, 3175, 720, 102, 0, 0]
    [CLS] 1 孩 子 想 了 解 一 下 省 移 动 网 络 部. [SEP] 2 因 为 加 班 么 ？ 不 过 现 在 还 有 不 加 班 的 地 方 么 [SEP] [PAD] [PAD]
'''

增强编码 tokenizer.encode_plus()

# 增强的编码函数
out = tokenizer.encode_plus(text=sents[0]
                      ,text_pair=sents[1] # 可不传,只编码一个句子
                      ,max_length=40
                      ,truncation=True # 句子长度大于max_length截断
                      ,padding='max_length' # 句子长度小于max_length时补填
                      ,add_special_tokens=True # 指定添加特殊符号
                      # 可取tf,pt,np, 默认list
                      ,return_tensors=None # 不指定数据类型,默认list
                       
                      ,return_token_type_ids=True 
                      ,return_attention_mask=True
                      ,return_special_tokens_mask=True
                      ,return_length=True
                      )
'''
input_ids 编码后的词
token_type_ids 第一个句子和特殊符号的位置是0,第二个句子位置是1
special_token_mask  特殊负号位置是1,其他位置是0
attention_mask pad的位置是0,其他位置是1
length 返回句子长度
'''
for k, v in out.items():
    print(k, ':', v)
print(tokenizer.decode(out['input_ids']))

'''
    input_ids : [101, 122, 2111, 2094, 2682, 749, 6237, 671, 678, 4689, 4919, 1220, 5381, 5317, 6956, 119, 102, 123, 1728, 711, 1217, 4408, 720, 8043, 679, 6814, 4385, 1762, 6820, 3300, 679, 1217, 4408, 4638, 1765, 3175, 720, 102, 0, 0]
    token_type_ids : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]
    special_tokens_mask : [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]
    attention_mask : [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]
    length : 40
    [CLS] 1 孩 子 想 了 解 一 下 省 移 动 网 络 部. [SEP] 2 因 为 加 班 么 ？ 不 过 现 在 还 有 不 加 班 的 地 方 么 [SEP] [PAD] [PAD]
'''

批量编码 tokenizer.batch_encode_plus()

# 批量编码句子
out = tokenizer.batch_encode_plus(
                      batch_text_or_text_pairs=[sents[0], sents[1]]
    
                      ,max_length=40
                      ,truncation=True # 句子长度大于max_length截断
                      ,padding='max_length' # 句子长度小于max_length时补填
                      ,add_special_tokens=True # 指定添加特殊符号
                      # 可取tf,pt,np, 默认list
                      ,return_tensors=None # 不指定数据类型,默认list
                       
                      ,return_token_type_ids=True 
                      ,return_attention_mask=True
                      ,return_special_tokens_mask=True
                      ,return_length=True
                      )

for k, v in out.items():
    print(k, ':', v)
print('='*50)
print(tokenizer.decode(out['input_ids'][0]),'\n' ,tokenizer.decode(out['input_ids'][1]))

'''
    input_ids : [[101, 122, 2111, 2094, 2682, 749, 6237, 671, 678, 4689, 4919, 1220, 5381, 5317, 6956, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 123, 1728, 711, 1217, 4408, 720, 8043, 679, 6814, 4385, 1762, 6820, 3300, 679, 1217, 4408, 4638, 1765, 3175, 720, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
    token_type_ids : [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
    special_tokens_mask : [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
    length : [17, 22]
    attention_mask : [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
    ==================================================
    [CLS] 1 孩 子 想 了 解 一 下 省 移 动 网 络 部. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 
     [CLS] 2 因 为 加 班 么 ？ 不 过 现 在 还 有 不 加 班 的 地 方 么 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
'''

批量成对编码 tokenizer.batch_encode_plus()

# 批量成对编码句子
out = tokenizer.batch_encode_plus(
                      batch_text_or_text_pairs=[(sents[0], sents[1]), (sents[2], sents[3])]
    
                      ,max_length=40
                      ,truncation=True # 句子长度大于max_length截断
                      ,padding='max_length' # 句子长度小于max_length时补填
                      ,add_special_tokens=True # 指定添加特殊符号
                      # 可取tf,pt,np, 默认list
                      ,return_tensors=None # 不指定数据类型,默认list
                       
                      ,return_token_type_ids=True 
                      ,return_attention_mask=True
                      ,return_special_tokens_mask=True
                      ,return_length=True
                      )

for k, v in out.items():
    print(k, ':', v)
print('='*50)
print(tokenizer.decode(out['input_ids'][0]),'\n' ,tokenizer.decode(out['input_ids'][1]))

'''
    input_ids : [[101, 122, 2111, 2094, 2682, 749, 6237, 671, 678, 4689, 4919, 1220, 5381, 5317, 6956, 119, 102, 123, 1728, 711, 1217, 4408, 720, 8043, 679, 6814, 4385, 1762, 6820, 3300, 679, 1217, 4408, 4638, 1765, 3175, 720, 102, 0, 0], [101, 124, 2190, 8024, 3517, 677, 6432, 4638, 1762, 4415, 8024, 4689, 4510, 928, 1962, 1008, 4802, 2141, 1217, 4408, 2208, 102, 125, 4689, 4510, 928, 1962, 1008, 1372, 2875, 1920, 3144, 2945, 8024, 679, 2875, 2458, 1355, 102, 0]]
    token_type_ids : [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]
    special_tokens_mask : [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]]
    length : [38, 39]
    attention_mask : [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]
    ==================================================
    [CLS] 1 孩 子 想 了 解 一 下 省 移 动 网 络 部. [SEP] 2 因 为 加 班 么 ？ 不 过 现 在 还 有 不 加 班 的 地 方 么 [SEP] [PAD] [PAD] 
     [CLS] 3 对 ， 楼 上 说 的 在 理 ， 省 电 信 好 像 确 实 加 班 少 [SEP] 4 省 电 信 好 像 只 招 大 数 据 ， 不 招 开 发 [SEP] [PAD]
'''

字典操作

# 字典操作
zidian = tokenizer.get_vocab()

type(zidian), len(zidian), '你好' in zidian,'我好' in zidian,

'''
    (dict, 21128, False, False)

'''

# 添加新词
# 有新词后, 句子中相应词即可被识别编码
tokenizer.add_tokens(new_tokens=['你好', '我好'])
# 添加新符号
tokenizer.add_special_tokens({'eos_token':'[EOS]'})

zidian = tokenizer.get_vocab()
type(zidian), len(zidian), '你好' in zidian, zidian['你好']

'''
    (dict, 21131, True, 21128)
'''

datasets

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

'''
    /kaggle/input/chnsenticorp/ChnSentiCorp/dataset_info.json
    /kaggle/input/chnsenticorp/ChnSentiCorp/ChnSentiCorp.py
    /kaggle/input/chnsenticorp/ChnSentiCorp/chn_senti_corp-train.arrow
    /kaggle/input/chnsenticorp/ChnSentiCorp/chn_senti_corp-test.arrow
    /kaggle/input/chnsenticorp/ChnSentiCorp/chn_senti_corp-validation.arrow
'''

数据操作(from datasets import load_dataset)

1. 加载数据 .load_dataset(path=‘’, )

# 加载数据
from datasets import load_dataset

dataset = load_dataset(path='/kaggle/input/chnsenticorp/ChnSentiCorp', split='train')

dataset, dataset[0]

'''
    (Dataset({
         features: ['text', 'label'],
         num_rows: 9600
     }),
     {'text': '选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般',
      'label': 1})
'''

2. 排序 .sort(‘标签名’)

# 排序
print('排序前', dataset['label'][:20])
sorted_dataset = dataset.sort('label')
print('排序后', sorted_dataset['label'][:20])

'''
    排序前 [1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0]
    排序后 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
'''

3. 打乱 .shuffle()

print('打乱前', sorted_dataset['label'][:20])
shuffled_dataset = sorted_dataset.shuffle(seed=12)
print('打乱后', shuffled_dataset['label'][:20])

'''
    打乱前 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    打乱后 [1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0]
'''

4. 选择 .select()

dataset.select([0, 10, 20])

'''
    Dataset({
        features: ['text', 'label'],
        num_rows: 3
    })
'''

5. 过滤 .filter(函数名)

print(dataset)

def fun(data):
    return data['text'].startswith('选择') # 筛选以xz开头的

start_with_ar = dataset.filter(fun)

print(len(start_with_ar), start_with_ar['text'])

'''
    Dataset({
        features: ['text', 'label'],
        num_rows: 9600
    })
    


      0%|          | 0/10 [00:00<?, ?ba/s]


    2 ['选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。
    酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 
    服务吗，一般', '选择的事例太离奇了，夸大了心理咨询的现实意义，让人失去了信任感！
    如果说这样写的效果能在一开始抓住读者的眼球，但是看到案例主人公心理问题的原因解释时就逐渐失去了兴趣，
    反正有点拣了芝麻丢了西瓜的感觉。']
'''

6. 列操作 rename，remove，datatype

# rename
print(dataset.rename_column('text', 'renamed_text'))

#remove
print(dataset.remove_columns(['text']))

# set_format
dataset.set_format(type='torch', columns=['label'])
print(dataset[0])

'''
    Dataset({
        features: ['renamed_text', 'label'],
        num_rows: 9600
    })
    Dataset({
        features: ['label'],
        num_rows: 9600
    })
    {'label': tensor(1)}
'''

7. 保存和加载

from datasets import load_from_disk

dataset = load_dataset(path='/kaggle/input/chnsenticorp/ChnSentiCorp', split='train')

# 磁盘 <-> 内存
dataset.save_to_disk('/kaggle/output/1')
dataset = load_from_disk('/kaggle/output/1')
dataset

'''
    Dataset({
        features: ['text', 'label'],
        num_rows: 9600
    })
'''

评价函数 (from datasets import load_metric)

from datasets import list_metrics

metrics_list = list_metrics()
# 所有评价指标
print(len(metrics_list))
print(metrics_list)

'''
    84
    ['accuracy', 'bertscore', 'bleu', 'bleurt', 
    'brier_score', 'cer', 'chrf', 'code_eval', 'comet', 
    'competition_math', 'coval', 'cuad', 'exact_match', 'f1', 
    'frugalscore', 'glue', 'google_bleu', 'indic_glue', 'mae', 
    'mahalanobis', 'matthews_correlation', 'mauve', 'mean_iou', 
    'abidlabs/mean_iou2', 'angelina-wang/directional_bias_amplification', 'cakiki/ndcg'
    ... ...]
'''

# 使用
from datasets import load_metric

metric = load_metric('glue', 'mrpc')
print(metric)

'''
    Metric(name: "glue", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: """
    Compute GLUE evaluation metric associated to each GLUE dataset.
    Args:
        predictions: list of predictions to score.
            Each translation should be tokenized into a list of tokens.
        references: list of lists of references for each translation.
            Each reference should be tokenized into a list of tokens.
    Returns: depending on the GLUE subset, one or several of:
        "accuracy": Accuracy
        "f1": F1 score
        "pearson": Pearson Correlation
        "spearmanr": Spearman Correlation
        "matthews_correlation": Matthew Correlation
    Examples:
    
        >>> glue_metric = datasets.load_metric('glue', 'sst2')  # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]
        >>> references = [0, 1]
        >>> predictions = [0, 1]
        >>> results = glue_metric.compute(predictions=predictions, references=references)
        >>> print(results)
        {'accuracy': 1.0}
    
        >>> glue_metric = datasets.load_metric('glue', 'mrpc')  # 'mrpc' or 'qqp'
        >>> references = [0, 1]
        >>> predictions = [0, 1]
        >>> results = glue_metric.compute(predictions=predictions, references=references)
        >>> print(results)
        {'accuracy': 1.0, 'f1': 1.0}
    
        >>> glue_metric = datasets.load_metric('glue', 'stsb')
        >>> references = [0., 1., 2., 3., 4., 5.]
        >>> predictions = [0., 1., 2., 3., 4., 5.]
        >>> results = glue_metric.compute(predictions=predictions, references=references)
        >>> print({"pearson": round(results["pearson"], 2), "spearmanr": round(results["spearmanr"], 2)})
        {'pearson': 1.0, 'spearmanr': 1.0}
    
        >>> glue_metric = datasets.load_metric('glue', 'cola')
        >>> references = [0, 1]
        >>> predictions = [0, 1]
        >>> results = glue_metric.compute(predictions=predictions, references=references)
        >>> print(results)
        {'matthews_correlation': 1.0}
    """, stored examples: 0)
'''

pre = [0, 1, 0]
ref = [0, 1, 1]

metric.compute(predictions=pre, references=ref)

'''
    {'accuracy': 0.6666666666666666, 'f1': 0.6666666666666666}
'''

管道函数 (from transformers import pipeline)

# 提供不需要训练模型就可执行的nlp任务
from transformers import pipeline

# 文本分类
classifier = pipeline('sentiment-analysis')
#classifier = pipeline('fill-mask')

print( classifier('I hate you'))

print( classifier('I love you'))

'''
    No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)
    

    [{'label': 'NEGATIVE', 'score': 0.9991129040718079}]
    [{'label': 'POSITIVE', 'score': 0.9998656511306763}]
'''