NLP基础及其代码-tokenizer

An_ich

已于 2024-09-11 14:20:39 修改

阅读量952

点赞数 29

文章标签：自然语言处理人工智能

于 2024-09-10 21:03:55 首次发布

本文链接：https://blog.csdn.net/weixin_62891098/article/details/142058967

版权

基础知识

NLP-分词器：SentencePiece【参考Chinese-LLaMA-Alpaca在通用中文语料上训练的20K中文词表并与原版LLaMA模型的32K词表进行合并的代码】_sentencepiece 中文训练-CSDN博客

【OpenLLM 008】大模型基础组件之分词器-万字长文全面解读LLM中的分词算法与分词器（tokenization & tokenizers）：BPE/WordPiece/ULM & beyond - 知乎 (zhihu.com)

BPE

Byte Pair Encoding

步骤：

1.语料库
2.确定词表大小
3.为每个单词添加分割符
4.从字母开始迭代合并出现频率高的字符串

"""
2024/9/9
bpe.py
wang_yi
"""
import re, collections

def get_vocab(filename):
    vocab = collections.defaultdict(int)  # 对于 defaultdict(int) 创建的字典来说，任何不存在的键在访问时都会自动被赋值为 0
    with open(filename, 'r', encoding='utf-8') as fhand:
        for line in fhand:
            words = line.strip().split()
            for word in words:
                vocab[' '.join(list(word)) + ' </w>'] += 1
    return vocab

def get_stats(vocab):
    pairs = collections.defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i],symbols[i+1]] += freq
    return pairs

def merge_vocab(pair, v_in):
    v_out = {}
    bigram = re.escape(' '.join(pair))  # re.escape确保特殊字符被转义，以便在正则表达式中按照字面意义进行匹配
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')  # ?!负向前瞻断言，表示匹配位置前(<)或后面不是某种模式。\S: 表示任何非空白字符
    for word in v_in:
        w_out = p.sub(''.join(pair), word)  # 替换结合字符对，保留原来字符串中的其他部分
        v_out[w_out] = v_in[word]  # 将处理后的字符串 w_out 作为 v_out 字典的键，并将 v_in 字典中对应 word 的值赋给 v_out 字典中这个键
    return v_out

def get_tokens(vocab):
    tokens = collections.defaultdict(int)
    for word, freq in vocab.items():
        word_tokens = word.split()
        for token in word_tokens:
            tokens[token] += freq
    return tokens

# vocab = {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w e s t </w>': 6, 'w i d e s t </w>': 3}

# Get free book from Gutenberg
# wget http://www.gutenberg.org/cache/epub/16457/pg16457.txt
vocab = get_vocab('pg16457.txt')

print('==========')
print('Tokens Before BPE')
tokens = get_tokens(vocab)
print('Tokens: {}'.format(tokens))
print('Number of tokens: {}'.format(len(tokens)))
print('==========')

num_merges = 1000
for i in range(num_merges):
    pairs = get_stats(vocab)
    if not pairs:
        break
    best = max(pairs, key=pairs.get)
    vocab = merge_vocab(best, vocab)
    print('Iter: {}'.format(i))
    print('Best pair: {}'.format(best))
    tokens = get_tokens(vocab)
    print('Tokens: {}'.format(tokens))
    print('Number of tokens: {}'.format(len(tokens)))
    print('==========')

单词以</w>结尾

新分词

打印结果

Tokens Before BPE
Tokens: defaultdict(<class 'int'>, {'T': 1607, 'h': 26103, 'e': 59190, '</w>': 101849, 'P': 780, 'r': 29562, 'o': 35007, 'j': 858, 'c': 13900, 't': 44238, 'G': 300, 'u': 13723, 'n': 32498, 'b': 7426, 'g': 8752, 'B': 1162, 'k': 2732, 'f': 10463, 'A': 1379, 'l': 20619, 'd': 17581, 'M': 1204, 'i': 31414, 's': 28311, 'a': 36695, 'y': 8828, 'w': 8155, 'U': 178, 'S': 865, 'm': 9751, 'p': 8030, 'v': 4878, '.': 4061, 'Y': 250, ',': 8065, '-': 1063, 'L': 426, 'I': 1428, ':': 201, 'J': 78, 'V': 102, 'E': 895, 'R': 369, '6': 73, '2': 160, '0': 402, '5': 124, '[': 32, '#': 1, '1': 291, '4': 99, '7': 60, ']': 32, 'D': 322, 'C': 862, 'K': 41, 'O': 510, '/': 31, '*': 22, 'F': 419, 'H': 688, 'N': 793, '"': 4064, '!': 1214, 'W': 576, '3': 104, "'": 1236, 'Q': 33, 'X': 49, 'Z': 10, '?': 651, '8': 73, '9': 36, '_': 1426, 'à': 3, 'x': 937, 'z': 364, '°': 41, 'q': 575, ';': 561, '(': 53, ')': 53, '{': 23, '}': 16, 'è': 2, 'é': 14, '+': 2, '=': 3, 'ö': 2, 'ê': 5, 'â': 1, 'ô': 1, 'Æ': 3, 'æ': 2, '—': 2, '™': 57, '“': 11, '”': 11, '•': 4, '%': 1, '‘': 1, '’': 6, '$': 2})
Number of tokens: 103
==========
Iter: 0
Best pair: ('e', '</w>')
Tokens: defaultdict(<class 'int'>, {'T': 1607, 'h': 26103, 'e</w>': 17758, 'P': 780, 'r': 29562, 'o': 35007, 'j': 858, 'e': 41432, 'c': 13900, 't': 44238, '</w>': 84091, 'G': 300, 'u': 13723, 'n': 32498, 'b': 7426, 'g': 8752, 'B': 1162, 'k': 2732, 'f': 10463, 'A': 1379, 'l': 20619, 'd': 17581, 'M': 1204, 'i': 31414, 's': 28311, 'a': 36695, 'y': 8828, 'w': 8155, 'U': 178, 'S': 865, 'm': 9751, 'p': 8030, 'v': 4878, '.': 4061, 'Y': 250, ',': 8065, '-': 1063, 'L': 426, 'I': 1428, ':': 201, 'J': 78, 'V': 102, 'E': 895, 'R': 369, '6': 73, '2': 160, '0': 402, '5': 124, '[': 32, '#': 1, '1': 291, '4': 99, '7': 60, ']': 32, 'D': 322, 'C': 862, 'K': 41, 'O': 510, '/': 31, '*': 22, 'F': 419, 'H': 688, 'N': 793, '"': 4064, '!': 1214, 'W': 576, '3': 104, "'": 1236, 'Q': 33, 'X': 49, 'Z': 10, '?': 651, '8': 73, '9': 36, '_': 1426, 'à': 3, 'x': 937, 'z': 364, '°': 41, 'q': 575, ';': 561, '(': 53, ')': 53, '{': 23, '}': 16, 'è': 2, 'é': 14, '+': 2, '=': 3, 'ö': 2, 'ê': 5, 'â': 1, 'ô': 1, 'Æ': 3, 'æ': 2, '—': 2, '™': 57, '“': 11, '”': 11, '•': 4, '%': 1, '‘': 1, '’': 6, '$': 2})
Number of tokens: 104
==========
Iter: 1
Best pair: ('t', 'h')
Tokens: defaultdict(<class 'int'>, {'T': 1607, 'h': 12062, 'e</w>': 17758, 'P': 780, 'r': 29562, 'o': 35007, 'j': 858, 'e': 41432, 'c': 13900, 't': 30197, '</w>': 84091, 'G': 300, 'u': 13723, 'n': 32498, 'b': 7426, 'g': 8752, 'B': 1162, 'k': 2732, 'f': 10463, 'A': 1379, 'l': 20619, 'd': 17581, 'th': 14041, 'M': 1204, 'i': 31414, 's': 28311, 'a': 36695, 'y': 8828, 'w': 8155, 'U': 178, 'S': 865, 'm': 9751, 'p': 8030, 'v': 4878, '.': 4061, 'Y': 250, ',': 8065, '-': 1063, 'L': 426, 'I': 1428, ':': 201, 'J': 78, 'V': 102, 'E': 895, 'R': 369, '6': 73, '2': 160, '0': 402, '5': 124, '[': 32, '#': 1, '1': 291, '4': 99, '7': 60, ']': 32, 'D': 322, 'C': 862, 'K': 41, 'O': 510, '/': 31, '*': 22, 'F': 419, 'H': 688, 'N': 793, '"': 4064, '!': 1214, 'W': 576, '3': 104, "'": 1236, 'Q': 33, 'X': 49, 'Z': 10, '?': 651, '8': 73, '9': 36, '_': 1426, 'à': 3, 'x': 937, 'z': 364, '°': 41, 'q': 575, ';': 561, '(': 53, ')': 53, '{': 23, '}': 16, 'è': 2, 'é': 14, '+': 2, '=': 3, 'ö': 2, 'ê': 5, 'â': 1, 'ô': 1, 'Æ': 3, 'æ': 2, '—': 2, '™': 57, '“': 11, '”': 11, '•': 4, '%': 1, '‘': 1, '’': 6, '$': 2})
Number of tokens: 105
==========
Iter: 2
Best pair: ('t', '</w>')
Tokens: defaultdict(<class 'int'>, {'T': 1607, 'h': 12062, 'e</w>': 17758, 'P': 780, 'r': 29562, 'o': 35007, 'j': 858, 'e': 41432, 'c': 13900, 't</w>': 9280, 'G': 300, 'u': 13723, 't': 20917, 'n': 32498, 'b': 7426, 'g': 8752, '</w>': 74811, 'B': 1162, 'k': 2732, 'f': 10463, 'A': 1379, 'l': 20619, 'd': 17581, 'th': 14041, 'M': 1204, 'i': 31414, 's': 28311, 'a': 36695, 'y': 8828, 'w': 8155, 'U': 178, 'S': 865, 'm': 9751, 'p': 8030, 'v': 4878, '.': 4061, 'Y': 250, ',': 8065, '-': 1063, 'L': 426, 'I': 1428, ':': 201, 'J': 78, 'V': 102, 'E': 895, 'R': 369, '6': 73, '2': 160, '0': 402, '5': 124, '[': 32, '#': 1, '1': 291, '4': 99, '7': 60, ']': 32, 'D': 322, 'C': 862, 'K': 41, 'O': 510, '/': 31, '*': 22, 'F': 419, 'H': 688, 'N': 793, '"': 4064, '!': 1214, 'W': 576, '3': 104, "'": 1236, 'Q': 33, 'X': 49, 'Z': 10, '?': 651, '8': 73, '9': 36, '_': 1426, 'à': 3, 'x': 937, 'z': 364, '°': 41, 'q': 575, ';': 561, '(': 53, ')': 53, '{': 23, '}': 16, 'è': 2, 'é': 14, '+': 2, '=': 3, 'ö': 2, 'ê': 5, 'â': 1, 'ô': 1, 'Æ': 3, 'æ': 2, '—': 2, '™': 57, '“': 11, '”': 11, '•': 4, '%': 1, '‘': 1, '’': 6, '$': 2})
Number of tokens: 106
==========

可以保存生成的分词，根据分词列表对输入的文本进行分词

"""
2024/9/9
bpe_code.py
wang_yi
"""
import re, collections


def get_vocab(filename):
    vocab = collections.defaultdict(int)
    with open(filename, 'r', encoding='utf-8') as fhand:
        for line in fhand:
            words = line.strip().split()
            for word in words:
                vocab[' '.join(list(word)) + ' </w>'] += 1

    return vocab


def get_stats(vocab):
    pairs = collections.defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[symbols[i], symbols[i + 1]] += freq
    return pairs


def merge_vocab(pair, v_in):
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out


def get_tokens_from_vocab(vocab):
    tokens_frequencies = collections.defaultdict(int)
    vocab_tokenization = {}
    for word, freq in vocab.items():
        word_tokens = word.split()
        for token in word_tokens:
            tokens_frequencies[token] += freq
        vocab_tokenization[''.join(word_tokens)] = word_tokens
    return tokens_frequencies, vocab_tokenization


def measure_token_length(token):
    if token[-4:] == '</w>':
        return len(token[:-4]) + 1
    else:
        return len(token)


def tokenize_word(string, sorted_tokens, unknown_token='</u>'):
    if string == '':
        return []
    if sorted_tokens == []:
        return [unknown_token]

    string_tokens = []
    for i in range(len(sorted_tokens)):
        token = sorted_tokens[i]
        token_reg = re.escape(token.replace('.', '[.]'))

        matched_positions = [(m.start(0), m.end(0)) for m in re.finditer(token_reg, string)]
        if len(matched_positions) == 0:
            continue
        substring_end_positions = [matched_position[0] for matched_position in matched_positions]

        substring_start_position = 0
        for substring_end_position in substring_end_positions:
            substring = string[substring_start_position:substring_end_position]
            string_tokens += tokenize_word(string=substring, sorted_tokens=sorted_tokens[i + 1:],
                                           unknown_token=unknown_token)
            string_tokens += [token]
            substring_start_position = substring_end_position + len(token)
        remaining_substring = string[substring_start_position:]
        string_tokens += tokenize_word(string=remaining_substring, sorted_tokens=sorted_tokens[i + 1:],
                                       unknown_token=unknown_token)
        break
    return string_tokens


# vocab = {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w e s t </w>': 6, 'w i d e s t </w>': 3}

vocab = get_vocab('pg16457.txt')

print('==========')
print('Tokens Before BPE')
tokens_frequencies, vocab_tokenization = get_tokens_from_vocab(vocab)
print('All tokens: {}'.format(tokens_frequencies.keys()))
print('Number of tokens: {}'.format(len(tokens_frequencies.keys())))
print('==========')

num_merges = 100
for i in range(num_merges):
    pairs = get_stats(vocab)
    if not pairs:
        break
    best = max(pairs, key=pairs.get)
    vocab = merge_vocab(best, vocab)
    print('Iter: {}'.format(i))
    print('Best pair: {}'.format(best))
    tokens_frequencies, vocab_tokenization = get_tokens_from_vocab(vocab)
    print('All tokens: {}'.format(tokens_frequencies.keys()))
    print('Number of tokens: {}'.format(len(tokens_frequencies.keys())))
    print('==========')

# Let's check how tokenization will be for a known word
word_given_known = 'mountains</w>'
word_given_unknown = 'Ilikeeatingapples!</w>'

sorted_tokens_tuple = sorted(tokens_frequencies.items(), key=lambda item: (measure_token_length(item[0]), item[1]),
                             reverse=True)  # 根据 measure_token_length(item[0]) 排序，如果长度相同，则按 item[1]（值）排序。reverse=True 降序排序
sorted_tokens = [token for (token, freq) in sorted_tokens_tuple]

print(sorted_tokens)

word_given = word_given_known

print('Tokenizing word: {}...'.format(word_given))
if word_given in vocab_tokenization:
    print('Tokenization of the known word:')
    print(vocab_tokenization[word_given])
    print('Tokenization treating the known word as unknown:')
    print(tokenize_word(string=word_given, sorted_tokens=sorted_tokens, unknown_token='</u>'))
else:
    print('Tokenizating of the unknown word:')
    print(tokenize_word(string=word_given, sorted_tokens=sorted_tokens, unknown_token='</u>'))

word_given = word_given_unknown

print('Tokenizing word: {}...'.format(word_given))
if word_given in vocab_tokenization:
    print('Tokenization of the known word:')
    print(vocab_tokenization[word_given])
    print('Tokenization treating the known word as unknown:')
    print(tokenize_word(string=word_given, sorted_tokens=sorted_tokens, unknown_token='</u>'))
else:
    print('Tokenizating of the unknown word:')
    print(tokenize_word(string=word_given, sorted_tokens=sorted_tokens, unknown_token='</u>'))

字节对编码 - 毛雷日志 (leimao.github.io)
彻底搞懂BPE（Byte Pair Encode）原理（附代码实现）_bpe实现-CSDN博客

问题：

一个词可能存在多种拆分方式，对于算法来说，难以评估使用那个拆分方式比较合理，可以组合的列表中的优先级无法确定，通常会直接取第一个。如：

encode一个句子"linear algebra", 那么存在的划分方法有以下几种：

linear = li + near 或者 li + n + ea + r

algebra = al + ge + bra 或者 al + g + e + bra

在这个具体的case中，每个单词都有两种拆分方法，那么整个句子就有4中拆分方法。

解决方式——>在merge的时候考虑merge前后的影响到底有多大

wordpiece

WordPiece基于概率生成新的subword而不是最高频字节对。

WordPiece和BPE的区别就在每次merge的过程中， BPE是通过合并最高频次的字节对，WordPiece选择能够提升语言模型概率最大的相邻子词加入词表

假设句子S=(t1,t2,...,tn)，是由n个子词组成，ti表示子词，且假设各个子词之间是独立存在的，则句子S的语言模型似然值等价与所有子词概率的乘积：

$\log P(S) = \sum_{i=1}^n \log P(t_i)$

把相邻位置的x和y两个子词进行合并，合并后产生的子词为z，此时句子S似然值的变化可表示为

选择让似然概率最大的值，具体的计算使用合并后的概率值，除以合并前的概率值，举个例子, 在考虑将"e"和"s"合并的时候除了会考虑"es"的概率值，还会考虑"e"和"s"的概率值。或者说，"es"的合并是通过考虑合并带来的价值。

步骤：

1.语料库
2.确定词表大小
3.为每个单词添加分割符
4.从字母开始迭代合并前后概率变化最大的字符串

在编码时从头开始查找最长的子词

"""
2024/9/10
wordpiece.py
wang_yi
"""
corpus = [
    "This is the Hugging Face course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]
from transformers import AutoTokenizer
from collections import defaultdict
# 在進行預標記化時計算語料庫中每個單詞的頻率:
tokenizer = AutoTokenizer.from_pretrained(r"G:\llm_model\bert-base-chinese")

word_freqs = defaultdict(int)
for text in corpus:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1

print(word_freqs)
# 字母表是由單詞的所有第一個字母組成的唯一集合,以及出現在前綴為 ## 的其他字母:
alphabet = []
for word in word_freqs.keys():
    if word[0] not in alphabet:
        alphabet.append(word[0])
    for letter in word[1:]:
        if f"##{letter}" not in alphabet:
            alphabet.append(f"##{letter}")

alphabet.sort()
# alphabet
print(alphabet)
# 在該詞彙表的開頭添加了模型使用的特殊標記。在使用 BERT 的情況下,它是列表 ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]:
vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] + alphabet.copy()
# 需要拆分每個單詞, 所有不是第一個字母的字母都以 ## 為前綴:
splits = {
    word: [c if i == 0 else f"##{c}" for i, c in enumerate(word)]
    for word in word_freqs.keys()
}

# 計算每對的分數
def compute_pair_scores(splits):
    letter_freqs = defaultdict(int)
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            letter_freqs[split[0]] += freq
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            letter_freqs[split[i]] += freq
            pair_freqs[pair] += freq
        letter_freqs[split[-1]] += freq

    scores = {
        pair: freq / (letter_freqs[pair[0]] * letter_freqs[pair[1]])
        for pair, freq in pair_freqs.items()
    }
    return scores


pair_scores = compute_pair_scores(splits)
for i, key in enumerate(pair_scores.keys()):
    print(f"{key}: {pair_scores[key]}")
    if i >= 5:
        break

best_pair = ""
max_score = None
for pair, score in pair_scores.items():
    if max_score is None or max_score < score:
        best_pair = pair
        max_score = score

print(best_pair, max_score)
# 第一個要學習的合併是 ('a', '##b') -> 'ab', 並且我們添加 'ab' 到詞彙表中:
vocab.append("ab")


def merge_pair(a, b, splits):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue
        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                merge = a + b[2:] if b.startswith("##") else a + b
                split = split[:i] + [merge] + split[i + 2:]
            else:
                i += 1
        splits[word] = split
    return splits


splits = merge_pair("a", "##b", splits)
print(splits["about"])

vocab_size = 70
while len(vocab) < vocab_size:
    scores = compute_pair_scores(splits)
    best_pair, max_score = "", None
    for pair, score in scores.items():
        if max_score is None or max_score < score:
            best_pair = pair
            max_score = score
    splits = merge_pair(*best_pair, splits)
    new_token = (
        best_pair[0] + best_pair[1][2:]
        if best_pair[1].startswith("##")
        else best_pair[0] + best_pair[1]
    )
    vocab.append(new_token)
print(vocab)


def encode_word(word):
    tokens = []
    while len(word) > 0:
        i = len(word)
        while i > 0 and word[:i] not in vocab:
            i -= 1
        if i == 0:
            return ["[UNK]"]
        tokens.append(word[:i])
        word = word[i:]
        if len(word) > 0:
            word = f"##{word}"
    return tokens


print(encode_word("Hugging"))
print(encode_word("HOgging"))


def tokenize(text):
    pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in pre_tokenize_result]
    encoded_words = [encode_word(word) for word in pre_tokenized_text]
    return sum(encoded_words, [])


tokenize("This is the Hugging Face course!")

标准化和预标记化 - Hugging Face NLP Course

运行结果：

defaultdict(<class 'int'>, {'This': 3, 'is': 2, 'the': 1, 'Hugging': 1, 'Face': 1, 'course': 1, '.': 4, 'chapter': 1, 'about': 1, 'tokenization': 1, 'section': 1, 'shows': 1, 'several': 1, 'tokenizer': 1, 'algorithms': 1, 'Hopefully': 1, ',': 1, 'you': 1, 'will': 1, 'be': 1, 'able': 1, 'to': 1, 'understand': 1, 'how': 1, 'they': 1, 'are': 1, 'trained': 1, 'and': 1, 'generate': 1, 'tokens': 1})
['##a', '##b', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##k', '##l', '##m', '##n', '##o', '##p', '##r', '##s', '##t', '##u', '##v', '##w', '##y', '##z', ',', '.', 'F', 'H', 'T', 'a', 'b', 'c', 'g', 'h', 'i', 's', 't', 'u', 'w', 'y']
('T', '##h'): 0.125
('##h', '##i'): 0.03409090909090909
('##i', '##s'): 0.02727272727272727
('i', '##s'): 0.1
('t', '##h'): 0.03571428571428571
('##h', '##e'): 0.011904761904761904
('a', '##b') 0.2
['ab', '##o', '##u', '##t']
['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '##a', '##b', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##k', '##l', '##m', '##n', '##o', '##p', '##r', '##s', '##t', '##u', '##v', '##w', '##y', '##z', ',', '.', 'F', 'H', 'T', 'a', 'b', 'c', 'g', 'h', 'i', 's', 't', 'u', 'w', 'y', 'ab', '##fu', 'Fa', 'Fac', '##ct', '##ful', '##full', '##fully', 'Th', '##hm', '##thm', 'Hu', 'Hug', 'Hugg', 'ch', 'cha', 'chap', 'chapt', 'sh', 'th', 'is', '##thms', '##za', '##zat', '##ut', '##ta']
['Hugg', '##i', '##n', '##g']
['[UNK]']
['Th', '##i', '##s', 'is', 'th', '##e', 'Hugg', '##i', '##n', '##g', 'Fac', '##e', 'c', '##o', '##u', '##r', '##s', '##e', '[UNK]']

单词中首字母不变，中间分词以##连接

Unigram Language Model(ULM)

与 BPE 和 WordPiece 相比,Unigram 在另一个方向上工作:它从一个较大的词汇表开始,然后从中删除标记,直到达到所需的词汇表大小。请注意,从不删除基本字符,以确保可以标记任何单词。

在训练的每一步,Unigram 算法都会在给定当前词汇的情况下计算语料库的损失。然后,遍历词汇表中的每个分词,分别计算如果删除该分词,整体损失会增加多少。寻找损失增加最少的分词，这些分词对语料库的整体损失影响较小,因此从某种意义上说,它们“不太需要”并且是移除的最佳候选者。

步骤：

1.建立一个足够大的词表。
2.求当前词表每个分词在语料上的概率。
3.根据词表，计算对语料最优分割下的loss。
4.遍历删除分词，计算删除该分词后对语料最优分割下的loss。
5.根据删除某分词后增加的损失进行排序，按要求比例丢弃对损失无影响或影响较小的分词。单字符不能被丢弃，这是为了避免OOV情况。
6.重复步骤2到5，直到词表大小减少到设定范围。

举个例子：

语料库：

所有分词：

所有子词的出现次数：

给定分词的概率是它在原始语料库中的频率(我们找到它的次数),除以词汇表中所有分词的所有频率的总和(以确保概率总和为 1)。例如, "ug" 在 "hug" 、 "pug" 以及 "hugs" 中,所以它在我们的语料库中的频率为 20。所有频率之和为210, 并且子词 "ug" 出现的概率是 20/210

"pug" 的标记化 ["p", "u", "g"] 的概率为:

标记化 ["pu", "g"] 的概率为:

将一个单词分成尽可能少的分词拥有更高的概率

整体损失函数为 $-\sum num(x)logp(x)$
其中num(x)为当前单词在语料中出现的次数，p(x)为当前单词按某种分词方式分词的概率。

对于下面的分词方式：

所以损失是:
$10 * (-log(0.071428)) + 5 * (-log(0.007710)) + 12 * (-log(0.006168)) + 4 * (-log(0.001451)) + 5 * (-log(0.001701)) = 169.8$

对每种分词方式计算去掉该分词后，最优分割方式下整体损失的变化

如将hug分词去掉，

按照hu g分割为最优分割

计算整体loss的变化值=hugs单词个数*(- (-log(0.071428)) + (-log(0.006802))) = 23.5

"""
2024/9/10
ulm.py
wang_yi
"""
# import os
# os.environ['ALL_PROXY'] = 'http://127.0.0.1:7890'

corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]
from transformers import AutoTokenizer
from collections import defaultdict

tokenizer = AutoTokenizer.from_pretrained(r"G:\llm_model\xlnet-base-cased")

word_freqs = defaultdict(int)
for text in corpus:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1

print(word_freqs)
# 字符分词
char_freqs = defaultdict(int)
# 除字符分词外所有分词
subwords_freqs = defaultdict(int)
for word, freq in word_freqs.items():
    for i in range(len(word)):
        char_freqs[word[i]] += freq
        # Loop through the subwords of length at least 2
        for j in range(i + 2, len(word) + 1):
            subwords_freqs[word[i:j]] += freq

# Sort subwords by frequency
sorted_subwords = sorted(subwords_freqs.items(), key=lambda x: x[1], reverse=True)  # items返回key和value
print(sorted_subwords[:10])

token_freqs = list(char_freqs.items()) + sorted_subwords[: 300 - len(char_freqs)]
token_freqs = {token: freq for token, freq in token_freqs}

from math import log

total_sum = sum([freq for token, freq in token_freqs.items()])
# 分词出现的概率
model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}

def encode_word(word, model):
    best_segmentations = [{"start": 0, "score": 1}] + [
        {"start": None, "score": None} for _ in range(len(word))  # hopefully This
    ]
    for start_idx in range(len(word)):
        # This should be properly filled by the previous steps of the loop
        best_score_at_start = best_segmentations[start_idx]["score"]
        for end_idx in range(start_idx + 1, len(word) + 1):
            token = word[start_idx:end_idx]
            if token in model and best_score_at_start is not None:
                score = model[token] + best_score_at_start
                # If we have found a better segmentation ending at end_idx, we update
                if (
                    best_segmentations[end_idx]["score"] is None
                    or best_segmentations[end_idx]["score"] > score
                ):
                    best_segmentations[end_idx] = {"start": start_idx, "score": score}

    segmentation = best_segmentations[-1]
    if segmentation["score"] is None:
        # We did not find a tokenization of the word -> unknown
        return ["<unk>"], None

    score = segmentation["score"]
    start = segmentation["start"]
    end = len(word)
    tokens = []
    while start != 0:
        tokens.insert(0, word[start:end])
        next_start = best_segmentations[start]["start"]
        end = start
        start = next_start
    tokens.insert(0, word[start:end])
    return tokens, score


print(encode_word("Hopefully", model))
print(encode_word("This", model))

def compute_loss(model):
    loss = 0
    for word, freq in word_freqs.items():
        _, word_loss = encode_word(word, model)
        loss += freq * word_loss
    return loss

print(compute_loss(model))

import copy


def compute_scores(model):
    scores = {}
    model_loss = compute_loss(model)
    for token, score in model.items():
        # We always keep tokens of length 1
        if len(token) == 1:
            continue
        model_without_token = copy.deepcopy(model)
        _ = model_without_token.pop(token)
        scores[token] = compute_loss(model_without_token) - model_loss
    return scores

scores = compute_scores(model)
print(scores["ll"])
print(scores["his"])

percent_to_remove = 0.1
while len(model) > 100:
    scores = compute_scores(model)
    sorted_scores = sorted(scores.items(), key=lambda x: x[1])
    # Remove percent_to_remove tokens with the lowest scores.
    for i in range(int(len(model) * percent_to_remove)):
        _ = token_freqs.pop(sorted_scores[i][0])

    total_sum = sum([freq for token, freq in token_freqs.items()])
    model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}

def tokenize(text, model):
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in words_with_offsets]
    encoded_words = [encode_word(word, model)[0] for word in pre_tokenized_text]
    return sum(encoded_words, [])


print(tokenize("This is the Hugging Face course.", model))

将_作为单词开头

defaultdict(<class 'int'>, {'▁This': 3, '▁is': 2, '▁the': 1, '▁Hugging': 1, '▁Face': 1, '▁Course.': 1, '▁chapter': 1, '▁about': 1, '▁tokenization.': 1, '▁section': 1, '▁shows': 1, '▁several': 1, '▁tokenizer': 1, '▁algorithms.': 1, '▁Hopefully,': 1, '▁you': 1, '▁will': 1, '▁be': 1, '▁able': 1, '▁to': 1, '▁understand': 1, '▁how': 1, '▁they': 1, '▁are': 1, '▁trained': 1, '▁and': 1, '▁generate': 1, '▁tokens.': 1})
[('▁t', 7), ('is', 5), ('er', 5), ('▁a', 5), ('▁to', 4), ('to', 4), ('en', 4), ('▁T', 3), ('▁Th', 3), ('▁Thi', 3)]
(['H', 'o', 'p', 'e', 'f', 'u', 'll', 'y'], 41.5157494601402)
(['This'], 6.288267030694535)
413.10377642940875
6.376412403623874
0.0
['▁This', '▁is', '▁the', '▁Hugging', '▁Face', '▁', 'c', 'ou', 'r', 's', 'e', '.']