【LLM】——mllm端侧大模型推理框架解读

最新推荐文章于 2025-04-15 09:57:33 发布

农夫山泉2号

最新推荐文章于 2025-04-15 09:57:33 发布

阅读量332

点赞数 6

分类专栏： LLM 嵌入式AI 文章标签： LLM qwen

本文链接：https://blog.csdn.net/u011622208/article/details/146415664

版权

嵌入式AI 同时被 2 个专栏收录

169 篇文章

订阅专栏

LLM

4 篇文章

订阅专栏

代码:mllm

1. 整体流程

请添加图片描述

2. 模型结构组织

// TODO:
/*
 * ┌───────┬──────┬───────┬────────┬───────────┬─────────┬─────────┬──────┬──────────────────────┬─────────────────────────┐
 * │       │      │       │        │           │         │         │      │                      │                         │
 * │       │      │       │        │           │         │         │      │                      │                         │
 * │       │      │       │        │           │         │         │      │                      │                         │
 * │       │      │       │        │           │         │         │      │                      │                         │
 * │       │Index │       │        │           │         │         │      │                      │                         │
 * │       │ Len  │       │        │           │         │         │      │                      │                         │
 * │ Magic │ INT  │ Name  │Name    │ Weights   │ Offset  │ DataType│....  │   Weights Contents   │   Weights Contents      │
 * │       │      │ Length│String  │ Length    │  INT    │  INT    │      │                      │                         │
 * │       │      │ INT   │        │  INT      │         │         │      │                      │                         │
 * │       │      │       │        │           │         │         │      │                      │                         │
 * │       │      │       │        │           │         │         │      │                      │                         │
 * │       │      │       │        │           │         │         │      │                      │                         │
 * │       │      │       │        │           │         │         │      │                      │                         │
 * └───────┴──────┴───────┴────────┴───────────┴─────────┴─────────┴──────┴──────────────────────┴─────────────────────────┘
 * Weights File Structure
 */
int(20012)+u64(info)+[4+len(name)+8+8+4]*n + [weight]*n
import argparse
import json
import struct
from functools import reduce
from io import BufferedWriter
import os
import torch

MAGIC_NUMBER = 20012
file_map = {}


class Tensor:
    name: str
    offset: int
    size: int
    dtype: int

    def __init__(self, name: str, dtype: int):
        self.name = name
        self.dtype = dtype


# One Tensor Index Item Contains: Name_Len(Int)+Name(str)+Weights_Len(UInt64)+Offset(UInt64)+DataType(Int)
def calc_tensors_index_table_size(name: str):
    return 4 + len(name) + 8 + 8 + 4


class Writer:
    writer: BufferedWriter
    tensors_map: [str, Tensor]
    tensors_name: [str]

    def __init__(self, path: str):
        self.tensors_map = {}
        self.tensors_name = []
        self.writer = open(path, "wb+")
        self.writer.seek(0)
        self.write_int(MAGIC_NUMBER)

    def __torch_dtype_to_int(self, dtype: torch.dtype) -> int:
        if dtype == torch.float32 or dtype == torch.bfloat16:
            return 0
        elif dtype == torch.float16:
            return 1
        elif dtype == torch.int8 or dtype == torch.bool:
            return 16
        elif dtype == torch.int32:
            return 18
        else:
            raise Exception(f"Unknown dtype: {dtype}")

    def write_int(self, val: int):
        self.writer.write(struct.pack("<i", val))

    def write_float(self, val: float):
        self.writer.write(struct.pack("<f", val))

    def write_u64(self, val: int):
        self.writer.write(struct.pack("<Q", val))

    def write_str(self, val: str):
        self.writer.write(struct.pack("<i", len(val)))
        self.writer.write(val.encode("utf-8"))

    def write_tensor(self, tensor: torch.Tensor, name: str) -> [int, int]:
        tensor_idx = Tensor(name=name, dtype=self.__torch_dtype_to_int(tensor.dtype))
        self.tensors_map[name] = tensor_idx
        offset = self.writer.tell()
        if tensor.dtype == torch.bfloat16:  # to float 16
            tensor_numpy = tensor.detach().to(torch.float32).numpy()
        elif tensor.dtype == torch.bool or tensor.dtype == torch.int8:  # exported model for QNN int8
            tensor_numpy = tensor.detach().to(torch.int8).numpy()
        else:
            tensor_numpy = tensor.numpy()
        tensor_numpy.tofile(self.writer)
        size = self.writer.tell() - offset
        tensor_idx.size = size
        tensor_idx.offset = offset
        return offset, size

    def write_tensor_index(
            self,
    ):
        self.writer.seek(4 + 8)
        for tensor_name in self.tensors_name:
            tensor = self.tensors_map[tensor_name]
            # self.write_int(len(tensor.name))
            tensor.name = tensor.name.replace("_weight", ".weight")
            tensor.name = tensor.name.replace("_bias", ".bias")
            # todo: nort used in GTEST
            # tensor.name = key_map(tensor.name, args.type)
            self.write_str(tensor.name)
            self.write_u64(tensor.size)
            self.write_u64(tensor.offset)
            self.write_int(tensor.dtype)
            print(f"Write tensor {tensor.name} to {tensor.offset} with size {tensor.size}")

    def write_tensor_index_padding(self, tensors_name: [str]):
        if len(tensors_name) > 0:
            self.tensors_name = tensors_name
            padding_size = reduce(
                lambda x, y: x + y, map(calc_tensors_index_table_size, tensors_name)
            )
            self.writer.seek(4)             # magic number的4字节
            self.write_u64(padding_size)    # tensor信息字节
            print(f"Padding size: {padding_size}")
            self.writer.write(b"\x00" * padding_size)
            self.writer.flush()
            return
        else:
            raise Exception("No tensors to write")

    def close(self):
        self.writer.close()


def get_tensor(model: dict, key: str, index_: dict):
    if index_ is not None and isinstance(index_, dict) and "weight_map" in index_.keys():
        if key in index_["weight_map"].keys():
            model_ = file_map[index_["weight_map"][key]]
            if args.type == "torch":
                return model_[key]
            if args.type == "safetensor":
                return model_.get_tensor(key)
        else:
            raise Exception(f"Tensor {key} not found in index")
    if key in model.keys():
        if args.type == "torch":
            return model[key]
        if args.type == "safetensor":
            return model.get_tensor(key)
    else:
        raise Exception(f"Tensor {key} not found in model")


def all_keys(model: dict, index_: dict):
    global file_map
    all_keys_name = []
    if index_ is not None and isinstance(index_, dict) and "weight_map" in index_.keys():
        json_pwd = os.path.dirname(args.input_model.name)
        for (key, val) in index_["weight_map"].items():
            all_keys_name.append(key)
            if val is not None and val not in file_map.keys():
                # JOIN PATH
                val_path = os.path.join(json_pwd, val)
                print(val_path)
                if args.type == "torch":
                    file_map[val] = torch.load(val_path, weights_only=True)
                else:
                    file_map[val] = safe_open(val_path, framework="pt")
    else:
        for key in model.keys():
            if not key.startswith("_"):
                if args.type == "torch":
                    val = model[key]
                if args.type == "safetensor":
                    val = model.get_tensor(key)
                if isinstance(val, torch.Tensor):
                    all_keys_name.append(key)
                elif isinstance(val, dict):
                    all_keys_name.extend(all_keys(val))
                else:
                    pass
    return all_keys_name


def process_str(name: str, type: str='dense'):
    if type == 'dense' or ('down_proj.weight' not in name):
        return name
    return name.replace('weight', 'weight_T')

def process(name: str, ten: torch.Tensor, type: str='dense'):
    if type == 'dense' or ('down_proj.weight' not in name):
        return name, ten

    new_name = name.replace('weight', 'weight_T')
    transposed_tensor = ten.transpose(-2, -1).contiguous()
    return new_name, transposed_tensor


if __name__ == "__main__":
    global args
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--input_model", type=argparse.FileType("r"), default="/home/hub/Qwen1.5-0.5B/model.safetensors"
    )
    parser.add_argument("--output_model", type=str, default="output/qwen1.5-0.5b-fp32.mllm")
    parser.add_argument(
        "--type",
        choices=["torch", "safetensor"],
        default="safetensor",
    )
    parser.add_argument(
        "--model_type",
        choices=["dense", "sparse"],
        default="dense",
    )
    model = None
    index_ = None
    args = parser.parse_args()
    if args.type == "torch":
        if args.input_model.name.endswith(".json"):
            if os.path.basename(args.input_model.name) != "pytorch_model.bin.index.json":
                raise Exception("Only support pytorch_model.bin.index.json")
            index_ = json.load(args.input_model)
        else:
            model = torch.load(args.input_model.name)
            if isinstance(model, dict) and "model" in model.keys():
                model = model["model"]
    elif args.type == "safetensor":
        from safetensors import safe_open

        if args.input_model.name.endswith(".json"):
            index_ = json.load(args.input_model)
        else:
            tensors = {}
            args.input_model.close()
            model = safe_open(args.input_model.name, framework="pt")
            for key in model.keys():
                tensors[key] = model.get_tensor(key)
    else:
        raise Exception("Unknown type")
    writer = Writer(args.output_model)
    model_keys = all_keys(model, index_)
    writer.write_tensor_index_padding([process_str(name, args.model_type) for name in model_keys])

    for key in model_keys:
        tensor = get_tensor(model, key, index_)
        key, tensor = process(key, tensor, args.model_type)
        if tensor.dtype != torch.bool or tensor.dtype != torch.int8:
            tensor = tensor.float()
        offset, size = writer.write_tensor(tensor, key)
        print(f"Get tensor {key} to {offset} with size {size}")

    writer.write_tensor_index()

1 模型导出

● 模型格式定义
● 模型参数写入到 bin 文件

2 推理框架中模型定义

● 用自己写的 module+layer 类来实现 graph 的构建，因为模型文件中是没有计算图的，所以需要在推理框架中再定义一遍计算图（手动）

class QWenForCausalLM final : public Module {
public:
    QWenForCausalLM(QWenConfig &config) {
        auto names = config.names_config;
        hidden_size = config.hidden_size;
        tie_embedding_words = config.tie_embedding_words;
        embedding = Embedding(config.vocab_size, config.hidden_size, names.token_embd_name);
        model = QWenModel(config, names, names.blk_name);

        // Qwen-0.5 use tied embedding
        // Others use nn.Linear()
        if (tie_embedding_words) {
            lm_head = Parameter(1, config.vocab_size, 1, config.hidden_size,
                                names.token_embd_name + ".weight");
        } else {
            lm_head_layer =
                Linear(config.hidden_size, config.vocab_size, false, names.lm_head_name);
        }
    }

    std::vector<Tensor> Forward(std::vector<Tensor> inputs, std::vector<std::any> args) override {
        auto x = embedding(inputs[0]);

        // go through model
        auto outputs = model({x})[0];
        if (tie_embedding_words) {
            outputs = Tensor::mm(outputs, lm_head().transpose(Chl::SEQUENCE, Chl::DIMENSION));
        } else {
            outputs = lm_head_layer(outputs);
        }
        return {outputs};
    }
    void clear_kvcache() override {
        model.clear_kvcache();
    }

private:
    int hidden_size;
    bool tie_embedding_words;
    Layer embedding;
    Parameter lm_head;
    Layer lm_head_layer;
    QWenModel model;
};

其核心有 3 个类
● module：类似于 torch 中的 module
● layer+op：torch.layer, layer 的结构参数，创建 op，加载 layer 权重参数
● tensor：torch.tensor，做了很多重载，使用方式类似于 torch
3 op load
● 根据每个 layer 初始化时的名字，找到模型 bin 文件中对应的权重，再载入。
● 所有 layer 读取权重文件

ErrorCode CPUEmbedding::load(AbstructLoader &loader) {
    weight_.setName(name() + ".weight");
    weight_.reshape(1, 1, vocabSize_, hiddenSize_);
    if (loader.getDataType(weight_.name()) != MLLM_TYPE_COUNT) {
        weight_.setDtype(loader.getDataType(weight_.name()));
        weight_.alloc();
        loader.load(&weight_);
    } else {
        weight_.setDtype(MLLM_TYPE_F32);
        weight_.alloc();
    }
    return Op::load(loader);
}

4 op reshape
● prefill/decoder 两个阶段，输入 seq 的长度是不定的。所以在推理之前需要做一个 reshape，就是根据输入 tensor 的 shape，将每一层输出 tensor 的 shape 计算出来。

ErrorCode CPUEmbedding::reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
    assert(inputs.size() == 1);
    assert(outputs.size() == 1);
    auto input = inputs[0];
    auto output = outputs[0];
    // Input: [batch, 1, sequence, 1]
    output->reshape(input->batch(), 1, input->sequence(), hiddenSize_);
    // outputs[0]->setDtype(activationDtype());
    return Op::reshape(inputs, outputs);
}

5 op 执行
op 执行，将结果写入到每层的 outtensor

ErrorCode CPUEmbedding::execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
    assert(inputs.size() == 1);
    assert(outputs.size() == 1);
    auto &input = inputs[0];
    auto &output = outputs[0];
    switch (weight_.dtype()) {
    case MLLM_TYPE_F32: {
        for (int batch = 0; batch < input->batch(); ++batch) {
            for (int head = 0; head < input->head(); ++head) { // NOLINT(*-use-default-none)
#pragma omp parallel for num_threads(thread_count)
                for (int seq = 0; seq < input->sequence(); ++seq) {
#ifdef USE_QNN
                    if ((int)input->dataAt<float>(batch, head, seq, 0) == vocabSize_) {
                        memset(output->hostPtr<float>() + output->offset(batch, head, seq, 0), 0, output->dimension() * sizeof(float));
                        continue;
                    }
#endif
                    auto seq__ = input->dataAt<float>(batch, head, seq, 0);
                    if (seq__ >= 0) {
                        memcpy(output->hostPtr<float>() + output->offset(batch, head, seq, 0),
                               weight_.hostPtr<float>() + weight_.offset(0, 0, (int)seq__, 0),
                               weight_.dtypeSize() * hiddenSize_);
                    }
                }
            }
        }
        break;
    }
    case MLLM_TYPE_Q4_0: {
        for (int batch = 0; batch < input->batch(); ++batch) {
            for (int head = 0; head < input->head(); ++head) {
#pragma omp parallel for num_threads(thread_count)
                for (int seq = 0; seq < input->sequence(); ++seq) {
                    auto seq__ = input->dataAt<float>(batch, head, seq, 0);
                    if (seq__ >= 0) {
                        dequantize_row_q4_0(weight_.hostPtr<block_q4_0>() + weight_.offset(0, 0, (int)seq__, 0) / (QK4_0),
                                            output->hostPtr<float>() + output->offset(batch, head, seq, 0),
                                            hiddenSize_);
                    }
                }
            }
        }
        break;
    }

6 sample 采样
● 将最后一个 seq 的输出，取出来按 topk，topp，greadysearch 进行采样

unsigned int _LlmTextGenerateTopkSamplingMethod::generate(Tensor &t) {
    auto argmax = [](const std::vector<float> &vec) -> unsigned int {
        return std::distance(vec.begin(), std::max_element(vec.begin(), vec.end()));
    };

    if (m_k == 0 || m_k == 1) {
        std::vector<float> scores;
        this->_tensor_to_vec(t, scores);
        return argmax(scores);
    }

    std::vector<std::pair<float, unsigned int>> scores;
    this->_tensor_to_vec_with_idx(t, scores);

    // find top k
    std::partial_sort(scores.begin(), scores.begin() + m_k, scores.end(),
                      [](std::pair<float, unsigned int> a, std::pair<float, unsigned int> b) { return a.first > b.first; });
    std::vector<float> top_k_elements(m_k, 0.f);
    std::vector<unsigned int> top_k_elements_idx(m_k, 0);
    for (int i = 0; i < m_k; ++i) {
        top_k_elements[i] = scores[i].first;
        top_k_elements_idx[i] = scores[i].second;
    }

    // softmax with temperature
    std::vector<float> softmax(top_k_elements.size(), 0.f);
    double max_logit = top_k_elements[argmax(top_k_elements)];
    double sum_exp = 0.f;

    for (size_t i = 0; i < top_k_elements.size(); ++i) {
        softmax[i] = exp((top_k_elements[i] - max_logit) / m_temperature);
        sum_exp += softmax[i];
    }

    for (float &value : softmax) {
        value /= sum_exp;
    }

    // sampling
    float _sum = std::accumulate(softmax.begin(), softmax.end(), 0.0);
    for (float &value : softmax) {
        value /= _sum;
    }

    auto idx = _sample_element(top_k_elements_idx, softmax);
    return idx;
}