GPU推理代码

柴华松

已于 2024-09-14 21:02:56 修改

阅读量149

点赞数 1

文章标签： python 深度学习开发语言

于 2024-09-14 19:45:52 首次发布

本文链接：https://blog.csdn.net/chaihuasong/article/details/142265344

版权

import os
import torch
import time
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from accelerate import init_empty_weights, infer_auto_device_map
from accelerate.utils import set_module_tensor_to_device

folder_path = '/app'

def getFileStr(level):
    return ' ' * level + '- '

def getDicStr(level):
    return ' ' * level + '+'

def printFile(path, level):
    if os.path.exists(path):
        files = os.listdir(path)
        for f in files:
            subpath = os.path.join(path, f)
            if os.path.isfile(subpath):
                print(getFileStr(level) + os.path.basename(subpath))
            else:
                leveli = level + 1
                print(getDicStr(level) + os.path.basename(subpath))
                printFile(subpath, leveli)

printFile(folder_path, 1)
print("hello...............................")

# 检查可用的 GPU 设备
if torch.cuda.is_available():
    device_count = torch.cuda.device_count()
    print(f"Available GPU devices: {device_count}")

    # 加载模型和分词器
    tokenizer = AutoTokenizer.from_pretrained("/app/model/")

    # 初始化模型权重为空，用于分布式加载
    with init_empty_weights():
        model = AutoModelForCausalLM.from_pretrained("/app/model/", device_map="auto")

    # 使用 infer_auto_device_map 自动推断模型分布，设置最大内存
    device_map = infer_auto_device_map(
        model, 
        max_memory={i: "70GB" for i in range(device_count)}  # 假设每个 GPU 限制 70GB
    )

    # 打印 device_map，确保正确映射
    print("Device map:", device_map)

    # 遍历模型模块并将其分配到对应设备，如果没有找到对应设备，则分配到默认设备
    default_device = "cuda:0"  # 这里设定一个默认设备
    max_split_size_mb = 128  # 设置较小的分配块，减少内存碎片
    for name, module in model.named_modules():
        device = device_map.get(name, default_device)  # 如果 name 不在 device_map 中，使用默认设备
        set_module_tensor_to_device(module, device, max_split_size_mb=max_split_size_mb)

    # 确定模型第一个部分所在的设备
    first_device = next(iter(device_map.values()))

    # 禁用梯度计算，节省内存
    model.eval()
    torch.set_grad_enabled(False)

    # 使用 pipeline，并确保其在正确的设备上
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=first_device, max_length=50)

else:
    print("No GPU available, using CPU instead.")
    tokenizer = AutoTokenizer.from_pretrained("/app/model/")
    model = AutoModelForCausalLM.from_pretrained("/app/model/")
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=50)

messages = "Who are you?"

# 开始生成循环
while True:
    time.sleep(0.05)  # 休眠 50 毫秒

    # 如果使用 GPU，确保输入数据位于正确的设备上
    if torch.cuda.is_available():
        inputs = tokenizer(messages, return_tensors="pt").to(first_device)  # 将输入迁移到同一设备
    else:
        inputs = tokenizer(messages, return_tensors="pt")

    # 通过模型生成输出
    output = model.generate(**inputs)
    print(tokenizer.decode(output[0], skip_special_tokens=True))

报错（已修改）：
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0!

报错2:
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 896.00 MiB (GPU 0; 79.15 GiB total capacity; 78.67 GiB already allocated; 69.25 MiB free; 78.67 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.

报错3:
set_module_tensor_to_device(module, device_map[name], max_split_size_mb=max_split_size_mb)
KeyError: ‘’

报错4:
set_module_tensor_to_device(module, device, max_split_size_mb=max_split_size_mb)
TypeError: set_module_tensor_to_device() got an unexpected keyword argument ‘max_split_size_mb’

import os
import torch
import time
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from accelerate import init_empty_weights, infer_auto_device_map

folder_path = '/app'

def getFileStr(level):
    return ' ' * level + '- '

def getDicStr(level):
    return ' ' * level + '+'

def printFile(path, level):
    if os.path.exists(path):
        files = os.listdir(path)
        for f in files:
            subpath = os.path.join(path, f)
            if os.path.isfile(subpath):
                print(getFileStr(level) + os.path.basename(subpath))
            else:
                leveli = level + 1
                print(getDicStr(level) + os.path.basename(subpath))
                printFile(subpath, leveli)

printFile(folder_path, 1)
print("hello...............................")

# 检查可用的 GPU 设备
if torch.cuda.is_available():
    device_count = torch.cuda.device_count()
    print(f"Available GPU devices: {device_count}")

    # 加载模型和分词器
    tokenizer = AutoTokenizer.from_pretrained("/app/model/")

    # 初始化模型权重为空，用于分布式加载
    with init_empty_weights():
        model = AutoModelForCausalLM.from_pretrained("/app/model/", device_map="auto")

    # 使用 infer_auto_device_map 自动推断模型分布，设置最大内存
    device_map = infer_auto_device_map(
        model, 
        max_memory={i: "70GB" for i in range(device_count)}  # 假设每个 GPU 限制 70GB
    )

    # 打印 device_map，确保正确映射
    print("Device map:", device_map)

    # 遍历模型模块并将其分配到对应设备
    for name, module in model.named_modules():
        device = device_map.get(name, "cpu")  # 如果 name 不在 device_map 中，使用 CPU
        module.to(device)  # 手动将模块迁移到指定设备

    # 禁用梯度计算，节省内存
    model.eval()
    torch.set_grad_enabled(False)

    # 使用 pipeline，并确保其在正确的设备上
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0, max_length=50)

else:
    print("No GPU available, using CPU instead.")
    tokenizer = AutoTokenizer.from_pretrained("/app/model/")
    model = AutoModelForCausalLM.from_pretrained("/app/model/")
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=50)

messages = "Who are you?"

# 开始生成循环
while True:
    time.sleep(0.05)  # 休眠 50 毫秒

    # 如果使用 GPU，确保输入数据位于正确的设备上
    if torch.cuda.is_available():
        inputs = tokenizer(messages, return_tensors="pt").to("cuda:0")  # 将输入迁移到 GPU
    else:
        inputs = tokenizer(messages, return_tensors="pt")

    # 通过模型生成输出
    output = model.generate(**inputs)
    print(tokenizer.decode(output[0], skip_special_tokens=True))

import os
import torch
import time
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from accelerate import Accelerator, init_empty_weights, infer_auto_device_map

folder_path = '/app'

def getFileStr(level):
    return ' ' * level + '- '

def getDicStr(level):
    return ' ' * level + '+'

def printFile(path, level):
    if os.path.exists(path):
        files = os.listdir(path)
        for f in files:
            subpath = os.path.join(path, f)
            if os.path.isfile(subpath):
                print(getFileStr(level) + os.path.basename(subpath))
            else:
                leveli = level + 1
                print(getDicStr(level) + os.path.basename(subpath))
                printFile(subpath, leveli)

# 打印指定文件夹中的文件和目录
printFile(folder_path, 1)
print("hello...............................")

# 初始化 Accelerator
accelerator = Accelerator()

# 检查可用的 GPU 设备
if torch.cuda.is_available():
    device_count = torch.cuda.device_count()
    print(f"Available GPU devices: {device_count}")

    # 加载分词器和模型
    tokenizer = AutoTokenizer.from_pretrained("/app/model/")

    # 使用 init_empty_weights 懒加载模型权重
    with init_empty_weights():
        model = AutoModelForCausalLM.from_pretrained("/app/model/", device_map="auto")

    # 使用 infer_auto_device_map 管理设备分配
    device_map = infer_auto_device_map(model, max_memory={i: "70GB" for i in range(device_count)})

    # 使用 accelerator 准备模型和分词器
    model = accelerator.prepare(model)
    tokenizer = accelerator.prepare(tokenizer)

    # 使用 pipeline 进行推理
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=accelerator.device, max_length=50)
else:
    print("No GPU available, using CPU instead.")
    tokenizer = AutoTokenizer.from_pretrained("/app/model/")
    model = AutoModelForCausalLM.from_pretrained("/app/model/")
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=50)

messages = "Who are you?"

# 开始生成循环
while True:
    time.sleep(0.05)  # 休眠 50 毫秒
    # 通过管道生成输出
    output = pipe(messages)
    print(output)

报错：ValueError:The model has been loaded with ‘accelerate’ and therefore cannot be moved to a specific device.Please discard the ‘device’ argument when creating your pipeline object.
再次修改

import os
import torch
import time
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from accelerate import Accelerator, init_empty_weights, infer_auto_device_map

folder_path = '/app'

def getFileStr(level):
    return ' ' * level + '- '

def getDicStr(level):
    return ' ' * level + '+'

def printFile(path, level):
    if os.path.exists(path):
        files = os.listdir(path)
        for f in files:
            subpath = os.path.join(path, f)
            if os.path.isfile(subpath):
                print(getFileStr(level) + os.path.basename(subpath))
            else:
                leveli = level + 1
                print(getDicStr(level) + os.path.basename(subpath))
                printFile(subpath, leveli)

# 打印指定文件夹中的文件和目录
printFile(folder_path, 1)
print("hello...............................")

# 初始化 Accelerator
accelerator = Accelerator()

# 检查可用的 GPU 设备
if torch.cuda.is_available():
    device_count = torch.cuda.device_count()
    print(f"Available GPU devices: {device_count}")

    # 加载分词器和模型
    tokenizer = AutoTokenizer.from_pretrained("/app/model/")

    # 使用 init_empty_weights 懒加载模型权重
    with init_empty_weights():
        model = AutoModelForCausalLM.from_pretrained("/app/model/", device_map="auto")

    # 使用 infer_auto_device_map 管理设备分配
    device_map = infer_auto_device_map(model, max_memory={i: "70GB" for i in range(device_count)})

    # 使用 accelerator 准备模型和分词器
    model = accelerator.prepare(model)
    tokenizer = accelerator.prepare(tokenizer)

    # 使用 pipeline 进行推理，去掉 device 参数
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=50)
else:
    print("No GPU available, using CPU instead.")
    tokenizer = AutoTokenizer.from_pretrained("/app/model/")
    model = AutoModelForCausalLM.from_pretrained("/app/model/")
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=50)

messages = "Who are you?"

# 开始生成循环
while True:
    time.sleep(0.05)  # 休眠 50 毫秒
    # 通过管道生成输出
    output = pipe(messages)
    print(output)

告警：
truncation was not explicitly activated but ‘max_length’ is provided a specific value, please use ‘truncation=True’ to explicitly truncate examples to max length.

为了避免关于截断的警告，可以在创建 pipeline 时为 tokenizer 显式地设置 truncation=True。下面是修改后的代码：

import os
import torch
import time
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from accelerate import Accelerator, init_empty_weights, infer_auto_device_map

folder_path = '/app'

def getFileStr(level):
    return ' ' * level + '- '

def getDicStr(level):
    return ' ' * level + '+'

def printFile(path, level):
    if os.path.exists(path):
        files = os.listdir(path)
        for f in files:
            subpath = os.path.join(path, f)
            if os.path.isfile(subpath):
                print(getFileStr(level) + os.path.basename(subpath))
            else:
                leveli = level + 1
                print(getDicStr(level) + os.path.basename(subpath))
                printFile(subpath, leveli)

# 打印指定文件夹中的文件和目录
printFile(folder_path, 1)
print("hello...............................")

# 初始化 Accelerator
accelerator = Accelerator()

# 检查可用的 GPU 设备
if torch.cuda.is_available():
    device_count = torch.cuda.device_count()
    print(f"Available GPU devices: {device_count}")

    # 加载分词器和模型
    tokenizer = AutoTokenizer.from_pretrained("/app/model/")

    # 使用 init_empty_weights 懒加载模型权重
    with init_empty_weights():
        model = AutoModelForCausalLM.from_pretrained("/app/model/", device_map="auto")

    # 使用 infer_auto_device_map 管理设备分配
    device_map = infer_auto_device_map(model, max_memory={i: "70GB" for i in range(device_count)})

    # 使用 accelerator 准备模型和分词器
    model = accelerator.prepare(model)
    tokenizer = accelerator.prepare(tokenizer)

    # 使用 pipeline 进行推理，显式设置 truncation=True
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=50, truncation=True)
else:
    print("No GPU available, using CPU instead.")
    tokenizer = AutoTokenizer.from_pretrained("/app/model/")
    model = AutoModelForCausalLM.from_pretrained("/app/model/")
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=50, truncation=True)

messages = "Who are you?"

# 开始生成循环
while True:
    time.sleep(0.05)  # 休眠 50 毫秒
    # 通过管道生成输出
    output = pipe(messages)
    print(output)