import os
import torch
import time
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from accelerate import init_empty_weights, infer_auto_device_map
from accelerate.utils import set_module_tensor_to_device
folder_path = '/app'
def getFileStr(level):
return ' ' * level + '- '
def getDicStr(level):
return ' ' * level + '+'
def printFile(path, level):
if os.path.exists(path):
files = os.listdir(path)
for f in files:
subpath = os.path.join(path, f)
if os.path.isfile(subpath):
print(getFileStr(level) + os.path.basename(subpath))
else:
leveli = level + 1
print(getDicStr(level) + os.path.basename(subpath))
printFile(subpath, leveli)
printFile(folder_path, 1)
print("hello...............................")
# 检查可用的 GPU 设备
if torch.cuda.is_available():
device_count = torch.cuda.device_count()
print(f"Available GPU devices: {device_count}")
# 加载模型和分词器
tokenizer = AutoTokenizer.from_pretrained("/app/model/")
# 初始化模型权重为空,用于分布式加载
with init_empty_weights():
model = AutoModelForCausalLM.from_pretrained("/app/model/", device_map="auto")
# 使用 infer_auto_device_map 自动推断模型分布,设置最大内存
device_map = infer_auto_device_map(
model,
max_memory={i: "70GB" for i in range(device_count)} # 假设每个 GPU 限制 70GB
)
# 打印 device_map,确保正确映射
print("Device map:", device_map)
# 遍历模型模块并将其分配到对应设备,如果没有找到对应设备,则分配到默认设备
default_device = "cuda:0" # 这里设定一个默认设备
max_split_size_mb = 128 # 设置较小的分配块,减少内存碎片
for name, module in model.named_modules():
device = device_map.get(name, default_device) # 如果 name 不在 device_map 中,使用默认设备
set_module_tensor_to_device(module, device, max_split_size_mb=max_split_size_mb)
# 确定模型第一个部分所在的设备
first_device = next(iter(device_map.values()))
# 禁用梯度计算,节省内存
model.eval()
torch.set_grad_enabled(False)
# 使用 pipeline,并确保其在正确的设备上
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=first_device, max_length=50)
else:
print("No GPU available, using CPU instead.")
tokenizer = AutoTokenizer.from_pretrained("/app/model/")
model = AutoModelForCausalLM.from_pretrained("/app/model/")
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=50)
messages = "Who are you?"
# 开始生成循环
while True:
time.sleep(0.05) # 休眠 50 毫秒
# 如果使用 GPU,确保输入数据位于正确的设备上
if torch.cuda.is_available():
inputs = tokenizer(messages, return_tensors="pt").to(first_device) # 将输入迁移到同一设备
else:
inputs = tokenizer(messages, return_tensors="pt")
# 通过模型生成输出
output = model.generate(**inputs)
print(tokenizer.decode(output[0], skip_special_tokens=True))
报错(已修改):
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0!
报错2:
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 896.00 MiB (GPU 0; 79.15 GiB total capacity; 78.67 GiB already allocated; 69.25 MiB free; 78.67 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.
报错3:
set_module_tensor_to_device(module, device_map[name], max_split_size_mb=max_split_size_mb)
KeyError: ‘’
报错4:
set_module_tensor_to_device(module, device, max_split_size_mb=max_split_size_mb)
TypeError: set_module_tensor_to_device() got an unexpected keyword argument ‘max_split_size_mb’
import os
import torch
import time
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from accelerate import init_empty_weights, infer_auto_device_map
folder_path = '/app'
def getFileStr(level):
return ' ' * level + '- '
def getDicStr(level):
return ' ' * level + '+'
def printFile(path, level):
if os.path.exists(path):
files = os.listdir(path)
for f in files:
subpath = os.path.join(path, f)
if os.path.isfile(subpath):
print(getFileStr(level) + os.path.basename(subpath))
else:
leveli = level + 1
print(getDicStr(level) + os.path.basename(subpath))
printFile(subpath, leveli)
printFile(folder_path, 1)
print("hello...............................")
# 检查可用的 GPU 设备
if torch.cuda.is_available():
device_count = torch.cuda.device_count()
print(f"Available GPU devices: {device_count}")
# 加载模型和分词器
tokenizer = AutoTokenizer.from_pretrained("/app/model/")
# 初始化模型权重为空,用于分布式加载
with init_empty_weights():
model = AutoModelForCausalLM.from_pretrained("/app/model/", device_map="auto")
# 使用 infer_auto_device_map 自动推断模型分布,设置最大内存
device_map = infer_auto_device_map(
model,
max_memory={i: "70GB" for i in range(device_count)} # 假设每个 GPU 限制 70GB
)
# 打印 device_map,确保正确映射
print("Device map:", device_map)
# 遍历模型模块并将其分配到对应设备
for name, module in model.named_modules():
device = device_map.get(name, "cpu") # 如果 name 不在 device_map 中,使用 CPU
module.to(device) # 手动将模块迁移到指定设备
# 禁用梯度计算,节省内存
model.eval()
torch.set_grad_enabled(False)
# 使用 pipeline,并确保其在正确的设备上
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0, max_length=50)
else:
print("No GPU available, using CPU instead.")
tokenizer = AutoTokenizer.from_pretrained("/app/model/")
model = AutoModelForCausalLM.from_pretrained("/app/model/")
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=50)
messages = "Who are you?"
# 开始生成循环
while True:
time.sleep(0.05) # 休眠 50 毫秒
# 如果使用 GPU,确保输入数据位于正确的设备上
if torch.cuda.is_available():
inputs = tokenizer(messages, return_tensors="pt").to("cuda:0") # 将输入迁移到 GPU
else:
inputs = tokenizer(messages, return_tensors="pt")
# 通过模型生成输出
output = model.generate(**inputs)
print(tokenizer.decode(output[0], skip_special_tokens=True))
import os
import torch
import time
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from accelerate import Accelerator, init_empty_weights, infer_auto_device_map
folder_path = '/app'
def getFileStr(level):
return ' ' * level + '- '
def getDicStr(level):
return ' ' * level + '+'
def printFile(path, level):
if os.path.exists(path):
files = os.listdir(path)
for f in files:
subpath = os.path.join(path, f)
if os.path.isfile(subpath):
print(getFileStr(level) + os.path.basename(subpath))
else:
leveli = level + 1
print(getDicStr(level) + os.path.basename(subpath))
printFile(subpath, leveli)
# 打印指定文件夹中的文件和目录
printFile(folder_path, 1)
print("hello...............................")
# 初始化 Accelerator
accelerator = Accelerator()
# 检查可用的 GPU 设备
if torch.cuda.is_available():
device_count = torch.cuda.device_count()
print(f"Available GPU devices: {device_count}")
# 加载分词器和模型
tokenizer = AutoTokenizer.from_pretrained("/app/model/")
# 使用 init_empty_weights 懒加载模型权重
with init_empty_weights():
model = AutoModelForCausalLM.from_pretrained("/app/model/", device_map="auto")
# 使用 infer_auto_device_map 管理设备分配
device_map = infer_auto_device_map(model, max_memory={i: "70GB" for i in range(device_count)})
# 使用 accelerator 准备模型和分词器
model = accelerator.prepare(model)
tokenizer = accelerator.prepare(tokenizer)
# 使用 pipeline 进行推理
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=accelerator.device, max_length=50)
else:
print("No GPU available, using CPU instead.")
tokenizer = AutoTokenizer.from_pretrained("/app/model/")
model = AutoModelForCausalLM.from_pretrained("/app/model/")
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=50)
messages = "Who are you?"
# 开始生成循环
while True:
time.sleep(0.05) # 休眠 50 毫秒
# 通过管道生成输出
output = pipe(messages)
print(output)
报错:ValueError:The model has been loaded with ‘accelerate’ and therefore cannot be moved to a specific device.Please discard the ‘device’ argument when creating your pipeline object.
再次修改
import os
import torch
import time
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from accelerate import Accelerator, init_empty_weights, infer_auto_device_map
folder_path = '/app'
def getFileStr(level):
return ' ' * level + '- '
def getDicStr(level):
return ' ' * level + '+'
def printFile(path, level):
if os.path.exists(path):
files = os.listdir(path)
for f in files:
subpath = os.path.join(path, f)
if os.path.isfile(subpath):
print(getFileStr(level) + os.path.basename(subpath))
else:
leveli = level + 1
print(getDicStr(level) + os.path.basename(subpath))
printFile(subpath, leveli)
# 打印指定文件夹中的文件和目录
printFile(folder_path, 1)
print("hello...............................")
# 初始化 Accelerator
accelerator = Accelerator()
# 检查可用的 GPU 设备
if torch.cuda.is_available():
device_count = torch.cuda.device_count()
print(f"Available GPU devices: {device_count}")
# 加载分词器和模型
tokenizer = AutoTokenizer.from_pretrained("/app/model/")
# 使用 init_empty_weights 懒加载模型权重
with init_empty_weights():
model = AutoModelForCausalLM.from_pretrained("/app/model/", device_map="auto")
# 使用 infer_auto_device_map 管理设备分配
device_map = infer_auto_device_map(model, max_memory={i: "70GB" for i in range(device_count)})
# 使用 accelerator 准备模型和分词器
model = accelerator.prepare(model)
tokenizer = accelerator.prepare(tokenizer)
# 使用 pipeline 进行推理,去掉 device 参数
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=50)
else:
print("No GPU available, using CPU instead.")
tokenizer = AutoTokenizer.from_pretrained("/app/model/")
model = AutoModelForCausalLM.from_pretrained("/app/model/")
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=50)
messages = "Who are you?"
# 开始生成循环
while True:
time.sleep(0.05) # 休眠 50 毫秒
# 通过管道生成输出
output = pipe(messages)
print(output)
告警:
truncation was not explicitly activated but ‘max_length’ is provided a specific value, please use ‘truncation=True’ to explicitly truncate examples to max length.
为了避免关于截断的警告,可以在创建 pipeline 时为 tokenizer 显式地设置 truncation=True。下面是修改后的代码:
import os
import torch
import time
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from accelerate import Accelerator, init_empty_weights, infer_auto_device_map
folder_path = '/app'
def getFileStr(level):
return ' ' * level + '- '
def getDicStr(level):
return ' ' * level + '+'
def printFile(path, level):
if os.path.exists(path):
files = os.listdir(path)
for f in files:
subpath = os.path.join(path, f)
if os.path.isfile(subpath):
print(getFileStr(level) + os.path.basename(subpath))
else:
leveli = level + 1
print(getDicStr(level) + os.path.basename(subpath))
printFile(subpath, leveli)
# 打印指定文件夹中的文件和目录
printFile(folder_path, 1)
print("hello...............................")
# 初始化 Accelerator
accelerator = Accelerator()
# 检查可用的 GPU 设备
if torch.cuda.is_available():
device_count = torch.cuda.device_count()
print(f"Available GPU devices: {device_count}")
# 加载分词器和模型
tokenizer = AutoTokenizer.from_pretrained("/app/model/")
# 使用 init_empty_weights 懒加载模型权重
with init_empty_weights():
model = AutoModelForCausalLM.from_pretrained("/app/model/", device_map="auto")
# 使用 infer_auto_device_map 管理设备分配
device_map = infer_auto_device_map(model, max_memory={i: "70GB" for i in range(device_count)})
# 使用 accelerator 准备模型和分词器
model = accelerator.prepare(model)
tokenizer = accelerator.prepare(tokenizer)
# 使用 pipeline 进行推理,显式设置 truncation=True
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=50, truncation=True)
else:
print("No GPU available, using CPU instead.")
tokenizer = AutoTokenizer.from_pretrained("/app/model/")
model = AutoModelForCausalLM.from_pretrained("/app/model/")
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=50, truncation=True)
messages = "Who are you?"
# 开始生成循环
while True:
time.sleep(0.05) # 休眠 50 毫秒
# 通过管道生成输出
output = pipe(messages)
print(output)