1.vllm部署
conda create -n myenv python=3.9 -y
conda activate myenv
pip install vllm
从huggingface下载模型并部署
python -m vllm.entrypoints.openai.api_server --model Qwen/Qwen2-7B --dtype auto --api-key token-abc123 --host 0.0.0.0 --trust-remote-code
#接口信息
{
"model": "Qwen/Qwen2-7B",
"base_url": "http://64.247.196.36:8000/v1",
"api_key": "token-abc123",
},
用本地模型部署
export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:86
python -m vllm.entrypoints.openai.api_server --model /home/cxh/Qwen-7B --dtype auto --api-key token-abc123 --host 0.0.0.0 --trust-remote-code --max_model_len=1024
import torch
torch.cuda.empty_cache()
如果是unsloth微调后的模型,用下面命令运行
python3 -m vllm.entrypoints.openai.api_server --model /home/cxh/Meta-Llama-3.1-8B-Instruct-bnb-4bit --load-format bitsandbytes --quantization bitsandbytes --enforce-eager --gpu-memory-utilization=0.85 --max_model_len=1024
from openai import OpenAI
# Set OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
api_key='token-abc123',
base_url=openai_api_base,
)
chat_response = client.chat.completions.create(
model="/home/cxh/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Tell me a joke."},
]
)
content = chat_response.choices[0].message.content
print(content)
2.autogen studio 安装和启动
pip install autogenstudio
autogenstudio ui --port 8081 --host 0.0.0.0.
导入模型刚才运行的模型
3.AutoGen+Marker生成数据集
pip install marker-pdf -i https://pypi.tuna.tsinghua.edu.cn/simple
将pdf转换为md
marker_single GPT.pdf ./folder --batch_multiplier 2 --max_pages 52 --langs English
pip install pyautogen
pip install autogen[graph]
生成数据集
https://colab.research.google.com/drive/1xEqhuVUC89ZCeOn82mSo6qQfqc40xznD
4.用Axolotl 和生成的数据集微调qwen
# 从 GitHub 克隆 axolotl 仓库
git clone https://github.com/OpenAccess-AI-Collective/axolotl
# 切换到 axolotl 目录
cd axolotl
docker run --gpus '"all"' --rm -it winglian/axolotl:main-latest
# 使用 Accelerate 库启动 axolotl 训练脚本,使用 examples/openllama-3b/qlora.yml 配置文件
accelerate launch -m axolotl.cli.train examples/qwen2/qlora-fsdp.yaml
注意:
配置文件中fp16设为false bfp16设为true
如果数据集内容太少,需要将eval_sample_packing
设置为 False
qlora-fsdp.yaml配置
base_model: Qwen/Qwen2-7B trust_remote_code: true load_in_8bit: false load_in_4bit: true strict: false datasets: - path: tatsu-lab/alpaca type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./outputs/out sequence_len: 2048 sample_packing: true eval_sample_packing: true pad_to_sequence_len: true adapter: qlora lora_model_dir: lora_r: 32 lora_alpha: 64 lora_dropout: 0.05 lora_target_linear: true lora_fan_in_fan_out: wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 8 micro_batch_size: 1 num_epochs: 4 optimizer: adamw_torch lr_scheduler: cosine learning_rate: 0.0002 train_on_inputs: false group_by_length: false bf16: auto fp16: tf32: true gradient_checkpointing: false gradient_checkpointing_kwargs: use_reentrant: false early_stopping_patience: resume_from_checkpoint: local_rank: logging_steps: 1 xformers_attention: flash_attention: false warmup_steps: 10 evals_per_epoch: 4 saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 special_tokens: