前提:任务要求3w篇pdf文献用来预训练,采用MinerU转换为markdown格式,对markdown格式初步清洗用来预训练测试。
选用qwen为基座模型,采用lora预训练。以下是预训练代码。
from modelscope import snapshot_download
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import TaskType, LoraConfig, get_peft_model, AutoPeftModelForCausalLM
import torch
import os
import markdown
from bs4 import BeautifulSoup
import nltk
import matplotlib.pyplot as plt
# 下载句子分割器
# nltk.download()
# 下载punkt数据,如果未下载过
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt', download_dir=os.path.expanduser('~/.nltk_data'))
# 设置可见的CUDA设备
#os.environ["CUDA_VISIBLE_DEVICES"] = "0"
model_dir = snapshot_download('qwen/Qwen2-0.5B-Instruct')
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_dir