import numpy as np import pandas as pd from tqdm.auto import tqdm import torch from peft import PeftConfig, PeftModel from transformers import AutoModelForCausalLM, AutoTokenizer from transformers import BitsAndBytesConfig # 禁用一些CUDA优化以节省内存 torch.backends.cuda.enable_mem_efficient_sdp(False) torch.backends.cuda.enable_flash_sdp(False) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") input_token_len = 1024 # 输入token最大长度 output_token_len = 100 # 输出token最大长度
# 读取测试集数据 test_df = pd.read_csv('/kaggle/input/llm-prompt-recovery/test.csv') # 定义基础模型和lora适配器模型的路径 base_model_name = "/kaggle/input/mistral-7b-it-v02" adapter_model_name = "/kaggle/input/mistral-7b/mistral_7b_099999_lr0.0001_5ep_lora(r16,a32,d0.05,default)"
# 加载预训练模型的tokenizer tokenizer = AutoTokenizer.from_pretrained(base_model_name,trust_remote_code=True) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
# 设置BitsAndBytesConfig 4bit量化配置 bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16 )
# 加载预训练模型 model = AutoModelForCausalLM.from_pretrained( base_model_name, quantization_config=bnb_config, # 应用4bit量化配置 trust_remote_code=True, # 信任远程代码 device_map="auto", # 自动进行设备映射 use_auth_token=True ) # 加载LoRA适配器 model = PeftModel.from_pretrained( model, adapter_model_name, quantization_config=bnb_config, torch_dtype=torch.bfloat16, device_map="auto", ) model.to(device) model.eval()
def text_generate(ori_text, rew_text, model, tokenizer, input_max_len=512, output_len=20, device='cuda'): ''' 使用模型生成回复 ''' # 构建输入prompt prompt = f"Instruct: Original Text:{ori_text}\nRewritten Text:{rew_text}\nWrite a prompt that was likely given to the LLM to rewrite original text into rewritten text.\nOutput:" # 对输入进行tokenize,限制最大长度,返回PyTorch张量 inputs = tokenizer(prompt, max_length=input_max_len, truncation=True, return_tensors="pt", return_attention_mask=False) # 将数据移动到gpu inputs = {k:v.to(device) for k,v in inputs.items()} # 计算输入token数量 input_token_len = len(inputs.input_ids[0]) # 计算生成的最大长度 max_len = input_token_len + output_len # 使用模型生成文本 outputs = model.generate(**inputs, do_sample=False, max_length=max_len, pad_token_id=tokenizer.pad_token_id, ) # 解码生成的token为文本 text = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] # 找到生成文本的起始位置 start_index = text.find('Output:') # 提取生成的文本 generated_text = text[start_index+len('Output:'):].strip() return generated_text
# 通用平均prompt mean_prompt = "improve phrasing text lucrarea tone lucrarea rewrite this creatively formalize discours involving lucrarea anyone emulate lucrarea description send casual perspective information alter it lucrarea ss plotline speaker recommend doing if elegy tone lucrarea more com n paraphrase ss forward this st text redesign poem above etc possible llm clear lucrarea"
rewrite_prompts = [] # 遍历测试集数据 for i, row in tqdm(test_df.iterrows(), total=len(test_df)): try: # 使用模型生成rewrite prompt prompt = text_generate(row['original_text'], row['rewritten_text'], model, tokenizer, input_token_len, output_token_len, device, ) # 将生成的prompt拼接上平均prompt prompt += mean_prompt except: # 如果生成失败,直接使用平均prompt prompt = mean_prompt rewrite_prompts.append(prompt)
# 将生成的rewrite prompt添加到DataFrame中 test_df['rewrite_prompt'] = rewrite_prompts sub_df = test_df[['id', 'rewrite_prompt']] # # 只保留id和rewrite_prompt两列 sub_df.to_csv('submission.csv', index=False)