A SIMPLE AND EFFECTIVE PRUNING APPROACH FORLARGE LANGUAGE MODELS
github: https://github.com/locuslab/wanda
- 核心idea:基于权重和激活来计算权重重要性
这里我们以剪枝opt-125m为例,即main_opt.py中涉及的代码进行详细介绍
wanda剪枝算法步骤
主要剪的是模型layers的线性权重(主要占绝大多数参数,如nn.Linear,这里以opt-125m为例
可以看到opt-125m的模块为OPTDecoder构成包含
- emd_tokens–>Embedding(50272,768)–>词库的个数为50272
- embed_positions–>(2050, 768)–>seq_len+(2个其他添加的特殊标记字符token,如句子开始字符BOS,结束字符EOS)
- final_layer_norm–>(,768)
- layers包括12层OPTDecoderLayer
- lm_head–>nn.Linear()
第一步:加载预训练模型以及tokenizer
# 导入相关库
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from importlib.metadata import version
D:\Applications\anaconda\envs\llm\lib\site-packages\transformers\utils\hub.py:127: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
warnings.warn(
print('torch', version('torch'))
print('transformers', version('transformers'))
print('# of gpus: ', torch.cuda.device_count())
torch 1.13.1+cu116
transformers 4.42.4
# of gpus: 1
def get_llm(model_name, cache_dir="llm_weights"):
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16, # 默认是torch.float32,但半精度float16足够并且可以减少一般显存大小
cache_dir=cache_dir, # 模型缓存路径如果不指定模型缓存路径会通过huggingface自动下载到./cache/huggingface/hub下
low_cpu_mem_usage=True, # 模型加载减少cpu内存的使用,默认为False
device_map="auto" # 自动根据是否具有gpu进行模型加载,如果有gpu,则模型加载会直接加载到gpu上即device为"cuda"
)
model.seqlen = model.config.max_position_embeddings
return model
model = get_llm('llm_weights/models--facebook--opt-125m/snapshots/opt-125m', cache_dir="llm_weights") # 直接加载已经缓存好的模型路径
model.eval() # 推理评估模式下进行否则dropout没有去掉,layer_norm没有根据训练的结果进行
tokenizer = AutoTokenizer.from_pretrained('llm_weights/models--facebook--opt-125m/snapshots/opt-125m', use_fast=False)
model.device # device_map不设置则会为cpu
device(type='cuda', index=0)
- 需要注意的是模型加载如果你已经缓存好了,以后可以直接加载缓存好的模型路径定位到snapshots下一个文件夹(一般地,下载缓存好的文件夹名是一段哈希字符,)这里重命名为opt-125m
- tokenizer中的use_fast是否采用分词器加速(Rust编程优化)
第二步:使用校验数据执行前向传播计算得到layers的输入
import numpy as np
import random
import torch
import torch.nn as nn
from datasets import load_dataset
- 通常剪枝设定的校验数据batch_size = 128
- 模型的seq_len = 2048 (一般根据预训练模型配置的最大嵌入维度获得,即model.config.max_position_embeddings),也就是说seq_len是定了的,你要剪枝什么模型,那么它之前训练时的seq_len就会固定不变
- 接着需要考虑的是隐藏层维度(由于大模型的基础架构是Transformer,这个隐藏层维度其实就是d_model,理论上不同的大模型,设置的d_model有所区别在opt-125m中,维度为768=256*3,即代码中model.config.hidden_size)
那么我们其实已经确定好了,模型的OPTDecoder的layers的输入数据维度应该是[128, 2048, 768],那么我如何得到这个输入呢?这就需要校验数据,通过前向传播经过OPTDecoder的layers前的那几层计算(如emd_tokens,embed_positions等组成的嵌入层)得到输出即是OPTDecoder的layers的输入
获取校验数据
- 通常会选择c4(Colossal Clean Crawled Corpus,是一个大规模的、清洁的、从互联网上爬取的文本数据集)的一部分作为校验数据,主要是因为c4数据集规模大,多样性丰富
- 用它作为校验数据通用性比较强
- 那会选择多少c4的样本数据集去校验呢?
- 至少要确保c4某一样本分词后的长度必须大于seq_len(2048),这样你才可以取得完整的句子序列
那么现在,你有一个重要的问题,可能要问:你为什么要使用校验数据?为什么不直接根据权重进行剪枝呢?因为wanda考虑了激活就必须需要校验数据才能获得激活X
def get_c4(nsamples=128, seed=0, seq_len=2048,tokenizer=tokenizer,
data_path=None):
'''
这里我们修改了部分代码,但差距不大
return: train_loader-->list-->length:128
valenc: Tensor-->[1:,词个数]
'''
# 获取c4数据集
# 直接下载
if not data_path: # 需要联网(外网)下载数据
print('you need open the network to download data')
train_data = load_dataset('allenai/c4',
data_files={'train': 'en/c4-train.00000-of-01024.json.gz'},
split='train')
val_data = load_dataset('allenai/c4',
data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'},
split='validation')
else:
print('load data from the local directory')
train_data = load_dataset(data_path, split='train')
val_data = load_dataset(data_path, split='validation')
# 获取校验数据nsample个
random.seed(seed)
train_loader = []
for _ in range(nsamples):
while True:
i = random.randint(0, len(train_data) - 1)
# 对随机选择的样本进行分词,返回的是字典key-->'input_ids', 'attention_mask'->维度为[1,分词个数]
trainenc = tokenizer(train_data[i]['text'], return_tensors='pt')
# 如果分词后的词的长度大于seq_len,则保留trainenc,并且跳出循环,否则继续寻找trainenc直到跳出循环
if trainenc['input_ids'].shape[1] > seq_len:
break
else:
continue
# 对保留后的trainenc随机获取一个长度为2048的词序列
j = random.randint(0, trainenc['input_ids'].shape[1] - seq_len-1)
inp = trainenc.input_ids[:, j:(j+seq_len)] # 获取词序列
# 获取预测任务的序列
tar = inp.clone()
tar[:, :-1] = -100
# 添加(inp,tar)到train_loader中
train_loader.append((inp, tar))
# 获取评估验证集
valenc = tokenizer(' '.join(val_data[:1100]['text']), return_tensors='pt')
valenc = valenc.input_ids[:, :(256 * seq_len)]
print("data loading complete")
return train_loader, valenc
data_loader,_ = get_c4(nsamples=128, seed=0, seq_len=2048,tokenizer=tokenizer,
data_path='./data/c4')
load data from the local directory
data loading complete
获取layers的输入
- 需要注意的点:
- 前向传播无需梯度传播–>必须with torch.no_grad()
- 无需使用kv缓存因为要重新计算一边,因此刚开始的use_cache为False,最后前向传播结束后在use_cache=True
def prepare_layers_input(model, data_loader, device='cuda'):
"""
return inps, outs, attention_mask
"""
use_cache = model.config.use_cache
model.config.use_cache = False
layers = model.model.decoder.layers # 一共12层
if "model.embed_tokens" in model.hf_device_map: # model.hf_device_map-->字典,分配模块的gpu位置这里为{"":0}
device = model.hf_device_map["model.embed_tokens"]
dtype = next(iter(model.parameters())).dtype # torch.float16
inps = torch.zeros((128, model.seqlen, model.config.hidden_size), dtype=dtype, device=device)
inps.requires_grad = False # 单独设置的数据需要device和设置无grad
cache = {'i': 0, 'attention_mask': None, "position_ids": None} # 用来记录前向传播
# 捕获layers[0]的输入,一旦捕获就抛出异常,让异常模块直接pass掉
class Catcher(nn.Module):
def __init__(self, module):
super().__init__()
self.module = module
def forward(self, inp, **kwargs):
# layer module的forward
# inp是[1,2048]作为layer module的输入
inps[cache['i']] = inp
cache['i'] += 1
cache['attention_mask'] = kwargs['attention_mask']
raise ValueError
layers[0] = Catcher(layers[0])
# 捕获layers[0]的输入
for batch in data_loader:
# batch-->(inps,tar):inps-->[1,2048]-->模型的输入
try:
# 进行前向传播,当layers[0]接收到输入inp-->[1,2048,768]时开始Catcher inp-->inps[catche[i]]=inp
model(batch[0].to(device)) # 模m
#layers[0] 捕获到inp之后发出异常
except ValueError:
continue # 直接进入下一个batch,继续捕获
# 所有batch捕获完成后,让layers[0]恢复
layers[0] = layers[0].module
# 初始化out
outs = torch.zeros_like(inps) # # [128,2048,768]
# 获取attention_mask
attention_mask = cache['attention_mask']
# 重置use_cache
model.config.use_cache = use_cache
return inps, outs, attention_mask
inps, outs, attention_mask = prepare_layers_input(model, data_loader, device='cuda')
inps.requires_grad
True
with torch.no_grad():
inps, outs, attention_mask = prepare_layers_input(model, data_loader, device='cuda')
inps.requires_grad
False
第三步:执行layer剪枝
layers = model.model.decoder.layers
获取单层layer的剪枝模块
- 如名称,对应的线性权重module
layers[0]
OPTDecoderLayer(
(self_attn): OPTAttention(
(k_proj): Linear(in_features=768, out_features=768, bias=True)
(v_proj): Linear(in_features=768, out_features=768, bias=True)
(q_proj): Linear(in_features=768, out_features=768, bias=True)
(out_proj): Linear(in_features=768, out_features=768, bias=True)
)
(activation_fn): ReLU()
(self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
(final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
比如layers[0]我们需要剪枝self_attn下的
- k_proj–>Linear(in_features=768, out_features=768, bias=True)
- v_proj…
- q_proj…
- out_proj…
那么我们就需要用字典来表示模块下某个需要剪的名称如self_attn.k_proj
对应的值就是Linear(in_features=768, out_features=768, bias=True)
对每一层layer都需要这样操作
def find_layers(module, layers=[nn.Linear], name=''):
"""
Recursively find the layers of a certain type in a module.
Args:
module (nn.Module): PyTorch module.
layers (list): List of layer types to find.
name (str): Name of the module.
Returns:
dict: Dictionary of layers of the given type(s) within the module.
"""
if type(module) in layers:
return {name: module}
res = {}
for name1, child in module.named_children():
res.update(find_layers(
child, layers=layers, name=name + '.' + name1 if name != '' else name1
))
return res
下面以layers[0]为例
- 获取layers[0]的剪枝模块,字典保存,key是线性模块的名称
layer = layers[0]
subset = find_layers(layer)
subset
{'self_attn.k_proj': Linear(in_features=768, out_features=768, bias=True),
'self_attn.v_proj': Linear(in_features=768, out_features=768, bias=True),
'self_attn.q_proj': Linear(in_features=768, out_features=768, bias=True),
'self_attn.out_proj': Linear(in_features=768, out_features=768, bias=True),
'fc1': Linear(in_features=768, out_features=3072, bias=True),
'fc2': Linear(in_features=3072, out_features=768, bias=True)}
单层layer下的单个剪枝模块的权重剪枝
比如layers[0]下的’fc1’剪枝模块为
- Linear(in_features=768, out_features=3072, bias=True)
- WrappedGPT(subset[‘fc1’])
注意pytorch Linear权重的shape是[out_features, in_features]–>[3072,768]
class WrappedGPT:
"""
This class wraps a GPT layer for specific operations.
# 对每层执行按照scaler_row操作
"""
def __init__(self, layer, layer_id=0, layer_name="none"):
self.layer = layer
self.dev = self.layer.weight.device
self.rows = layer.weight.data.shape[0] # 3072
self.columns = layer.weight.data.shape[1] # 768
# x.w+b
self.scaler_row = torch.zeros((self.columns), device=self.dev) # [768]
self.nsamples = 0
self.layer_id = layer_id
self.layer_name = layer_name
def add_batch(self, inp, out):
if len(inp.shape) == 2:
inp = inp.unsqueeze(0)
tmp = inp.shape[0]
if isinstance(self.layer, nn.Linear):
if len(inp.shape) == 3:
inp = inp.reshape((-1, inp.shape[-1]))
inp = inp.t()
self.scaler_row *= self.nsamples / (self.nsamples+tmp)
self.nsamples += tmp
inp = inp.type(torch.float32)
self.scaler_row += torch.norm(inp, p=2, dim=1) ** 2 / self.nsamples
wrapped_layers = {}
for name in subset:
wrapped_layers[name] = WrappedGPT(subset[name]) # 对每层的module执行标准化特殊操作
def add_batch(name): # 钩子,执行wrapped_layers[name].add_batch,主要是是执行scaler_row操作
def tmp(_, inp, out):
wrapped_layers[name].add_batch(inp[0].data, out.data)
return tmp
handles = []
for name in wrapped_layers:
handles.append(subset[name].register_forward_hook(add_batch(name))) # 为layer的各个module注册前向传播tmp钩子-->scaler_row操作
for j in range(128):
with torch.no_grad(): # [1,2048,768],执行前向计算
outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
outs.shape
torch.Size([128, 2048, 768])
for h in handles: # 前向计算结束后会执行钩子,最后移除钩子保存操作避免再次使用钩子
h.remove()
开始剪枝
scaler_row实际上就是加上了移动平均输入x的范数
scaler_row= ∣ ∣ x ∣ ∣ 2 / n ||x||^2/{n} ∣∣x∣∣2/n
w_metric = ∣ w ∣ ∗ ∣ ∣ x ∣ ∣ 2 / n |w| * ||x||^2/n ∣w∣∗∣∣x∣∣2/n
以"fc1"的权重修剪为例
w_fc1 = subset['fc1'].weight.data # 权重
fc1_scaler_raw = wrapped_layers['fc1'].scaler_row.reshape((1,-1)) # 标准化||X||^2
W_metric = torch.abs(w_fc1) * torch.sqrt(fc1_scaler_raw) # 权重重性度量
W_metric.shape
torch.Size([3072, 768])
W_mask = (torch.zeros_like(W_metric) == 1)
W_mask.shape
torch.Size([3072, 768])
非结构化剪枝,稀疏度0.5为例子
sort_res = torch.sort(W_metric, dim=-1, stable=True)
sort_res
torch.return_types.sort(
values=tensor([[3.1266e-04, 4.4838e-04, 5.2401e-04, ..., 6.0046e-01, 6.0655e-01,
6.0705e-01],
[6.1230e-05, 1.7848e-04, 2.1598e-04, ..., 9.9946e-01, 1.1328e+00,
1.3395e+00],
[3.2112e-05, 8.5624e-05, 2.8847e-04, ..., 5.1239e-01, 6.5296e-01,
1.4165e+00],
...,
[2.1328e-04, 2.5025e-04, 4.8324e-04, ..., 5.8667e-01, 2.0401e+00,
3.1894e+00],
[2.0609e-04, 2.2691e-04, 3.2290e-04, ..., 5.4380e-01, 5.5347e-01,
1.1568e+00],
[9.9054e-06, 4.6087e-04, 7.6572e-04, ..., 7.7114e-01, 2.8828e+00,
4.7223e+00]], device='cuda:0'),
indices=tensor([[506, 4, 420, ..., 43, 564, 429],
[703, 380, 75, ..., 355, 20, 526],
[366, 187, 643, ..., 667, 355, 20],
...,
[280, 580, 394, ..., 358, 20, 526],
[ 64, 472, 203, ..., 12, 20, 526],
[329, 349, 125, ..., 588, 526, 20]], device='cuda:0'))
indices = sort_res[1][:,:int(W_metric.shape[1]*0.5)]
W_mask.scatter_(1, indices, True)
tensor([[False, True, True, ..., False, True, True],
[ True, False, True, ..., False, False, False],
[ True, False, False, ..., True, False, False],
...,
[ True, False, False, ..., False, False, True],
[ True, True, False, ..., False, False, False],
[ True, False, True, ..., False, False, False]], device='cuda:0')
subset['fc1'].weight.data[W_mask] = 0
subset['fc1'].weight.data
tensor([[-0.0871, 0.0000, 0.0000, ..., 0.0425, 0.0000, 0.0000],
[ 0.0000, 0.0230, 0.0000, ..., 0.0206, 0.0000, 0.0000],
[ 0.0000, -0.0129, 0.0245, ..., 0.0000, 0.0000, 0.0462],
...,
[ 0.0000, 0.0394, 0.0232, ..., -0.0341, 0.0475, 0.0000],
[ 0.0000, 0.0000, -0.0399, ..., 0.0158, 0.0000, -0.0500],
[ 0.0000, 0.0334, 0.0000, ..., 0.0387, -0.0299, 0.0230]],
device='cuda:0', dtype=torch.float16)
结构化剪枝n:m
- 设prune_n=4, prune_m=8
- 连续m个权重,有4个不为0
prune_n, prune_m = 4, 8
for ii in range(W_metric.shape[1]):
# 当ii:(ii+prune_m)
if ii % prune_m == 0:
tmp = W_metric[:,ii:(ii+prune_m)].float()
W_mask.scatter_(1,ii+torch.topk(tmp, prune_n,dim=1, largest=False)[1], True)
subset['fc1'].weight.data[W_mask] = 0
subset['fc1'].weight.data
tensor([[-0.0871, 0.0000, 0.0000, ..., 0.0425, 0.0000, 0.0000],
[ 0.0000, 0.0230, 0.0000, ..., 0.0206, 0.0000, 0.0000],
[ 0.0000, -0.0129, 0.0245, ..., 0.0000, 0.0000, 0.0462],
...,
[ 0.0000, 0.0394, 0.0232, ..., -0.0341, 0.0475, 0.0000],
[ 0.0000, 0.0000, -0.0399, ..., 0.0158, 0.0000, -0.0500],
[ 0.0000, 0.0334, 0.0000, ..., 0.0387, -0.0299, 0.0230]],
device='cuda:0', dtype=torch.float16)
torch.torch.topk(input, k, dim , largest)
- 若largest是为True则找最大的k个元素对应的索引
- 反之,largest若为False则找最小的k个元对应的索引
a = torch.tensor([[1,2,3],[4,5,6]])
torch.topk(a, 2, 1, largest=True)
torch.return_types.topk(
values=tensor([[3, 2],
[6, 5]]),
indices=tensor([[2, 1],
[2, 1]]))
torch.topk(a, 2, 1, largest=False)
torch.return_types.topk(
values=tensor([[1, 2],
[4, 5]]),
indices=tensor([[0, 1],
[0, 1]]))
prune_wanda的主要代码
use_cache = model.config.use_cache
model.config.use_cache = False
prune_n, prune_m = 0 ,0
for i in range(len(layers)):
layer = layers[i]
subset = find_layers(layer)
if f"model.layers.{i}" in model.hf_device_map: ## handle the case for llama-30B and llama-65B, when the device map has multiple GPUs;
dev = model.hf_device_map[f"model.layers.{i}"]
inps, outs, attention_mask = inps.to(dev), outs.to(dev), attention_mask.to(dev)
wrapped_layers = {}
for name in subset:
wrapped_layers[name] = WrappedGPT(subset[name]) # 对每层的module执行标准化特殊操作
def add_batch(name): # 钩子,执行wrapped_layers[name].add_batch,主要是是执行scaler_row操作
def tmp(_, inp, out):
wrapped_layers[name].add_batch(inp[0].data, out.data)
return tmp
handles = []
for name in wrapped_layers:
handles.append(subset[name].register_forward_hook(add_batch(name))) # 对layer的所有module的输入输出进行hook,每个module获得不同的hook(tmp)
for j in range(128):
with torch.no_grad(): # [1,2048,768],执行前向计算
outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] # [1,2048,768)
for h in handles:
h.remove()
for name in subset:
print(f"pruning layer {i} name {name}")
W_metric = torch.abs(subset[name].weight.data) * torch.sqrt(wrapped_layers[name].scaler_row.reshape((1,-1)))
W_mask = (torch.zeros_like(W_metric) == 1) ## initialize a mask to be all False
if prune_n != 0:
# structured n:m sparsity
for ii in range(W_metric.shape[1]):
if ii % prune_m == 0:
tmp = W_metric[:,ii:(ii+prune_m)].float()
W_mask.scatter_(1,ii+torch.topk(tmp, prune_n,dim=1, largest=False)[1], True)
else:
sort_res = torch.sort(W_metric, dim=-1, stable=True)
# unstructured pruning
# indices = sort_res[1][:,:int(W_metric.shape[1]*args.sparsity_ratio)] # 这里直接设为0.5,方便演示
indices = sort_res[1][:,:int(W_metric.shape[1]*0.5)]
W_mask.scatter_(1, indices, True)
subset[name].weight.data[W_mask] = 0 ## set weights to zero
for j in range(128):
with torch.no_grad():
outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
inps, outs = outs, inps
model.config.use_cache = use_cache
torch.cuda.empty_cache()
pruning layer 0 name self_attn.k_proj
pruning layer 0 name self_attn.v_proj
pruning layer 0 name self_attn.q_proj
pruning layer 0 name self_attn.out_proj
pruning layer 0 name fc1
pruning layer 0 name fc2
pruning layer 1 name self_attn.k_proj
pruning layer 1 name self_attn.v_proj
pruning layer 1 name self_attn.q_proj
pruning layer 1 name self_attn.out_proj
pruning layer 1 name fc1
pruning layer 1 name fc2
pruning layer 2 name self_attn.k_proj
pruning layer 2 name self_attn.v_proj
pruning layer 2 name self_attn.q_proj
pruning layer 2 name self_attn.out_proj
pruning layer 2 name fc1
pruning layer 2 name fc2
pruning layer 3 name self_attn.k_proj
pruning layer 3 name self_attn.v_proj
pruning layer 3 name self_attn.q_proj
pruning layer 3 name self_attn.out_proj
pruning layer 3 name fc1
pruning layer 3 name fc2
pruning layer 4 name self_attn.k_proj
pruning layer 4 name self_attn.v_proj
pruning layer 4 name self_attn.q_proj
pruning layer 4 name self_attn.out_proj
pruning layer 4 name fc1
pruning layer 4 name fc2
pruning layer 5 name self_attn.k_proj
pruning layer 5 name self_attn.v_proj
pruning layer 5 name self_attn.q_proj
pruning layer 5 name self_attn.out_proj
pruning layer 5 name fc1
pruning layer 5 name fc2
pruning layer 6 name self_attn.k_proj
pruning layer 6 name self_attn.v_proj
pruning layer 6 name self_attn.q_proj
pruning layer 6 name self_attn.out_proj
pruning layer 6 name fc1
pruning layer 6 name fc2
pruning layer 7 name self_attn.k_proj
pruning layer 7 name self_attn.v_proj
pruning layer 7 name self_attn.q_proj
pruning layer 7 name self_attn.out_proj
pruning layer 7 name fc1
pruning layer 7 name fc2
pruning layer 8 name self_attn.k_proj
pruning layer 8 name self_attn.v_proj
pruning layer 8 name self_attn.q_proj
pruning layer 8 name self_attn.out_proj
pruning layer 8 name fc1
pruning layer 8 name fc2
pruning layer 9 name self_attn.k_proj
pruning layer 9 name self_attn.v_proj
pruning layer 9 name self_attn.q_proj
pruning layer 9 name self_attn.out_proj
pruning layer 9 name fc1
pruning layer 9 name fc2
pruning layer 10 name self_attn.k_proj
pruning layer 10 name self_attn.v_proj
pruning layer 10 name self_attn.q_proj
pruning layer 10 name self_attn.out_proj
pruning layer 10 name fc1
pruning layer 10 name fc2
pruning layer 11 name self_attn.k_proj
pruning layer 11 name self_attn.v_proj
pruning layer 11 name self_attn.q_proj
pruning layer 11 name self_attn.out_proj
pruning layer 11 name fc1
pruning layer 11 name fc2
这样,我们就完整的完成了wanda对opt-125m的50%稀疏度裁剪
第四步:开始评估剪枝后的模型
全部代码如下:
################################################################
print(“*”*30)
sparsity_ratio = check_sparsity(model) # 检查模型每层的稀疏度比例
print(f"sparsity sanity check {sparsity_ratio:.4f}")
print(“*”*30)
################################################################
ppl_test = eval_ppl(args, model, tokenizer, device) # 计算模型在wikitext2数据集上的困惑度ppl
print(f"wikitext perplexity {ppl_test}")
检查模型最终稀疏度比例
use_cache = model.config.use_cache
model.config.use_cache = False
layers = model.model.decoder.layers
count = 0
total_params = 0
还是以layer[0]为例子来计算该层的稀疏度比例
layer = layers[0]
subset = find_layers(layer)
sub_count = 0
sub_params = 0
for name in subset: # 计算该层的稀疏度
W = subset[name].weight.data # 获取该层每个修剪模块的权重
# count += (W==0).sum().item() #(W==0).sum().item()权重为0的个数,标量
# total_params += W.numel()
sub_count += (W==0).sum().item()
sub_params += W.numel()
print(f"layer sparsity {float(sub_count)/sub_params:.6f}")
layer sparsity 0.522670
def check_sparsity(model):
use_cache = model.config.use_cache
model.config.use_cache = False
layers = model.model.decoder.layers
count = 0
total_params = 0
for i in range(len(layers)):
layer = layers[i]
subset = find_layers(layer)
sub_count = 0
sub_params = 0
for name in subset:
W = subset[name].weight.data
count += (W==0).sum().item()
total_params += W.numel()
sub_count += (W==0).sum().item()
sub_params += W.numel()
print(f"layer {i} sparsity {float(sub_count)/sub_params:.6f}")
model.config.use_cache = use_cache
print(f"model total params:{total_params}, zero weight params: {count}")
print(f"model sparsity {count}/{total_params} ={float(count)/total_params: .6f}")
return float(count)/total_params # 计算模型参数的总稀疏度
以"fc1”为例子
W = subset['fc1'].weight.data
W.shape
torch.Size([3072, 768])
count += (W==0).sum().item()
(W==0).sum().item()
1340102
3072*768 # 总共200多万参数
2359296
1340102/2359296 # 稀疏度0.568009270562066
0.568009270562066
(W==0).sum()
tensor(1340102, device='cuda:0')
最后计算整个模型的参数稀疏度
sparsity_ratio = check_sparsity(model)
layer 0 sparsity 0.500000
layer 1 sparsity 0.500000
layer 2 sparsity 0.500000
layer 3 sparsity 0.500000
layer 4 sparsity 0.500000
layer 5 sparsity 0.500000
layer 6 sparsity 0.500000
layer 7 sparsity 0.500000
layer 8 sparsity 0.500000
layer 9 sparsity 0.500000
layer 10 sparsity 0.500000
layer 11 sparsity 0.500000
model total params:84934656, zero weight params: 42467328
model sparsity 42467328/84934656 = 0.500000
一个参数为torch.float16(2个字节),则参数占
84934656*2/(10**6)
169.869312
计算模型在wikitext2上的困惑度
ppl_test = eval_ppl(args, model, tokenizer, device)
print(f"wikitext perplexity {ppl_test}")
device = 'cuda'
获取wikitext2测试数据集
def get_wikitext2(nsamples=128, seed=0, seq_len=2048,tokenizer=tokenizer,
data_path=None):
'''
这里我们修改了部分代码,但差距不大
return: train_loader-->list-->length:128
testenc: dict-->key:input_ids, attention_mask
注意wikitext2的train_loader-->是先分词后直接获得词序列
而c4是,先根据样本(样本是完整的句子)的分词序列大于seq_len再获得词序列的子序列
'''
# 获取c4数据集
# 直接下载
if not data_path: # 需要联网(外网)下载数据
print('you need open the network to download data')
train_data = load_dataset('wikitext', 'wikitext-2-raw-v1',split='train')
test_data = load_dataset('wikitext', 'wikitext-2-raw-v1',split='test')
else:
print('load data from the local directory')
train_data = load_dataset(data_path, split='train')
test_data = load_dataset(data_path, split='test')
# 获取校验数据nsample个
trainenc = tokenizer(" ".join(train_data['text']), return_tensors='pt')
testenc = tokenizer("\n\n".join(test_data['text']), return_tensors='pt') # 为什么是\n\n?不是 " "
random.seed(seed)
train_loader = []
for _ in range(nsamples):
i = random.randint(0, trainenc.input_ids.shape[1] - seq_len - 1)
j = i+seq_len
inp = trainenc.input_ids[:, i:j] # 获取词序列
# 获取预测任务的序列
tar = inp.clone()
tar[:, :-1] = -100
# 添加(inp,tar)到train_loader中
train_loader.append((inp, tar))
print("data loading complete")
return train_loader, testenc
_, test_loader = get_wikitext2(nsamples=128,
seed=0,
seq_len=2048,
tokenizer=tokenizer,
data_path='./data/wikitext2')
load data from the local directory
data loading complete
# with torch.no_grad():
# ppl_test = eval_ppl_wikitext(model, testloader, 1, device)
testenc = test_loader.input_ids
testenc.shape
torch.Size([1, 287645])
testenc.numel() // model.seqlen
140
以第一个词序列为例
inputs = testenc[:,(0 * model.seqlen):(1 * model.seqlen)].to(device)
inputs.shape
torch.Size([1, 2048])
inputs = inputs.reshape(-1, model.seqlen)
inputs.shape
torch.Size([1, 2048])
out = model(inputs)
lm_logits = out.logits # [batch_size, 词序列维度,词库大小]
lm_logits.shape
torch.Size([1, 2048, 50272])
shift_logits = lm_logits[:, :-1, :].contiguous()
shift_labels = inputs[:, 1:]
shift_logits.shape
torch.Size([1, 2047, 50272])
shift_labels.shape
torch.Size([1, 2047])
loss_fct = nn.CrossEntropyLoss() # 计算每个sample的交叉熵损失
loss = loss_fct(shift_logits.reshape(-1, shift_logits.size(-1)), shift_labels.reshape(-1))
loss
tensor(3.3789, device='cuda:0', dtype=torch.float16,
grad_fn=<NllLossBackward0>)
计算极大似然
neg_log_likelihood = loss.float() * model.seqlen
neg_log_likelihood
tensor(6920., device='cuda:0', grad_fn=<MulBackward0>)
ppl = exp(Cross-Entropy Loss)
def eval_ppl_wikitext(model, testenc, bs=1, device=None):
# Get input IDs
testenc = testenc.input_ids
# Calculate number of samples
nsamples = testenc.numel() // model.seqlen
# List to store negative log likelihoods
nlls = []
print(f"nsamples {nsamples}")
# Loop through each batch
for i in range(0,nsamples,bs):
if i % 50 == 0:
print(f"sample {i}")
# Calculate end index
j = min(i+bs, nsamples)
# Prepare inputs and move to device
inputs = testenc[:,(i * model.seqlen):(j * model.seqlen)].to(device)
inputs = inputs.reshape(j-i, model.seqlen)
# Forward pass through the model
lm_logits = model(inputs).logits
# Shift logits and labels for next token prediction
shift_logits = lm_logits[:, :-1, :].contiguous()
shift_labels = inputs[:, 1:]
# Compute loss
loss_fct = nn.CrossEntropyLoss()
loss = loss_fct(shift_logits.reshape(-1, shift_logits.size(-1)), shift_labels.reshape(-1))
# Calculate negative log likelihood
neg_log_likelihood = loss.float() * model.seqlen * (j-i)
# Append to list of negative log likelihoods
nlls.append(neg_log_likelihood)
# Compute perplexity
ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
# Empty CUDA cache to save memory
torch.cuda.empty_cache()
return ppl.item()
with torch.no_grad():
ppl_test = eval_ppl_wikitext(model, test_loader, 1, device)