语音分类—audio classification

语音分类—audio classification(利用wav2vec2-base模型)

在参加科大讯飞比赛时接触的语音分类任务,由于数据集限制换了个吃东西声音分离数据集。同时编写一个可运行的kaggle notebook:https://www.kaggle.com/code/luo2323333/audio-classification把你的分类数据集放入,改下root即可运行
本项目原是基于pytorch的,pytorch的训练代码我写的不太规范,hugging face的api成熟且文档齐全用这个框架,还可以少写推理函数,不过如果是本地使用的话可能会报错。推荐是在kaggle中使用,当然你也可以用魔法。

本地训练生成csv文件

在datasets库中加载本地数据集可以通过csv,json等方式,我用的是加载csv文件

import os
import csv
import pandas as pd
def read_csv():
    with open('total.csv', 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            print(row)


with open("total.csv" , 'a') as f:
    writer = csv.DictWriter(f , fieldnames=["datas","labels"])
    writer.writeheader() #写入表头
    root = "/kaggle/input/eating-sound-collection/clips_rd"#你的语音分类数据集所在的地方
    labels = os.listdir(root) #你的标签
    data_dict = {}# 缓存字典
    for label in labels:
        files = os.listdir(os.path.join(root,label)) #读取某一类下的数据
        for file in files:
            data_dict['datas'] = os.path.join(os.path.join(root,label),file) #datas对应音频数据路径
            data_dict['labels'] = label #labels
            writer.writerow(data_dict) #写入csv      
df = pd.read_csv('total.csv')
print(df.head())
print(df.info())
                                               datas     labels
0  /kaggle/input/eating-sound-collection/clips_rd...  chocolate
1  /kaggle/input/eating-sound-collection/clips_rd...  chocolate
2  /kaggle/input/eating-sound-collection/clips_rd...  chocolate
3  /kaggle/input/eating-sound-collection/clips_rd...  chocolate
4  /kaggle/input/eating-sound-collection/clips_rd...  chocolate
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11140 entries, 0 to 11139
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   datas   11140 non-null  object
 1   labels  11140 non-null  object
dtypes: object(2)
memory usage: 174.2+ KB
None

利用datasets库加载与分割train集与test集

from datasets import load_dataset
# from datasets import AudioFolder
dataset = load_dataset("csv", data_files="total.csv") #加载数据集,csv需要以逗号分离
print(dataset['train'][2]) #看一眼
dataset = dataset.shuffle(seed=2023)# 打乱数据集
print(dataset['train'][2]) #看一眼
dataset = dataset['train'].train_test_split(test_size=0.1) # 分割数据集
print(dataset['test'][2]) # 看一眼
train_dataset = dataset['train']
val_dataset = dataset['test']
print(val_dataset)
print(train_dataset)
input_column,output_column = train_dataset.features.keys() #获取dataset中的内容
label_list = train_dataset.unique(output_column) #获取标签
label_list.sort()  # #标签排序
num_labels = len(label_list) #获取标签数量
print(f"A classification problem with {num_labels} classes: {label_list}")
label2id, id2label = dict(), dict()
for i, label in enumerate(label_list): #建立标签字典
    label2id[label] = str(i)
    id2label[str(i)] = label
{'datas': '/kaggle/input/eating-sound-collection/clips_rd/chocolate/chocolate_8_07.wav', 'labels': 'chocolate'}
{'datas': '/kaggle/input/eating-sound-collection/clips_rd/pizza/pizza_11_07.wav', 'labels': 'pizza'}
{'datas': '/kaggle/input/eating-sound-collection/clips_rd/pickles/pickles_8_85.wav', 'labels': 'pickles'}
Dataset({
    features: ['datas', 'labels'],
    num_rows: 1114
})
Dataset({
    features: ['datas', 'labels'],
    num_rows: 10026
})

A classification problem with 20 classes: ['aloe', 'burger', 'cabbage', 'candied_fruits', 'carrots', 'chips', 'chocolate', 'drinks', 'fries', 'grapes', 'gummies', 'ice-cream', 'jelly', 'noodles', 'pickles', 'pizza', 'ribs', 'salmon', 'soup', 'wings']

导入wav2vec2-base模型与它的feature_extractor

from transformers import AutoFeatureExtractor,AutoModelForAudioClassification 
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base") 
# print(feature_extractor.sampling_rate)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)
target_sampling_rate =feature_extractor.sampling_rate
max_duration = 6.0  # seconds

利用torchaudio处理数据集重采样,并利用datasets的map方法处理整个数据集

import torch 
import torchaudio
from torchaudio import functional as F
import random
random.seed(2023)
def resample_and_add_noise(path):
#     print(path)
    waveform , sr = torchaudio.load(path)
    speech = F.resample(waveform, sr, 16000) # wavec2的要求采样率为16000
#     noise = os.path.dirname(path)
#     noise = os.path.join(noise ,random.choice(os.listdir(noise)))
#     print(noise)
#     print(path)
#     noise , sr = torchaudio.load(noise)
#     noise = F.resample(noise, sr, 16000) # wavec2的要求采样率为16000
#     print(noise.shape)
#     print(waveform.shape)
#     waveform = F.add_noise(waveform, noise ,snr=torch.tensor(15)) 
#加噪代码由于这些音频长度不同所以不能运行,可以通过截断但是太麻烦了不想弄了
    speech = speech.squeeze().numpy()
    return speech

def label_to_id(label, label_list):

    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1

    return label

def preprocess_function(examples):
#     print(examples)
#     print(examples[input_column])
#     speech_list = [resample_and_add_noise(examples[input_column])] #如果不bached的处理方法,慢一点好像还可能报错
#     target_list = [int(label2id[examples[output_column]])]#如果不bached的处理方法,慢一点好像还可能报错
    speech_list = [resample_and_add_noise(path) for path in examples[input_column]]
    target_list = [ int(label2id[label]) for label in examples[output_column]]

    result = feature_extractor(speech_list, sampling_rate=target_sampling_rate,max_length=16000, truncation=True)#官方的说法时截断但是时分成两段的那种,没有直接丢弃
    result["labels"] = list(target_list)

    return result
# max_samples = 100
# train_dataset = train_dataset.select(range(max_samples))
# val_dataset = val_dataset.select(range(max_samples))
train_dataset = train_dataset.map(preprocess_function,batched=True)
val_dataset  = val_dataset.map(preprocess_function,batched=True)

val_dataset = val_dataset.rename_column("labels", "label").remove_columns("datas") #移除杂项
train_dataset = train_dataset.rename_column("labels", "label").remove_columns("datas") #移除杂项
# print(val_dataset[0]['input_values'][0]) #看一眼

写一段简单的评估函数

import evaluate
import numpy as np
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)
# from transformers import DataCollatorWithPadding
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

开始训练

from transformers import TrainingArguments,Trainer
# os.environ["WANDB_API_KEY"] = 'yourwandbapi' 
training_args = TrainingArguments(
    output_dir="model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-3,
    per_device_train_batch_size=48,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=48,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
    save_total_limit = 3,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

在这里插入图片描述

推理

from transformers import pipeline
audio_file = '/kaggle/input/eating-sound-collection/clips_rd/burger/burger_10_01.wav'
classifier = pipeline("audio-classification", model="/kaggle/working/model/checkpoint-470")
classifier(audio_file)
[{'score': 0.34858930110931396, 'label': 'burger'},
 {'score': 0.14217574894428253, 'label': 'pizza'},
 {'score': 0.12973767518997192, 'label': 'noodles'},
 {'score': 0.07574242353439331, 'label': 'wings'},
 {'score': 0.0667475089430809, 'label': 'ribs'}]

参考了transformers的官方文档:
参考了Mehrdad Farahani的GitHub项目:

  • 1
    点赞
  • 11
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值