音频token化

import torchaudio
from torchaudio.transforms import Resample, MelSpectrogram

class AudioProcessor:
    """
    Example:
        file_path = 'path_to_your_audio_file.mp3'
        processor = AudioProcessor(file_path)
        processed_audio = processor.get_processed_audio()

        # processed_audio: Tensor of shape (n, d), where 'n' is the number of time frames,
        # and 'd' is the feature dimension (e.g., flattened mel frequency channels).
    """
    def __init__(self, file_path, new_sample_rate=16000):
        self.file_path = file_path
        self.new_sample_rate = new_sample_rate
        self.waveform, self.original_sample_rate = self.load_audio()
        self.waveform = self.resample_audio()
        self.mel_spectrogram = self.compute_mel_spectrogram()

    def load_audio(self):
        waveform, sample_rate = torchaudio.load(self.file_path)
        return waveform, sample_rate

    def resample_audio(self):
        if self.original_sample_rate != self.new_sample_rate:
            resampler = Resample(self.original_sample_rate, self.new_sample_rate)
            waveform = resampler(self.waveform)
        else:
            waveform = self.waveform
        return waveform

    def compute_mel_spectrogram(self, n_fft=2048, hop_length=512, n_mels=128):
        mel_spectrogram = MelSpectrogram(
            sample_rate=self.new_sample_rate,
            n_fft=n_fft,
            hop_length=hop_length,
            n_mels=n_mels
        )
        return mel_spectrogram(self.waveform)

    def get_processed_audio(self):
        mel_spec = self.mel_spectrogram.reshape(-1, self.mel_spectrogram.size(2))
        mel_spec = mel_spec.transpose(0, 1)
        return mel_spec

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值