import torchaudio
from torchaudio.transforms import Resample, MelSpectrogram
class AudioProcessor:
"""
Example:
file_path = 'path_to_your_audio_file.mp3'
processor = AudioProcessor(file_path)
processed_audio = processor.get_processed_audio()
# processed_audio: Tensor of shape (n, d), where 'n' is the number of time frames,
# and 'd' is the feature dimension (e.g., flattened mel frequency channels).
"""
def __init__(self, file_path, new_sample_rate=16000):
self.file_path = file_path
self.new_sample_rate = new_sample_rate
self.waveform, self.original_sample_rate = self.load_audio()
self.waveform = self.resample_audio()
self.mel_spectrogram = self.compute_mel_spectrogram()
def load_audio(self):
waveform, sample_rate = torchaudio.load(self.file_path)
return waveform, sample_rate
def resample_audio(self):
if self.original_sample_rate != self.new_sample_rate:
resampler = Resample(self.original_sample_rate, self.new_sample_rate)
waveform = resampler(self.waveform)
else:
waveform = self.waveform
return waveform
def compute_mel_spectrogram(self, n_fft=2048, hop_length=512, n_mels=128):
mel_spectrogram = MelSpectrogram(
sample_rate=self.new_sample_rate,
n_fft=n_fft,
hop_length=hop_length,
n_mels=n_mels
)
return mel_spectrogram(self.waveform)
def get_processed_audio(self):
mel_spec = self.mel_spectrogram.reshape(-1, self.mel_spectrogram.size(2))
mel_spec = mel_spec.transpose(0, 1)
return mel_spec
音频token化
最新推荐文章于 2025-04-25 13:49:49 发布