1 onnx 模型实现
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;
using NAudio.Wave;
public class Audio2Feature
{
private InferenceSession inferenceSession;
public Audio2Feature(string modelPath)
{
this.inferenceSession = new InferenceSession(modelPath);
}
public float[,] GetSlicedFeature(float[,,] featureArray, int vidIdx, int[] audioFeatLength, int fps = 25)
{
int length = featureArray.GetLength(0);
List<float[]> selectedFeature = new List<float[]>();
List<int> selectedIdx = new List<int>();
int centerIdx = vidIdx * 50 / fps;
int leftIdx = centerIdx - audioFeatLength[0] * 2;
int rightIdx = centerIdx + (audioFeatLength[1] + 1) * 2;
for (int idx = leftIdx; idx < rightIdx; idx++)
{
int boundedIdx = Math.Max(0, Math.Min(length - 1, idx));
float[] x = GetArraySlice(featureArray, boundedIdx);
selectedFeature.Add(x);
selectedIdx.Add(boundedIdx);
}
float[,] selectedFeatureArray = ListTo2DArray(selectedFeature);
return selectedFeatureArray;
}
public List<float[,]> Feature2Chunks(float[,,] featureArray, int fps, int[] audioFeatLength)
{
List<float[,]> whisperChunks = new List<float[,]>();
double whisperIdxMultiplier = 50.0 / fps;
int i = 0;
while (true)
{
int startIdx = (int)(i * whisperIdxMultiplier);
if (startIdx >= featureArray.GetLength(0)) break;
float[,] selectedFeature = GetSlicedFeature(featureArray, i, audioFeatLength, fps);
whisperChunks.Add(selectedFeature);
i++;
}
return whisperChunks;
}
public float[,,] Audio2Feat(string audioPath)
{
// Load audio file and get the waveform
var audioWaveform = LoadAudio(audioPath);
// Prepare input for the ONNX model
var inputTensor = new DenseTensor<float>(audioWaveform, new[] { 1, 1, audioWaveform.Length });
var input = NamedOnnxValue.CreateFromTensor("input", inputTensor);
using (var results = inferenceSession.Run(new List<NamedOnnxValue> { input }))
{
var embeddings = results.First().AsEnumerable<float>().ToArray();
var embedTensor = new DenseTensor<float>(embeddings, new[] { embeddings.Length / 384, 1, 384 });
List<float[,]> embedList = new List<float[,]>();
// Process each embedding segment
for (int i = 0; i < embedTensor.Dimensions[0]; i++)
{
var segment = GetEmbeddingsSlice(embedTensor, i);
embedList.Add(segment);
}
float[,,] concatenatedArray = ConcatenateEmbedList(embedList);
return concatenatedArray;
}
}
private float[] LoadAudio(string audioPath)
{
using (var reader = new AudioFileReader(audioPath))
{
int sampleCount = (int)reader.Length / sizeof(float);
float[] buffer = new float[sampleCount];
int samplesRead = reader.Read(buffer, 0, sampleCount);
return buffer;
}
}
private float[] GetArraySlice(float[,,] array, int idx)
{
int size = array.GetLength(1) * array.GetLength(2);
float[] slice = new float[size];
Buffer.BlockCopy(array, idx * size * sizeof(float), slice, 0, size * sizeof(float));
return slice;
}
private float[,] ListTo2DArray(List<float[]> list)
{
int rows = list.Count;
int cols = list[0].Length;
float[,] array = new float[rows, cols];
for (int i = 0; i < rows; i++)
{
for (int j = 0; j < cols; j++)
{
array[i, j] = list[i][j];
}
}
return array;
}
private float[,,] ConcatenateEmbedList(List<float[,]> embedList)
{
int totalRows = embedList.Sum(embed => embed.GetLength(0));
int cols = embedList[0].GetLength(1);
float[,,] concatenatedArray = new float[totalRows, 1, cols];
int currentRow = 0;
foreach (var embed in embedList)
{
int rows = embed.GetLength(0);
for (int i = 0; i < rows; i++)
{
for (int j = 0; j < cols; j++)
{
concatenatedArray[currentRow, 0, j] = embed[i, j];
}
currentRow++;
}
}
return concatenatedArray;
}
private float[,] GetEmbeddingsSlice(DenseTensor<float> embeddings, int index)
{
int rows = 1;
int cols = embeddings.Dimensions[2];
float[,] slice = new float[rows, cols];
for (int j = 0; j < cols; j++)
{
slice[0, j] = embeddings[index, 0, j];
}
return slice;
}
}
2 采用whisper tiny.bin 实现
using System;
using System.Collections.Generic;
using System.IO;
using Whisper.net;
using Whisper.net.Ggml;
using NAudio.Wave;
public class Audio2Feature
{
private string whisperModelType;
private WhisperModel whisperModel;
public Audio2Feature(string whisperModelType = "tiny", string modelPath = "./models/whisper/tiny.bin")
{
this.whisperModelType = whisperModelType;
this.whisperModel = WhisperModel.FromPath(modelPath);
}
public float[,] GetSlicedFeature(float[,,] featureArray, int vidIdx, int[] audioFeatLength, int fps = 25)
{
int length = featureArray.GetLength(0);
List<float[]> selectedFeature = new List<float[]>();
List<int> selectedIdx = new List<int>();
int centerIdx = vidIdx * 50 / fps;
int leftIdx = centerIdx - audioFeatLength[0] * 2;
int rightIdx = centerIdx + (audioFeatLength[1] + 1) * 2;
for (int idx = leftIdx; idx < rightIdx; idx++)
{
int boundedIdx = Math.Max(0, Math.Min(length - 1, idx));
float[] x = GetArraySlice(featureArray, boundedIdx);
selectedFeature.AddRange(x);
selectedIdx.Add(boundedIdx);
}
float[,] selectedFeatureArray = ListTo2DArray(selectedFeature);
return selectedFeatureArray;
}
public List<float[,]> Feature2Chunks(float[,,] featureArray, int fps, int[] audioFeatLength)
{
List<float[,]> whisperChunks = new List<float[,]>();
double whisperIdxMultiplier = 50.0 / fps;
int i = 0;
while (true)
{
int startIdx = (int)(i * whisperIdxMultiplier);
if (startIdx >= featureArray.GetLength(0)) break;
float[,] selectedFeature = GetSlicedFeature(featureArray, i, audioFeatLength, fps);
whisperChunks.Add(selectedFeature);
i++;
}
return whisperChunks;
}
public float[,,] Audio2Feat(string audioPath)
{
var result = whisperModel.Transcribe(audioPath);
List<float[,]> embedList = new List<float[,]>();
foreach (var segment in result.Segments)
{
var encoderEmbeddings = segment.EncoderEmbeddings;
int startIdx = (int)segment.Start;
int endIdx = (int)segment.End;
int embEndIdx = (endIdx - startIdx) / 2;
var embeddingsSlice = GetEmbeddingsSlice(encoderEmbeddings, embEndIdx);
embedList.Add(embeddingsSlice);
}
float[,,] concatenatedArray = ConcatenateEmbedList(embedList);
return concatenatedArray;
}
private float[] GetArraySlice(float[,,] array, int idx)
{
int size = array.GetLength(1) * array.GetLength(2);
float[] slice = new float[size];
Buffer.BlockCopy(array, idx * size * sizeof(float), slice, 0, size * sizeof(float));
return slice;
}
private float[,] ListTo2DArray(List<float[]> list)
{
int rows = list.Count;
int cols = list[0].Length;
float[,] array = new float[rows, cols];
for (int i = 0; i < rows; i++)
{
for (int j = 0; j < cols; j++)
{
array[i, j] = list[i][j];
}
}
return array;
}
private float[,,] ConcatenateEmbedList(List<float[,]> embedList)
{
int totalRows = 0;
int cols = embedList[0].GetLength(1);
foreach (var embed in embedList)
{
totalRows += embed.GetLength(0);
}
float[,,] concatenatedArray = new float[totalRows, 1, cols];
int currentRow = 0;
foreach (var embed in embedList)
{
int rows = embed.GetLength(0);
for (int i = 0; i < rows; i++)
{
for (int j = 0; j < cols; j++)
{
concatenatedArray[currentRow, 0, j] = embed[i, j];
}
currentRow++;
}
}
return concatenatedArray;
}
private float[,] GetEmbeddingsSlice(float[,] embeddings, int endIdx)
{
int rows = endIdx;
int cols = embeddings.GetLength(1);
float[,] slice = new float[rows, cols];
for (int i = 0; i < rows; i++)
{
for (int j = 0; j < cols; j++)
{
slice[i, j] = embeddings[i, j];
}
}
return slice;
}
}