数字人语音特征提取代码，采用onnx 模型实现

最新推荐文章于 2024-07-28 17:29:26 发布

好友hy93119

最新推荐文章于 2024-07-28 17:29:26 发布

阅读量319

点赞数 10

分类专栏： AI数字人文章标签： AIGC 人工智能

本文链接：https://blog.csdn.net/ebdiy3119/article/details/139162268

版权

AI数字人专栏收录该内容

15 篇文章 0 订阅

订阅专栏

1 onnx 模型实现

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;
using NAudio.Wave;

public class Audio2Feature
{
private InferenceSession inferenceSession;

public Audio2Feature(string modelPath)
{
    this.inferenceSession = new InferenceSession(modelPath);
}

public float[,] GetSlicedFeature(float[,,] featureArray, int vidIdx, int[] audioFeatLength, int fps = 25)
{
    int length = featureArray.GetLength(0);
    List<float[]> selectedFeature = new List<float[]>();
    List<int> selectedIdx = new List<int>();

    int centerIdx = vidIdx * 50 / fps;
    int leftIdx = centerIdx - audioFeatLength[0] * 2;
    int rightIdx = centerIdx + (audioFeatLength[1] + 1) * 2;

    for (int idx = leftIdx; idx < rightIdx; idx++)
    {
        int boundedIdx = Math.Max(0, Math.Min(length - 1, idx));
        float[] x = GetArraySlice(featureArray, boundedIdx);
        selectedFeature.Add(x);
        selectedIdx.Add(boundedIdx);
    }

    float[,] selectedFeatureArray = ListTo2DArray(selectedFeature);
    return selectedFeatureArray;
}

public List<float[,]> Feature2Chunks(float[,,] featureArray, int fps, int[] audioFeatLength)
{
    List<float[,]> whisperChunks = new List<float[,]>();
    double whisperIdxMultiplier = 50.0 / fps;
    int i = 0;

    while (true)
    {
        int startIdx = (int)(i * whisperIdxMultiplier);
        if (startIdx >= featureArray.GetLength(0)) break;

        float[,] selectedFeature = GetSlicedFeature(featureArray, i, audioFeatLength, fps);
        whisperChunks.Add(selectedFeature);
        i++;
    }

    return whisperChunks;
}

public float[,,] Audio2Feat(string audioPath)
{
    // Load audio file and get the waveform
    var audioWaveform = LoadAudio(audioPath);

    // Prepare input for the ONNX model
    var inputTensor = new DenseTensor<float>(audioWaveform, new[] { 1, 1, audioWaveform.Length });

    var input = NamedOnnxValue.CreateFromTensor("input", inputTensor);

    using (var results = inferenceSession.Run(new List<NamedOnnxValue> { input }))
    {
        var embeddings = results.First().AsEnumerable<float>().ToArray();
        var embedTensor = new DenseTensor<float>(embeddings, new[] { embeddings.Length / 384, 1, 384 });

        List<float[,]> embedList = new List<float[,]>();

        // Process each embedding segment
        for (int i = 0; i < embedTensor.Dimensions[0]; i++)
        {
            var segment = GetEmbeddingsSlice(embedTensor, i);
            embedList.Add(segment);
        }

        float[,,] concatenatedArray = ConcatenateEmbedList(embedList);
        return concatenatedArray;
    }
}

private float[] LoadAudio(string audioPath)
{
    using (var reader = new AudioFileReader(audioPath))
    {
        int sampleCount = (int)reader.Length / sizeof(float);
        float[] buffer = new float[sampleCount];
        int samplesRead = reader.Read(buffer, 0, sampleCount);
        return buffer;
    }
}

private float[] GetArraySlice(float[,,] array, int idx)
{
    int size = array.GetLength(1) * array.GetLength(2);
    float[] slice = new float[size];
    Buffer.BlockCopy(array, idx * size * sizeof(float), slice, 0, size * sizeof(float));
    return slice;
}

private float[,] ListTo2DArray(List<float[]> list)
{
    int rows = list.Count;
    int cols = list[0].Length;
    float[,] array = new float[rows, cols];
    for (int i = 0; i < rows; i++)
    {
        for (int j = 0; j < cols; j++)
        {
            array[i, j] = list[i][j];
        }
    }
    return array;
}

private float[,,] ConcatenateEmbedList(List<float[,]> embedList)
{
    int totalRows = embedList.Sum(embed => embed.GetLength(0));
    int cols = embedList[0].GetLength(1);

    float[,,] concatenatedArray = new float[totalRows, 1, cols];
    int currentRow = 0;
    foreach (var embed in embedList)
    {
        int rows = embed.GetLength(0);
        for (int i = 0; i < rows; i++)
        {
            for (int j = 0; j < cols; j++)
            {
                concatenatedArray[currentRow, 0, j] = embed[i, j];
            }
            currentRow++;
        }
    }
    return concatenatedArray;
}

private float[,] GetEmbeddingsSlice(DenseTensor<float> embeddings, int index)
{
    int rows = 1;
    int cols = embeddings.Dimensions[2];
    float[,] slice = new float[rows, cols];
    for (int j = 0; j < cols; j++)
    {
        slice[0, j] = embeddings[index, 0, j];
    }
    return slice;
}

}
2 采用whisper tiny.bin 实现

using System;
using System.Collections.Generic;
using System.IO;
using Whisper.net;
using Whisper.net.Ggml;
using NAudio.Wave;

public class Audio2Feature
{
private string whisperModelType;
private WhisperModel whisperModel;

public Audio2Feature(string whisperModelType = "tiny", string modelPath = "./models/whisper/tiny.bin")
{
    this.whisperModelType = whisperModelType;
    this.whisperModel = WhisperModel.FromPath(modelPath);
}

public float[,] GetSlicedFeature(float[,,] featureArray, int vidIdx, int[] audioFeatLength, int fps = 25)
{
    int length = featureArray.GetLength(0);
    List<float[]> selectedFeature = new List<float[]>();
    List<int> selectedIdx = new List<int>();

    int centerIdx = vidIdx * 50 / fps;
    int leftIdx = centerIdx - audioFeatLength[0] * 2;
    int rightIdx = centerIdx + (audioFeatLength[1] + 1) * 2;

    for (int idx = leftIdx; idx < rightIdx; idx++)
    {
        int boundedIdx = Math.Max(0, Math.Min(length - 1, idx));
        float[] x = GetArraySlice(featureArray, boundedIdx);
        selectedFeature.AddRange(x);
        selectedIdx.Add(boundedIdx);
    }

    float[,] selectedFeatureArray = ListTo2DArray(selectedFeature);
    return selectedFeatureArray;
}

public List<float[,]> Feature2Chunks(float[,,] featureArray, int fps, int[] audioFeatLength)
{
    List<float[,]> whisperChunks = new List<float[,]>();
    double whisperIdxMultiplier = 50.0 / fps;
    int i = 0;

    while (true)
    {
        int startIdx = (int)(i * whisperIdxMultiplier);
        if (startIdx >= featureArray.GetLength(0)) break;

        float[,] selectedFeature = GetSlicedFeature(featureArray, i, audioFeatLength, fps);
        whisperChunks.Add(selectedFeature);
        i++;
    }

    return whisperChunks;
}

public float[,,] Audio2Feat(string audioPath)
{
    var result = whisperModel.Transcribe(audioPath);
    List<float[,]> embedList = new List<float[,]>();

    foreach (var segment in result.Segments)
    {
        var encoderEmbeddings = segment.EncoderEmbeddings;
        int startIdx = (int)segment.Start;
        int endIdx = (int)segment.End;
        int embEndIdx = (endIdx - startIdx) / 2;

        var embeddingsSlice = GetEmbeddingsSlice(encoderEmbeddings, embEndIdx);
        embedList.Add(embeddingsSlice);
    }

    float[,,] concatenatedArray = ConcatenateEmbedList(embedList);
    return concatenatedArray;
}

private float[] GetArraySlice(float[,,] array, int idx)
{
    int size = array.GetLength(1) * array.GetLength(2);
    float[] slice = new float[size];
    Buffer.BlockCopy(array, idx * size * sizeof(float), slice, 0, size * sizeof(float));
    return slice;
}

private float[,] ListTo2DArray(List<float[]> list)
{
    int rows = list.Count;
    int cols = list[0].Length;
    float[,] array = new float[rows, cols];
    for (int i = 0; i < rows; i++)
    {
        for (int j = 0; j < cols; j++)
        {
            array[i, j] = list[i][j];
        }
    }
    return array;
}

private float[,,] ConcatenateEmbedList(List<float[,]> embedList)
{
    int totalRows = 0;
    int cols = embedList[0].GetLength(1);
    foreach (var embed in embedList)
    {
        totalRows += embed.GetLength(0);
    }

    float[,,] concatenatedArray = new float[totalRows, 1, cols];
    int currentRow = 0;
    foreach (var embed in embedList)
    {
        int rows = embed.GetLength(0);
        for (int i = 0; i < rows; i++)
        {
            for (int j = 0; j < cols; j++)
            {
                concatenatedArray[currentRow, 0, j] = embed[i, j];
            }
            currentRow++;
        }
    }
    return concatenatedArray;
}

private float[,] GetEmbeddingsSlice(float[,] embeddings, int endIdx)
{
    int rows = endIdx;
    int cols = embeddings.GetLength(1);
    float[,] slice = new float[rows, cols];
    for (int i = 0; i < rows; i++)
    {
        for (int j = 0; j < cols; j++)
        {
            slice[i, j] = embeddings[i, j];
        }
    }
    return slice;
}

}

在这里插入图片描述