gen_audio_ops.py

Lebhoryi

于 2020-05-11 09:35:52 发布

阅读量1k

点赞数

分类专栏： tensorflow

本文链接：https://blog.csdn.net/weixin_37598106/article/details/106047276

版权

tensorflow 专栏收录该内容

16 篇文章 1 订阅

订阅专栏

tensorlfow1.14 之前的实现audio 相关操作的代码

# 出处在这，但是在github已经不可见，1.14和1.13我找了个遍
from tensorflow.python.ops.gen_audio_ops import *

相关ops 的操作实际代码在于以下这行

_, _, _op = _op_def_lib._apply_op_helper()

然后我深入的了解了一下
在这里插入图片描述
大佬的世界非我所能理解，望其项背，仍需努力

gen_audio_ops.py 源码

"""Python wrappers around TensorFlow ops.

This file is MACHINE GENERATED! Do not edit.
Original C++ source file: audio_ops.cc
"""

import collections as _collections
import six as _six

from tensorflow.python import pywrap_tensorflow as _pywrap_tensorflow
from tensorflow.python.eager import context as _context
from tensorflow.python.eager import core as _core
from tensorflow.python.eager import execute as _execute
from tensorflow.python.framework import dtypes as _dtypes
from tensorflow.python.framework import errors as _errors
from tensorflow.python.framework import tensor_shape as _tensor_shape

from tensorflow.core.framework import op_def_pb2 as _op_def_pb2
# Needed to trigger the call to _set_call_cpp_shape_fn.
from tensorflow.python.framework import common_shapes as _common_shapes
from tensorflow.python.framework import op_def_registry as _op_def_registry
from tensorflow.python.framework import ops as _ops
from tensorflow.python.framework import op_def_library as _op_def_library
from tensorflow.python.util.tf_export import tf_export


@tf_export('audio_spectrogram')
def audio_spectrogram(input, window_size, stride, magnitude_squared=False, name=None):
  r"""Produces a visualization of audio data over time.

  Spectrograms are a standard way of representing audio information as a series of
  slices of frequency information, one slice for each window of time. By joining
  these together into a sequence, they form a distinctive fingerprint of the sound
  over time.

  This op expects to receive audio data as an input, stored as floats in the range
  -1 to 1, together with a window width in samples, and a stride specifying how
  far to move the window between slices. From this it generates a three
  dimensional output. The lowest dimension has an amplitude value for each
  frequency during that time slice. The next dimension is time, with successive
  frequency slices. The final dimension is for the channels in the input, so a
  stereo audio input would have two here for example.

  This means the layout when converted and saved as an image is rotated 90 degrees
  clockwise from a typical spectrogram. Time is descending down the Y axis, and
  the frequency decreases from left to right.

  Each value in the result represents the square root of the sum of the real and
  imaginary parts of an FFT on the current window of samples. In this way, the
  lowest dimension represents the power of each frequency in the current window,
  and adjacent windows are concatenated in the next dimension.

  To get a more intuitive and visual look at what this operation does, you can run
  tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
  resulting spectrogram as a PNG image.

  Args:
    input: A `Tensor` of type `float32`. Float representation of audio data.
    window_size: An `int`.
      How wide the input window is in samples. For the highest efficiency
      this should be a power of two, but other values are accepted.
    stride: An `int`.
      How widely apart the center of adjacent sample windows should be.
    magnitude_squared: An optional `bool`. Defaults to `False`.
      Whether to return the squared magnitude or just the
      magnitude. Using squared magnitude can avoid extra calculations.
    name: A name for the operation (optional).

  Returns:
    A `Tensor` of type `float32`.
  """
  _ctx = _context._context
  if _ctx is None or not _ctx._eager_context.is_eager:
    window_size = _execute.make_int(window_size, "window_size")
    stride = _execute.make_int(stride, "stride")
    if magnitude_squared is None:
      magnitude_squared = False
    magnitude_squared = _execute.make_bool(magnitude_squared, "magnitude_squared")
    _, _, _op = _op_def_lib._apply_op_helper(
        "AudioSpectrogram", input=input, window_size=window_size,
        stride=stride, magnitude_squared=magnitude_squared, name=name)
    _result = _op.outputs[:]
    _inputs_flat = _op.inputs
    _attrs = ("window_size", _op.get_attr("window_size"), "stride",
              _op.get_attr("stride"), "magnitude_squared",
              _op.get_attr("magnitude_squared"))
    _execute.record_gradient(
      "AudioSpectrogram", _inputs_flat, _attrs, _result, name)
    _result, = _result
    return _result

  else:
    try:
      _result = _pywrap_tensorflow.TFE_Py_FastPathExecute(
        _ctx._context_handle, _ctx._eager_context.device_name,
        "AudioSpectrogram", name, _ctx._post_execution_callbacks, input,
        "window_size", window_size, "stride", stride, "magnitude_squared",
        magnitude_squared)
      return _result
    except _core._FallbackException:
      return audio_spectrogram_eager_fallback(
          input, window_size=window_size, stride=stride,
          magnitude_squared=magnitude_squared, name=name, ctx=_ctx)
    except _core._NotOkStatusException as e:
      if name is not None:
        message = e.message + " name: " + name
      else:
        message = e.message
      _six.raise_from(_core._status_to_exception(e.code, message), None)


def audio_spectrogram_eager_fallback(input, window_size, stride, magnitude_squared=False, name=None, ctx=None):
  r"""This is the slowpath function for Eager mode.
  This is for function audio_spectrogram
  """
  _ctx = ctx if ctx else _context.context()
  window_size = _execute.make_int(window_size, "window_size")
  stride = _execute.make_int(stride, "stride")
  if magnitude_squared is None:
    magnitude_squared = False
  magnitude_squared = _execute.make_bool(magnitude_squared, "magnitude_squared")
  input = _ops.convert_to_tensor(input, _dtypes.float32)
  _inputs_flat = [input]
  _attrs = ("window_size", window_size, "stride", stride, "magnitude_squared",
  magnitude_squared)
  _result = _execute.execute(b"AudioSpectrogram", 1, inputs=_inputs_flat,
                             attrs=_attrs, ctx=_ctx, name=name)
  _execute.record_gradient(
      "AudioSpectrogram", _inputs_flat, _attrs, _result, name)
  _result, = _result
  return _result


_decode_wav_outputs = ["audio", "sample_rate"]
_DecodeWavOutput = _collections.namedtuple(
    "DecodeWav", _decode_wav_outputs)


@tf_export('decode_wav')
def decode_wav(contents, desired_channels=-1, desired_samples=-1, name=None):
  r"""Decode a 16-bit PCM WAV file to a float tensor.

  The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.

  When desired_channels is set, if the input contains fewer channels than this
  then the last channel will be duplicated to give the requested number, else if
  the input has more channels than requested then the additional channels will be
  ignored.

  If desired_samples is set, then the audio will be cropped or padded with zeroes
  to the requested length.

  The first output contains a Tensor with the content of the audio samples. The
  lowest dimension will be the number of channels, and the second will be the
  number of samples. For example, a ten-sample-long stereo WAV file should give an
  output shape of [10, 2].

  Args:
    contents: A `Tensor` of type `string`.
      The WAV-encoded audio, usually from a file.
    desired_channels: An optional `int`. Defaults to `-1`.
      Number of sample channels wanted.
    desired_samples: An optional `int`. Defaults to `-1`.
      Length of audio requested.
    name: A name for the operation (optional).

  Returns:
    A tuple of `Tensor` objects (audio, sample_rate).

    audio: A `Tensor` of type `float32`.
    sample_rate: A `Tensor` of type `int32`.
  """
  _ctx = _context._context
  if _ctx is None or not _ctx._eager_context.is_eager:
    if desired_channels is None:
      desired_channels = -1
    desired_channels = _execute.make_int(desired_channels, "desired_channels")
    if desired_samples is None:
      desired_samples = -1
    desired_samples = _execute.make_int(desired_samples, "desired_samples")
    _, _, _op = _op_def_lib._apply_op_helper(
        "DecodeWav", contents=contents, desired_channels=desired_channels,
        desired_samples=desired_samples, name=name)
    _result = _op.outputs[:]
    _inputs_flat = _op.inputs
    _attrs = ("desired_channels", _op.get_attr("desired_channels"),
              "desired_samples", _op.get_attr("desired_samples"))
    _execute.record_gradient(
      "DecodeWav", _inputs_flat, _attrs, _result, name)
    _result = _DecodeWavOutput._make(_result)
    return _result

  else:
    try:
      _result = _pywrap_tensorflow.TFE_Py_FastPathExecute(
        _ctx._context_handle, _ctx._eager_context.device_name, "DecodeWav",
        name, _ctx._post_execution_callbacks, contents, "desired_channels",
        desired_channels, "desired_samples", desired_samples)
      _result = _DecodeWavOutput._make(_result)
      return _result
    except _core._FallbackException:
      return decode_wav_eager_fallback(
          contents, desired_channels=desired_channels,
          desired_samples=desired_samples, name=name, ctx=_ctx)
    except _core._NotOkStatusException as e:
      if name is not None:
        message = e.message + " name: " + name
      else:
        message = e.message
      _six.raise_from(_core._status_to_exception(e.code, message), None)


def decode_wav_eager_fallback(contents, desired_channels=-1, desired_samples=-1, name=None, ctx=None):
  r"""This is the slowpath function for Eager mode.
  This is for function decode_wav
  """
  _ctx = ctx if ctx else _context.context()
  if desired_channels is None:
    desired_channels = -1
  desired_channels = _execute.make_int(desired_channels, "desired_channels")
  if desired_samples is None:
    desired_samples = -1
  desired_samples = _execute.make_int(desired_samples, "desired_samples")
  contents = _ops.convert_to_tensor(contents, _dtypes.string)
  _inputs_flat = [contents]
  _attrs = ("desired_channels", desired_channels, "desired_samples",
  desired_samples)
  _result = _execute.execute(b"DecodeWav", 2, inputs=_inputs_flat,
                             attrs=_attrs, ctx=_ctx, name=name)
  _execute.record_gradient(
      "DecodeWav", _inputs_flat, _attrs, _result, name)
  _result = _DecodeWavOutput._make(_result)
  return _result


@tf_export('encode_wav')
def encode_wav(audio, sample_rate, name=None):
  r"""Encode audio data using the WAV file format.

  This operation will generate a string suitable to be saved out to create a .wav
  audio file. It will be encoded in the 16-bit PCM format. It takes in float
  values in the range -1.0f to 1.0f, and any outside that value will be clamped to
  that range.

  `audio` is a 2-D float Tensor of shape `[length, channels]`.
  `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).

  Args:
    audio: A `Tensor` of type `float32`. 2-D with shape `[length, channels]`.
    sample_rate: A `Tensor` of type `int32`.
      Scalar containing the sample frequency.
    name: A name for the operation (optional).

  Returns:
    A `Tensor` of type `string`.
  """
  _ctx = _context._context
  if _ctx is None or not _ctx._eager_context.is_eager:
    _, _, _op = _op_def_lib._apply_op_helper(
        "EncodeWav", audio=audio, sample_rate=sample_rate, name=name)
    _result = _op.outputs[:]
    _inputs_flat = _op.inputs
    _attrs = None
    _execute.record_gradient(
      "EncodeWav", _inputs_flat, _attrs, _result, name)
    _result, = _result
    return _result

  else:
    try:
      _result = _pywrap_tensorflow.TFE_Py_FastPathExecute(
        _ctx._context_handle, _ctx._eager_context.device_name, "EncodeWav",
        name, _ctx._post_execution_callbacks, audio, sample_rate)
      return _result
    except _core._FallbackException:
      return encode_wav_eager_fallback(
          audio, sample_rate, name=name, ctx=_ctx)
    except _core._NotOkStatusException as e:
      if name is not None:
        message = e.message + " name: " + name
      else:
        message = e.message
      _six.raise_from(_core._status_to_exception(e.code, message), None)


def encode_wav_eager_fallback(audio, sample_rate, name=None, ctx=None):
  r"""This is the slowpath function for Eager mode.
  This is for function encode_wav
  """
  _ctx = ctx if ctx else _context.context()
  audio = _ops.convert_to_tensor(audio, _dtypes.float32)
  sample_rate = _ops.convert_to_tensor(sample_rate, _dtypes.int32)
  _inputs_flat = [audio, sample_rate]
  _attrs = None
  _result = _execute.execute(b"EncodeWav", 1, inputs=_inputs_flat,
                             attrs=_attrs, ctx=_ctx, name=name)
  _execute.record_gradient(
      "EncodeWav", _inputs_flat, _attrs, _result, name)
  _result, = _result
  return _result


@tf_export('mfcc')
def mfcc(spectrogram, sample_rate, upper_frequency_limit=4000, lower_frequency_limit=20, filterbank_channel_count=40, dct_coefficient_count=13, name=None):
  r"""Transforms a spectrogram into a form that's useful for speech recognition.

  Mel Frequency Cepstral Coefficients are a way of representing audio data that's
  been effective as an input feature for machine learning. They are created by
  taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
  higher frequencies that are less significant to the human ear. They have a long
  history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
  is a good resource to learn more.

  Args:
    spectrogram: A `Tensor` of type `float32`.
      Typically produced by the Spectrogram op, with magnitude_squared
      set to true.
    sample_rate: A `Tensor` of type `int32`.
      How many samples per second the source audio used.
    upper_frequency_limit: An optional `float`. Defaults to `4000`.
      The highest frequency to use when calculating the
      ceptstrum.
    lower_frequency_limit: An optional `float`. Defaults to `20`.
      The lowest frequency to use when calculating the
      ceptstrum.
    filterbank_channel_count: An optional `int`. Defaults to `40`.
      Resolution of the Mel bank used internally.
    dct_coefficient_count: An optional `int`. Defaults to `13`.
      How many output channels to produce per time slice.
    name: A name for the operation (optional).

  Returns:
    A `Tensor` of type `float32`.
  """
  _ctx = _context._context
  if _ctx is None or not _ctx._eager_context.is_eager:
    if upper_frequency_limit is None:
      upper_frequency_limit = 4000
    upper_frequency_limit = _execute.make_float(upper_frequency_limit, "upper_frequency_limit")
    if lower_frequency_limit is None:
      lower_frequency_limit = 20
    lower_frequency_limit = _execute.make_float(lower_frequency_limit, "lower_frequency_limit")
    if filterbank_channel_count is None:
      filterbank_channel_count = 40
    filterbank_channel_count = _execute.make_int(filterbank_channel_count, "filterbank_channel_count")
    if dct_coefficient_count is None:
      dct_coefficient_count = 13
    dct_coefficient_count = _execute.make_int(dct_coefficient_count, "dct_coefficient_count")
    _, _, _op = _op_def_lib._apply_op_helper(
        "Mfcc", spectrogram=spectrogram, sample_rate=sample_rate,
        upper_frequency_limit=upper_frequency_limit,
        lower_frequency_limit=lower_frequency_limit,
        filterbank_channel_count=filterbank_channel_count,
        dct_coefficient_count=dct_coefficient_count, name=name)
    _result = _op.outputs[:]
    _inputs_flat = _op.inputs
    _attrs = ("upper_frequency_limit", _op.get_attr("upper_frequency_limit"),
              "lower_frequency_limit", _op.get_attr("lower_frequency_limit"),
              "filterbank_channel_count",
              _op.get_attr("filterbank_channel_count"),
              "dct_coefficient_count", _op.get_attr("dct_coefficient_count"))
    _execute.record_gradient(
      "Mfcc", _inputs_flat, _attrs, _result, name)
    _result, = _result
    return _result

  else:
    try:
      _result = _pywrap_tensorflow.TFE_Py_FastPathExecute(
        _ctx._context_handle, _ctx._eager_context.device_name, "Mfcc", name,
        _ctx._post_execution_callbacks, spectrogram, sample_rate,
        "upper_frequency_limit", upper_frequency_limit,
        "lower_frequency_limit", lower_frequency_limit,
        "filterbank_channel_count", filterbank_channel_count,
        "dct_coefficient_count", dct_coefficient_count)
      return _result
    except _core._FallbackException:
      return mfcc_eager_fallback(
          spectrogram, sample_rate,
          upper_frequency_limit=upper_frequency_limit,
          lower_frequency_limit=lower_frequency_limit,
          filterbank_channel_count=filterbank_channel_count,
          dct_coefficient_count=dct_coefficient_count, name=name, ctx=_ctx)
    except _core._NotOkStatusException as e:
      if name is not None:
        message = e.message + " name: " + name
      else:
        message = e.message
      _six.raise_from(_core._status_to_exception(e.code, message), None)


def mfcc_eager_fallback(spectrogram, sample_rate, upper_frequency_limit=4000, lower_frequency_limit=20, filterbank_channel_count=40, dct_coefficient_count=13, name=None, ctx=None):
  r"""This is the slowpath function for Eager mode.
  This is for function mfcc
  """
  _ctx = ctx if ctx else _context.context()
  if upper_frequency_limit is None:
    upper_frequency_limit = 4000
  upper_frequency_limit = _execute.make_float(upper_frequency_limit, "upper_frequency_limit")
  if lower_frequency_limit is None:
    lower_frequency_limit = 20
  lower_frequency_limit = _execute.make_float(lower_frequency_limit, "lower_frequency_limit")
  if filterbank_channel_count is None:
    filterbank_channel_count = 40
  filterbank_channel_count = _execute.make_int(filterbank_channel_count, "filterbank_channel_count")
  if dct_coefficient_count is None:
    dct_coefficient_count = 13
  dct_coefficient_count = _execute.make_int(dct_coefficient_count, "dct_coefficient_count")
  spectrogram = _ops.convert_to_tensor(spectrogram, _dtypes.float32)
  sample_rate = _ops.convert_to_tensor(sample_rate, _dtypes.int32)
  _inputs_flat = [spectrogram, sample_rate]
  _attrs = ("upper_frequency_limit", upper_frequency_limit,
  "lower_frequency_limit", lower_frequency_limit, "filterbank_channel_count",
  filterbank_channel_count, "dct_coefficient_count", dct_coefficient_count)
  _result = _execute.execute(b"Mfcc", 1, inputs=_inputs_flat, attrs=_attrs,
                             ctx=_ctx, name=name)
  _execute.record_gradient(
      "Mfcc", _inputs_flat, _attrs, _result, name)
  _result, = _result
  return _result

def _InitOpDefLibrary(op_list_proto_bytes):
  op_list = _op_def_pb2.OpList()
  op_list.ParseFromString(op_list_proto_bytes)
  _op_def_registry.register_op_list(op_list)
  op_def_lib = _op_def_library.OpDefLibrary()
  op_def_lib.add_op_list(op_list)
  return op_def_lib
# op {
#   name: "AudioSpectrogram"
#   input_arg {
#     name: "input"
#     type: DT_FLOAT
#   }
#   output_arg {
#     name: "spectrogram"
#     type: DT_FLOAT
#   }
#   attr {
#     name: "window_size"
#     type: "int"
#   }
#   attr {
#     name: "stride"
#     type: "int"
#   }
#   attr {
#     name: "magnitude_squared"
#     type: "bool"
#     default_value {
#       b: false
#     }
#   }
# }
# op {
#   name: "DecodeWav"
#   input_arg {
#     name: "contents"
#     type: DT_STRING
#   }
#   output_arg {
#     name: "audio"
#     type: DT_FLOAT
#   }
#   output_arg {
#     name: "sample_rate"
#     type: DT_INT32
#   }
#   attr {
#     name: "desired_channels"
#     type: "int"
#     default_value {
#       i: -1
#     }
#   }
#   attr {
#     name: "desired_samples"
#     type: "int"
#     default_value {
#       i: -1
#     }
#   }
# }
# op {
#   name: "EncodeWav"
#   input_arg {
#     name: "audio"
#     type: DT_FLOAT
#   }
#   input_arg {
#     name: "sample_rate"
#     type: DT_INT32
#   }
#   output_arg {
#     name: "contents"
#     type: DT_STRING
#   }
# }
# op {
#   name: "Mfcc"
#   input_arg {
#     name: "spectrogram"
#     type: DT_FLOAT
#   }
#   input_arg {
#     name: "sample_rate"
#     type: DT_INT32
#   }
#   output_arg {
#     name: "output"
#     type: DT_FLOAT
#   }
#   attr {
#     name: "upper_frequency_limit"
#     type: "float"
#     default_value {
#       f: 4000
#     }
#   }
#   attr {
#     name: "lower_frequency_limit"
#     type: "float"
#     default_value {
#       f: 20
#     }
#   }
#   attr {
#     name: "filterbank_channel_count"
#     type: "int"
#     default_value {
#       i: 40
#     }
#   }
#   attr {
#     name: "dct_coefficient_count"
#     type: "int"
#     default_value {
#       i: 13
#     }
#   }
# }
_op_def_lib = _InitOpDefLibrary(b"\np\n\020AudioSpectrogram\022\t\n\005input\030\001\032\017\n\013spectrogram\030\001\"\022\n\013window_size\022\003int\"\r\n\006stride\022\003int\"\035\n\021magnitude_squared\022\004bool\032\002(\000\n\200\001\n\tDecodeWav\022\014\n\010contents\030\007\032\t\n\005audio\030\001\032\017\n\013sample_rate\030\003\"$\n\020desired_channels\022\003int\032\013\030\377\377\377\377\377\377\377\377\377\001\"#\n\017desired_samples\022\003int\032\013\030\377\377\377\377\377\377\377\377\377\001\n5\n\tEncodeWav\022\t\n\005audio\030\001\022\017\n\013sample_rate\030\003\032\014\n\010contents\030\007\n\311\001\n\004Mfcc\022\017\n\013spectrogram\030\001\022\017\n\013sample_rate\030\003\032\n\n\006output\030\001\"%\n\025upper_frequency_limit\022\005float\032\005%\000\000zE\"%\n\025lower_frequency_limit\022\005float\032\005%\000\000\240A\"#\n\030filterbank_channel_count\022\003int\032\002\030(\" \n\025dct_coefficient_count\022\003int\032\002\030\r")