.\models\sam\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_tf_available,
is_torch_available,
is_vision_available,
)
_import_structure = {
"configuration_sam": [
"SAM_PRETRAINED_CONFIG_ARCHIVE_MAP",
"SamConfig",
"SamMaskDecoderConfig",
"SamPromptEncoderConfig",
"SamVisionConfig",
],
"processing_sam": ["SamProcessor"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_sam"] = [
"SAM_PRETRAINED_MODEL_ARCHIVE_LIST",
"SamModel",
"SamPreTrainedModel",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_sam"] = [
"TF_SAM_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFSamModel",
"TFSamPreTrainedModel",
]
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["image_processing_sam"] = ["SamImageProcessor"]
if TYPE_CHECKING:
from .configuration_sam import (
SAM_PRETRAINED_CONFIG_ARCHIVE_MAP,
SamConfig,
SamMaskDecoderConfig,
SamPromptEncoderConfig,
SamVisionConfig,
)
from .processing_sam import SamProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_sam import SAM_PRETRAINED_MODEL_ARCHIVE_LIST, SamModel, SamPreTrainedModel
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_sam import TF_SAM_PRETRAINED_MODEL_ARCHIVE_LIST, TFSamModel, TFSamPreTrainedModel
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .image_processing_sam import SamImageProcessor
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\seamless_m4t\configuration_seamless_m4t.py
""" SeamlessM4T 模型配置"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
SEAMLESS_M4T_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"facebook/hf-seamless-m4t-medium": "https://huggingface.co/facebook/hf-seamless-m4t-medium/resolve/main/config.json",
}
class SeamlessM4TConfig(PretrainedConfig):
r"""
这是一个配置类,用于存储 [`~SeamlessM4TModel`] 的配置。根据指定的参数实例化一个 SeamlessM4T 模型配置,
定义模型的架构。使用默认配置实例化将产生类似于 SeamlessM4T
["facebook/hf-seamless-m4t-medium"](https://huggingface.co/"facebook/hf-seamless-m4t-medium") 架构的配置。
配置对象继承自 [`PretrainedConfig`],可用于控制模型的输出。阅读 [`PretrainedConfig`] 的文档获取更多信息。
```
>>> from transformers import SeamlessM4TModel, SeamlessM4TConfig
>>> # 初始化 SeamlessM4T "facebook/hf-seamless-m4t-medium" 风格的配置
>>> configuration = SeamlessM4TConfig()
>>> # 根据 "facebook/hf-seamless-m4t-medium" 风格的配置初始化模型
>>> model = SeamlessM4TModel(configuration)
>>> # 访问模型配置
>>> configuration = model.config
```
"""
model_type = "seamless_m4t"
def __init__(
self,
vocab_size=256102,
t2u_vocab_size=10082,
hidden_size=1024,
initializer_range=0.02,
layer_norm_eps=1e-5,
use_cache=True,
max_position_embeddings=1024,
is_encoder_decoder=True,
encoder_layerdrop=0.05,
decoder_layerdrop=0.05,
activation_function="relu",
dropout=0.1,
attention_dropout=0.1,
activation_dropout=0.0,
scale_embedding=True,
encoder_layers=24,
encoder_ffn_dim=8192,
encoder_attention_heads=16,
decoder_layers=24,
decoder_ffn_dim=8192,
decoder_attention_heads=16,
decoder_start_token_id=3,
max_new_tokens=256,
pad_token_id=0,
bos_token_id=2,
eos_token_id=3,
speech_encoder_layers=24,
speech_encoder_attention_heads=16,
speech_encoder_intermediate_size=4096,
speech_encoder_hidden_act="swish",
speech_encoder_dropout=0.0,
add_adapter=True,
speech_encoder_layerdrop=0.1,
feature_projection_input_dim=160,
num_conv_pos_embeddings=128,
num_conv_pos_embedding_groups=16,
adaptor_kernel_size=8,
adaptor_stride=8,
adaptor_dropout=0.1,
num_adapter_layers=1,
position_embeddings_type="relative",
rotary_embedding_base=10000,
max_source_positions=4096,
conv_depthwise_kernel_size=31,
t2u_bos_token_id=0,
t2u_pad_token_id=1,
t2u_eos_token_id=2,
t2u_decoder_start_token_id=2,
t2u_max_new_tokens=1024,
t2u_encoder_layers=6,
t2u_encoder_ffn_dim=8192,
t2u_encoder_attention_heads=16,
t2u_decoder_layers=6,
t2u_decoder_ffn_dim=8192,
t2u_decoder_attention_heads=16,
t2u_max_position_embeddings=2048,
sampling_rate=16000,
upsample_initial_channel=512,
upsample_rates=[5, 4, 4, 2, 2],
upsample_kernel_sizes=[11, 8, 8, 4, 4],
resblock_kernel_sizes=[3, 7, 11],
resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
leaky_relu_slope=0.1,
unit_hifi_gan_vocab_size=10000,
unit_embed_dim=1280,
lang_embed_dim=256,
spkr_embed_dim=256,
vocoder_num_langs=36,
vocoder_num_spkrs=200,
variance_predictor_kernel_size=3,
var_pred_dropout=0.5,
vocoder_offset=4,
**kwargs,
.\models\seamless_m4t\convert_fairseq2_to_hf.py
""" 将 Meta SeamlessM4T 检查点从 seamless_communication 转换为 HF."""
import argparse
import os
from pathlib import Path
import torch
from accelerate.utils.modeling import find_tied_parameters
from seamless_communication.models.inference.translator import Translator
from transformers import (
SeamlessM4TConfig,
SeamlessM4TFeatureExtractor,
SeamlessM4TModel,
SeamlessM4TProcessor,
SeamlessM4TTokenizer,
)
from transformers.utils import logging
UNIT_SUPPORTED_LANGUAGES = [
"__arb__", "__ben__", "__cat__", "__ces__", "__cmn__", "__cym__", "__dan__", "__deu__",
"__eng__", "__est__", "__fin__", "__fra__", "__hin__", "__ind__", "__ita__", "__jpn__",
"__kan__", "__kor__", "__mlt__", "__nld__", "__pes__", "__pol__", "__por__", "__ron__",
"__rus__", "__slk__", "__spa__", "__swe__", "__swh__", "__tam__", "__tel__", "__tgl__",
"__tha__", "__tur__", "__ukr__", "__urd__", "__uzn__", "__vie__",
]
VOCODER_SUPPORTED_LANGUAGES = [
"__arb__", "__ben__", "__cat__", "__ces__", "__cmn__", "__cym__", "__dan__", "__deu__",
"__eng__", "__est__", "__fin__", "__fra__", "__hin__", "__ind__", "__ita__", "__jpn__",
"__kor__", "__mlt__", "__nld__", "__pes__", "__pol__", "__por__", "__ron__", "__rus__",
"__slk__", "__spa__", "__swe__", "__swh__", "__tel__", "__tgl__", "__tha__", "__tur__",
"__ukr__", "__urd__", "__uzn__", "__vie__",
]
MEDIUM_SUPPORTED_LANGUAGES = ["ace","ace_Latn","acm","acq","aeb","afr","ajp","aka","amh","apc","arb","ars","ary","arz","asm","ast","awa","ayr","azb","azj","bak","bam","ban","bel","bem","ben","bho","bjn","bjn_Latn","bod","bos","bug","bul","cat","ceb","ces","cjk","ckb","crh","cym","dan","deu","dik","dyu","dzo","ell","eng","epo","est","eus","ewe","fao","pes","fij","fin","fon","fra","fur","fuv","gla","gle","glg","grn","guj","hat","hau","heb","hin","hne","hrv","hun","hye","ibo","ilo","ind","isl","ita","jav","jpn","kab","kac","kam","kan","kas","kas_Deva","kat","knc","knc_Latn","kaz","kbp","kea","khm","kik","kin","kir","kmb","kon","kor","kmr","lao","lvs","lij","lim","lin","lit","lmo","ltg","ltz","lua","lug","luo","lus","mag","mai","mal","mar","min","mkd","plt","mlt","mni","khk","mos","mri","zsm","mya","nld","nno","nob","npi","nso","nus","nya","oci","gaz","ory","pag","pan","pap","pol","por","prs","pbt","quy","ron","run","rus","sag","san","sat","scn","shn","sin","slk","slv","smo","sna","snd","som","sot","spa","als","srd","srp","ssw","sun","swe","swh","szl","tam","tat","tel","tgk","tgl","tha","tir","taq","taq_Tfng","tpi","tsn","tso","tuk","tum","tur","twi","tzm","uig","ukr","umb","urd","uzn","vec","vie","war","wol","xho","ydd","yor","yue","cmn","cmn_Hant","zul",]
LARGE_SUPPORTED_LANGUAGES = ["afr","amh","arb","ary","arz","asm","azj","bel","ben","bos","bul","cat","ceb","ces","ckb","cmn","cmn_Hant","cym","dan","deu","ell","eng","est","eus","fin","fra","fuv","gaz","gle","glg","guj","heb","hin","hrv","hun","hye","ibo","ind","isl","ita","jav","jpn","kan","kat","kaz","khk","khm","kir","kor","lao","lit","lug","luo","lvs","mai","mal","mar","mkd","mlt","mni","mya","nld","nno","nob","npi","nya","ory","pan","pbt","pes","pol","por","ron","rus","sat","slk","slv","sna","snd","som","spa","srp","swe","swh","tam","tel","tgk","tgl","tha","tur","ukr","urd","uzn","vie","yor","yue","zlm","zul",]
def assert_param_count(model_1, model_2):
count_1 = sum(p[1].numel() for p in model_1.named_parameters() if "final_proj" not in p[0])
count_2 = sum(p[1].numel() for p in model_2.named_parameters() if "final_proj" not in p[0])
assert count_1 == count_2, f"{model_1.__class__}: {count_1} != {model_2.__class__}: {count_2}"
def param_count(model):
return sum(p[1].numel() for p in model.named_parameters() if "final_proj" not in p[0])
def _grab_best_device(use_gpu=True):
if torch.cuda.device_count() > 0 and use_gpu:
device = "cuda"
else:
device = "cpu"
return torch.device(device)
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
vocoder_convert_list = [
("ups", "hifi_gan.upsampler"),
("conv_pre", "hifi_gan.conv_pre"),
("resblocks", "hifi_gan.resblocks"),
("conv_post", "hifi_gan.conv_post"),
("lang", "language_embedding"),
("spkr", "speaker_embedding"),
("dict.", "unit_embedding."),
("dur_predictor.conv1.0", "dur_predictor.conv1"),
]
("dur_predictor.conv2.0", "dur_predictor.conv2"),
wav2vec_convert_list = [
("speech_encoder_frontend.model_dim_proj", "feature_projection.projection"),
("speech_encoder_frontend.post_extract_layer_norm", "feature_projection.layer_norm"),
("speech_encoder_frontend.pos_encoder.conv", "encoder.pos_conv_embed.conv"),
("speech_encoder.inner.layers", "encoder.layers"),
("speech_encoder.inner_layer_norm", "encoder.layer_norm"),
("speech_encoder.adaptor_layers", "adapter.layers"),
("inner_proj", "intermediate_dense"),
("self_attn.output_proj", "self_attn.linear_out"),
("output_proj", "output_dense"),
("self_attn.k_proj", "self_attn.linear_k"),
("self_attn.v_proj", "self_attn.linear_v"),
("self_attn.q_proj", "self_attn.linear_q"),
("self_attn.sdpa.u_bias", "self_attn.pos_bias_u"),
("self_attn.sdpa.v_bias", "self_attn.pos_bias_v"),
("self_attn.sdpa.r_proj", "self_attn.linear_pos"),
("conv.pointwise_conv1", "conv_module.pointwise_conv1"),
("conv.pointwise_conv2", "conv_module.pointwise_conv2"),
("conv.depthwise_conv", "conv_module.depthwise_conv"),
("conv.batch_norm", "conv_module.batch_norm"),
("conv_layer_norm", "conv_module.layer_norm"),
("speech_encoder.proj1", "intermediate_ffn.intermediate_dense"),
("speech_encoder.proj2", "intermediate_ffn.output_dense"),
("speech_encoder.layer_norm", "inner_layer_norm"),
]
t2u_convert_list = [
("t2u_model.final_proj", "lm_head"),
("t2u_model.", "model."),
("encoder_decoder_attn_layer_norm", "cross_attention_layer_norm"),
("encoder_decoder_attn", "cross_attention"),
("linear_k", "k_proj"),
("linear_v", "v_proj"),
("linear_q", "q_proj"),
("ffn.inner_proj", "ffn.fc1"),
("ffn.output_proj", "ffn.fc2"),
("output_proj", "out_proj"),
("decoder_frontend.embed", "decoder.embed_tokens"),
]
text_convert_list = [
("text_encoder.", ""),
("text_decoder.", ""),
("text_encoder_frontend.embed", "embed_tokens"),
("text_decoder_frontend.embed", "embed_tokens"),
("encoder_decoder_attn_layer_norm", "cross_attention_layer_norm"),
("encoder_decoder_attn", "cross_attention"),
("linear_k", "k_proj"),
("linear_v", "v_proj"),
("linear_q", "q_proj"),
("ffn.inner_proj", "ffn.fc1"),
("ffn.output_proj", "ffn.fc2"),
("output_proj", "out_proj"),
("final_proj", "lm_head"),
]
CUR_PATH = os.path.dirname(os.path.abspath(__file__))
default_cache_dir = os.path.join(os.path.expanduser("~"), ".cache")
CACHE_DIR = os.path.join(os.getenv("XDG_CACHE_HOME", default_cache_dir), "huggingface", "hub")
def _load_hf_config(model_type="medium"):
kwargs = {
"vocab_size": 256206,
"t2u_vocab_size": 10082,
"hidden_size": 1024,
"max_position_embeddings": 4096,
"encoder_layers": 12,
"decoder_layers": 12,
"encoder_ffn_dim": 4096,
"decoder_ffn_dim": 4096,
"t2u_encoder_layers": 4,
"t2u_decoder_layers": 4,
"speech_encoder_layers": 12,
}
return SeamlessM4TConfig(**kwargs)
def _convert_model(
original_model,
hf_model,
convert_list,
device,
unwanted_prefix="model.",
filter_state_dict="speech",
exclude_state_dict=None,
):
state_dict = original_model.state_dict()
if isinstance(filter_state_dict, str):
def filter_func(x):
return filter_state_dict in x[0]
else:
def filter_func(item):
if exclude_state_dict is not None and exclude_state_dict in item[0]:
return False
for filter_el in filter_state_dict:
if filter_el in item[0]:
return True
return False
state_dict = dict(filter(filter_func, state_dict.items()))
for k, v in list(state_dict.items()):
new_k = k[len(unwanted_prefix):]
for old_layer_name, new_layer_name in convert_list:
if old_layer_name in new_k:
new_k = new_k.replace(old_layer_name, new_layer_name)
if ".layer_norm" in new_k and new_k.split(".layer_norm")[0][-1].isnumeric():
new_k = new_k.replace("layer_norm", "final_layer_norm")
state_dict[new_k] = state_dict.pop(k)
extra_keys = set(state_dict.keys()) - set(hf_model.state_dict().keys())
extra_keys = set(extra_keys)
missing_keys = set(hf_model.state_dict().keys()) - set(state_dict.keys())
missing_keys = set({k for k in missing_keys if "final_logits_bias" not in k})
if len(extra_keys) != 0:
raise ValueError(f"extra keys found: {extra_keys}")
if len(missing_keys) != 0:
raise ValueError(f"missing keys: {missing_keys}")
hf_model.load_state_dict(state_dict, strict=False)
n_params = param_count(hf_model)
logger.info(f"model loaded: {round(n_params/1e6,1)}M params")
hf_model.eval()
hf_model.to(device)
del state_dict
return hf_model
def load_model(save_dir, model_type, repo_id):
"""
Meta SeamlessM4T is made of 8 main components:
- speech_encoder (#1) and speech_encoder_frontend (#2)
- t2u_model (#3)
- text_encoder (#4) and text_encoder_frontend (#5)
- text_decoder (#6) [and text_decoder_frontend (#5) = equals to text_encoder_frontend]
- final_proj (#7)
- vocoder (#8)
"""
device = _grab_best_device()
if model_type == "medium":
name = "seamlessM4T_medium"
else:
name = "seamlessM4T_large"
original_model = Translator(name, "vocoder_36langs", device, torch.float32)
langs = MEDIUM_SUPPORTED_LANGUAGES if model_type == "medium" else LARGE_SUPPORTED_LANGUAGES
langs = [f"__{lang}__" for lang in langs]
vocab_file = os.path.join(os.path.expanduser("~"), "tokenizer", model_type, "tokenizer.model")
save_dir = os.path.join(save_dir, name)
Path(save_dir).mkdir(exist_ok=True)
tokenizer = SeamlessM4TTokenizer(vocab_file, additional_special_tokens=langs)
sanity_check_lang_id = tokenizer.convert_tokens_to_ids("__fra__")
tokenizer.save_pretrained(save_dir)
tokenizer = SeamlessM4TTokenizer.from_pretrained(save_dir)
if sanity_check_lang_id != tokenizer.convert_tokens_to_ids("__fra__"):
raise ValueError(
f"Error in tokenizer saving/loading - __fra__ lang id is not coherent: {sanity_check_lang_id} vs {tokenizer.convert_tokens_to_ids('__fra__')}"
)
text_decoder_lang_code_to_id = {lang.replace("__", ""): tokenizer.convert_tokens_to_ids(lang) for lang in langs}
t2u_lang_code_to_id = {
code.replace("__", ""): i + 10005 + len(UNIT_SUPPORTED_LANGUAGES)
for i, code in enumerate(UNIT_SUPPORTED_LANGUAGES)
}
vocoder_lang_code_to_id = {code.replace("__", ""): i for i, code in enumerate(VOCODER_SUPPORTED_LANGUAGES)}
fe = SeamlessM4TFeatureExtractor(language_code=langs)
fe.save_pretrained(save_dir)
fe = SeamlessM4TFeatureExtractor.from_pretrained(save_dir)
processor = SeamlessM4TProcessor(feature_extractor=fe, tokenizer=tokenizer)
processor.save_pretrained(save_dir)
processor.push_to_hub(repo_id=repo_id, create_pr=True)
processor = SeamlessM4TProcessor.from_pretrained(save_dir)
hf_config = _load_hf_config(model_type)
hf_model = SeamlessM4TModel(hf_config)
hf_model.generation_config.__setattr__("text_decoder_lang_to_code_id", text_decoder_lang_code_to_id)
hf_model.generation_config.__setattr__("t2u_lang_code_to_id", t2u_lang_code_to_id)
hf_model.generation_config.__setattr__("vocoder_lang_code_to_id", vocoder_lang_code_to_id)
hf_model.vocoder.apply_weight_norm()
hf_model.vocoder = _convert_model(
original_model,
hf_model.vocoder,
vocoder_convert_list,
device,
unwanted_prefix="vocoder.code_generator.",
filter_state_dict="vocoder",
)
hf_model.vocoder.remove_weight_norm()
wav2vec = hf_model.speech_encoder
hf_model.speech_encoder = _convert_model(
original_model, wav2vec, wav2vec_convert_list, device, unwanted_prefix="model.", filter_state_dict="speech"
)
hf_model.t2u_model = _convert_model(
original_model,
hf_model.t2u_model,
t2u_convert_list,
device,
unwanted_prefix="model.",
filter_state_dict="t2u_model",
)
hf_model.text_encoder = _convert_model(
original_model,
hf_model.text_encoder,
text_convert_list,
device,
unwanted_prefix="model.",
filter_state_dict=["model.text_encoder"],
exclude_state_dict="t2u_model",
)
)
hf_model.text_decoder = _convert_model(
original_model,
hf_model.text_decoder,
text_convert_list,
device,
unwanted_prefix="model.",
filter_state_dict=["model.text_decoder"],
exclude_state_dict="t2u_model",
)
hf_model.lm_head = _convert_model(
original_model,
hf_model.lm_head,
[("final_proj.", "")],
device,
unwanted_prefix="model.",
filter_state_dict=["model.final_proj"],
exclude_state_dict="t2u_model",
)
print(find_tied_parameters(hf_model))
count_1 = param_count(hf_model)
count_2 = param_count(original_model)
print(f"HF MODEL:{count_1}, ORIGINAL_MODEL: {count_2}, diff:{count_1 - count_2}")
print(f"HF MODEL excluding embeddings:{hf_model.num_parameters(exclude_embeddings=True)}")
del original_model
hf_model.generation_config._from_model_config = False
hf_model.save_pretrained(save_dir)
hf_model.push_to_hub(repo_id=repo_id, create_pr=True)
hf_model = SeamlessM4TModel.from_pretrained(save_dir)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_type",
default="medium",
type=str,
help="Model type.",
)
parser.add_argument(
"--save_dir",
default="/home/ubuntu/weights",
type=str,
help="Path to the output PyTorch model.",
)
parser.add_argument(
"--repo_id",
default="facebook/hf-seamless-m4t-medium",
type=str,
help="Repo ID.",
)
args = parser.parse_args()
load_model(args.save_dir, args.model_type, args.repo_id)
.\models\seamless_m4t\feature_extraction_seamless_m4t.py
"""
Feature extractor class for SeamlessM4T
"""
from typing import List, Optional, Union
import numpy as np
from ...utils import is_torch_available
if is_torch_available():
import torch
from ...audio_utils import mel_filter_bank, spectrogram, window_function
from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
from ...feature_extraction_utils import BatchFeature
from ...utils import PaddingStrategy, TensorType, logging
logger = logging.get_logger(__name__)
class SeamlessM4TFeatureExtractor(SequenceFeatureExtractor):
r"""
Constructs a SeamlessM4T feature extractor.
This feature extractor inherits from [`SequenceFeatureExtractor`] which contains most of the main methods. Users
should refer to this superclass for more information regarding those methods.
This class extracts mel-filter bank features from raw speech.
Args:
feature_size (`int`, *optional*, defaults to 80):
The feature dimension of the extracted features.
sampling_rate (`int`, *optional*, defaults to 16000):
The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
num_mel_bins (`int`, *optional*, defaults to 80):
Number of Mel-frequency bins.
padding_value (`float`, *optional*, defaults to 0.0):
The value that is used to fill the padding vectors.
stride (`int`, *optional*, defaults to 2):
Stride used to reshape audios from shape (batch_size,num_frames,num_mel_bins) to
(batch_size,num_frames//stride,num_mel_bins*stride).
"""
model_input_names = ["input_features", "attention_mask"]
def __init__(
self,
feature_size=80,
sampling_rate=16000,
num_mel_bins=80,
padding_value=0.0,
stride=2,
**kwargs,
):
):
self.num_mel_bins = num_mel_bins
self.return_attention_mask = True
self.stride = stride
mel_filters = mel_filter_bank(
num_frequency_bins=256,
num_mel_filters=self.num_mel_bins,
min_frequency=20,
max_frequency=sampling_rate // 2,
sampling_rate=sampling_rate,
norm=None,
mel_scale="kaldi",
triangularize_in_mel_space=True,
)
self.mel_filters = np.pad(mel_filters, ((0, 1), (0, 0)))
self.window = window_function(400, "povey", periodic=False)
super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
@staticmethod
def zero_mean_unit_var_norm(
input_values: List[np.ndarray], attention_mask: List[np.ndarray], padding_value: float = 0.0
) -> List[np.ndarray]:
"""
每个数组都被归一化为零均值和单位方差
"""
if attention_mask is not None:
attention_mask = np.array(attention_mask, np.int32)
normed_input_values = []
for vector, length in zip(input_values, attention_mask.sum(-1)):
normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
if length < normed_slice.shape[0]:
normed_slice[length:] = padding_value
normed_input_values.append(normed_slice)
else:
normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]
return normed_input_values
def _extract_fbank_features(
self,
waveform: np.ndarray,
) -> np.ndarray:
"""
使用 TorchAudio 获取梅尔滤波器组特征。注意,TorchAudio 要求输入为16位有符号整数,因此在特征提取之前不应对波形进行归一化。
"""
if len(waveform.shape) == 2:
waveform = waveform[0]
waveform = np.squeeze(waveform) * (2**15)
features = spectrogram(
waveform,
self.window,
frame_length=400,
hop_length=160,
fft_length=512,
power=2.0,
center=False,
preemphasis=0.97,
mel_filters=self.mel_filters,
log_mel="log",
mel_floor=1.192092955078125e-07,
remove_dc_offset=True,
).T
return features
def __call__(
self,
raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
padding: Union[bool, str, PaddingStrategy] = True,
pad_to_multiple_of: Optional[int] = 2,
max_length: Optional[int] = None,
truncation: bool = False,
return_tensors: Optional[Union[str, TensorType]] = None,
sampling_rate: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
do_normalize_per_mel_bins: Optional[bool] = True,
**kwargs,
.\models\seamless_m4t\modeling_seamless_m4t.py
import copy
import math
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import Tensor, nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...deepspeed import is_deepspeed_zero3_enabled
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPastAndCrossAttentions,
Seq2SeqLMOutput,
Seq2SeqModelOutput,
Wav2Vec2BaseModelOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
)
from .configuration_seamless_m4t import SeamlessM4TConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "facebook/hf-seamless-m4t-medium"
_CONFIG_FOR_DOC = "SeamlessM4TConfig"
SEAMLESS_M4T_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/hf-seamless-m4t-medium",
]
SPEECHT5_PRETRAINED_HIFIGAN_CONFIG_ARCHIVE_MAP = {
"microsoft/speecht5_hifigan": "https://huggingface.co/microsoft/speecht5_hifigan/resolve/main/config.json",
}
@dataclass
class SeamlessM4TGenerationOutput(ModelOutput):
"""
定义来自[`SeamlessM4TModel`], [`SeamlessM4TForTextToText`],
[`SeamlessM4TForTextToSpeech`], [`SeamlessM4TForSpeechToSpeech`] 和 [`SeamlessM4TForTextToSpeech`] 的生成输出类。
继承自ModelOutput类。
"""
Args:
waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
模型预测的最终音频波形。
waveform_lengths (`torch.IntTensor` of shape `(batch_size,)`, *optional*):
`waveform` 批次中每个元素的样本长度,可选参数。
sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
生成的翻译序列。这是文本到文本或语音到文本模型的输出。
第二维 (sequence_length) 要么等于 `max_length`,要么由于 `eos_token_id` 导致所有批次提前结束而较短。
unit_sequences (`torch.LongTensor` of shape `(batch_size, unit_sequence_length)`, *optional*):
生成的单元序列。这是文本到单元模型的输出。
第二维 (unit_sequence_length) 要么等于 `t2u_max_length`,要么由于 `t2u_eos_token_id` 导致所有批次提前结束而较短。
SEAMLESS_M4T_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`~SeamlessM4TConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
SEAMLESS_M4T_INPUTS_DOCSTRING_FIRST_PART = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See
[`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
[`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
"""
SEAMLESS_M4T_INPUTS_DOCSTRING_TEXT_PART = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See
[`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
"""
SEAMLESS_M4T_INPUTS_DOCSTRING_SPEECH_PART = r"""
Args:
input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
[`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
"""
M4T_MODEL_INPUTS_DOCSTRING = SEAMLESS_M4T_INPUTS_DOCSTRING_FIRST_PART + SEAMLESS_M4T_INPUTS_DOCSTRING_LAST_PART
M4T_TEXT_INPUTS_DOCSTRING = SEAMLESS_M4T_INPUTS_DOCSTRING_TEXT_PART + SEAMLESS_M4T_INPUTS_DOCSTRING_LAST_PART
M4T_SPEECH_INPUTS_DOCSTRING = SEAMLESS_M4T_INPUTS_DOCSTRING_SPEECH_PART + SEAMLESS_M4T_INPUTS_DOCSTRING_LAST_PART
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
"""
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
"""
mask = input_ids.ne(padding_idx).int()
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
return incremental_indices.long() + padding_idx
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
"""
将输入的token向右移动一位。
"""
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
shifted_input_ids[:, 0] = decoder_start_token_id
if pad_token_id is None:
raise ValueError("self.model.config.pad_token_id has to be defined.")
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
return shifted_input_ids
def _compute_new_attention_mask(hidden_states: torch.Tensor, seq_lens: torch.Tensor):
"""
计算形式为`(batch, seq_len)`的注意力掩码,其中每个批次中的每个元素的注意力在对应的`seq_lens`停止。
Args:
hidden_states (`torch.FloatTensor`,形状为`(batch, seq_len, *)`):
要掩码的序列,其中`*`是任意数量的特定于序列的维度,包括没有。
seq_lens (`torch.Tensor`,形状为`(batch)`):
每个元素表示`hidden_states`中相同索引处的序列的长度
Returns:
`torch.FloatTensor`:形状为`(batch, seq_len)`的浮点数注意力掩码
"""
batch_size, mask_seq_len = hidden_states.shape[:2]
indices = torch.arange(mask_seq_len, device=seq_lens.device).expand(batch_size, -1)
bool_mask = indices >= seq_lens.unsqueeze(1).expand(-1, mask_seq_len)
mask = hidden_states.new_ones((batch_size, mask_seq_len))
mask = mask.masked_fill(bool_mask, 0)
return mask
def format_speech_generation_kwargs(kwargs):
"""
为生成语音的SeamlessM4T模型格式化kwargs,将kwargs分配给文本生成或语音生成模型。
Args:
kwargs (`dict`)`:
关键字参数有两种类型:
- 没有前缀时,它们将作为每个子模型的`generate`方法的`**kwargs`输入,除了`decoder_input_ids`只会传递给文本组件。
- 带有*text_*或*speech_*前缀时,它们将作为文本模型和语音模型的`generate`方法的输入。优先级高于没有前缀的关键字。
这意味着您可以为一种生成策略指定一种生成,而另一种不指定。
"""
kwargs_text = {}
kwargs_speech = {}
for key, value in kwargs.items():
if key.startswith("text_"):
key = key[len("text_") :]
kwargs_text[key] = value
elif key.startswith("speech_"):
key = key[len("speech_") :]
kwargs_speech[key] = value
else:
if key not in kwargs_text:
kwargs_text[key] = value
if key not in kwargs_speech:
kwargs_speech[key] = value
return kwargs_text, kwargs_speech
class SeamlessM4TConformerPositionalConvEmbedding(nn.Module):
def __init__(self, config):
super().__init__()
self.conv = nn.Conv1d(
config.hidden_size,
config.hidden_size,
kernel_size=config.num_conv_pos_embeddings,
padding=config.num_conv_pos_embeddings // 2,
groups=config.num_conv_pos_embedding_groups,
)
weight_norm = nn.utils.weight_norm
if hasattr(nn.utils.parametrizations, "weight_norm"):
weight_norm = nn.utils.parametrizations.weight_norm
if is_deepspeed_zero3_enabled():
import deepspeed
with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
self.conv = weight_norm(self.conv, name="weight", dim=2)
deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
else:
self.conv = weight_norm(self.conv, name="weight", dim=2)
self.padding = SeamlessM4TConformerSamePadLayer(config.num_conv_pos_embeddings)
self.activation = ACT2FN[config.speech_encoder_hidden_act]
def forward(self, hidden_states):
hidden_states = hidden_states.transpose(1, 2)
hidden_states = self.conv(hidden_states)
hidden_states = self.padding(hidden_states)
hidden_states = self.activation(hidden_states)
hidden_states = hidden_states.transpose(1, 2)
return hidden_states
class SeamlessM4TConformerRotaryPositionalEmbedding(nn.Module):
"""Rotary positional embedding
Reference : https://blog.eleuther.ai/rotary-embeddings/ Paper: https://arxiv.org/pdf/2104.09864.pdf
"""
def __init__(self, config):
super().__init__()
dim = config.hidden_size // config.speech_encoder_attention_heads
base = config.rotary_embedding_base
inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
self.register_buffer("inv_freq", inv_freq)
self.cached_sequence_length = None
self.cached_rotary_positional_embedding = None
def forward(self, hidden_states):
sequence_length = hidden_states.shape[1]
if sequence_length == self.cached_sequence_length and self.cached_rotary_positional_embedding is not None:
return self.cached_rotary_positional_embedding
self.cached_sequence_length = sequence_length
time_stamps = torch.arange(sequence_length).type_as(self.inv_freq)
freqs = torch.einsum("i,j->ij", time_stamps, self.inv_freq)
embeddings = torch.cat((freqs, freqs), dim=-1)
cos_embeddings = embeddings.cos()[:, None, None, :]
sin_embeddings = embeddings.sin()[:, None, None, :]
self.cached_rotary_positional_embedding = torch.stack([cos_embeddings, sin_embeddings]).type_as(hidden_states)
return self.cached_rotary_positional_embedding
class SeamlessM4TConformerRelPositionalEmbedding(nn.Module):
"""相对位置编码模块。"""
def __init__(self, config):
super().__init__()
self.max_len = config.max_source_positions
self.d_model = config.hidden_size
self.pe = None
self.extend_pe(torch.tensor(0.0).expand(1, self.max_len))
def extend_pe(self, x):
if self.pe is not None:
if self.pe.size(1) >= x.size(1) * 2 - 1:
if self.pe.dtype != x.dtype or self.pe.device != x.device:
self.pe = self.pe.to(dtype=x.dtype, device=x.device)
return
pe_positive = torch.zeros(x.size(1), self.d_model)
pe_negative = torch.zeros(x.size(1), self.d_model)
position = torch.arange(0, x.size(1), dtype=torch.int64).float().unsqueeze(1)
div_term = torch.exp(
torch.arange(0, self.d_model, 2, dtype=torch.int64).float() * -(math.log(10000.0) / self.d_model)
)
pe_positive[:, 0::2] = torch.sin(position * div_term)
pe_positive[:, 1::2] = torch.cos(position * div_term)
pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
pe_negative = pe_negative[1:].unsqueeze(0)
pe = torch.cat([pe_positive, pe_negative], dim=1)
self.pe = pe.to(device=x.device, dtype=x.dtype)
def forward(self, hidden_states: torch.Tensor):
self.extend_pe(hidden_states)
start_idx = self.pe.size(1) // 2 - hidden_states.size(1) + 1
end_idx = self.pe.size(1) // 2 + hidden_states.size(1)
relative_position_embeddings = self.pe[:, start_idx:end_idx]
return relative_position_embeddings
class SeamlessM4TConformerSamePadLayer(nn.Module):
def __init__(self, num_conv_pos_embeddings):
super().__init__()
self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
def forward(self, hidden_states):
if self.num_pad_remove > 0:
hidden_states = hidden_states[:, :, :-self.num_pad_remove]
return hidden_states
class SeamlessM4TConformerFeatureProjection(nn.Module):
def __init__(self, config):
super().__init__()
self.layer_norm = nn.LayerNorm(config.feature_projection_input_dim, eps=config.layer_norm_eps)
self.projection = nn.Linear(config.feature_projection_input_dim, config.hidden_size)
self.dropout = nn.Dropout(config.speech_encoder_dropout)
def forward(self, hidden_states):
norm_hidden_states = self.layer_norm(hidden_states)
hidden_states = self.projection(norm_hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class SeamlessM4TConformerFeedForward(nn.Module):
def __init__(self, config, act_fn=None, dropout=None):
super().__init__()
dropout = dropout if dropout is not None else config.speech_encoder_dropout
act_fn = act_fn if act_fn is not None else config.speech_encoder_hidden_act
self.intermediate_dropout = nn.Dropout(dropout)
self.intermediate_dense = nn.Linear(config.hidden_size, config.speech_encoder_intermediate_size)
self.intermediate_act_fn = ACT2FN[act_fn] if isinstance(act_fn, str) else act_fn
self.output_dense = nn.Linear(config.speech_encoder_intermediate_size, config.hidden_size)
self.output_dropout = nn.Dropout(dropout)
def forward(self, hidden_states):
hidden_states = self.intermediate_dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
hidden_states = self.intermediate_dropout(hidden_states)
hidden_states = self.output_dense(hidden_states)
hidden_states = self.output_dropout(hidden_states)
return hidden_states
class SeamlessM4TConformerConvolutionModule(nn.Module):
"""Convolution block used in the conformer block"""
def __init__(self, config):
super().__init__()
if (config.conv_depthwise_kernel_size - 1) % 2 == 1:
raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding")
self.layer_norm = nn.LayerNorm(config.hidden_size)
self.pointwise_conv1 = nn.Conv1d(
config.hidden_size,
2 * config.hidden_size,
kernel_size=1,
stride=1,
padding=0,
bias=False,
)
self.glu = nn.GLU(dim=1)
self.depthwise_conv = nn.Conv1d(
config.hidden_size,
config.hidden_size,
config.conv_depthwise_kernel_size,
stride=1,
padding="same",
groups=config.hidden_size,
bias=False,
)
self.batch_norm = nn.BatchNorm1d(config.hidden_size)
self.activation = ACT2FN[config.speech_encoder_hidden_act]
self.pointwise_conv2 = nn.Conv1d(
config.hidden_size,
config.hidden_size,
kernel_size=1,
stride=1,
padding=0,
bias=False,
)
self.dropout = nn.Dropout(config.speech_encoder_dropout)
def forward(self, hidden_states, attention_mask=None):
hidden_states = self.layer_norm(hidden_states)
if attention_mask is not None:
hidden_states = hidden_states.masked_fill(~attention_mask.bool().unsqueeze(-1), 0.0)
hidden_states = hidden_states.transpose(1, 2)
hidden_states = self.pointwise_conv1(hidden_states)
hidden_states = self.glu(hidden_states)
hidden_states = self.depthwise_conv(hidden_states)
hidden_states = self.batch_norm(hidden_states)
hidden_states = self.activation(hidden_states)
hidden_states = self.pointwise_conv2(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = hidden_states.transpose(1, 2)
return hidden_states
"""Construct a SeamlessM4TConformerSelfAttention object.
Can be enhanced with rotary or relative position embeddings.
"""
def __init__(self, config, use_position_embeddings=True):
super().__init__()
self.head_size = config.hidden_size // config.speech_encoder_attention_heads
self.num_heads = config.speech_encoder_attention_heads
self.position_embeddings_type = config.position_embeddings_type if use_position_embeddings else None
self.linear_q = nn.Linear(config.hidden_size, config.hidden_size)
self.linear_k = nn.Linear(config.hidden_size, config.hidden_size)
self.linear_v = nn.Linear(config.hidden_size, config.hidden_size)
self.linear_out = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(p=config.speech_encoder_dropout)
if self.position_embeddings_type == "relative":
self.linear_pos = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
self.pos_bias_u = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
self.pos_bias_v = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
relative_position_embeddings: Optional[torch.Tensor] = None,
output_attentions: bool = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
batch_size, sequence_length, hidden_size = hidden_states.size()
query_key_states = hidden_states
value_states = hidden_states
if self.position_embeddings_type == "rotary":
if relative_position_embeddings is None:
raise ValueError(
"`relative_position_embeddings` has to be defined when `self.position_embeddings_type == 'rotary'"
)
query_key_states = self._apply_rotary_embedding(query_key_states, relative_position_embeddings)
query = self.linear_q(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
key = self.linear_k(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
value = self.linear_v(value_states).view(batch_size, -1, self.num_heads, self.head_size)
query = query.transpose(1, 2)
key = key.transpose(1, 2)
value = value.transpose(1, 2)
if self.position_embeddings_type == "relative":
if relative_position_embeddings is None:
raise ValueError(
"`relative_position_embeddings` has to be defined when `self.position_embeddings_type == 'relative'"
)
scores = self._apply_relative_embeddings(
query=query, key=key, relative_position_embeddings=relative_position_embeddings
)
else:
scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_size)
if attention_mask is not None:
scores = scores + attention_mask
probs = torch.softmax(scores, dim=-1)
probs = self.dropout(probs)
hidden_states = torch.matmul(probs, value)
hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_size)
hidden_states = self.linear_out(hidden_states)
return hidden_states, probs
def _apply_rotary_embedding(self, hidden_states, relative_position_embeddings):
batch_size, sequence_length, hidden_size = hidden_states.size()
hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads, self.head_size)
cos = relative_position_embeddings[0, :sequence_length, ...]
sin = relative_position_embeddings[1, :sequence_length, ...]
hidden_states = hidden_states.transpose(0, 1)
rotated_states_begin = hidden_states[..., : self.head_size // 2]
rotated_states_end = hidden_states[..., self.head_size // 2 :]
rotated_states = torch.cat((-rotated_states_end, rotated_states_begin), dim=rotated_states_begin.ndim - 1)
hidden_states = (hidden_states * cos) + (rotated_states * sin)
hidden_states = hidden_states.transpose(0, 1)
hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads * self.head_size)
return hidden_states
`
def _apply_relative_embeddings(self, query, key, relative_position_embeddings):
proj_relative_position_embeddings = self.linear_pos(relative_position_embeddings)
proj_relative_position_embeddings = proj_relative_position_embeddings.view(
relative_position_embeddings.size(0), -1, self.num_heads, self.head_size
)
proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(1, 2)
proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(2, 3)
query = query.transpose(1, 2)
q_with_bias_u = (query + self.pos_bias_u).transpose(1, 2)
q_with_bias_v = (query + self.pos_bias_v).transpose(1, 2)
scores_ac = torch.matmul(q_with_bias_u, key.transpose(-2, -1))
scores_bd = torch.matmul(q_with_bias_v, proj_relative_position_embeddings)
zero_pad = torch.zeros((*scores_bd.size()[:3], 1), device=scores_bd.device, dtype=scores_bd.dtype)
scores_bd_padded = torch.cat([zero_pad, scores_bd], dim=-1)
scores_bd_padded_shape = scores_bd.size()[:2] + (scores_bd.shape[3] + 1, scores_bd.shape[2])
scores_bd_padded = scores_bd_padded.view(*scores_bd_padded_shape)
scores_bd = scores_bd_padded[:, :, 1:].view_as(scores_bd)
scores_bd = scores_bd[:, :, :, : scores_bd.size(-1) // 2 + 1]
scores = (scores_ac + scores_bd) / math.sqrt(self.head_size)
return scores
class SeamlessM4TConformerEncoderLayer(nn.Module):
"""Conformer block based on https://arxiv.org/abs/2005.08100."""
def __init__(self, config):
super().__init__()
embed_dim = config.hidden_size
dropout = config.speech_encoder_dropout
self.ffn1_layer_norm = nn.LayerNorm(embed_dim)
self.ffn1 = SeamlessM4TConformerFeedForward(config)
self.self_attn_layer_norm = nn.LayerNorm(embed_dim)
self.self_attn_dropout = nn.Dropout(dropout)
self.self_attn = SeamlessM4TConformerSelfAttention(config)
self.conv_module = SeamlessM4TConformerConvolutionModule(config)
self.ffn2_layer_norm = nn.LayerNorm(embed_dim)
self.ffn2 = SeamlessM4TConformerFeedForward(config)
self.final_layer_norm = nn.LayerNorm(embed_dim)
def forward(
self,
hidden_states,
attention_mask: Optional[torch.Tensor] = None,
relative_position_embeddings: Optional[torch.Tensor] = None,
output_attentions: bool = False,
conv_attention_mask: Optional[torch.Tensor] = None,
):
hidden_states = hidden_states
residual = hidden_states
hidden_states = self.ffn1_layer_norm(hidden_states)
hidden_states = self.ffn1(hidden_states)
hidden_states = hidden_states * 0.5 + residual
residual = hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
hidden_states, attn_weights = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
relative_position_embeddings=relative_position_embeddings,
output_attentions=output_attentions,
)
hidden_states = self.self_attn_dropout(hidden_states)
hidden_states = hidden_states + residual
residual = hidden_states
hidden_states = self.conv_module(hidden_states, attention_mask=conv_attention_mask)
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.ffn2_layer_norm(hidden_states)
hidden_states = self.ffn2(hidden_states)
hidden_states = hidden_states * 0.5 + residual
hidden_states = self.final_layer_norm(hidden_states)
return hidden_states, attn_weights
def __init__(self, config):
super().__init__()
self.config = config
if config.position_embeddings_type == "relative":
self.embed_positions = SeamlessM4TConformerRelPositionalEmbedding(config)
elif config.position_embeddings_type == "rotary":
self.embed_positions = SeamlessM4TConformerRotaryPositionalEmbedding(config)
else:
self.embed_positions = None
self.dropout = nn.Dropout(config.speech_encoder_dropout)
self.layers = nn.ModuleList(
[SeamlessM4TConformerEncoderLayer(config) for _ in range(config.speech_encoder_layers)]
)
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.gradient_checkpointing = False
def forward(
self,
hidden_states,
attention_mask=None,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
):
class SeamlessM4TConformerAdapterLayer(nn.Module):
def __init__(self, config):
super().__init__()
embed_dim = config.hidden_size
dropout = config.adaptor_dropout
self.kernel_size = config.adaptor_kernel_size
self.stride = config.adaptor_stride
self.residual_layer_norm = nn.LayerNorm(embed_dim)
self.residual_conv = nn.Conv1d(
embed_dim,
2 * embed_dim,
self.kernel_size,
stride=self.stride,
padding=self.stride // 2,
)
self.activation = nn.GLU(dim=1)
self.self_attn_layer_norm = nn.LayerNorm(embed_dim)
self.self_attn_conv = nn.Conv1d(
embed_dim,
2 * embed_dim,
self.kernel_size,
stride=self.stride,
padding=self.stride // 2,
)
self.self_attn = SeamlessM4TConformerSelfAttention(config, use_position_embeddings=False)
self.self_attn_dropout = nn.Dropout(dropout)
self.ffn_layer_norm = nn.LayerNorm(embed_dim)
self.ffn = SeamlessM4TConformerFeedForward(config, act_fn="relu", dropout=dropout)
def _compute_sub_sample_lengths_from_attention_mask(self, attention_mask):
pad = self.kernel_size // 2
seq_lens = attention_mask.size(1) - (1 - attention_mask.int()).sum(1)
seq_lens = ((seq_lens + 2 * pad - self.kernel_size) / self.stride) + 1
return seq_lens.floor()
def forward(
self,
hidden_states,
attention_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
):
residual = self.residual_layer_norm(hidden_states)
residual = residual.transpose(1, 2)
residual = self.residual_conv(residual)
residual = self.activation(residual)
residual = residual.transpose(1, 2)
hidden_states = self.self_attn_layer_norm(hidden_states)
hidden_states = hidden_states.transpose(1, 2)
hidden_states = self.self_attn_conv(hidden_states)
hidden_states = self.activation(hidden_states)
hidden_states = hidden_states.transpose(1, 2)
if attention_mask is not None:
sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
hidden_states.device
)
attention_mask = _compute_new_attention_mask(hidden_states=hidden_states, seq_lens=sub_sampled_lengths)
attention_mask = _prepare_4d_attention_mask(
attention_mask,
hidden_states.dtype,
)
hidden_states, attn_weights = self.self_attn(
hidden_states,
attention_mask=attention_mask,
output_attentions=output_attentions,
)
hidden_states = self.self_attn_dropout(hidden_states)
hidden_states = hidden_states + residual
residual = hidden_states
hidden_states = self.ffn_layer_norm(hidden_states)
hidden_states = self.ffn(hidden_states) + residual
return hidden_states
class SeamlessM4TConformerAdapter(nn.Module):
def __init__(self, config):
super().__init__()
self.layers = nn.ModuleList(SeamlessM4TConformerAdapterLayer(config) for _ in range(config.num_adapter_layers))
def forward(self, hidden_states, attention_mask):
for layer in self.layers:
hidden_states = layer(hidden_states, attention_mask)
return hidden_states
class SeamlessM4TSinusoidalPositionalEmbedding(nn.Module):
"""This module produces sinusoidal positional embeddings of any length."""
def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
super().__init__()
self.offset = 2
self.embedding_dim = embedding_dim
self.padding_idx = padding_idx
self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
if hasattr(self, "weights"):
emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
self.register_buffer("weights", emb_weights, persistent=False)
@staticmethod
def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
"""
Build sinusoidal embeddings.
构建 sinusoidal embeddings,与 tensor2tensor 中的实现相匹配,但与 "Attention Is All You Need" 第 3.5 节中的描述略有不同。
"""
half_dim = embedding_dim // 2
emb = math.log(10000) / (half_dim - 1)
emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
if embedding_dim % 2 == 1:
emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
if padding_idx is not None:
emb[padding_idx, :] = 0
return emb.to(torch.get_default_dtype())
@torch.no_grad()
def forward(
self, input_ids: torch.Tensor = None, inputs_embeds: torch.Tensor = None, past_key_values_length: int = 0
):
if input_ids is not None:
bsz, seq_len = input_ids.size()
position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
input_ids.device
)
else:
bsz, seq_len = inputs_embeds.size()[:-1]
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, past_key_values_length)
max_pos = self.padding_idx + 1 + seq_len + past_key_values_length
if max_pos > self.weights.size(0):
self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach()
def create_position_ids_from_inputs_embeds(self, inputs_embeds, past_key_values_length):
"""
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
Args:
inputs_embeds: torch.Tensor
Returns: torch.Tensor
"""
input_shape = inputs_embeds.size()[:-1]
sequence_length = input_shape[1]
position_ids = torch.arange(
self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
)
return position_ids.unsqueeze(0).expand(input_shape).contiguous() + past_key_values_length
class SeamlessM4TAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
is_causal: bool = False,
config: Optional[SeamlessM4TConfig] = None,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
self.config = config
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.is_causal = is_causal
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
encoder_hidden_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
):
class SeamlessM4TFeedForwardNetwork(nn.Module):
def __init__(self, config: SeamlessM4TConfig, ffn_dim: int):
super().__init__()
self.fc1 = nn.Linear(config.hidden_size, ffn_dim)
self.fc2 = nn.Linear(ffn_dim, config.hidden_size)
self.dropout = nn.Dropout(config.activation_dropout)
self.act = ACT2FN[config.activation_function]
def forward(self, hidden_states):
hidden_states = self.fc1(hidden_states)
hidden_states = self.act(hidden_states)
hidden_states = self.dropout(hidden_states)
if (
isinstance(self.fc2.weight, torch.Tensor)
and hidden_states.dtype != self.fc2.weight.dtype
and (self.fc2.weight.dtype != torch.int8 and self.fc2.weight.dtype != torch.uint8)
):
hidden_states = hidden_states.to(self.fc2.weight.dtype)
hidden_states = self.fc2(hidden_states)
return hidden_states
def __init__(self, config: SeamlessM4TConfig, encoder_ffn_dim=None, encoder_attention_heads=None):
super().__init__()
encoder_ffn_dim = config.encoder_ffn_dim if encoder_ffn_dim is None else encoder_ffn_dim
encoder_attention_heads = (
config.encoder_attention_heads if encoder_attention_heads is None else encoder_attention_heads
)
self.embed_dim = config.hidden_size
self.self_attn = SeamlessM4TAttention(
embed_dim=self.embed_dim,
num_heads=encoder_attention_heads,
dropout=config.attention_dropout,
)
self.attn_dropout = nn.Dropout(config.dropout)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.ffn = SeamlessM4TFeedForwardNetwork(config, ffn_dim=encoder_ffn_dim)
self.ffn_layer_norm = nn.LayerNorm(config.hidden_size)
self.ffn_dropout = nn.Dropout(config.activation_dropout)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
output_attentions: bool = False,
) -> torch.Tensor:
"""
Args:
hidden_states (`torch.FloatTensor`):
input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`):
attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
large negative values.
"""
residual = hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
hidden_states, attn_weights, _ = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
output_attentions=output_attentions,
)
hidden_states = self.attn_dropout(hidden_states)
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.ffn_layer_norm(hidden_states)
hidden_states = self.ffn(hidden_states)
hidden_states = self.ffn_dropout(hidden_states)
hidden_states = residual + hidden_states
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
class SeamlessM4TDecoderLayer(nn.Module):
def __init__(self, config: SeamlessM4TConfig, decoder_ffn_dim=None, decoder_attention_heads=None):
super().__init__()
decoder_ffn_dim = config.decoder_ffn_dim if decoder_ffn_dim is None else decoder_ffn_dim
decoder_attention_heads = (
config.decoder_attention_heads if decoder_attention_heads is None else decoder_attention_heads
)
self.embed_dim = config.hidden_size
self.self_attn = SeamlessM4TAttention(
embed_dim=self.embed_dim,
num_heads=decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.attn_dropout = nn.Dropout(config.dropout)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.cross_attention = SeamlessM4TAttention(
self.embed_dim, decoder_attention_heads, config.attention_dropout, is_decoder=True
)
self.cross_attention_layer_norm = nn.LayerNorm(self.embed_dim)
self.ffn = SeamlessM4TFeedForwardNetwork(config, ffn_dim=decoder_ffn_dim)
self.ffn_layer_norm = nn.LayerNorm(config.hidden_size)
self.ffn_dropout = nn.Dropout(config.activation_dropout)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = True,
):
pass
class SeamlessM4TPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = SeamlessM4TConfig
base_model_prefix = "seamless_m4t"
supports_gradient_checkpointing = True
_no_split_modules = ["SeamlessM4TEncoderLayer", "SeamlessM4TDecoderLayer", "SeamlessM4TConformerEncoderLayer"]
def _init_weights(self, module):
"""初始化模型权重"""
std = self.config.initializer_range
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, SeamlessM4TConformerSelfAttention):
if hasattr(module, "pos_bias_u"):
nn.init.xavier_uniform_(module.pos_bias_u)
if hasattr(module, "pos_bias_v"):
nn.init.xavier_uniform_(module.pos_bias_v)
elif isinstance(module, SeamlessM4TConformerPositionalConvEmbedding):
nn.init.normal_(
module.conv.weight,
mean=0,
std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
)
nn.init.constant_(module.conv.bias, 0)
elif isinstance(module, SeamlessM4TConformerFeatureProjection):
k = math.sqrt(1 / module.projection.in_features)
nn.init.uniform_(module.projection.weight, a=-k, b=k)
nn.init.uniform_(module.projection.bias, a=-k, b=k)
elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
elif isinstance(module, nn.Conv1d):
nn.init.kaiming_normal_(module.weight)
if module.bias is not None:
k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
nn.init.uniform_(module.bias, a=-k, b=k)
) -> torch.Tensor:
"""
Computes the last hidden states.
Parameters:
hidden_states (`Tuple[Tuple[torch.Tensor]]`):
The generated hidden states. Tuple (one element for each generated token) of tuples (one element for
each layer of the decoder) of torch.FloatTensor of shape (batch_size*num_beams*num_return_sequences,
generated_length, hidden_size).
beam_indices (`torch.LongTensor`, *optional*):
Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
`(batch_size*num_return_sequences, sequence_length)`. Only required if a `num_beams>1` at
generate-time.
Return:
`torch.Tensor`: A `torch.Tensor` of shape `(batch_size*num_return_sequences, sequence_length, hidden_size)`
containing
the last hidden states.
"""
last_hidden_states = torch.concat([hidden_states[-1] for hidden_states in hidden_states], dim=1)
if beam_indices is None:
return last_hidden_states
beam_indices_mask = beam_indices < 0
max_beam_length = (1 - beam_indices_mask.long()).sum(-1).max()
beam_indices = beam_indices.clone()[:, :max_beam_length]
beam_indices_mask = beam_indices_mask[:, :max_beam_length]
beam_indices[beam_indices_mask] = 0
beam_indices = beam_indices.unsqueeze(-1)
beam_indices = beam_indices.expand(-1, -1, last_hidden_states.shape[-1])
last_hidden_states = torch.gather(last_hidden_states, 0, beam_indices)
return last_hidden_states
class SeamlessM4TSpeechEncoder(SeamlessM4TPreTrainedModel):
main_input_name = "input_features"
def __init__(self, config: SeamlessM4TConfig):
super().__init__(config)
self.feature_projection = SeamlessM4TConformerFeatureProjection(config)
self.encoder = SeamlessM4TConformerEncoder(config)
self.intermediate_ffn = SeamlessM4TConformerFeedForward(config, act_fn="relu", dropout=0.0)
self.adapter = SeamlessM4TConformerAdapter(config) if config.add_adapter else None
self.inner_layer_norm = nn.LayerNorm(config.hidden_size)
self.post_init()
def forward(
self,
input_features: Optional[torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs,
) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
if input_features is None:
raise ValueError(
"""Both `input_features` and `inputs_embeds` are `None` in `SeamlessM4TSpeechEncoder.forward`.
Make sure one of them is not `None`."""
)
hidden_states = self.feature_projection(input_features)
encoder_outputs = self.encoder(
hidden_states,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = encoder_outputs[0]
expanded_hidden_states = self.intermediate_ffn(hidden_states)
hidden_states = hidden_states + 0.5 * expanded_hidden_states
if self.adapter is not None:
hidden_states = self.adapter(hidden_states, attention_mask=attention_mask)
hidden_states = self.inner_layer_norm(hidden_states)
if not return_dict:
return (hidden_states,) + encoder_outputs[1:]
return Wav2Vec2BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a [`SeamlessM4TEncoderLayer`].
"""
"""
SEAMLESS_M4T_START_DOCSTRING,
"""
"""
embed_tokens (`nn.Embedding`, *optional*):
Input embedding
is_t2u_encoder (`bool`, *optional*, defaults to `False`):
indicates if it belongs to the text-to-units model, in which case it won't have input embeddings
"""
@add_start_docstrings(
"Transformer encoder consisting of *config.encoder_layers* layers. Each layer is a [`SeamlessM4TEncoderLayer`].",
SEAMLESS_M4T_START_DOCSTRING,
"""
embed_tokens (`nn.Embedding`, *optional*):
Input embedding
""",
)
class SeamlessM4TEncoder(SeamlessM4TPreTrainedModel):
def __init__(
self,
config: SeamlessM4TConfig,
embed_tokens: Optional[nn.Embedding] = None,
is_t2u_encoder: bool = False,
):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.encoder_layerdrop
self.padding_idx = config.pad_token_id
embed_dim = config.hidden_size
self.is_t2u_encoder = is_t2u_encoder
self.max_source_positions = config.max_position_embeddings
if not self.is_t2u_encoder:
self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
if embed_tokens is not None:
self.embed_tokens.weight = embed_tokens.weight
self.embed_positions = SeamlessM4TSinusoidalPositionalEmbedding(
self.max_source_positions,
embed_dim,
self.padding_idx,
)
layers = []
for _ in range(config.encoder_layers):
layers.append(
SeamlessM4TEncoderLayer(
config,
encoder_attention_heads=config.encoder_attention_heads,
encoder_ffn_dim=config.encoder_ffn_dim,
)
)
self.layers = nn.ModuleList(layers)
self.layer_norm = nn.LayerNorm(config.hidden_size)
self.gradient_checkpointing = False
self.post_init()
):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.decoder_layerdrop
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
self.max_target_positions = config.max_position_embeddings
self.embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0
if embed_tokens is not None:
self.embed_tokens = nn.Embedding(embed_tokens.num_embeddings, embed_tokens.embedding_dim, self.padding_idx)
self.embed_tokens.weight = embed_tokens.weight
else:
self.embed_tokens = nn.Embedding(self.vocab_size, config.hidden_size, self.padding_idx)
self.embed_positions = SeamlessM4TSinusoidalPositionalEmbedding(
self.max_target_positions,
config.hidden_size,
padding_idx=self.padding_idx,
)
layers = []
for _ in range(config.decoder_layers):
layers.append(
SeamlessM4TDecoderLayer(
config,
decoder_attention_heads=config.decoder_attention_heads,
decoder_ffn_dim=config.decoder_ffn_dim,
)
)
self.layers = nn.ModuleList(layers)
self.layer_norm = nn.LayerNorm(config.hidden_size)
self.gradient_checkpointing = False
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"Transformer bare text-to-unit encoder-decoder. The encoder is a [`SeamlessM4TEncoder`] without embeddings and the decoder is a [`SeamlessM4TDecoder`].",
SEAMLESS_M4T_START_DOCSTRING,
"""
embed_tokens_decoder (`nn.Embedding`, *optional*): input embedding of the decoder.
""",
)
class SeamlessM4TTextToUnitModel(SeamlessM4TPreTrainedModel):
def __init__(
self,
config: SeamlessM4TConfig,
embed_tokens_decoder: Optional[nn.Embedding] = None,
):
super().__init__(config)
self.encoder = SeamlessM4TEncoder(config, is_t2u_encoder=True)
self.decoder = SeamlessM4TDecoder(config, embed_tokens_decoder)
self.post_init()
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], Seq2SeqModelOutput]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if encoder_outputs is None:
encoder_outputs = self.encoder(
input_ids=input_ids,
attention_mask=attention_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
encoder_outputs = BaseModelOutput(
last_hidden_state=encoder_outputs[0],
hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
)
decoder_outputs = self.decoder(
input_ids=decoder_input_ids,
attention_mask=decoder_attention_mask,
encoder_hidden_states=encoder_outputs[0],
encoder_attention_mask=attention_mask,
past_key_values=past_key_values,
inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
if not return_dict:
return decoder_outputs + encoder_outputs
return Seq2SeqModelOutput(
last_hidden_state=decoder_outputs.last_hidden_state,
past_key_values=decoder_outputs.past_key_values,
decoder_hidden_states=decoder_outputs.hidden_states,
decoder_attentions=decoder_outputs.attentions,
cross_attentions=decoder_outputs.cross_attentions,
encoder_last_hidden_state=encoder_outputs.last_hidden_state,
encoder_hidden_states=encoder_outputs.hidden_states,
encoder_attentions=encoder_outputs.attentions,
)
@add_start_docstrings(
"Transformer text-to-unit encoder-decoder with a language model head. The base encoder-decoder model is a [`SeamlessM4TTextToUnit`].",
SEAMLESS_M4T_START_DOCSTRING,
"""
embed_tokens_decoder (`nn.Embedding`, *optional*): input embedding of the decoder.
""",
)
class SeamlessM4TTextToUnitForConditionalGeneration(SeamlessM4TPreTrainedModel):
_keys_to_ignore_on_load_missing = [
"vocoder",
"speech_encoder",
"text_encoder",
"text_decoder",
]
_tied_weights_keys = ["decoder.embed_tokens.weight", "lm_head.weight"]
def __init__(
self,
config: SeamlessM4TConfig,
embed_tokens_decoder: Optional[nn.Embedding] = None,
):
config = copy.deepcopy(config)
for param, val in config.to_dict().items():
if param.startswith("t2u_"):
config.__setattr__(param[4:], val)
super().__init__(config)
self.model = SeamlessM4TTextToUnitModel(config, embed_tokens_decoder)
self.lm_head = nn.Linear(config.hidden_size, config.t2u_vocab_size, bias=False)
self.post_init()
def get_encoder(self):
return self.model.encoder
def get_decoder(self):
return self.model.decoder
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def get_input_embeddings(self):
return self.model.decoder.embed_tokens
def set_input_embeddings(self, value):
self.model.decoder.embed_tokens = value
@add_start_docstrings_to_model_forward(M4T_TEXT_INPUTS_DOCSTRING)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
if past_key_values is not None:
decoder_input_ids = decoder_input_ids[:, -1:]
return {
"input_ids": None,
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"use_cache": use_cache,
}
def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
return shift_tokens_right(labels, self.config.t2u_pad_token_id, self.config.t2u_decoder_start_token_id)
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
)
return reordered_past
def _tie_weights(self) -> None:
if getattr(self.config, "tie_word_embeddings", True):
output_embeddings = self.get_output_embeddings()
if output_embeddings is not None:
self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
HIFIGAN_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`SeamlessM4TConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
class HifiGanResidualBlock(nn.Module):
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1):
super().__init__()
self.leaky_relu_slope = leaky_relu_slope
self.convs1 = nn.ModuleList(
[
nn.Conv1d(
channels,
channels,
kernel_size,
stride=1,
dilation=dilation[i],
padding=self.get_padding(kernel_size, dilation[i]),
)
for i in range(len(dilation))
]
)
self.convs2 = nn.ModuleList(
[
nn.Conv1d(
channels,
channels,
kernel_size,
stride=1,
dilation=1,
padding=self.get_padding(kernel_size, 1),
)
for _ in range(len(dilation))
]
)
def get_padding(self, kernel_size, dilation=1):
return (kernel_size * dilation - dilation) // 2
def apply_weight_norm(self):
for layer in self.convs1:
nn.utils.weight_norm(layer)
for layer in self.convs2:
nn.utils.weight_norm(layer)
def remove_weight_norm(self):
for layer in self.convs1:
nn.utils.remove_weight_norm(layer)
for layer in self.convs2:
nn.utils.remove_weight_norm(layer)
def forward(self, hidden_states):
for conv1, conv2 in zip(self.convs1, self.convs2):
residual = hidden_states
hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
hidden_states = conv1(hidden_states)
hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
hidden_states = conv2(hidden_states)
hidden_states = hidden_states + residual
return hidden_states
class SeamlessM4TVariancePredictor(nn.Module):
def __init__(self, config):
super().__init__()
embed_dim = config.unit_embed_dim
kernel_size = config.variance_predictor_kernel_size
var_pred_dropout = config.var_pred_dropout
self.conv1 = nn.Conv1d(
embed_dim,
embed_dim,
kernel_size=kernel_size,
padding=(kernel_size - 1) // 2,
)
self.activation_fuction = nn.ReLU()
self.ln1 = nn.LayerNorm(embed_dim)
self.dropout_module = nn.Dropout(p=var_pred_dropout)
self.conv2 = nn.Conv1d(
embed_dim,
embed_dim,
kernel_size=kernel_size,
padding=1,
)
self.ln2 = nn.LayerNorm(embed_dim)
self.proj = nn.Linear(embed_dim, 1)
def forward(self, hidden_states: Tensor) -> Tensor:
hidden_states = self.conv1(hidden_states.transpose(1, 2))
hidden_states = self.activation_fuction(hidden_states).transpose(1, 2)
hidden_states = self.dropout_module(self.ln1(hidden_states))
hidden_states = self.conv2(hidden_states.transpose(1, 2))
hidden_states = self.activation_fuction(hidden_states).transpose(1, 2)
hidden_states = self.dropout_module(self.ln2(hidden_states))
return self.proj(hidden_states).squeeze(dim=2)
class SeamlessM4THifiGan(nn.Module):
def __init__(self, config: SeamlessM4TConfig):
super().__init__()
model_in_dim = config.unit_embed_dim + config.lang_embed_dim + config.spkr_embed_dim
self.leaky_relu_slope = config.leaky_relu_slope
self.num_kernels = len(config.resblock_kernel_sizes)
self.num_upsamples = len(config.upsample_rates)
self.conv_pre = nn.Conv1d(
model_in_dim,
config.upsample_initial_channel,
kernel_size=7,
stride=1,
padding=3,
)
self.upsampler = nn.ModuleList()
for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
self.upsampler.append(
nn.ConvTranspose1d(
config.upsample_initial_channel // (2**i),
config.upsample_initial_channel // (2 ** (i + 1)),
kernel_size=kernel_size,
stride=upsample_rate,
padding=(kernel_size - upsample_rate) // 2,
)
)
self.resblocks = nn.ModuleList()
for i in range(len(self.upsampler)):
channels = config.upsample_initial_channel // (2 ** (i + 1))
for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes):
self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation, config.leaky_relu_slope))
self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3)
def forward(self, input_embeds: torch.FloatTensor) -> torch.FloatTensor:
r"""
Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
waveform.
Args:
spectrogram (`torch.FloatTensor`):
Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
model_in_dim)`, or un-batched and of shape `(sequence_length, model_in_dim)`. Note that `model_in_dim`
is the sum of `config.unit_embed_dim`, `config.lang_embed_dim` and `config.spkr_embed_dim`.
Returns:
`torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
"""
hidden_states = self.conv_pre(input_embeds)
for i in range(self.num_upsamples):
hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
hidden_states = self.upsampler[i](hidden_states)
res_state = self.resblocks[i * self.num_kernels](hidden_states)
for j in range(1, self.num_kernels):
res_state += self.resblocks[i * self.num_kernels + j](hidden_states)
hidden_states = res_state / self.num_kernels
hidden_states = nn.functional.leaky_relu(hidden_states)
hidden_states = self.conv_post(hidden_states)
hidden_states = torch.tanh(hidden_states)
waveform = hidden_states.squeeze(1)
return waveform
@add_start_docstrings(
"""Code HiFi-GAN vocoder as described in this [repository](https://github.com/facebookresearch/speech-resynthesis).""",
HIFIGAN_START_DOCSTRING,
)
class SeamlessM4TCodeHifiGan(PreTrainedModel):
config_class = SeamlessM4TConfig
main_input_name = "input_embeds"
_no_split_modules = []
def __init__(self, config):
super().__init__(config)
self.pad_token_id = config.t2u_pad_token_id
self.dur_predictor = SeamlessM4TVariancePredictor(config)
self.unit_embedding = nn.Embedding(config.unit_hifi_gan_vocab_size, config.unit_embed_dim)
self.speaker_embedding = nn.Embedding(config.vocoder_num_spkrs, config.spkr_embed_dim)
self.language_embedding = nn.Embedding(config.vocoder_num_langs, config.lang_embed_dim)
self.hifi_gan = SeamlessM4THifiGan(config)
self.post_init()
def _get_dur_output_lengths(self, input_ids, dur_out):
"""
Computes the output length after the duration layer.
"""
unit_lengths = (input_ids != self.pad_token_id).sum(1)
unit_lengths = torch.clamp(unit_lengths, 0, dur_out.shape[1] - 1)
cumulative_dur_out = torch.cumsum(dur_out, dim=1)
unit_lengths = cumulative_dur_out.gather(dim=1, index=unit_lengths.unsqueeze(1)).squeeze()
return unit_lengths
def _get_output_hifigan_lengths(self, input_lengths: Union[torch.LongTensor, int]):
"""
Computes the output length of the hifigan convolutional layers
"""
def _conv_out_length(input_length, kernel_size, stride, pad, dilation=1):
return (
torch.div(input_length + 2 * pad - dilation * (kernel_size - 1) - 1, stride, rounding_mode="floor") + 1
)
def _transpose_conv_out_length(input_length, kernel_size, stride, pad, dilation=1):
return (input_length - 1) * stride - 2 * pad + dilation * (kernel_size - 1) + 1
input_lengths = _conv_out_length(input_lengths, 7, 1, 3)
for i, (upsample_rate, kernel_size) in enumerate(
zip(self.config.upsample_rates, self.config.upsample_kernel_sizes)
):
input_lengths = _transpose_conv_out_length(
input_lengths, kernel_size, upsample_rate, (kernel_size - upsample_rate) // 2
)
for i in range(len(self.config.upsample_rates)):
for kernel_size, dilation in zip(self.config.resblock_kernel_sizes, self.config.resblock_dilation_sizes):
for dil in dilation:
input_lengths = _conv_out_length(
input_lengths, kernel_size, 1, (kernel_size - 1) * dil // 2, dilation=dil
)
for dil in dilation:
input_lengths = _conv_out_length(input_lengths, kernel_size, 1, (kernel_size - 1) // 2, dilation=1)
input_lengths = _conv_out_length(input_lengths, 7, 1, 3)
return input_lengths
def forward(
self, input_ids: torch.LongTensor, spkr_id: torch.Tensor, lang_id: torch.Tensor
) -> Tuple[torch.Tensor]:
"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`SeamlessM4TTextToUnitForConditionalGeneration`]. [What are input
IDs?](../glossary#input-ids)
spkr_id (`int`, *optional*):
The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`.
tgt_lang (`str`, *optional*):
The language id to use as target language for translation.
"""
hidden_states = self.unit_embedding(input_ids).transpose(1, 2)
spkr = self.speaker_embedding(spkr_id).transpose(1, 2)
lang = self.language_embedding(lang_id).transpose(1, 2)
log_dur_pred = self.dur_predictor(hidden_states.transpose(1, 2))
dur_out = torch.clamp(torch.round((torch.exp(log_dur_pred) - 1)).long(), min=1)
if hidden_states.size(0) == 1:
hidden_states = torch.repeat_interleave(hidden_states, dur_out.view(-1), dim=2)
else:
if hidden_states.shape[0] > 1 and self.training:
logger.warning(
"""`self.training=True` and you use batching. You lose parallelism during the hifigan
forward pass because the samples are interleaved."""
)
hidden_states = [
torch.repeat_interleave(hidden_state, duration, dim=-1).transpose(0, 1)
for (hidden_state, duration) in zip(hidden_states, dur_out)
]
hidden_states = nn.utils.rnn.pad_sequence(hidden_states, batch_first=True).transpose(1, 2)
spkr = spkr.repeat(1, 1, hidden_states.shape[-1])
lang = lang.repeat(1, 1, hidden_states.shape[-1])
hidden_states = torch.cat([lang, hidden_states, spkr], dim=1)
hidden_states = self.hifi_gan(hidden_states)
unit_lengths = self._get_dur_output_lengths(input_ids, dur_out)
lengths = self._get_output_hifigan_lengths(unit_lengths)
return hidden_states, lengths
def _init_weights(self, module):
"""Initialize the weights."""
if isinstance(module, (nn.Linear, nn.Conv1d, nn.ConvTranspose1d)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
def apply_weight_norm(self):
nn.utils.weight_norm(self.hifi_gan.conv_pre)
for layer in self.hifi_gan.upsampler:
nn.utils.weight_norm(layer)
for layer in self.hifi_gan.resblocks:
layer.apply_weight_norm()
nn.utils.weight_norm(self.hifi_gan.conv_post)
def remove_weight_norm(self):
nn.utils.remove_weight_norm(self.hifi_gan.conv_pre)
for layer in self.hifi_gan.upsampler:
nn.utils.remove_weight_norm(layer)
for layer in self.hifi_gan.resblocks:
layer.remove_weight_norm()
nn.utils.remove_weight_norm(self.hifi_gan.conv_post)
@add_start_docstrings(
"The text-to-text SeamlessM4T Model transformer which can be used for T2TT.",
SEAMLESS_M4T_START_DOCSTRING,
)
class SeamlessM4TForTextToText(SeamlessM4TPreTrainedModel):
_keys_to_ignore_on_load_missing = ["speech_encoder", "t2u_model", "vocoder"]
main_input_name = "input_ids"
_tied_weights_keys = [
"lm_head.weight",
"text_encoder.embed_tokens.weight",
"text_decoder.embed_tokens.weight",
]
def __init__(self, config: SeamlessM4TConfig):
super().__init__(config)
self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
self.text_encoder = SeamlessM4TEncoder(config, self.shared)
self.text_decoder = SeamlessM4TDecoder(config, self.shared)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.post_init()
def get_encoder(self):
return self.text_encoder
def get_decoder(self):
return self.text_decoder
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def get_input_embeddings(self):
return self.text_decoder.embed_tokens
def set_input_embeddings(self, value):
self.text_encoder.embed_tokens = value
self.text_decoder.embed_tokens = value
self.shared = value
def _tie_weights(self):
if self.config.tie_word_embeddings:
self._tie_or_clone_weights(self.text_encoder.embed_tokens, self.shared)
self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
self._tie_or_clone_weights(self.lm_head, self.shared)
@add_start_docstrings_to_model_forward(M4T_TEXT_INPUTS_DOCSTRING)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs,
):
pass
def generate(
self,
input_ids=None,
tgt_lang=None,
generation_config=None,
logits_processor=None,
stopping_criteria=None,
prefix_allowed_tokens_fn=None,
synced_gpus=False,
**kwargs,
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
if past_key_values is not None:
decoder_input_ids = decoder_input_ids[:, -1:]
return {
"input_ids": None,
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"use_cache": use_cache,
}
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
)
return reordered_past
@add_start_docstrings(
"The speech-to-text SeamlessM4T Model transformer which can be used for S2TT.",
SEAMLESS_M4T_START_DOCSTRING,
)
class SeamlessM4TForSpeechToText(SeamlessM4TPreTrainedModel):
_keys_to_ignore_on_load_missing = ["text_decoder", "t2u_model", "vocoder"]
main_input_name = "input_features"
_tied_weights_keys = [
"lm_head.weight",
"text_decoder.embed_tokens.weight",
]
def __init__(self, config: SeamlessM4TConfig):
super().__init__(config)
self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
self.speech_encoder = SeamlessM4TSpeechEncoder(config)
self.text_decoder = SeamlessM4TDecoder(config, self.shared)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.post_init()
def get_encoder(self):
return self.speech_encoder
def get_decoder(self):
return self.text_decoder
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def get_input_embeddings(self):
return self.text_decoder.embed_tokens
def set_input_embeddings(self, value):
self.text_decoder.embed_tokens = value
def _tie_weights(self):
if self.config.tie_word_embeddings:
self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
self._tie_or_clone_weights(self.lm_head, self.shared)
@add_start_docstrings_to_model_forward(M4T_SPEECH_INPUTS_DOCSTRING)
def forward(
self,
input_features: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs,
):
"""
Forward pass of the SeamlessM4T model for speech-to-text tasks.
"""
def generate(
self,
input_features=None,
tgt_lang=None,
generation_config=None,
logits_processor=None,
stopping_criteria=None,
prefix_allowed_tokens_fn=None,
synced_gpus=False,
**kwargs,
):
"""
Generate sequences based on input features and configuration.
"""
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
"""
Prepare inputs for sequence generation based on given parameters.
"""
):
if past_key_values is not None:
decoder_input_ids = decoder_input_ids[:, -1:]
return {
"input_ids": None,
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"use_cache": use_cache,
}
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
)
return reordered_past
@add_start_docstrings(
"The text-to-speech SeamlessM4T Model transformer which can be used for T2ST.",
SEAMLESS_M4T_START_DOCSTRING,
)
class SeamlessM4TForTextToSpeech(SeamlessM4TPreTrainedModel):
_keys_to_ignore_on_load_missing = ["speech_encoder"]
main_input_name = "input_ids"
_tied_weights_keys = [
"lm_head.weight",
"text_encoder.embed_tokens.weight",
"text_decoder.embed_tokens.weight",
]
def __init__(self, config: SeamlessM4TConfig):
super().__init__(config)
self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
self.text_encoder = SeamlessM4TEncoder(config, self.shared)
self.text_decoder = SeamlessM4TDecoder(config, self.shared)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.post_init()
self.t2u_model = SeamlessM4TTextToUnitForConditionalGeneration(config)
self.vocoder = SeamlessM4TCodeHifiGan(config)
def get_encoder(self):
return self.text_encoder
def get_decoder(self):
return self.text_decoder
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def get_input_embeddings(self):
return self.text_decoder.embed_tokens
def set_input_embeddings(self, value):
self.text_encoder.embed_tokens = value
self.text_decoder.embed_tokens = value
self.shared = value
def _tie_weights(self):
if self.config.tie_word_embeddings:
self._tie_or_clone_weights(self.text_encoder.embed_tokens, self.shared)
self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
self._tie_or_clone_weights(self.lm_head, self.shared)
@add_start_docstrings_to_model_forward(M4T_TEXT_INPUTS_DOCSTRING)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
@torch.no_grad()
def generate(
self,
input_ids: Optional[torch.Tensor] = None,
return_intermediate_token_ids: Optional[bool] = None,
tgt_lang: Optional[str] = None,
spkr_id: Optional[int] = 0,
**kwargs,
):
if past_key_values is not None:
decoder_input_ids = decoder_input_ids[:, -1:]
return {
"input_ids": None,
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"use_cache": use_cache,
}
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
)
return reordered_past
@add_start_docstrings(
"The speech-to-speech SeamlessM4T Model transformer which can be used for S2ST.",
SEAMLESS_M4T_START_DOCSTRING,
)
class SeamlessM4TForSpeechToSpeech(SeamlessM4TPreTrainedModel):
_keys_to_ignore_on_load_missing = ["text_encoder"]
main_input_name = "input_features"
_tied_weights_keys = [
"lm_head.weight",
"text_decoder.embed_tokens.weight",
]
def __init__(self, config):
super().__init__(config)
self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
self.speech_encoder = SeamlessM4TSpeechEncoder(config)
self.text_decoder = SeamlessM4TDecoder(config, self.shared)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.post_init()
self.t2u_model = SeamlessM4TTextToUnitForConditionalGeneration(config)
self.vocoder = SeamlessM4TCodeHifiGan(config)
def get_encoder(self):
return self.speech_encoder
def get_decoder(self):
return self.text_decoder
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def get_input_embeddings(self):
return self.text_decoder.embed_tokens
def set_input_embeddings(self, value):
self.text_decoder.embed_tokens = value
def _tie_weights(self):
if self.config.tie_word_embeddings:
self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
self._tie_or_clone_weights(self.lm_head, self.shared)
@add_start_docstrings_to_model_forward(M4T_SPEECH_INPUTS_DOCSTRING)
def forward(
self,
input_features: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs,
):
pass
@torch.no_grad()
def generate(
self,
input_features: Optional[torch.Tensor] = None,
return_intermediate_token_ids: Optional[bool] = None,
tgt_lang: Optional[str] = None,
spkr_id: Optional[int] = 0,
**kwargs,
):
pass
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
)
return reordered_past
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
if past_key_values is not None:
decoder_input_ids = decoder_input_ids[:, -1:]
return {
"input_ids": None,
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"use_cache": use_cache,
}
@add_start_docstrings(
"The original SeamlessM4T Model transformer which can be used for every tasks available (S2ST, S2TT, T2TT, T2ST).",
SEAMLESS_M4T_START_DOCSTRING,
"""
current_modality (`str`, *optional*, defaults to `"text"`):
Default modality. Used to initialize the model.
""",
)
class SeamlessM4TModel(SeamlessM4TPreTrainedModel):
_tied_weights_keys = [
"lm_head.weight",
"text_encoder.embed_tokens.weight",
"text_decoder.embed_tokens.weight",
]
def __init__(self, config, current_modality="text"):
super().__init__(config)
self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
self.text_encoder = SeamlessM4TEncoder(config, self.shared)
self.speech_encoder = SeamlessM4TSpeechEncoder(config)
self.text_decoder = SeamlessM4TDecoder(config, self.shared)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.post_init()
self.current_modality = current_modality
if current_modality == "speech":
self.main_input_name = "input_features"
self.t2u_model = SeamlessM4TTextToUnitForConditionalGeneration(config)
self.vocoder = SeamlessM4TCodeHifiGan(config)
def set_modality(self, modality="text"):
if modality == "text":
self.main_input_name = "input_ids"
self.current_modality = "text"
elif modality == "speech":
self.main_input_name = "input_features"
self.current_modality = "speech"
else:
raise ValueError(f"`modality={modality}` is not a valid modality. It must be `text` or `speech`.")
def get_encoder(self):
if self.current_modality == "text":
return self.text_encoder
else:
return self.speech_encoder
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def get_input_embeddings(self):
return self.text_decoder.embed_tokens
def set_input_embeddings(self, value):
self.text_encoder.embed_tokens = value
self.text_decoder.embed_tokens = value
self.shared = value
def _tie_weights(self):
if self.config.tie_word_embeddings:
self._tie_or_clone_weights(self.text_encoder.embed_tokens, self.shared)
self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
self._tie_or_clone_weights(self.lm_head, self.shared)
@add_start_docstrings_to_model_forward(M4T_MODEL_INPUTS_DOCSTRING)
@torch.no_grad()
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
input_features: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs,
):
def generate(
self,
input_ids: Optional[torch.Tensor] = None,
input_features: Optional[torch.Tensor] = None,
return_intermediate_token_ids: Optional[bool] = None,
tgt_lang: Optional[str] = None,
spkr_id: Optional[int] = 0,
generate_speech: Optional[bool] = True,
**kwargs,
):
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
if past_key_values is not None:
decoder_input_ids = decoder_input_ids[:, -1:]
return {
"input_ids": None,
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"use_cache": use_cache,
}
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
)
return reordered_past