Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
208 changes: 206 additions & 2 deletions TTS/server/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,35 @@ def convert_boolean(x):
parser.add_argument("--use_cuda", type=convert_boolean, default=False, help="true to use CUDA.")
parser.add_argument("--debug", type=convert_boolean, default=False, help="true to enable Flask debug mode.")
parser.add_argument("--show_details", type=convert_boolean, default=False, help="Generate model detail page.")

# --- NEW: option serveur pour la vitesse/rythme par défaut (VITS) ---
# length_scale < 1.0 = plus rapide ; > 1.0 = plus lent
parser.add_argument(
"--length_scale_default",
type=float,
default=1.0,
help="Default VITS length_scale. Smaller=faster, larger=slower.",
)
# --------------------------------------------------------------------

# --- NEW: options serveur pour le contrôle de la variabilité à l'inférence ---
# inference_noise_scale: variation prosodique globale au décodeur
# inference_noise_scale_dp: variation sur le duration predictor (rythme local)
# Par défaut None = ne pas forcer et laisser la valeur du modèle.
parser.add_argument(
"--inference_noise_scale_default",
type=float,
default=None,
help="Default inference_noise_scale. Higher=more variation. None keeps model default.",
)
parser.add_argument(
"--inference_noise_scale_dp_default",
type=float,
default=None,
help="Default inference_noise_scale_dp (duration predictor). Higher=more timing variation. None keeps model default.",
)
# ------------------------------------------------------------------------------

return parser


Expand Down Expand Up @@ -127,6 +156,126 @@ def convert_boolean(x):
use_gst = synthesizer.tts_config.get("use_gst", False)
app = Flask(__name__)

# --- NEW: helpers pour length_scale côté requête ---
# On lit un éventuel paramètre de requête/entête et on applique sur le modèle VITS.
# Acceptés: header "length-scale" ou "length_scale", champs GET/POST "length_scale".
def _read_length_scale_from_request() -> Union[None, float]:
val = (
request.headers.get("length-scale")
or request.headers.get("length_scale")
or request.values.get("length_scale")
)
if val is None or val == "":
return None
try:
return float(val)
except Exception:
return None # on ignore silencieusement si non numérique


def _apply_length_scale_temporarily(ls: Union[None, float]):
"""
Applique length_scale sur tts_model si possible et renvoie un callback de reset.
- Si ls est None: on applique la valeur par défaut serveur.
- Si le modèle ne possède pas 'length_scale': on ne fait rien.
"""
# valeur à appliquer pour cette synthèse
target = args.length_scale_default if ls is None else ls

if hasattr(synthesizer.tts_model, "length_scale"):
# sauvegarde pour reset après synthèse
old = synthesizer.tts_model.length_scale
synthesizer.tts_model.length_scale = target

def _reset():
try:
synthesizer.tts_model.length_scale = old
except Exception:
pass

return _reset
else:
# pas de support length_scale sur ce modèle
def _noop():
return None

return _noop
# ---------------------------------------------------

# --- NEW: helpers pour inference_noise_scale et inference_noise_scale_dp ---
# Lecture depuis headers/params et application temporaire avec reset après inférence.
def _read_float_from_request(*keys) -> Union[None, float]:
"""
Lit la première clé disponible dans headers ou params et tente un float.
Retourne None si absente ou invalide.
"""
for k in keys:
v = request.headers.get(k)
if v is None or v == "":
v = request.values.get(k)
if v not in (None, ""):
try:
return float(v)
except Exception:
return None
return None


def _apply_inference_noise_scale_temporarily(val: Union[None, float]):
"""
Applique inference_noise_scale pour cette requête.
- Si val est None: utilise --inference_noise_scale_default s'il est fourni, sinon ne force rien.
"""
to_apply = val if val is not None else args.inference_noise_scale_default
if to_apply is None:
# rien à faire
def _noop():
return None
return _noop

if hasattr(synthesizer.tts_model, "inference_noise_scale"):
old = synthesizer.tts_model.inference_noise_scale
synthesizer.tts_model.inference_noise_scale = to_apply

def _reset():
try:
synthesizer.tts_model.inference_noise_scale = old
except Exception:
pass
return _reset
else:
def _noop():
return None
return _noop


def _apply_inference_noise_scale_dp_temporarily(val: Union[None, float]):
"""
Applique inference_noise_scale_dp pour cette requête.
- Si val est None: utilise --inference_noise_scale_dp_default s'il est fourni, sinon ne force rien.
"""
to_apply = val if val is not None else args.inference_noise_scale_dp_default
if to_apply is None:
def _noop():
return None
return _noop

if hasattr(synthesizer.tts_model, "inference_noise_scale_dp"):
old = synthesizer.tts_model.inference_noise_scale_dp
synthesizer.tts_model.inference_noise_scale_dp = to_apply

def _reset():
try:
synthesizer.tts_model.inference_noise_scale_dp = old
except Exception:
pass
return _reset
else:
def _noop():
return None
return _noop
# ---------------------------------------------------------------------------


def style_wav_uri_to_dict(style_wav: str) -> Union[str, dict]:
"""Transform an uri style_wav, in either a string (path to wav file to be use for style transfer)
Expand Down Expand Up @@ -197,10 +346,33 @@ def tts():
style_wav = request.headers.get("style-wav") or request.values.get("style_wav", "")
style_wav = style_wav_uri_to_dict(style_wav)

# --- NEW: lecture et application temporaire du length_scale ---
# Permet de contrôler la vitesse/rythme depuis la requête.
req_ls = _read_length_scale_from_request()
_reset_length_scale = _apply_length_scale_temporarily(req_ls)
# --------------------------------------------------------------

# --- NEW: lecture et application des bruits d'inférence ---
# Headers/params acceptés:
# - inference-noise-scale, inference_noise_scale
# - inference-noise-scale-dp, inference_noise_scale_dp
req_ins = _read_float_from_request("inference-noise-scale", "inference_noise_scale")
req_ins_dp = _read_float_from_request("inference-noise-scale-dp", "inference_noise_scale_dp")
_reset_ins = _apply_inference_noise_scale_temporarily(req_ins)
_reset_ins_dp = _apply_inference_noise_scale_dp_temporarily(req_ins_dp)
# -----------------------------------------------------------

print(f" > Model input: {text}")
print(f" > Speaker Idx: {speaker_idx}")
print(f" > Language Idx: {language_idx}")
wavs = synthesizer.tts(text, speaker_name=speaker_idx, language_name=language_idx, style_wav=style_wav)
try:
wavs = synthesizer.tts(text, speaker_name=speaker_idx, language_name=language_idx, style_wav=style_wav)
finally:
# --- NEW: on rétablit les valeurs précédentes après synthèse ---
_reset_length_scale()
_reset_ins()
_reset_ins_dp()
# ---------------------------------------------------------------
out = io.BytesIO()
synthesizer.save_wav(wavs, out)
return send_file(out, mimetype="audio/wav")
Expand Down Expand Up @@ -241,10 +413,42 @@ def mary_tts_api_process():
data = parse_qs(request.get_data(as_text=True))
# NOTE: we ignore param. LOCALE and VOICE for now since we have only one active model
text = data.get("INPUT_TEXT", [""])[0]
# --- NEW: support length_scale en POST MaryTTS (optionnel) ---
# Si un client envoie length_scale dans le form-url-encoded, on le lit ici.
ls_str = data.get("length_scale", [None])[0]
req_ls = float(ls_str) if ls_str not in (None, "") else None

# --- NEW: support des bruits d'inférence en POST MaryTTS ---
ins_str = data.get("inference_noise_scale", [None])[0]
ins_dp_str = data.get("inference_noise_scale_dp", [None])[0]
req_ins = float(ins_str) if ins_str not in (None, "") else None
req_ins_dp = float(ins_dp_str) if ins_dp_str not in (None, "") else None
# -----------------------------------------------------------
else:
text = request.args.get("INPUT_TEXT", "")
# --- NEW: support length_scale en GET MaryTTS (optionnel) ---
req_ls = _read_length_scale_from_request()
# --- NEW: support des bruits d'inférence en GET MaryTTS ---
req_ins = _read_float_from_request("inference-noise-scale", "inference_noise_scale")
req_ins_dp = _read_float_from_request("inference-noise-scale-dp", "inference_noise_scale_dp")
# ------------------------------------------------------------

# --- NEW: application temporaire du length_scale ---
_reset_length_scale = _apply_length_scale_temporarily(req_ls)
# --- NEW: application temporaire des bruits d'inférence ---
_reset_ins = _apply_inference_noise_scale_temporarily(req_ins)
_reset_ins_dp = _apply_inference_noise_scale_dp_temporarily(req_ins_dp)
# ---------------------------------------------------

print(f" > Model input: {text}")
wavs = synthesizer.tts(text)
try:
wavs = synthesizer.tts(text)
finally:
# --- NEW: reset après synthèse ---
_reset_length_scale()
_reset_ins()
_reset_ins_dp()
# ---------------------------------
out = io.BytesIO()
synthesizer.save_wav(wavs, out)
return send_file(out, mimetype="audio/wav")
Expand Down
Loading
Loading