How to train xtts-v2 from scratch? #4318

YangGao1001 · 2025-07-25T08:56:49Z

YangGao1001
Jul 25, 2025

I want to train xtts-v2 from scratch using my own dataset. Which training script should I use?
I tried using the train_gpt_xtts.py under the path recipes/ljSpeech/xtts-v2 and changed the xtts_checkpoint to None, but the final trained model parameters could not generate normal sound, only weak metal buzzing. May I ask where the problem lies?
Here are my training code and inference code.

training code：

`import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2"
from trainer import Trainer, TrainerArgs

from TTS.config.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig
from TTS.tts.models.xtts import XttsAudioConfig
from TTS.utils.manage import ModelManager
from TTS .tts.datasets.formatters import Emilia # gy

Logging parameters

RUN_NAME = "XTTS-v2-train-Emilia-JA"
PROJECT_NAME = "XTTS_trainer"
DASHBOARD_LOGGER = "tensorboard"
LOGGER_URI = None

Set here the path that the checkpoints will be saved. Default: ./run/training/

OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(file)), "run", "training-Emilia-JA")

OUT_PATH = "/data/gy/xtts_v2/run/training-Emilia-JA"

Training Parameters

OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False # gy
START_WITH_EVAL = True # if True it will star with evaluation
BATCH_SIZE = 3 # set here the batch size
GRAD_ACUMM_STEPS = 84 # set here the grad accumulation steps

Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly.

Define here the dataset that you want to use for the fine-tuning on.

config_dataset = BaseDatasetConfig(
formatter="Emilia",
dataset_name="Emilia_JA",
path="/data/gy/DataSet/Amphion___Emilia/raw/JA",
meta_file_train_list=["/data/gy/DataSet/Amphion___Emilia/raw/JA/JA_B00000.jsonl",
"/data/gy/DataSet/Amphion___Emilia/raw/JA/JA_B00001.jsonl",
"/data/gy/DataSet/Amphion___Emilia/raw/JA/JA_B00002.jsonl",
"/data/gy/DataSet/Amphion___Emilia/raw/JA/JA_B00003.jsonl",
"/data/gy/DataSet/Amphion___Emilia/raw/JA/JA_B00004.jsonl"],
meta_file_val_list=["/data/gy/DataSet/Amphion___Emilia//raw/JA/JA_B00005.jsonl"],
language="ja",
)

Add here the configs of the datasets

DATASETS_CONFIG_LIST = [config_dataset]

CHECKPOINTS_OUT_PATH = "/home/gaoyang/code/coqui-ai-TTS/coqui/XTTS-v2"
DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, "dvae.pth")
MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, "mel_stats.pth")
TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, "vocab.json") # vocab.json file

XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, "model.pth") # model.pth file

Training sentences generations

SPEAKER_REFERENCE = [
"/data/gy/DataSet/Amphion___Emilia/raw/JA/JA_B00005/JA_B00005_S00215/mp3/JA_B00005_S00215_W000000.wav" # speaker reference to be used in training test sentences
]
LANGUAGE = config_dataset.language

def main():
# init args and config
model_args = GPTArgs(
max_conditioning_length=132300, # 6 secs
min_conditioning_length=66150, # 3 secs
debug_loading_failures=False,
max_wav_length=255995, # ~11.6 seconds
max_text_length=200,
mel_norm_file=MEL_NORM_FILE,
dvae_checkpoint=DVAE_CHECKPOINT,
xtts_checkpoint=None, # checkpoint path of the model that you want to fine-tune
tokenizer_file=TOKENIZER_FILE,
gpt_num_audio_tokens=1026,
gpt_start_audio_token=1024,
gpt_stop_audio_token=1025,
gpt_use_masking_gt_prompt_approach=True,
gpt_use_perceiver_resampler=True,
)
# define audio config
audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
# training parameters config
config = GPTTrainerConfig(
output_path=OUT_PATH,
model_args=model_args,
run_name=RUN_NAME,
project_name=PROJECT_NAME,
run_description="""
GPT XTTS training
""",
dashboard_logger=DASHBOARD_LOGGER,
logger_uri=LOGGER_URI,
audio=audio_config,
batch_size=BATCH_SIZE,
batch_group_size=48,
eval_batch_size=BATCH_SIZE,
num_loader_workers=8,
eval_split_max_size=256,
eval_split_size=0.1, # gy
print_step=50,
plot_step=100,
log_model_step=1000,
save_step=10000,
save_n_checkpoints=3,
save_checkpoints=True,
# target_loss="loss",
print_eval=False,
# Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
optimizer="AdamW",
optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
lr=5e-06, # learning rate
lr_scheduler="MultiStepLR",
# it was adjusted accordly for the new step scheme
lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
test_sentences=[
{
"text": "乾杯！それは知っていて、寂しさは秋草に向かって、悲風は千里に来ます",
"speaker_wav": SPEAKER_REFERENCE,
"language": LANGUAGE,
},
{
"text": "私の父は天地の間で初めて私を愛してくれた人です",
"speaker_wav": SPEAKER_REFERENCE,
"language": LANGUAGE,
},
],
)

# init the model from config
model = GPTTrainer.init_from_config(config)

# load training samples
train_samples, eval_samples = load_tts_samples(
    DATASETS_CONFIG_LIST,
    eval_split=True,
    formatter=Emilia
)


# init the trainer and 🚀
trainer = Trainer(
    TrainerArgs(
        restore_path=None,  # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
        skip_train_epoch=False,
        start_with_eval=START_WITH_EVAL,
        grad_accum_steps=GRAD_ACUMM_STEPS,
    ),
    config,
    output_path=OUT_PATH,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
)
print(len(eval_samples))  # gy
trainer.fit()

if name == "main":
main()
`

inference code:

`import os
import torch
import torchaudio
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
import csv

print("Loading model...")
config = XttsConfig()
config.load_json("/data/gy/xtts_v2/run/training-Emilia-JA/XTTS-v2-train-Emilia-JA-July-21-2025_02+45PM-0000000/config.json")
model = Xtts.init_from_config(config)
checkpoint = model.load_checkpoint(config, checkpoint_dir="/data/gy/xtts_v2/run/training-Emilia-JA/XTTS-v2-train-Emilia-JA-July-21-2025_02+45PM-0000000", use_deepspeed=False)
model.cuda()

输入文本文件路径

input_text_file = "/home/gaoyang/code/coqui-ai-TTS/demos/text/JA/text.txt"

输出音频文件夹路径

output_dir = "/data/gy/xtts_v2/run/training-Emilia-JA/XTTS-v2-train-Emilia-JA-July-21-2025_02+45PM-0000000/test/ja_ja/best_model_628943"

参考音频路径（统一使用一个参考音频）

ref_wav_path = "/data/gy/DataSet/Amphion___Emilia/raw/JA/JA_B00001/JA_B00001_S00000/mp3/JA_B00001_S00000_W000002.wav"

print("Computing speaker latents...")
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path = ref_wav_path)

创建输出目录（如果不存在）

os.makedirs(output_dir, exist_ok=True)

CSV 文件路径

csv_path = os.path.join(output_dir, "custom_generated_sentences.csv")

打开 CSV 文件并写入表头

with open(csv_path, 'w', encoding='utf-8', newline='') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(["speaker_reference", "text", "generated_wav", "language"])

# 读取文本文件并逐行处理
with open(input_text_file, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        text = line.strip()
        if not text:
            continue

        output_file = os.path.join(output_dir, f"{i+1}.wav")
        print(f"[{i+1}] Processing: {text}")

        try:
            # 合成语音并保存
            out = model.inference(
                    text=text,
                    language="ja",
                    gpt_cond_latent = gpt_cond_latent,
                    speaker_embedding = speaker_embedding,
                    temperature=0.65,
                    length_penalty=1.0,
                    repetition_penalty=10.0,
                    top_k=50,
                    top_p=0.85
                )
            torchaudio.save(output_file, torch.tensor(out["wav"]).unsqueeze(0), 24000)
            print(f"✅ Saved to {output_file}")

            # 写入 CSV
            writer.writerow([ref_wav_path, text, output_file, "ja"])

        except Exception as e:
            print(f"❌ Line {i+1} synthesis failed: {e}")
            writer.writerow([ref_wav_path, text, "", "ja"])

print("🎉 所有音频生成和 CSV 写入完成，输出目录:", output_dir)`

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

How to train xtts-v2 from scratch? #4318

Uh oh!

{{title}}

Uh oh!

Replies: 0 comments

Select a reply

Uh oh!

How to train xtts-v2 from scratch? #4318

Uh oh!

YangGao1001 Jul 25, 2025

Logging parameters

Set here the path that the checkpoints will be saved. Default: ./run/training/

OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(file)), "run", "training-Emilia-JA")

Training Parameters

Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly.

Define here the dataset that you want to use for the fine-tuning on.

Add here the configs of the datasets

XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, "model.pth") # model.pth file

Training sentences generations

输入文本文件路径

输出音频文件夹路径

参考音频路径（统一使用一个参考音频）

创建输出目录（如果不存在）

CSV 文件路径

打开 CSV 文件并写入表头

Replies: 0 comments

YangGao1001
Jul 25, 2025