Note This project is currently in development.
Speech AI training and inference tools
git clone https://github.com/ddPn08/Latopia.git
cd Latopia
python -m venv venv
source venv/bin/activate
pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu118
pip install -r requirements.txt
python setup.py develop
latopia webui
- Create config files
# ./configs/zundamon/config.toml
pretrained_model_path = "./models/checkpoints/vits/pretrained-40k.g.safetensors"
pretrained_discriminator_path = "./models/checkpoints/vits/pretrained-40k.d.safetensors"
cache_in_gpu = true
output_name = "zundamon"
output_dir = "./output"
mixed_precision = "fp16"
batch_size = 32
save_every_n_epoch = 5
# ./configs/zundamon/dataset.toml
[[subsets]]
data_dir = "./datasets/zundamon_voice_data/emotion/normal"
- Preprocess dataset
latopia preprocess all ./configs/zundamon/dataset.toml 40000 --max_workers 8 --device cuda --f0_method harvest
- Train
latopia train vits ./configs/zundamon/config.toml --device cuda
latopia infer vits ./input.wav ./output.wav ./output/checkpoints/zundamon-30-G.safetensors ./models/encoders/checkpoint_best_legacy_500.pt --device cuda --torch_dtype fp16 --f0_method harvest