Skip to content

Commit a6ff354

Browse files
authored
feat(tts): add pocket-tts backend (#8018)
* feat(pocket-tts): add new backend Signed-off-by: Ettore Di Giacinto <[email protected]> * Add to the gallery Signed-off-by: Ettore Di Giacinto <[email protected]> * fixups Signed-off-by: Ettore Di Giacinto <[email protected]> * Update docs Signed-off-by: Ettore Di Giacinto <[email protected]> --------- Signed-off-by: Ettore Di Giacinto <[email protected]>
1 parent 3a2be4d commit a6ff354

25 files changed

+847
-17
lines changed

.github/workflows/backend.yml

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,19 @@ jobs:
105105
dockerfile: "./backend/Dockerfile.python"
106106
context: "./"
107107
ubuntu-version: '2404'
108+
- build-type: 'cublas'
109+
cuda-major-version: "12"
110+
cuda-minor-version: "9"
111+
platforms: 'linux/amd64'
112+
tag-latest: 'auto'
113+
tag-suffix: '-gpu-nvidia-cuda-12-pocket-tts'
114+
runs-on: 'ubuntu-latest'
115+
base-image: "ubuntu:24.04"
116+
skip-drivers: 'false'
117+
backend: "pocket-tts"
118+
dockerfile: "./backend/Dockerfile.python"
119+
context: "./"
120+
ubuntu-version: '2404'
108121
- build-type: 'cublas'
109122
cuda-major-version: "12"
110123
cuda-minor-version: "0"
@@ -340,6 +353,19 @@ jobs:
340353
dockerfile: "./backend/Dockerfile.python"
341354
context: "./"
342355
ubuntu-version: '2404'
356+
- build-type: 'cublas'
357+
cuda-major-version: "13"
358+
cuda-minor-version: "0"
359+
platforms: 'linux/amd64'
360+
tag-latest: 'auto'
361+
tag-suffix: '-gpu-nvidia-cuda-13-pocket-tts'
362+
runs-on: 'ubuntu-latest'
363+
base-image: "ubuntu:24.04"
364+
skip-drivers: 'false'
365+
backend: "pocket-tts"
366+
dockerfile: "./backend/Dockerfile.python"
367+
context: "./"
368+
ubuntu-version: '2404'
343369
- build-type: 'cublas'
344370
cuda-major-version: "13"
345371
cuda-minor-version: "0"
@@ -405,6 +431,19 @@ jobs:
405431
backend: "vibevoice"
406432
dockerfile: "./backend/Dockerfile.python"
407433
context: "./"
434+
- build-type: 'l4t'
435+
cuda-major-version: "13"
436+
cuda-minor-version: "0"
437+
platforms: 'linux/arm64'
438+
tag-latest: 'auto'
439+
tag-suffix: '-nvidia-l4t-cuda-13-arm64-pocket-tts'
440+
runs-on: 'ubuntu-24.04-arm'
441+
base-image: "ubuntu:24.04"
442+
skip-drivers: 'false'
443+
ubuntu-version: '2404'
444+
backend: "pocket-tts"
445+
dockerfile: "./backend/Dockerfile.python"
446+
context: "./"
408447
- build-type: 'l4t'
409448
cuda-major-version: "13"
410449
cuda-minor-version: "0"
@@ -641,6 +680,19 @@ jobs:
641680
dockerfile: "./backend/Dockerfile.python"
642681
context: "./"
643682
ubuntu-version: '2404'
683+
- build-type: 'hipblas'
684+
cuda-major-version: ""
685+
cuda-minor-version: ""
686+
platforms: 'linux/amd64'
687+
tag-latest: 'auto'
688+
tag-suffix: '-gpu-rocm-hipblas-pocket-tts'
689+
runs-on: 'arc-runner-set'
690+
base-image: "rocm/dev-ubuntu-24.04:6.4.4"
691+
skip-drivers: 'false'
692+
backend: "pocket-tts"
693+
dockerfile: "./backend/Dockerfile.python"
694+
context: "./"
695+
ubuntu-version: '2404'
644696
- build-type: 'hipblas'
645697
cuda-major-version: ""
646698
cuda-minor-version: ""
@@ -772,6 +824,19 @@ jobs:
772824
dockerfile: "./backend/Dockerfile.python"
773825
context: "./"
774826
ubuntu-version: '2204'
827+
- build-type: 'l4t'
828+
cuda-major-version: "12"
829+
cuda-minor-version: "0"
830+
platforms: 'linux/arm64'
831+
tag-latest: 'auto'
832+
tag-suffix: '-nvidia-l4t-pocket-tts'
833+
runs-on: 'ubuntu-24.04-arm'
834+
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
835+
skip-drivers: 'true'
836+
backend: "pocket-tts"
837+
dockerfile: "./backend/Dockerfile.python"
838+
context: "./"
839+
ubuntu-version: '2204'
775840
- build-type: 'l4t'
776841
cuda-major-version: "12"
777842
cuda-minor-version: "0"
@@ -825,6 +890,19 @@ jobs:
825890
dockerfile: "./backend/Dockerfile.python"
826891
context: "./"
827892
ubuntu-version: '2404'
893+
- build-type: 'intel'
894+
cuda-major-version: ""
895+
cuda-minor-version: ""
896+
platforms: 'linux/amd64'
897+
tag-latest: 'auto'
898+
tag-suffix: '-gpu-intel-pocket-tts'
899+
runs-on: 'arc-runner-set'
900+
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
901+
skip-drivers: 'false'
902+
backend: "pocket-tts"
903+
dockerfile: "./backend/Dockerfile.python"
904+
context: "./"
905+
ubuntu-version: '2404'
828906
- build-type: 'intel'
829907
cuda-major-version: ""
830908
cuda-minor-version: ""
@@ -1278,6 +1356,19 @@ jobs:
12781356
dockerfile: "./backend/Dockerfile.python"
12791357
context: "./"
12801358
ubuntu-version: '2404'
1359+
- build-type: ''
1360+
cuda-major-version: ""
1361+
cuda-minor-version: ""
1362+
platforms: 'linux/amd64,linux/arm64'
1363+
tag-latest: 'auto'
1364+
tag-suffix: '-cpu-pocket-tts'
1365+
runs-on: 'ubuntu-latest'
1366+
base-image: "ubuntu:24.04"
1367+
skip-drivers: 'false'
1368+
backend: "pocket-tts"
1369+
dockerfile: "./backend/Dockerfile.python"
1370+
context: "./"
1371+
ubuntu-version: '2404'
12811372
backend-jobs-darwin:
12821373
uses: ./.github/workflows/backend_build_darwin.yml
12831374
strategy:

.github/workflows/test-extra.yml

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,4 +265,23 @@ jobs:
265265
- name: Test moonshine
266266
run: |
267267
make --jobs=5 --output-sync=target -C backend/python/moonshine
268-
make --jobs=5 --output-sync=target -C backend/python/moonshine test
268+
make --jobs=5 --output-sync=target -C backend/python/moonshine test
269+
tests-pocket-tts:
270+
runs-on: ubuntu-latest
271+
steps:
272+
- name: Clone
273+
uses: actions/checkout@v6
274+
with:
275+
submodules: true
276+
- name: Dependencies
277+
run: |
278+
sudo apt-get update
279+
sudo apt-get install build-essential ffmpeg
280+
sudo apt-get install -y ca-certificates cmake curl patch python3-pip
281+
# Install UV
282+
curl -LsSf https://astral.sh/uv/install.sh | sh
283+
pip install --user --no-cache-dir grpcio-tools==1.64.1
284+
- name: Test pocket-tts
285+
run: |
286+
make --jobs=5 --output-sync=target -C backend/python/pocket-tts
287+
make --jobs=5 --output-sync=target -C backend/python/pocket-tts test

Makefile

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Disable parallel execution for backend builds
2-
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/stablediffusion-ggml-darwin backends/vllm backends/moonshine
2+
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/stablediffusion-ggml-darwin backends/vllm backends/moonshine backends/pocket-tts
33

44
GOCMD=go
55
GOTEST=$(GOCMD) test
@@ -9,7 +9,7 @@ LAUNCHER_BINARY_NAME=local-ai-launcher
99

1010
CUDA_MAJOR_VERSION?=13
1111
CUDA_MINOR_VERSION?=0
12-
UBUNTU_VERSION?=2204
12+
UBUNTU_VERSION?=2404
1313
UBUNTU_CODENAME?=noble
1414

1515
GORELEASER?=
@@ -316,6 +316,7 @@ prepare-test-extra: protogen-python
316316
$(MAKE) -C backend/python/vllm
317317
$(MAKE) -C backend/python/vibevoice
318318
$(MAKE) -C backend/python/moonshine
319+
$(MAKE) -C backend/python/pocket-tts
319320

320321
test-extra: prepare-test-extra
321322
$(MAKE) -C backend/python/transformers test
@@ -324,6 +325,7 @@ test-extra: prepare-test-extra
324325
$(MAKE) -C backend/python/vllm test
325326
$(MAKE) -C backend/python/vibevoice test
326327
$(MAKE) -C backend/python/moonshine test
328+
$(MAKE) -C backend/python/pocket-tts test
327329

328330
DOCKER_IMAGE?=local-ai
329331
DOCKER_AIO_IMAGE?=local-ai-aio
@@ -447,17 +449,16 @@ BACKEND_FASTER_WHISPER = faster-whisper|python|.|false|true
447449
BACKEND_COQUI = coqui|python|.|false|true
448450
BACKEND_BARK = bark|python|.|false|true
449451
BACKEND_EXLLAMA2 = exllama2|python|.|false|true
450-
451-
# Python backends with ./backend context
452-
BACKEND_RFDETR = rfdetr|python|./backend|false|true
453-
BACKEND_KITTEN_TTS = kitten-tts|python|./backend|false|true
454-
BACKEND_NEUTTS = neutts|python|./backend|false|true
455-
BACKEND_KOKORO = kokoro|python|./backend|false|true
456-
BACKEND_VLLM = vllm|python|./backend|false|true
457-
BACKEND_DIFFUSERS = diffusers|python|./backend|--progress=plain|true
458-
BACKEND_CHATTERBOX = chatterbox|python|./backend|false|true
459-
BACKEND_VIBEVOICE = vibevoice|python|./backend|--progress=plain|true
460-
BACKEND_MOONSHINE = moonshine|python|./backend|false|true
452+
BACKEND_RFDETR = rfdetr|python|.|false|true
453+
BACKEND_KITTEN_TTS = kitten-tts|python|.|false|true
454+
BACKEND_NEUTTS = neutts|python|.|false|true
455+
BACKEND_KOKORO = kokoro|python|.|false|true
456+
BACKEND_VLLM = vllm|python|.|false|true
457+
BACKEND_DIFFUSERS = diffusers|python|.|--progress=plain|true
458+
BACKEND_CHATTERBOX = chatterbox|python|.|false|true
459+
BACKEND_VIBEVOICE = vibevoice|python|.|--progress=plain|true
460+
BACKEND_MOONSHINE = moonshine|python|.|false|true
461+
BACKEND_POCKET_TTS = pocket-tts|python|.|false|true
461462

462463
# Helper function to build docker image for a backend
463464
# Usage: $(call docker-build-backend,BACKEND_NAME,DOCKERFILE_TYPE,BUILD_CONTEXT,PROGRESS_FLAG,NEEDS_BACKEND_ARG)
@@ -503,12 +504,13 @@ $(eval $(call generate-docker-build-target,$(BACKEND_DIFFUSERS)))
503504
$(eval $(call generate-docker-build-target,$(BACKEND_CHATTERBOX)))
504505
$(eval $(call generate-docker-build-target,$(BACKEND_VIBEVOICE)))
505506
$(eval $(call generate-docker-build-target,$(BACKEND_MOONSHINE)))
507+
$(eval $(call generate-docker-build-target,$(BACKEND_POCKET_TTS)))
506508

507509
# Pattern rule for docker-save targets
508510
docker-save-%: backend-images
509511
docker save local-ai-backend:$* -o backend-images/$*.tar
510512

511-
docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-transformers docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-bark docker-build-chatterbox docker-build-vibevoice docker-build-exllama2 docker-build-moonshine
513+
docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-transformers docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-bark docker-build-chatterbox docker-build-vibevoice docker-build-exllama2 docker-build-moonshine docker-build-pocket-tts
512514

513515
########################################################
514516
### END Backends

README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,7 @@ LocalAI supports a comprehensive range of AI backends with multiple acceleration
295295
| **silero-vad** | Voice Activity Detection | CPU |
296296
| **neutts** | Text-to-speech with voice cloning | CUDA 12/13, ROCm, CPU |
297297
| **vibevoice** | Real-time TTS with voice cloning | CUDA 12/13, ROCm, Intel, CPU |
298+
| **pocket-tts** | Lightweight CPU-based TTS | CUDA 12/13, ROCm, Intel, CPU |
298299

299300
### Image & Video Generation
300301
| Backend | Description | Acceleration Support |
@@ -316,8 +317,8 @@ LocalAI supports a comprehensive range of AI backends with multiple acceleration
316317
|-------------------|-------------------|------------------|
317318
| **NVIDIA CUDA 12** | All CUDA-compatible backends | Nvidia hardware |
318319
| **NVIDIA CUDA 13** | All CUDA-compatible backends | Nvidia hardware |
319-
| **AMD ROCm** | llama.cpp, whisper, vllm, transformers, diffusers, rerankers, coqui, kokoro, bark, neutts, vibevoice | AMD Graphics |
320-
| **Intel oneAPI** | llama.cpp, whisper, stablediffusion, vllm, transformers, diffusers, rfdetr, rerankers, exllama2, coqui, kokoro, bark, vibevoice | Intel Arc, Intel iGPUs |
320+
| **AMD ROCm** | llama.cpp, whisper, vllm, transformers, diffusers, rerankers, coqui, kokoro, bark, neutts, vibevoice, pocket-tts | AMD Graphics |
321+
| **Intel oneAPI** | llama.cpp, whisper, stablediffusion, vllm, transformers, diffusers, rfdetr, rerankers, exllama2, coqui, kokoro, bark, vibevoice, pocket-tts | Intel Arc, Intel iGPUs |
321322
| **Apple Metal** | llama.cpp, whisper, diffusers, MLX, MLX-VLM, bark-cpp | Apple M1/M2/M3+ |
322323
| **Vulkan** | llama.cpp, whisper, stablediffusion | Cross-platform GPUs |
323324
| **NVIDIA Jetson (CUDA 12)** | llama.cpp, whisper, stablediffusion, diffusers, rfdetr | ARM64 embedded AI (AGX Orin, etc.) |

backend/index.yaml

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,28 @@
428428
nvidia-l4t-cuda-12: "nvidia-l4t-vibevoice"
429429
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vibevoice"
430430
icon: https://avatars.githubusercontent.com/u/6154722?s=200&v=4
431+
- &pocket-tts
432+
urls:
433+
- https://github.com/kyutai-labs/pocket-tts
434+
description: |
435+
Pocket TTS is a lightweight text-to-speech model designed to run efficiently on CPUs.
436+
tags:
437+
- text-to-speech
438+
- TTS
439+
license: mit
440+
name: "pocket-tts"
441+
alias: "pocket-tts"
442+
capabilities:
443+
nvidia: "cuda12-pocket-tts"
444+
intel: "intel-pocket-tts"
445+
amd: "rocm-pocket-tts"
446+
nvidia-l4t: "nvidia-l4t-pocket-tts"
447+
default: "cpu-pocket-tts"
448+
nvidia-cuda-13: "cuda13-pocket-tts"
449+
nvidia-cuda-12: "cuda12-pocket-tts"
450+
nvidia-l4t-cuda-12: "nvidia-l4t-pocket-tts"
451+
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-pocket-tts"
452+
icon: https://avatars.githubusercontent.com/u/6154722?s=200&v=4
431453
- &piper
432454
name: "piper"
433455
uri: "quay.io/go-skynet/local-ai-backends:latest-piper"
@@ -1605,3 +1627,86 @@
16051627
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-vibevoice"
16061628
mirrors:
16071629
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-vibevoice
1630+
## pocket-tts
1631+
- !!merge <<: *pocket-tts
1632+
name: "pocket-tts-development"
1633+
capabilities:
1634+
nvidia: "cuda12-pocket-tts-development"
1635+
intel: "intel-pocket-tts-development"
1636+
amd: "rocm-pocket-tts-development"
1637+
nvidia-l4t: "nvidia-l4t-pocket-tts-development"
1638+
default: "cpu-pocket-tts-development"
1639+
nvidia-cuda-13: "cuda13-pocket-tts-development"
1640+
nvidia-cuda-12: "cuda12-pocket-tts-development"
1641+
nvidia-l4t-cuda-12: "nvidia-l4t-pocket-tts-development"
1642+
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-pocket-tts-development"
1643+
- !!merge <<: *pocket-tts
1644+
name: "cpu-pocket-tts"
1645+
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-pocket-tts"
1646+
mirrors:
1647+
- localai/localai-backends:latest-cpu-pocket-tts
1648+
- !!merge <<: *pocket-tts
1649+
name: "cpu-pocket-tts-development"
1650+
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-pocket-tts"
1651+
mirrors:
1652+
- localai/localai-backends:master-cpu-pocket-tts
1653+
- !!merge <<: *pocket-tts
1654+
name: "cuda12-pocket-tts"
1655+
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-pocket-tts"
1656+
mirrors:
1657+
- localai/localai-backends:latest-gpu-nvidia-cuda-12-pocket-tts
1658+
- !!merge <<: *pocket-tts
1659+
name: "cuda12-pocket-tts-development"
1660+
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-pocket-tts"
1661+
mirrors:
1662+
- localai/localai-backends:master-gpu-nvidia-cuda-12-pocket-tts
1663+
- !!merge <<: *pocket-tts
1664+
name: "cuda13-pocket-tts"
1665+
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-pocket-tts"
1666+
mirrors:
1667+
- localai/localai-backends:latest-gpu-nvidia-cuda-13-pocket-tts
1668+
- !!merge <<: *pocket-tts
1669+
name: "cuda13-pocket-tts-development"
1670+
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-pocket-tts"
1671+
mirrors:
1672+
- localai/localai-backends:master-gpu-nvidia-cuda-13-pocket-tts
1673+
- !!merge <<: *pocket-tts
1674+
name: "intel-pocket-tts"
1675+
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-pocket-tts"
1676+
mirrors:
1677+
- localai/localai-backends:latest-gpu-intel-pocket-tts
1678+
- !!merge <<: *pocket-tts
1679+
name: "intel-pocket-tts-development"
1680+
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-pocket-tts"
1681+
mirrors:
1682+
- localai/localai-backends:master-gpu-intel-pocket-tts
1683+
- !!merge <<: *pocket-tts
1684+
name: "rocm-pocket-tts"
1685+
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-pocket-tts"
1686+
mirrors:
1687+
- localai/localai-backends:latest-gpu-rocm-hipblas-pocket-tts
1688+
- !!merge <<: *pocket-tts
1689+
name: "rocm-pocket-tts-development"
1690+
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-pocket-tts"
1691+
mirrors:
1692+
- localai/localai-backends:master-gpu-rocm-hipblas-pocket-tts
1693+
- !!merge <<: *pocket-tts
1694+
name: "nvidia-l4t-pocket-tts"
1695+
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-pocket-tts"
1696+
mirrors:
1697+
- localai/localai-backends:latest-nvidia-l4t-pocket-tts
1698+
- !!merge <<: *pocket-tts
1699+
name: "nvidia-l4t-pocket-tts-development"
1700+
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-pocket-tts"
1701+
mirrors:
1702+
- localai/localai-backends:master-nvidia-l4t-pocket-tts
1703+
- !!merge <<: *pocket-tts
1704+
name: "cuda13-nvidia-l4t-arm64-pocket-tts"
1705+
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-pocket-tts"
1706+
mirrors:
1707+
- localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-pocket-tts
1708+
- !!merge <<: *pocket-tts
1709+
name: "cuda13-nvidia-l4t-arm64-pocket-tts-development"
1710+
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-pocket-tts"
1711+
mirrors:
1712+
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-pocket-tts

0 commit comments

Comments
 (0)