xorbitsai · qinxuye · Mar 9, 2025 · Mar 4, 2025 · Mar 4, 2025 · Mar 4, 2025
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
@@ -124,6 +124,7 @@ jobs:
             sudo rm -rf "$AGENT_TOOLSDIRECTORY"
           fi
           pip install -e ".[dev]"
+          pip install xllamacpp
           if [ "$MODULE" == "metal" ]; then
             conda install -c conda-forge "ffmpeg<7"
             pip install "mlx>=0.22.0"

diff --git a/setup.cfg b/setup.cfg
@@ -80,6 +80,7 @@ dev =
     sphinx-design
 all =
     llama-cpp-python>=0.2.25,!=0.2.58
+    xllamacpp
     transformers>=4.46.0
     torch>=2.0.0  # >=2.0 For CosyVoice
     accelerate>=0.28.0
@@ -155,6 +156,7 @@ intel =
     intel_extension_for_pytorch==2.1.10+xpu
 llama_cpp =
     llama-cpp-python>=0.2.25,!=0.2.58
+    xllamacpp
 transformers =
     transformers>=4.46.0
     torch

diff --git a/xinference/client/tests/test_client.py b/xinference/client/tests/test_client.py
@@ -126,11 +126,15 @@ def _check(stream=False):
             "AI is going to", generate_config={"stream": stream, "max_tokens": 5}
         )
         if stream:
+            count = 0
+            has_text = False
             for chunk in completion:
                 assert "text" in chunk["choices"][0]
-                assert (
-                    chunk["choices"][0]["text"] or chunk["choices"][0]["finish_reason"]
-                )
+                if chunk["choices"][0]["text"]:
+                    has_text = True
+                count += 1
+            assert has_text
+            assert count > 2
         else:
             assert "text" in completion["choices"][0]
             assert len(completion["choices"][0]["text"]) > 0
@@ -161,6 +165,10 @@ def _check(stream=False):
     assert len(client.list_models()) == 0
 
 
+def test_RESTful_client_xllamacpp(set_use_xllamacpp, setup):
+    test_RESTful_client(setup)
+
+
 @pytest.mark.skipif(os.name == "nt", reason="Skip windows")
 def test_list_cached_models(setup):
     endpoint, _ = setup

diff --git a/xinference/conftest.py b/xinference/conftest.py
@@ -304,3 +304,10 @@ def setup_with_auth():
             os.remove(auth_file)
         except:
             pass
+
+
+@pytest.fixture
+def set_use_xllamacpp():
+    os.environ["USE_XLLAMACPP"] = "1"
+    yield
+    del os.environ["USE_XLLAMACPP"]
diff --git a/xinference/core/model.py b/xinference/core/model.py
@@ -231,6 +231,7 @@ def __init__(
         driver_info: Optional[dict] = None,  # for model across workers
     ):
         super().__init__()
+        from ..model.llm.llama_cpp.core import XllamaCppModel
         from ..model.llm.lmdeploy.core import LMDeployModel
         from ..model.llm.sglang.core import SGLANGModel
         from ..model.llm.transformers.core import PytorchModel
@@ -251,7 +252,8 @@ def __init__(
         self._lock = (
             None
             if isinstance(
-                self._model, (PytorchModel, VLLMModel, SGLANGModel, LMDeployModel)
+                self._model,
+                (PytorchModel, VLLMModel, SGLANGModel, LMDeployModel, XllamaCppModel),
             )
             else asyncio.locks.Lock()
         )

diff --git a/xinference/core/tests/test_restful_api.py b/xinference/core/tests/test_restful_api.py
@@ -317,6 +317,11 @@ async def test_restful_api(setup):
     assert custom_model_reg is None
 
 
+@pytest.mark.asyncio
+async def test_restful_api_xllamacpp(set_use_xllamacpp, setup):
+    await test_restful_api(setup)
+
+
 def test_restful_api_for_embedding(setup):
     model_name = "gte-base"
     model_spec = BUILTIN_EMBEDDING_MODELS[model_name]

diff --git a/xinference/core/worker.py b/xinference/core/worker.py
@@ -1002,7 +1002,7 @@ async def terminate_model(self, model_uid: str, is_model_die=False):
             )
         try:
             subpool_address = self._model_uid_to_addr[model_uid]
-            await self._main_pool.remove_sub_pool(subpool_address)
+            await self._main_pool.remove_sub_pool(subpool_address, force=True)
         except Exception as e:
             logger.debug(
                 "Remove sub pool failed, model uid: %s, error: %s", model_uid, e

diff --git a/xinference/deploy/docker/Dockerfile b/xinference/deploy/docker/Dockerfile
@@ -36,6 +36,8 @@ RUN pip install --upgrade -i "$PIP_INDEX" pip && \
     python3 setup.py build_web && \
     git restore . && \
     pip install -i "$PIP_INDEX" --no-deps "." && \
+    pip uninstall xllamacpp -y && \
+    pip install https://github.com/xorbitsai/xllamacpp/releases/download/v0.1.9-cu124/xllamacpp_cuda12x-0.1.9-cp310-cp310-manylinux_2_35_x86_64.whl && \
     # clean packages
     pip cache purge
 

diff --git a/xinference/deploy/docker/cpu.Dockerfile b/xinference/deploy/docker/cpu.Dockerfile
@@ -26,6 +26,7 @@ RUN python -m pip install --upgrade -i "$PIP_INDEX" pip && \
     python setup.py build_web && \
     git restore . && \
     pip install -i "$PIP_INDEX" --no-deps "." && \
+    pip install -i "$PIP_INDEX" xllamacpp && \
     # clean packages
     pip cache purge
 

diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py
@@ -129,7 +129,7 @@ def register_custom_model():
 
 
 def _install():
-    from .llama_cpp.core import LlamaCppChatModel, LlamaCppModel
+    from .llama_cpp.core import LlamaCppChatModel, LlamaCppModel, XllamaCppModel
     from .lmdeploy.core import LMDeployChatModel, LMDeployModel
     from .mlx.core import MLXChatModel, MLXModel, MLXVisionModel
     from .sglang.core import SGLANGChatModel, SGLANGModel
@@ -169,6 +169,7 @@ def _install():
         [
             LlamaCppChatModel,
             LlamaCppModel,
+            XllamaCppModel,
         ]
     )
     SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel])