Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/python.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ jobs:
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
fi
pip install -e ".[dev]"
pip install xllamacpp
if [ "$MODULE" == "metal" ]; then
conda install -c conda-forge "ffmpeg<7"
pip install "mlx>=0.22.0"
Expand Down
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ dev =
sphinx-design
all =
llama-cpp-python>=0.2.25,!=0.2.58
xllamacpp
transformers>=4.46.0
torch>=2.0.0 # >=2.0 For CosyVoice
accelerate>=0.28.0
Expand Down Expand Up @@ -155,6 +156,7 @@ intel =
intel_extension_for_pytorch==2.1.10+xpu
llama_cpp =
llama-cpp-python>=0.2.25,!=0.2.58
xllamacpp
transformers =
transformers>=4.46.0
torch
Expand Down
14 changes: 11 additions & 3 deletions xinference/client/tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,11 +126,15 @@ def _check(stream=False):
"AI is going to", generate_config={"stream": stream, "max_tokens": 5}
)
if stream:
count = 0
has_text = False
for chunk in completion:
assert "text" in chunk["choices"][0]
assert (
chunk["choices"][0]["text"] or chunk["choices"][0]["finish_reason"]
)
if chunk["choices"][0]["text"]:
has_text = True
count += 1
assert has_text
assert count > 2
else:
assert "text" in completion["choices"][0]
assert len(completion["choices"][0]["text"]) > 0
Expand Down Expand Up @@ -161,6 +165,10 @@ def _check(stream=False):
assert len(client.list_models()) == 0


def test_RESTful_client_xllamacpp(set_use_xllamacpp, setup):
test_RESTful_client(setup)


@pytest.mark.skipif(os.name == "nt", reason="Skip windows")
def test_list_cached_models(setup):
endpoint, _ = setup
Expand Down
7 changes: 7 additions & 0 deletions xinference/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,3 +304,10 @@ def setup_with_auth():
os.remove(auth_file)
except:
pass


@pytest.fixture
def set_use_xllamacpp():
os.environ["USE_XLLAMACPP"] = "1"
yield
del os.environ["USE_XLLAMACPP"]
4 changes: 3 additions & 1 deletion xinference/core/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@ def __init__(
driver_info: Optional[dict] = None, # for model across workers
):
super().__init__()
from ..model.llm.llama_cpp.core import XllamaCppModel
from ..model.llm.lmdeploy.core import LMDeployModel
from ..model.llm.sglang.core import SGLANGModel
from ..model.llm.transformers.core import PytorchModel
Expand All @@ -251,7 +252,8 @@ def __init__(
self._lock = (
None
if isinstance(
self._model, (PytorchModel, VLLMModel, SGLANGModel, LMDeployModel)
self._model,
(PytorchModel, VLLMModel, SGLANGModel, LMDeployModel, XllamaCppModel),
)
else asyncio.locks.Lock()
)
Expand Down
5 changes: 5 additions & 0 deletions xinference/core/tests/test_restful_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,11 @@ async def test_restful_api(setup):
assert custom_model_reg is None


@pytest.mark.asyncio
async def test_restful_api_xllamacpp(set_use_xllamacpp, setup):
await test_restful_api(setup)


def test_restful_api_for_embedding(setup):
model_name = "gte-base"
model_spec = BUILTIN_EMBEDDING_MODELS[model_name]
Expand Down
2 changes: 1 addition & 1 deletion xinference/core/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -1002,7 +1002,7 @@ async def terminate_model(self, model_uid: str, is_model_die=False):
)
try:
subpool_address = self._model_uid_to_addr[model_uid]
await self._main_pool.remove_sub_pool(subpool_address)
await self._main_pool.remove_sub_pool(subpool_address, force=True)
except Exception as e:
logger.debug(
"Remove sub pool failed, model uid: %s, error: %s", model_uid, e
Expand Down
2 changes: 2 additions & 0 deletions xinference/deploy/docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ RUN pip install --upgrade -i "$PIP_INDEX" pip && \
python3 setup.py build_web && \
git restore . && \
pip install -i "$PIP_INDEX" --no-deps "." && \
pip uninstall xllamacpp -y && \
pip install https://github.com/xorbitsai/xllamacpp/releases/download/v0.1.9-cu124/xllamacpp_cuda12x-0.1.9-cp310-cp310-manylinux_2_35_x86_64.whl && \
# clean packages
pip cache purge

Expand Down
1 change: 1 addition & 0 deletions xinference/deploy/docker/cpu.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ RUN python -m pip install --upgrade -i "$PIP_INDEX" pip && \
python setup.py build_web && \
git restore . && \
pip install -i "$PIP_INDEX" --no-deps "." && \
pip install -i "$PIP_INDEX" xllamacpp && \
# clean packages
pip cache purge

Expand Down
3 changes: 2 additions & 1 deletion xinference/model/llm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def register_custom_model():


def _install():
from .llama_cpp.core import LlamaCppChatModel, LlamaCppModel
from .llama_cpp.core import LlamaCppChatModel, LlamaCppModel, XllamaCppModel
from .lmdeploy.core import LMDeployChatModel, LMDeployModel
from .mlx.core import MLXChatModel, MLXModel, MLXVisionModel
from .sglang.core import SGLANGChatModel, SGLANGModel
Expand Down Expand Up @@ -169,6 +169,7 @@ def _install():
[
LlamaCppChatModel,
LlamaCppModel,
XllamaCppModel,
]
)
SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel])
Expand Down
Loading
Loading