Skip to content

Commit 99c2448

Browse files
Feat/954 llama cpp (#1000)
Co-authored-by: David Berenstein <[email protected]>
1 parent 344cce7 commit 99c2448

File tree

5 files changed

+459
-1
lines changed

5 files changed

+459
-1
lines changed

.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,4 +77,3 @@ venv.bak/
7777
# Other
7878
*.log
7979
*.swp
80-
.DS_Store

src/distilabel/models/embeddings/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414

1515
from distilabel.models.embeddings.base import Embeddings
16+
from distilabel.models.embeddings.llamacpp import LlamaCppEmbeddings
1617
from distilabel.models.embeddings.sentence_transformers import (
1718
SentenceTransformerEmbeddings,
1819
)
@@ -22,4 +23,5 @@
2223
"Embeddings",
2324
"SentenceTransformerEmbeddings",
2425
"vLLMEmbeddings",
26+
"LlamaCppEmbeddings",
2527
]
Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
# Copyright 2023-present, Argilla, Inc.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from pathlib import Path
16+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
17+
18+
from pydantic import Field, PrivateAttr
19+
20+
from distilabel.mixins.runtime_parameters import RuntimeParameter
21+
from distilabel.models.embeddings.base import Embeddings
22+
from distilabel.models.mixins.cuda_device_placement import CudaDevicePlacementMixin
23+
24+
if TYPE_CHECKING:
25+
from llama_cpp import Llama
26+
27+
28+
class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
29+
"""`LlamaCpp` library implementation for embedding generation.
30+
31+
Attributes:
32+
model_name: contains the name of the GGUF quantized model, compatible with the
33+
installed version of the `llama.cpp` Python bindings.
34+
model_path: contains the path to the GGUF quantized model, compatible with the
35+
installed version of the `llama.cpp` Python bindings.
36+
repo_id: the Hugging Face Hub repository id.
37+
verbose: whether to print verbose output. Defaults to `False`.
38+
n_gpu_layers: number of layers to run on the GPU. Defaults to `-1` (use the GPU if available).
39+
disable_cuda_device_placement: whether to disable CUDA device placement. Defaults to `True`.
40+
normalize_embeddings: whether to normalize the embeddings. Defaults to `False`.
41+
seed: RNG seed, -1 for random
42+
n_ctx: Text context, 0 = from model
43+
n_batch: Prompt processing maximum batch size
44+
extra_kwargs: additional dictionary of keyword arguments that will be passed to the
45+
`Llama` class of `llama_cpp` library. Defaults to `{}`.
46+
47+
Runtime parameters:
48+
- `n_gpu_layers`: the number of layers to use for the GPU. Defaults to `-1`.
49+
- `verbose`: whether to print verbose output. Defaults to `False`.
50+
- `normalize_embeddings`: whether to normalize the embeddings. Defaults to `False`.
51+
- `extra_kwargs`: additional dictionary of keyword arguments that will be passed to the
52+
`Llama` class of `llama_cpp` library. Defaults to `{}`.
53+
54+
References:
55+
- [Offline inference embeddings](https://llama-cpp-python.readthedocs.io/en/stable/#embeddings)
56+
57+
Examples:
58+
Generate sentence embeddings using a local model:
59+
60+
```python
61+
from pathlib import Path
62+
from distilabel.models.embeddings import LlamaCppEmbeddings
63+
64+
# You can follow along this example downloading the following model running the following
65+
# command in the terminal, that will download the model to the `Downloads` folder:
66+
# curl -L -o ~/Downloads/all-MiniLM-L6-v2-Q2_K.gguf https://huggingface.co/second-state/All-MiniLM-L6-v2-Embedding-GGUF/resolve/main/all-MiniLM-L6-v2-Q2_K.gguf
67+
68+
model_path = "Downloads/"
69+
model = "all-MiniLM-L6-v2-Q2_K.gguf"
70+
embeddings = LlamaCppEmbeddings(
71+
model=model,
72+
model_path=str(Path.home() / model_path),
73+
)
74+
75+
embeddings.load()
76+
77+
results = embeddings.encode(inputs=["distilabel is awesome!", "and Argilla!"])
78+
print(results)
79+
embeddings.unload()
80+
```
81+
82+
Generate sentence embeddings using a HuggingFace Hub model:
83+
84+
```python
85+
from distilabel.models.embeddings import LlamaCppEmbeddings
86+
# You need to set environment variable to download private model to the local machine
87+
88+
repo_id = "second-state/All-MiniLM-L6-v2-Embedding-GGUF"
89+
model = "all-MiniLM-L6-v2-Q2_K.gguf"
90+
embeddings = LlamaCppEmbeddings(model=model,repo_id=repo_id)
91+
92+
embeddings.load()
93+
94+
results = embeddings.encode(inputs=["distilabel is awesome!", "and Argilla!"])
95+
print(results)
96+
embeddings.unload()
97+
# [
98+
# [-0.05447685346007347, -0.01623094454407692, ...],
99+
# [4.4889533455716446e-05, 0.044016145169734955, ...],
100+
# ]
101+
```
102+
103+
Generate sentence embeddings with cpu:
104+
105+
```python
106+
from pathlib import Path
107+
from distilabel.models.embeddings import LlamaCppEmbeddings
108+
109+
# You can follow along this example downloading the following model running the following
110+
# command in the terminal, that will download the model to the `Downloads` folder:
111+
# curl -L -o ~/Downloads/all-MiniLM-L6-v2-Q2_K.gguf https://huggingface.co/second-state/All-MiniLM-L6-v2-Embedding-GGUF/resolve/main/all-MiniLM-L6-v2-Q2_K.gguf
112+
113+
model_path = "Downloads/"
114+
model = "all-MiniLM-L6-v2-Q2_K.gguf"
115+
embeddings = LlamaCppEmbeddings(
116+
model=model,
117+
model_path=str(Path.home() / model_path),
118+
n_gpu_layers=0,
119+
disable_cuda_device_placement=True,
120+
)
121+
122+
embeddings.load()
123+
124+
results = embeddings.encode(inputs=["distilabel is awesome!", "and Argilla!"])
125+
print(results)
126+
embeddings.unload()
127+
# [
128+
# [-0.05447685346007347, -0.01623094454407692, ...],
129+
# [4.4889533455716446e-05, 0.044016145169734955, ...],
130+
# ]
131+
```
132+
133+
134+
"""
135+
136+
model: str = Field(
137+
description="The name of the model to use for embeddings.",
138+
)
139+
140+
model_path: RuntimeParameter[str] = Field(
141+
default=None,
142+
description="The path to the GGUF quantized model, compatible with the installed version of the `llama.cpp` Python bindings.",
143+
)
144+
145+
repo_id: RuntimeParameter[str] = Field(
146+
default=None, description="The Hugging Face Hub repository id.", exclude=True
147+
)
148+
149+
n_gpu_layers: RuntimeParameter[int] = Field(
150+
default=-1,
151+
description="The number of layers that will be loaded in the GPU.",
152+
)
153+
154+
n_ctx: int = 512
155+
n_batch: int = 512
156+
seed: int = 4294967295
157+
158+
normalize_embeddings: RuntimeParameter[bool] = Field(
159+
default=False,
160+
description="Whether to normalize the embeddings.",
161+
)
162+
verbose: RuntimeParameter[bool] = Field(
163+
default=False,
164+
description="Whether to print verbose output from llama.cpp library.",
165+
)
166+
extra_kwargs: Optional[RuntimeParameter[Dict[str, Any]]] = Field(
167+
default_factory=dict,
168+
description="Additional dictionary of keyword arguments that will be passed to the"
169+
" `Llama` class of `llama_cpp` library. See all the supported arguments at: "
170+
"https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__init__",
171+
)
172+
_model: Optional["Llama"] = PrivateAttr(...)
173+
174+
def load(self) -> None:
175+
"""Loads the `gguf` model using either the path or the Hugging Face Hub repository id."""
176+
super().load()
177+
CudaDevicePlacementMixin.load(self)
178+
179+
try:
180+
from llama_cpp import Llama
181+
except ImportError as ie:
182+
raise ImportError(
183+
"`llama-cpp-python` package is not installed. Please install it using"
184+
" `pip install llama-cpp-python`."
185+
) from ie
186+
187+
if self.repo_id is not None:
188+
# use repo_id to download the model
189+
from huggingface_hub.utils import validate_repo_id
190+
191+
validate_repo_id(self.repo_id)
192+
self._model = Llama.from_pretrained(
193+
repo_id=self.repo_id,
194+
filename=self.model,
195+
n_gpu_layers=self.n_gpu_layers,
196+
seed=self.seed,
197+
n_ctx=self.n_ctx,
198+
n_batch=self.n_batch,
199+
verbose=self.verbose,
200+
embedding=True,
201+
kwargs=self.extra_kwargs,
202+
)
203+
elif self.model_path is not None:
204+
self._model = Llama(
205+
model_path=str(Path(self.model_path) / self.model),
206+
n_gpu_layers=self.n_gpu_layers,
207+
seed=self.seed,
208+
n_ctx=self.n_ctx,
209+
n_batch=self.n_batch,
210+
verbose=self.verbose,
211+
embedding=True,
212+
kwargs=self.extra_kwargs,
213+
)
214+
else:
215+
raise ValueError("Either 'model_path' or 'repo_id' must be provided")
216+
217+
def unload(self) -> None:
218+
"""Unloads the `gguf` model."""
219+
CudaDevicePlacementMixin.unload(self)
220+
self._model.close()
221+
super().unload()
222+
223+
@property
224+
def model_name(self) -> str:
225+
"""Returns the name of the model."""
226+
return self.model
227+
228+
def encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:
229+
"""Generates embeddings for the provided inputs.
230+
231+
Args:
232+
inputs: a list of texts for which an embedding has to be generated.
233+
234+
Returns:
235+
The generated embeddings.
236+
"""
237+
return self._model.embed(inputs, normalize=self.normalize_embeddings)

tests/unit/conftest.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,10 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
import atexit
16+
import os
1517
from typing import TYPE_CHECKING, Any, Dict, List, Union
18+
from urllib.request import urlretrieve
1619

1720
import pytest
1821
from pydantic import PrivateAttr
@@ -126,3 +129,35 @@ class DummyTaskOfflineBatchGeneration(DummyTask):
126129
@pytest.fixture
127130
def dummy_llm() -> AsyncLLM:
128131
return DummyAsyncLLM()
132+
133+
134+
@pytest.fixture(scope="session")
135+
def local_llamacpp_model_path(tmp_path_factory):
136+
"""
137+
Session-scoped fixture that provides the local model path for LlamaCpp testing.
138+
139+
Download a small test model to a temporary directory.
140+
The model is downloaded once per test session and cleaned up after all tests.
141+
142+
Args:
143+
tmp_path_factory: Pytest fixture providing a temporary directory factory.
144+
145+
Returns:
146+
str: The path to the local LlamaCpp model file.
147+
"""
148+
model_name = "all-MiniLM-L6-v2-Q2_K.gguf"
149+
model_url = f"https://huggingface.co/second-state/All-MiniLM-L6-v2-Embedding-GGUF/resolve/main/{model_name}"
150+
tmp_path = tmp_path_factory.getbasetemp()
151+
model_path = tmp_path / model_name
152+
153+
if not model_path.exists():
154+
urlretrieve(model_url, model_path)
155+
156+
def cleanup():
157+
if model_path.exists():
158+
os.remove(model_path)
159+
160+
# Register the cleanup function to be called at exit
161+
atexit.register(cleanup)
162+
163+
return str(tmp_path)

0 commit comments

Comments
 (0)