Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file removed assets/openarc_list.png
Binary file not shown.
Binary file removed assets/openarc_status.png
Binary file not shown.
Empty file removed docs/devices.md
Empty file.
13 changes: 0 additions & 13 deletions docs/openarc_load.md

This file was deleted.

File renamed without changes.
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@ dependencies = [
"librosa>=0.11.0",
"misaki[en]>=0.9.4",
"openai>1.109.1",
"openvino-genai>2025.3.0.0",
"openvino-genai>=2025.3.0.0",
"optimum[openvino]>1.26.1",
"pydantic>=2.11.7",
"pyyaml>=6.0.2",
"rich-click>=1.8.9",
"torch>2.6.0",
"torchvision>0.23.0",
"torchvision>=0.23.0",
"uvicorn>=0.35.0",
]

Expand Down
6 changes: 3 additions & 3 deletions src/cli/openarc_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,9 @@ def cli():
required=True,
help='Engine used to load the model (ovgenai, openvino, optimum)')
@click.option('--model-type', '--mt',
type=click.Choice(['llm', 'vlm', 'whisper', 'kokoro']),
type=click.Choice(['llm', 'vlm', 'whisper', 'kokoro', 'emb']),
required=True,
help='Model type (llm, vlm, whisper, kokoro)')
help='Model type (llm, vlm, whisper, kokoro, emb)')
@click.option('--device', '--d',
required=True,
help='Device(s) to load the model on.')
Expand Down Expand Up @@ -353,7 +353,7 @@ def status(ctx):
console.print("[yellow]No models currently loaded.[/yellow]")
else:
# Create a table for all models
status_table = Table()
status_table = Table(title=f"📊 Loaded Models ({total_models})")
status_table.add_column("model_name", style="cyan", width=20)
status_table.add_column("device", style="blue", width=10)
status_table.add_column("model_type", style="magenta", width=15)
Expand Down
112 changes: 112 additions & 0 deletions src/engine/optimum/optimum_emb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@


import asyncio
import gc
import logging
from typing import Any, AsyncIterator, Dict, List, Union

import torch
import torch.nn.functional as F
from torch import Tensor

from transformers import AutoTokenizer
from optimum.intel import OVModelForFeatureExtraction

from src.server.models.optimum import PreTrainedTokenizerConfig

from typing import Any, AsyncIterator, Dict

from src.server.model_registry import ModelLoadConfig, ModelRegistry




class Optimum_EMB:

def __init__(self, load_config: ModelLoadConfig):
self.model_path = None
self.encoder_tokenizer = None
self.load_config = load_config

def last_token_pool(self, last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0]
if left_padding:
return last_hidden_states[:, -1]
else:
sequence_lengths = attention_mask.sum(dim=1) - 1
batch_size = last_hidden_states.shape[0]
return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

async def generate_embeddings(self, tok_config: PreTrainedTokenizerConfig) -> AsyncIterator[Union[Dict[str, Any], str]]:

# Tokenize the input texts
batch_dict = self.tokenizer(
text=tok_config.text,
text_pair=tok_config.text_pair,
text_target=tok_config.text_target,
text_pair_target=tok_config.text_pair_target,
add_special_tokens=tok_config.add_special_tokens,
padding=tok_config.padding,
truncation=tok_config.truncation,
max_length=tok_config.max_length,
stride=tok_config.stride,
is_split_into_words=tok_config.is_split_into_words,
pad_to_multiple_of=tok_config.pad_to_multiple_of,
padding_side=tok_config.padding_side,
return_tensors=tok_config.return_tensors,
return_token_type_ids=tok_config.return_token_type_ids,
return_attention_mask=tok_config.return_attention_mask,
return_overflowing_tokens=tok_config.return_overflowing_tokens,
return_special_tokens_mask=tok_config.return_special_tokens_mask,
return_offsets_mapping=tok_config.return_offsets_mapping,
return_length=tok_config.return_length,
verbose=tok_config.verbose
)
batch_dict.to(self.model.device)
outputs = self.model(**batch_dict)
embeddings = self.last_token_pool(outputs.last_hidden_state, batch_dict["attention_mask"])
# normalize embeddings
if tok_config.return_tensors=="pt":
embeddings = F.normalize(embeddings, p=2, dim=1)
yield embeddings.tolist()

def collect_metrics(self, tok_config: PreTrainedTokenizerConfig, perf_metrics) -> Dict[str, Any]:
pass

def load_model(self, loader: ModelLoadConfig):
"""Load model using a ModelLoadConfig configuration and cache the tokenizer.

Args:
loader: ModelLoadConfig containing model_path, device, engine, and runtime_config.
"""

self.model = OVModelForFeatureExtraction.from_pretrained(loader.model_path,
device=loader.device,
export=False)

self.tokenizer = AutoTokenizer.from_pretrained(loader.model_path)
logging.info(f"Model loaded successfully: {loader.model_name}")

async def unload_model(self, registry: ModelRegistry, model_name: str) -> bool:
"""Unregister model from registry and free memory resources.

Args:
registry: ModelRegistry to unregister from
model_id: Private model identifier returned by register_load

Returns:
True if the model was found and unregistered, else False.
"""
removed = await registry.register_unload(model_name)

if self.model is not None:
del self.model
self.model = None

if self.tokenizer is not None:
del self.tokenizer
self.tokenizer = None

gc.collect()
logging.info(f"[{self.load_config.model_name}] weights and tokenizer unloaded and memory cleaned up")
return removed
1 change: 1 addition & 0 deletions src/server/launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def start_server(host: str = "0.0.0.0", openarc_port: int = 8001, reload: bool =
logger.info(" - POST /v1/chat/completions")
logger.info(" - POST /v1/audio/transcriptions: Whisper only")
logger.info(" - POST /v1/audio/speech: Kokoro only")
logger.info(" - POST /v1/embeddings")


uvicorn.run(
Expand Down
66 changes: 65 additions & 1 deletion src/server/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import time
import uuid
import traceback
from typing import Any, AsyncIterator, List, Optional, Dict
from typing import Any, AsyncIterator, List, Optional, Dict, Union

from pydantic import BaseModel
from fastapi import Depends, FastAPI, HTTPException, Request
Expand All @@ -21,6 +21,7 @@
from src.server.worker_registry import WorkerRegistry
from src.server.models.openvino import OV_KokoroGenConfig
from src.server.models.ov_genai import OVGenAI_GenConfig, OVGenAI_WhisperGenConfig
from src.server.models.optimum import PreTrainedTokenizerConfig

#===============================================================#
# Logging
Expand Down Expand Up @@ -151,6 +152,15 @@ class OpenAIKokoroRequest(BaseModel):
language: Optional[str] = None
response_format: Optional[str] = "wav"

# https://platform.openai.com/docs/api-reference/embeddings
class EmbeddingsRequest(BaseModel):
model: str
input: Union[str, List[str], List[List[str]]]
dimensions: Optional[int] = None
encoding_format: Optional[str] = "float" #not implemented
user: Optional[str] = None, #not implemented
#end of openai api
config: Optional[PreTrainedTokenizerConfig] = None
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(dont make this change) but would your approach in EmbeddingRequest work here?

config_kwargs = {

yours looks much cleaner since there are a t-o-n of sampling setting to keep organized. nice


@app.get("/v1/models", dependencies=[Depends(verify_api_key)])
async def openai_list_models():
Expand Down Expand Up @@ -336,3 +346,57 @@ async def openai_audio_speech(request: OpenAIKokoroRequest):
except Exception as exc:
raise HTTPException(status_code=500, detail=f"Speech synthesis failed: {str(exc)}")

@app.post("/v1/embeddings", dependencies=[Depends(verify_api_key)])
async def embeddings(request: EmbeddingsRequest):

try:

tok_config = PreTrainedTokenizerConfig(
text=request.input
)

if request.config:
tok_config = request.config
if not tok_config.text:
tok_config.text = request.input

if not tok_config.max_length and request.dimensions:
tok_config.max_length = request.dimensions

model_name = request.model
created_ts = int(time.time())
request_id = f"ov-{uuid.uuid4().hex[:24]}"

result = await _workers.embed(model_name, tok_config)
data = result.get("data", None)
metrics = result.get("metrics", {}) or {}

prompt_tokens = metrics.get("input_token", 0)
total_tokens = metrics.get("total_token", prompt_tokens)

embs = []
for i in range(len(data)):
embs.append({
"index":i,
"object":"embedding",
"embedding":data[i]
})

response = {
"id": request_id,
"object": "list",
"created": created_ts,
"model": model_name,
"data": embs,
"usage": {
"prompt_tokens": prompt_tokens,
"total_tokens": total_tokens,
},
}

return response
except ValueError as exc:
raise HTTPException(status_code=400, detail=str(exc))
except Exception as exc:
traceback.print_exc()
raise HTTPException(status_code=500, detail=f"Embedding failed: {str(exc)}")
5 changes: 4 additions & 1 deletion src/server/model_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,14 @@ class ModelType(str, Enum):
- llm: Text-to-text LLM models
- vlm: Image-to-text VLM models
- whisper: Whisper ASR models
- kokoro: Kokoro TTS models"""
- kokoro: Kokoro TTS models
- emb: Text-to-vector models"""

LLM = "llm"
VLM = "vlm"
WHISPER = "whisper"
KOKORO = "kokoro"
EMB = "emb"

class EngineType(str, Enum):
"""Engine used to load the model.
Expand Down Expand Up @@ -288,6 +290,7 @@ async def status(self) -> dict:
(EngineType.OV_GENAI, ModelType.VLM): "src.engine.ov_genai.vlm.OVGenAI_VLM",
(EngineType.OV_GENAI, ModelType.WHISPER): "src.engine.ov_genai.whisper.OVGenAI_Whisper",
(EngineType.OPENVINO, ModelType.KOKORO): "src.engine.openvino.kokoro.OV_Kokoro",
(EngineType.OV_OPTIMUM, ModelType.EMB): "src.engine.optimum.optimum_emb.Optimum_EMB",
}

async def create_model_instance(load_config: ModelLoadConfig) -> Any:
Expand Down
Loading