Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
174 changes: 174 additions & 0 deletions gpu-metrics/gpu_metrics.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <string>
#include <vector>
#include <stdexcept>
#include <iostream>
#include <level_zero/ze_api.h>
#include <level_zero/zes_api.h>

namespace py = pybind11;

std::string getZesErrorString(ze_result_t res) {
return "Level Zero Error Code: " + std::to_string(res);
}

void checkZesResult(ze_result_t res, const char* operation) {
if (res != ZE_RESULT_SUCCESS) {
throw std::runtime_error(std::string(operation) + " failed: " + getZesErrorString(res));
}
}

// func to get all the gpu metrics
py::dict get_gpu_metrics() {
py::dict results;

// init sysman api
ze_result_t res = zesInit(0);
if (res != ZE_RESULT_SUCCESS) {
return results;
}

uint32_t driverCount = 0;
try {
checkZesResult(zesDriverGet(&driverCount, nullptr), "zesDriverGet (count)");
} catch (...) {
return results;
}

if (driverCount == 0) return results;

std::vector<zes_driver_handle_t> drivers(driverCount);
checkZesResult(zesDriverGet(&driverCount, drivers.data()), "zesDriverGet");

zes_driver_handle_t driver = drivers[0];

uint32_t deviceCount = 0;
checkZesResult(zesDeviceGet(driver, &deviceCount, nullptr), "zesDeviceGet (count)");
if (deviceCount == 0) return results;

std::vector<zes_device_handle_t> devices(deviceCount);
checkZesResult(zesDeviceGet(driver, &deviceCount, devices.data()), "zesDeviceGet");

for (uint32_t i = 0; i < deviceCount; ++i) {
zes_device_handle_t sysmanDevice = devices[i];
py::dict gpu_data;

// device props
zes_device_properties_t device_props = {};
device_props.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES;
if (zesDeviceGetProperties(sysmanDevice, &device_props) == ZE_RESULT_SUCCESS) {
gpu_data["name"] = std::string(device_props.core.name);
} else {
gpu_data["name"] = "Unknown Intel GPU";
}

// temperature (might be empty arr depending on your perms)
uint32_t tempSensorCount = 0;
if (zesDeviceEnumTemperatureSensors(sysmanDevice, &tempSensorCount, nullptr) == ZE_RESULT_SUCCESS && tempSensorCount > 0) {
std::vector<zes_temp_handle_t> tempSensors(tempSensorCount);
if (zesDeviceEnumTemperatureSensors(sysmanDevice, &tempSensorCount, tempSensors.data()) == ZE_RESULT_SUCCESS) {
py::list temps;
for (uint32_t j = 0; j < tempSensorCount; ++j) {
double temperature = 0.0;
if (zesTemperatureGetState(tempSensors[j], &temperature) == ZE_RESULT_SUCCESS) {
temps.append(temperature);
}
}
gpu_data["temperature"] = temps;
}
}

// memory state
uint32_t memCount = 0;
if (zesDeviceEnumMemoryModules(sysmanDevice, &memCount, nullptr) == ZE_RESULT_SUCCESS && memCount > 0) {
std::vector<zes_mem_handle_t> memories(memCount);
if (zesDeviceEnumMemoryModules(sysmanDevice, &memCount, memories.data()) == ZE_RESULT_SUCCESS) {
py::list mems;
for (uint32_t j = 0; j < memCount; ++j) {
zes_mem_state_t mem_state = {};
mem_state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
if (zesMemoryGetState(memories[j], &mem_state) == ZE_RESULT_SUCCESS) {
py::dict mem_info;
mem_info["total"] = mem_state.size;
mem_info["free"] = mem_state.free;
mem_info["used"] = mem_state.size - mem_state.free;
mems.append(mem_info);
}
}
gpu_data["memory"] = mems;
}
}

// power and energy counters
uint32_t powerCount = 0;
if (zesDeviceEnumPowerDomains(sysmanDevice, &powerCount, nullptr) == ZE_RESULT_SUCCESS && powerCount > 0) {
std::vector<zes_pwr_handle_t> powers(powerCount);
if (zesDeviceEnumPowerDomains(sysmanDevice, &powerCount, powers.data()) == ZE_RESULT_SUCCESS) {
py::list pwrs;
for (uint32_t j = 0; j < powerCount; ++j) {
zes_power_energy_counter_t pwr_state = {};
if (zesPowerGetEnergyCounter(powers[j], &pwr_state) == ZE_RESULT_SUCCESS) {
py::dict p_info;
p_info["energy"] = pwr_state.energy;
p_info["timestamp"] = pwr_state.timestamp;
pwrs.append(p_info);
}
}
gpu_data["power"] = pwrs;
}
}

// engines (utilization / clocks)
uint32_t engineCount = 0;
if (zesDeviceEnumEngineGroups(sysmanDevice, &engineCount, nullptr) == ZE_RESULT_SUCCESS && engineCount > 0) {
std::vector<zes_engine_handle_t> engines(engineCount);
if (zesDeviceEnumEngineGroups(sysmanDevice, &engineCount, engines.data()) == ZE_RESULT_SUCCESS) {
py::list engs;
for (uint32_t j = 0; j < engineCount; ++j) {
zes_engine_stats_t eng_state = {};
if (zesEngineGetActivity(engines[j], &eng_state) == ZE_RESULT_SUCCESS) {
py::dict e_info;
e_info["active_time"] = eng_state.activeTime; // microsecs
e_info["timestamp"] = eng_state.timestamp; // microsecs

zes_engine_properties_t e_props = {};
e_props.stype = ZES_STRUCTURE_TYPE_ENGINE_PROPERTIES;
if (zesEngineGetProperties(engines[j], &e_props) == ZE_RESULT_SUCCESS) {
e_info["type"] = static_cast<int>(e_props.type);
}

engs.append(e_info);
}
}
gpu_data["engines"] = engs;
}
}

// frequency (clock speeds)
uint32_t freqCount = 0;
if (zesDeviceEnumFrequencyDomains(sysmanDevice, &freqCount, nullptr) == ZE_RESULT_SUCCESS && freqCount > 0) {
std::vector<zes_freq_handle_t> freqs(freqCount);
if (zesDeviceEnumFrequencyDomains(sysmanDevice, &freqCount, freqs.data()) == ZE_RESULT_SUCCESS) {
py::list f_list;
for(uint32_t j = 0; j < freqCount; ++j) {
zes_freq_state_t f_state = {};
f_state.stype = ZES_STRUCTURE_TYPE_FREQ_STATE;
if(zesFrequencyGetState(freqs[j], &f_state) == ZE_RESULT_SUCCESS) {
f_list.append(f_state.actual); // mhz
}
}
gpu_data["clocks_mhz"] = f_list;
}
}

results[std::to_string(i).c_str()] = gpu_data;
}

return results;
}

PYBIND11_MODULE(gpu_metrics, m) {
m.doc() = "Intel GPU Metrics via Level Zero Sysman API";
m.def("get_gpu_metrics", &get_gpu_metrics, "Fetch all available hardware metrics like temperature, memory, and clocks from Intel GPUs");
}
11 changes: 11 additions & 0 deletions gpu-metrics/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[build-system]
requires = ["setuptools", "pybind11"]
build-backend = "setuptools.build_meta"

[project]
name = "gpu_metrics"
version = "0.1.0"
description = "Python bindings for Intel GPU Level Zero Metrics"
dependencies = [
"pybind11"
]
23 changes: 23 additions & 0 deletions gpu-metrics/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from setuptools import setup, Extension
import pybind11

ext_modules = [
Extension(
"gpu_metrics",
["gpu_metrics.cpp"],
include_dirs=[
pybind11.get_include(),
"/usr/include",
"/usr/lib/pkgconfig/../../include",
],
libraries=["ze_loader"], # cruical: links the level zero sysman api
extra_compile_args=["-std=c++17"],
),
]

setup(
name="gpu_metrics",
version="0.1.0",
description="Python bindings for Intel GPU Level Zero Metrics",
ext_modules=ext_modules,
)
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ dependencies = [
"openvino-genai>=2025.3.0.0",
"optimum[openvino]>1.26.1",
"pip>=25.2",
"pybind11>=3.0.3",
"pydantic>=2.11.7",
"pynput>=1.8.1",
"pytest>=8.4.2",
Expand Down
8 changes: 8 additions & 0 deletions src/cli/modules/launch_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,14 @@ def start_server(host: str = "0.0.0.0", port: int = 8001, reload: bool = False):
logger.info(" - POST /openarc/load Load a model")
logger.info(" - POST /openarc/unload Unload a model")
logger.info(" - GET /openarc/status Get model status")
logger.info(" - GET /openarc/metrics Get hardware telemetry")
logger.info(" - POST /openarc/models/update Update model configuration")
logger.info(" - POST /openarc/bench Run inference benchmark")
logger.info(" - GET /openarc/downloader List active model downloads")
logger.info(" - POST /openarc/downloader Start a model download")
Comment on lines +91 to +95
Copy link

Copilot AI Apr 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The endpoint list printed at startup doesn’t include new routes added in this PR (GET /openarc/version and GET /openarc/models). Please keep this list in sync with the server routes so users can discover the full API surface.

Copilot uses AI. Check for mistakes.
logger.info(" - DELETE /openarc/downloader Cancel a model download")
logger.info(" - POST /openarc/downloader/pause Pause a model download")
logger.info(" - POST /openarc/downloader/resume Resume a model download")
logger.info("--------------------------------")
logger.info("OpenAI compatible endpoints:")
logger.info(" - GET /v1/models")
Expand Down
Loading