Merge pull request #11 from McLavish/feature/inference-recommender

McLavish · web-flow · commit d9ed5069cc9d · 2025-11-17T23:36:03.000+01:00
added recommender benchmark
diff --git a/benchmarks/400.inference/413.recommendation/config.json b/benchmarks/400.inference/413.recommendation/config.json
@@ -0,0 +1,6 @@
+{
+  "timeout": 60,
+  "memory": 1024,
+  "languages": ["python"],
+  "modules": ["storage"]
+}
diff --git a/benchmarks/400.inference/413.recommendation/input.py b/benchmarks/400.inference/413.recommendation/input.py
@@ -0,0 +1,30 @@
+import os
+
+
+def buckets_count():
+    return (2, 0)
+
+
+def upload_files(data_root, data_dir, upload_func):
+    for root, _, files in os.walk(data_dir):
+        prefix = os.path.relpath(root, data_root)
+        for file in files:
+            upload_func(0, os.path.join(prefix, file), os.path.join(root, file))
+
+
+def generate_input(
+    data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func
+):
+    model_file = "dlrm_tiny.pt"
+    upload_func(0, model_file, os.path.join(data_dir, "model", model_file))
+
+    requests_file = "requests.jsonl"
+    upload_func(1, requests_file, os.path.join(data_dir, "data", requests_file))
+
+    cfg = {"object": {}, "bucket": {}}
+    cfg["object"]["model"] = model_file
+    cfg["object"]["requests"] = requests_file
+    cfg["bucket"]["bucket"] = benchmarks_bucket
+    cfg["bucket"]["model"] = input_paths[0]
+    cfg["bucket"]["requests"] = input_paths[1]
+    return cfg
diff --git a/benchmarks/400.inference/413.recommendation/python/function.py b/benchmarks/400.inference/413.recommendation/python/function.py
@@ -0,0 +1,144 @@
+import datetime
+import json
+import os
+import uuid
+
+import torch
+import torch.nn as nn
+
+from . import storage
+
+client = storage.storage.get_instance()
+
+MODEL_FILE = "dlrm_tiny.pt"
+MODEL_CACHE = "/tmp/dlrm_gpu_model"
+
+_model = None
+_device = torch.device("cpu")
+
+
+class TinyDLRM(nn.Module):
+    def __init__(self, num_users, num_items, num_categories, embed_dim=8):
+        super().__init__()
+        self.user_emb = nn.Embedding(num_users, embed_dim)
+        self.item_emb = nn.Embedding(num_items, embed_dim)
+        self.category_emb = nn.Embedding(num_categories, embed_dim)
+        in_dim = embed_dim * 3 + 2
+        hidden = 16
+        self.mlp = nn.Sequential(
+            nn.Linear(in_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, 1),
+        )
+
+    def forward(self, user_id, item_id, category_id, dense):
+        features = torch.cat(
+            [
+                self.user_emb(user_id),
+                self.item_emb(item_id),
+                self.category_emb(category_id),
+                dense,
+            ],
+            dim=-1,
+        )
+        return torch.sigmoid(self.mlp(features))
+
+
+def _select_device():
+    if torch.cuda.is_available():
+        return torch.device("cuda")
+    raise RuntimeError("CUDA is not available")
+    return torch.device("cpu")
+
+
+def _load_model(bucket, prefix):
+    global _model, _device
+
+    if _model is not None:
+        return 0.0, 0.0
+
+    download_begin = datetime.datetime.now()
+    os.makedirs(MODEL_CACHE, exist_ok=True)
+    tmp_path = os.path.join("/tmp", f"{uuid.uuid4()}-{MODEL_FILE}")
+    client.download(bucket, os.path.join(prefix, MODEL_FILE), tmp_path)
+    download_end = datetime.datetime.now()
+
+    process_begin = datetime.datetime.now()
+    checkpoint = torch.load(tmp_path, map_location="cpu")
+    meta = checkpoint["meta"]
+    _device = _select_device()
+    model = TinyDLRM(
+        meta["num_users"], meta["num_items"], meta["num_categories"], meta["embed_dim"]
+    )
+    model.load_state_dict(checkpoint["state_dict"])
+    model.to(_device)
+    model.eval()
+    _model = model
+    os.remove(tmp_path)
+    process_end = datetime.datetime.now()
+
+    download_time = (download_end - download_begin) / datetime.timedelta(microseconds=1)
+    process_time = (process_end - process_begin) / datetime.timedelta(microseconds=1)
+    return download_time, process_time
+
+
+def _prepare_batch(requests):
+    user_ids = torch.tensor([req["user_id"] for req in requests], dtype=torch.long, device=_device)
+    item_ids = torch.tensor([req["item_id"] for req in requests], dtype=torch.long, device=_device)
+    category_ids = torch.tensor(
+        [req["category_id"] for req in requests], dtype=torch.long, device=_device
+    )
+    dense = torch.tensor(
+        [req.get("dense", [0.0, 0.0]) for req in requests], dtype=torch.float32, device=_device
+    )
+    return user_ids, item_ids, category_ids, dense
+
+
+def handler(event):
+    bucket = event.get("bucket", {}).get("bucket")
+    model_prefix = event.get("bucket", {}).get("model")
+    requests_prefix = event.get("bucket", {}).get("requests")
+    requests_key = event.get("object", {}).get("requests")
+
+    download_begin = datetime.datetime.now()
+    req_path = os.path.join("/tmp", f"{uuid.uuid4()}-{os.path.basename(requests_key)}")
+    client.download(bucket, os.path.join(requests_prefix, requests_key), req_path)
+    download_end = datetime.datetime.now()
+
+    model_download_time, model_process_time = _load_model(bucket, model_prefix)
+
+    with open(req_path, "r") as f:
+        payloads = [json.loads(line) for line in f if line.strip()]
+    os.remove(req_path)
+
+    inference_begin = datetime.datetime.now()
+    user_ids, item_ids, category_ids, dense = _prepare_batch(payloads)
+
+    with torch.no_grad():
+        scores = _model(user_ids, item_ids, category_ids, dense).squeeze(-1).tolist()
+    inference_end = datetime.datetime.now()
+
+    predictions = []
+    for req, score in zip(payloads, scores):
+        predictions.append(
+            {
+                "user_id": req["user_id"],
+                "item_id": req["item_id"],
+                "category_id": req["category_id"],
+                "score": score,
+                "device": str(_device),
+            }
+        )
+
+    download_time = (download_end - download_begin) / datetime.timedelta(microseconds=1)
+    compute_time = (inference_end - inference_begin) / datetime.timedelta(microseconds=1)
+
+    return {
+        "result": {"predictions": predictions},
+        "measurement": {
+            "download_time": download_time + model_download_time,
+            "compute_time": compute_time + model_process_time,
+            "model_time": model_process_time,
+            "model_download_time": model_download_time,
+        },
+    }
diff --git a/benchmarks/400.inference/413.recommendation/python/init.sh b/benchmarks/400.inference/413.recommendation/python/init.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+# No additional initialization required for GPU recommendation benchmark.
diff --git a/benchmarks/400.inference/413.recommendation/python/package.sh b/benchmarks/400.inference/413.recommendation/python/package.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+PACKAGE_DIR=$1
+echo "DLRM GPU package size $(du -sh $1 | cut -f1)"
diff --git a/benchmarks/400.inference/413.recommendation/python/requirements.txt b/benchmarks/400.inference/413.recommendation/python/requirements.txt
@@ -0,0 +1 @@
+torch==2.2.2
diff --git a/benchmarks/400.inference/413.recommendation/python/requirements.txt.3.10 b/benchmarks/400.inference/413.recommendation/python/requirements.txt.3.10
@@ -0,0 +1 @@
+torch==2.2.2
diff --git a/benchmarks/400.inference/413.recommendation/python/requirements.txt.3.11 b/benchmarks/400.inference/413.recommendation/python/requirements.txt.3.11
@@ -0,0 +1 @@
+torch==2.2.2
diff --git a/benchmarks/400.inference/413.recommendation/python/requirements.txt.3.8 b/benchmarks/400.inference/413.recommendation/python/requirements.txt.3.8
@@ -0,0 +1 @@
+torch==2.2.2
diff --git a/benchmarks/400.inference/413.recommendation/python/requirements.txt.3.9 b/benchmarks/400.inference/413.recommendation/python/requirements.txt.3.9
@@ -0,0 +1 @@
+torch==2.2.2
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
@@ -11,6 +11,7 @@
 | Utilities      | 311.compression    | Python   | x64, arm64 | Create a .zip file for a group of files in storage and return to user to download. |
 | Inference      | 411.image-recognition    | Python    | x64 | Image recognition with ResNet and pytorch. |
 | Inference      | 412.language-bert    | Python    | x64 | Sentence classification with a compact BERT model served via ONNX Runtime. |
+| Inference      | 413.recommendation    | Python    | x64 | GPU DLRM-inspired recommender scoring implemented in PyTorch. |
 | Scientific      | 501.graph-pagerank    | Python    | x64, arm64 | PageRank implementation with igraph. |
 | Scientific      | 502.graph-mst    | Python    | x64, arm64 | Minimum spanning tree (MST)  implementation with igraph. |
 | Scientific      | 503.graph-bfs    | Python    | x64, arm64 | Breadth-first search (BFS) implementation with igraph. |
@@ -75,6 +76,10 @@ The benchmark is inspired by MLPerf and implements image recognition with Resnet
 
 This benchmark runs sequence classification with a compact BERT model exported to ONNX. The function downloads the model archive and text samples from storage, tokenizes the sentences, executes the ONNX Runtime session, and returns the predicted labels together with confidences.
 
+### Recommendation
+
+Inspired by MLPerf’s DLRM v2, this benchmark ships a tiny PyTorch DLRM model that optionally runs on CUDA when available. The function downloads the model and request batch, moves the network to GPU if possible, performs batched inference, and reports recommendation scores alongside timing measurements.
+
 ## Scientific
 
 ### Graph PageRank, BFS, MST
diff --git a/sebs/regression.py b/sebs/regression.py
@@ -22,6 +22,7 @@
     "311.compression",
     "411.image-recognition",
     "412.language-bert",
+    "413.recommendation",
     "501.graph-pagerank",
     "502.graph-mst",
     "503.graph-bfs",

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+#!/bin/bash`
	`2`	`+`
	`3`	`+# No additional initialization required for GPU recommendation benchmark.`