Merge pull request #2 from McLavish/feature/bert-inference

McLavish · web-flow · commit c478c91e5fb3 · 2025-11-05T10:32:35.000+01:00
Feature/bert inference
diff --git a/.gitmodules b/.gitmodules
@@ -3,4 +3,4 @@
   url = https://github.com/mcopik/pypapi.git
 [submodule "benchmarks-data"]
 	path = benchmarks-data
-  url = https://github.com/spcl/serverless-benchmarks-data.git
+  url = https://github.com/McLavish/serverless-benchmarks-data-dphpc.git
diff --git a/.mypy.ini b/.mypy.ini
@@ -3,6 +3,9 @@
 [mypy-docker]
 ignore_missing_imports = True
 
+[mypy-docker.*]
+ignore_missing_imports = True
+
 [mypy-tzlocal]
 ignore_missing_imports = True
 
diff --git a/benchmarks/400.inference/412.language-bert/config.json b/benchmarks/400.inference/412.language-bert/config.json
@@ -0,0 +1,6 @@
+{
+  "timeout": 60,
+  "memory": 512,
+  "languages": ["python"],
+  "modules": ["storage"]
+}
diff --git a/benchmarks/400.inference/412.language-bert/input.py b/benchmarks/400.inference/412.language-bert/input.py
@@ -0,0 +1,33 @@
+import os
+
+
+def buckets_count():
+    # model bucket and text bucket
+    return (2, 0)
+
+
+def upload_files(data_root, data_dir, upload_func):
+    for root, _, files in os.walk(data_dir):
+        prefix = os.path.relpath(root, data_root)
+        for file in files:
+            filepath = os.path.join(root, file)
+            relative_key = os.path.join(prefix, file)
+            upload_func(0, relative_key, filepath)
+
+
+def generate_input(
+    data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func
+):
+    model_archive = "bert-tiny-onnx.tar.gz"
+    upload_func(0, model_archive, os.path.join(data_dir, "model", model_archive))
+
+    text_filename = "sentences.jsonl"
+    upload_func(1, text_filename, os.path.join(data_dir, "text", text_filename))
+
+    input_config = {"object": {}, "bucket": {}}
+    input_config["object"]["model"] = model_archive
+    input_config["object"]["input"] = text_filename
+    input_config["bucket"]["bucket"] = benchmarks_bucket
+    input_config["bucket"]["model"] = input_paths[0]
+    input_config["bucket"]["text"] = input_paths[1]
+    return input_config
diff --git a/benchmarks/400.inference/412.language-bert/python/function.py b/benchmarks/400.inference/412.language-bert/python/function.py
@@ -0,0 +1,157 @@
+import datetime
+import json
+import os
+import tarfile
+import uuid
+from typing import Dict, List, Optional
+
+import numpy as np
+import onnxruntime as ort
+from tokenizers import Tokenizer
+
+from . import storage
+
+client = storage.storage.get_instance()
+
+MODEL_ARCHIVE = "bert-tiny-onnx.tar.gz"
+MODEL_DIRECTORY = "/tmp/bert_language_model"
+MODEL_SUBDIR = "bert-tiny-onnx"
+
+_session: Optional[ort.InferenceSession] = None
+_tokenizer: Optional[Tokenizer] = None
+_labels: Optional[Dict[int, str]] = None
+
+
+def _ensure_model(bucket: str, model_prefix: str):
+    """
+    Lazily download and initialize the ONNX model and tokenizer.
+    """
+    global _session, _tokenizer, _labels
+
+    model_path = os.path.join(MODEL_DIRECTORY, MODEL_SUBDIR)
+    model_download_begin = datetime.datetime.now()
+    model_download_end = model_download_begin
+
+    if _session is None or _tokenizer is None or _labels is None:
+        if not os.path.exists(model_path):
+            os.makedirs(MODEL_DIRECTORY, exist_ok=True)
+            archive_path = os.path.join("/tmp", f"{uuid.uuid4()}-{MODEL_ARCHIVE}")
+            client.download(bucket, os.path.join(model_prefix, MODEL_ARCHIVE), archive_path)
+            model_download_end = datetime.datetime.now()
+
+            with tarfile.open(archive_path, "r:gz") as tar:
+                tar.extractall(MODEL_DIRECTORY)
+            os.remove(archive_path)
+        else:
+            model_download_begin = datetime.datetime.now()
+            model_download_end = model_download_begin
+
+        model_process_begin = datetime.datetime.now()
+        tokenizer_path = os.path.join(model_path, "tokenizer.json")
+        _tokenizer = Tokenizer.from_file(tokenizer_path)
+        _tokenizer.enable_truncation(max_length=128)
+        _tokenizer.enable_padding(length=128)
+
+        label_map_path = os.path.join(model_path, "label_map.json")
+        with open(label_map_path, "r") as f:
+            raw_labels = json.load(f)
+        _labels = {int(idx): label for idx, label in raw_labels.items()}
+
+        onnx_path = os.path.join(model_path, "model.onnx")
+
+        available = ort.get_available_providers()
+        if "CUDAExecutionProvider" not in available:
+            raise RuntimeError(f"CUDAExecutionProvider unavailable (have: {available})")
+
+        _session = ort.InferenceSession(onnx_path, providers=["CUDAExecutionProvider"])
+        model_process_end = datetime.datetime.now()
+    else:
+        model_process_begin = datetime.datetime.now()
+        model_process_end = model_process_begin
+
+    model_download_time = (model_download_end - model_download_begin) / datetime.timedelta(
+        microseconds=1
+    )
+    model_process_time = (model_process_end - model_process_begin) / datetime.timedelta(
+        microseconds=1
+    )
+
+    return model_download_time, model_process_time
+
+
+def _prepare_inputs(sentences: List[str]):
+    assert _tokenizer is not None
+
+    encodings = _tokenizer.encode_batch(sentences)
+
+    input_ids = np.array([enc.ids for enc in encodings], dtype=np.int64)
+    attention_mask = np.array([enc.attention_mask for enc in encodings], dtype=np.int64)
+    token_type_ids = np.array(
+        [enc.type_ids if enc.type_ids else [0] * len(enc.ids) for enc in encodings],
+        dtype=np.int64,
+    )
+
+    return {
+        "input_ids": input_ids,
+        "attention_mask": attention_mask,
+        "token_type_ids": token_type_ids,
+    }
+
+
+def _softmax(logits: np.ndarray) -> np.ndarray:
+    shifted = logits - np.max(logits, axis=1, keepdims=True)
+    exp = np.exp(shifted)
+    return exp / np.sum(exp, axis=1, keepdims=True)
+
+
+def handler(event):
+    bucket = event.get("bucket", {}).get("bucket")
+    model_prefix = event.get("bucket", {}).get("model")
+    text_prefix = event.get("bucket", {}).get("text")
+    text_key = event.get("object", {}).get("input")
+
+    download_begin = datetime.datetime.now()
+    text_download_path = os.path.join("/tmp", f"{uuid.uuid4()}-{os.path.basename(text_key)}")
+    client.download(bucket, os.path.join(text_prefix, text_key), text_download_path)
+    download_end = datetime.datetime.now()
+
+    model_download_time, model_process_time = _ensure_model(bucket, model_prefix)
+    assert _session is not None and _labels is not None and _tokenizer is not None
+
+    with open(text_download_path, "r") as f:
+        sentences = [json.loads(line)["text"] for line in f if line.strip()]
+
+    os.remove(text_download_path)
+
+    inference_begin = datetime.datetime.now()
+    inputs = _prepare_inputs(sentences)
+    outputs = _session.run(None, inputs)
+    logits = outputs[0]
+    probabilities = _softmax(logits)
+    inference_end = datetime.datetime.now()
+
+    results = []
+    for sentence, probs in zip(sentences, probabilities):
+        label_idx = int(np.argmax(probs))
+        label = _labels.get(label_idx, str(label_idx))
+        results.append(
+            {
+                "text": sentence,
+                "label": label,
+                "confidence": float(probs[label_idx]),
+                "raw_scores": probs.tolist(),
+            }
+        )
+
+    download_time = (download_end - download_begin) / datetime.timedelta(microseconds=1)
+    compute_time = (inference_end - inference_begin) / datetime.timedelta(microseconds=1)
+
+    return {
+        "result": {"predictions": results},
+        "measurement": {
+            "download_time": download_time + model_download_time,
+            "compute_time": compute_time + model_process_time,
+            "model_time": model_process_time,
+            "model_download_time": model_download_time,
+        },
+    }
diff --git a/benchmarks/400.inference/412.language-bert/python/init.sh b/benchmarks/400.inference/412.language-bert/python/init.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+# No additional initialization required for the BERT inference benchmark.
diff --git a/benchmarks/400.inference/412.language-bert/python/package.sh b/benchmarks/400.inference/412.language-bert/python/package.sh
@@ -0,0 +1,35 @@
+# Stripping package code is based on https://github.com/ryfeus/lambda-packs repo
+
+PACKAGE_DIR=$1
+echo "Original size $(du -sh $1 | cut -f1)"
+
+CUR_DIR=$(pwd)
+cd $1
+# cleaning libs
+rm -rf external
+find . -type d -name "tests" -exec rm -rf {} +
+find . -type d -name "test" -exec rm -rf {} +
+find . -type d -name "bin" -not -path "*/torch/*" -exec rm -rf {} +
+
+# cleaning
+# stripping some of the numpy libs - libgfortran-2e0d59d6.so.5.0.0 - causes issues on Azure
+find -name "*.so" -not -path "*/PIL/*" -not -path "*/Pillow.libs/*" -not -path "*libgfortran*" | xargs strip
+find -name "*.so.*" -not -path "*/PIL/*" -not -path "*/Pillow.libs/*" -not -path "*libgfortran*" | xargs strip
+
+rm -r pip >/dev/null
+rm -r pip-* >/dev/null
+rm -r wheel >/dev/null
+rm -r wheel-* >/dev/null
+rm easy_install.py >/dev/null
+find . -name \*.pyc -delete
+cd ${CUR_DIR}
+echo "Stripped size $(du -sh $1 | cut -f1)"
+
+TORCH_DIR=".python_packages/lib/site-packages/torch"
+if [ -d "$1/${TORCH_DIR}" ]; then
+	cd $1
+	zip -qr torch.zip ${TORCH_DIR}
+	rm -rf ${TORCH_DIR}
+	cd ${CUR_DIR}
+	echo "Torch-zipped size $(du -sh $1 | cut -f1)"
+fi
diff --git a/benchmarks/400.inference/412.language-bert/python/requirements.txt b/benchmarks/400.inference/412.language-bert/python/requirements.txt
@@ -0,0 +1,3 @@
+numpy==1.24.4
+onnxruntime-gpu==1.16.3
+tokenizers==0.13.3
diff --git a/benchmarks/400.inference/412.language-bert/python/requirements.txt.3.10 b/benchmarks/400.inference/412.language-bert/python/requirements.txt.3.10
@@ -0,0 +1,3 @@
+numpy==1.24.4
+onnxruntime-gpu==1.16.3
+tokenizers==0.13.3
diff --git a/benchmarks/400.inference/412.language-bert/python/requirements.txt.3.11 b/benchmarks/400.inference/412.language-bert/python/requirements.txt.3.11
@@ -0,0 +1,3 @@
+numpy==1.24.4
+onnxruntime-gpu==1.16.3
+tokenizers==0.13.3
diff --git a/benchmarks/400.inference/412.language-bert/python/requirements.txt.3.8 b/benchmarks/400.inference/412.language-bert/python/requirements.txt.3.8
@@ -0,0 +1,3 @@
+numpy==1.24.4
+onnxruntime-gpu==1.16.3
+tokenizers==0.13.3
diff --git a/benchmarks/400.inference/412.language-bert/python/requirements.txt.3.9 b/benchmarks/400.inference/412.language-bert/python/requirements.txt.3.9
@@ -0,0 +1,3 @@
+numpy==1.24.4
+onnxruntime-gpu==1.16.3
+tokenizers==0.13.3
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
@@ -10,6 +10,7 @@
 | Multimedia      | 220.video-processing    | Python    | x64, arm64 | Add a watermark and generate gif of a video file. |
 | Utilities      | 311.compression    | Python   | x64, arm64 | Create a .zip file for a group of files in storage and return to user to download. |
 | Inference      | 411.image-recognition    | Python    | x64 | Image recognition with ResNet and pytorch. |
+| Inference      | 412.language-bert    | Python    | x64 | Sentence classification with a compact BERT model served via ONNX Runtime. |
 | Scientific      | 501.graph-pagerank    | Python    | x64, arm64 | PageRank implementation with igraph. |
 | Scientific      | 502.graph-mst    | Python    | x64, arm64 | Minimum spanning tree (MST)  implementation with igraph. |
 | Scientific      | 503.graph-bfs    | Python    | x64, arm64 | Breadth-first search (BFS) implementation with igraph. |
@@ -70,6 +71,10 @@ It implements the .zip file creation with the help of the `shutil` standard libr
 
 The benchmark is inspired by MLPerf and implements image recognition with Resnet50. It downloads the input and model from the storage and uses the CPU-only `pytorch` library in Python.
 
+### Language Inference
+
+This benchmark runs sequence classification with a compact BERT model exported to ONNX. The function downloads the model archive and text samples from storage, tokenizes the sentences, executes the ONNX Runtime session, and returns the predicted labels together with confidences.
+
 ## Scientific
 
 ### Graph PageRank, BFS, MST
@@ -87,4 +92,3 @@ This benchmark is inspired by the [DNAVisualization](https://github.com/Benjamin
 ## Applications
 
 **(WiP)** Coming soon!
-
diff --git a/install.py b/install.py
@@ -86,7 +86,7 @@ def execute(cmd, cwd=None):
             execute(f"git pull", cwd=data_dir)
         # clone
         else:
-            execute(f"git clone https://github.com/spcl/serverless-benchmarks-data.git {data_dir}")
+            execute(f"git clone https://github.com/McLavish/serverless-benchmarks-data-dphpc.git {data_dir}")
     else:
         raise error
 
@@ -99,4 +99,3 @@ def execute(cmd, cwd=None):
     execute("python3 setup.py build")
     execute("python3 pypapi/papi_build.py")
     os.chdir(cur_dir)
-
diff --git a/sebs/regression.py b/sebs/regression.py
@@ -21,6 +21,7 @@
     "220.video-processing",
     "311.compression",
     "411.image-recognition",
+    "412.language-bert",
     "501.graph-pagerank",
     "502.graph-mst",
     "503.graph-bfs",

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+#!/bin/bash`
	`2`	`+`
	`3`	`+# No additional initialization required for the BERT inference benchmark.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+numpy==1.24.4`
	`2`	`+onnxruntime-gpu==1.16.3`
	`3`	`+tokenizers==0.13.3`