sgl-project · zhyncs · Jul 29, 2025 · Jul 29, 2025 · Jul 29, 2025 · slin1237
@@ -1,3 +1,16 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: llama-31-8b-sglang
+spec:
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 30Gi
+  storageClassName: default # change this to your preferred storage class
+  volumeMode: Filesystem
+---
 apiVersion: node.k8s.io/v1
 kind: RuntimeClass
 metadata:
@@ -27,41 +40,59 @@ spec:
       containers:
         - name: meta-llama-31-8b-instruct-sglang
           image: docker.io/lmsysorg/sglang:latest
-          imagePullPolicy: Always  # IfNotPresent or Never
+          imagePullPolicy: Always # IfNotPresent or Never
           ports:
             - containerPort: 30000
           command: ["python3", "-m", "sglang.launch_server"]
-          args: ["--model-path", "meta-llama/Llama-3.1-8B-Instruct", "--host", "0.0.0.0", "--port", "30000"]
+          args:
+            [
+              "--model-path",
+              "meta-llama/Llama-3.1-8B-Instruct",
+              "--host",
+              "0.0.0.0",
+              "--port",
+              "30000",
+            ]
           env:
             - name: HF_TOKEN
               value: <secret>
           resources:
             limits:
               nvidia.com/gpu: 1
+              cpu: 8
+              memory: 40Gi
+            requests:
+              cpu: 2
+              memory: 16Gi
+              nvidia.com/gpu: 1
           volumeMounts:
             - name: shm
               mountPath: /dev/shm
             - name: hf-cache
               mountPath: /root/.cache/huggingface
-              readOnly: true
             - name: localtime
               mountPath: /etc/localtime
               readOnly: true
           livenessProbe:
             httpGet:
               path: /health
               port: 30000
-            initialDelaySeconds: 30
+            initialDelaySeconds: 120
+            periodSeconds: 10
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: 30000
+            initialDelaySeconds: 120
             periodSeconds: 10
       volumes:
         - name: shm
           emptyDir:
             medium: Memory
             sizeLimit: 10Gi
         - name: hf-cache
-          hostPath:
-            path: /root/.cache/huggingface
-            type: Directory
+          persistentVolumeClaim:
+            claimName: llama-31-8b-sglang
         - name: localtime
           hostPath:
             path: /etc/localtime
@@ -76,6 +107,6 @@ spec:
     app: meta-llama-31-8b-instruct-sglang
   ports:
     - protocol: TCP
-      port: 30000  # port on host
-      targetPort: 30000  # port in container
-  type: LoadBalancer
+      port: 80 # port on host
+      targetPort: 30000 # port in container
+  type: LoadBalancer # change to ClusterIP if needed