[SDK] Fix trainer error: Update the version of base image and add "num_labels" for downloading pretrained models (kubeflow/trainer#2230)

helenxie-bit · web-flow · commit 90044594d018 · 2024-08-28T16:24:04.000Z
* fix trainer error

Signed-off-by: helenxie-bit &lt;helenxiehz@gmail.com&gt;

* rerun tests

Signed-off-by: helenxie-bit &lt;helenxiehz@gmail.com&gt;

* update the process of num_labels in trainer

Signed-off-by: helenxie-bit &lt;helenxiehz@gmail.com&gt;

* rerun tests

Signed-off-by: helenxie-bit &lt;helenxiehz@gmail.com&gt;

* adjust the  default value of 'num_labels'

Signed-off-by: helenxie-bit &lt;helenxiehz@gmail.com&gt;

---------

Signed-off-by: helenxie-bit &lt;helenxiehz@gmail.com&gt;
diff --git a/python/kubeflow/storage_initializer/hugging_face.py b/python/kubeflow/storage_initializer/hugging_face.py
@@ -37,6 +37,7 @@ class HuggingFaceModelParams:
     model_uri: str
     transformer_type: TRANSFORMER_TYPES
     access_token: str = None
+    num_labels: Optional[int] = None
 
     def __post_init__(self):
         # Custom checks or validations can be added here
diff --git a/python/kubeflow/trainer/Dockerfile b/python/kubeflow/trainer/Dockerfile
@@ -1,5 +1,5 @@
 # Use an official Pytorch runtime as a parent image
-FROM nvcr.io/nvidia/pytorch:23.10-py3
+FROM nvcr.io/nvidia/pytorch:24.06-py3
 
 # Set the working directory in the container
 WORKDIR /app
diff --git a/python/kubeflow/trainer/hf_llm_training.py b/python/kubeflow/trainer/hf_llm_training.py
@@ -28,17 +28,26 @@
 logger.setLevel(logging.INFO)
 
 
-def setup_model_and_tokenizer(model_uri, transformer_type, model_dir):
+def setup_model_and_tokenizer(model_uri, transformer_type, model_dir, num_labels):
     # Set up the model and tokenizer
     parsed_uri = urlparse(model_uri)
     model_name = parsed_uri.netloc + parsed_uri.path
 
-    model = transformer_type.from_pretrained(
-        pretrained_model_name_or_path=model_name,
-        cache_dir=model_dir,
-        local_files_only=True,
-        trust_remote_code=True,
-    )
+    if num_labels != "None":
+        model = transformer_type.from_pretrained(
+            pretrained_model_name_or_path=model_name,
+            cache_dir=model_dir,
+            local_files_only=True,
+            trust_remote_code=True,
+            num_labels=int(num_labels),
+        )
+    else:
+        model = transformer_type.from_pretrained(
+            pretrained_model_name_or_path=model_name,
+            cache_dir=model_dir,
+            local_files_only=True,
+            trust_remote_code=True,
+        )
 
     tokenizer = AutoTokenizer.from_pretrained(
         pretrained_model_name_or_path=model_name,
@@ -151,6 +160,7 @@ def parse_arguments():
 
     parser.add_argument("--model_uri", help="model uri")
     parser.add_argument("--transformer_type", help="model transformer type")
+    parser.add_argument("--num_labels", default="None", help="number of classes")
     parser.add_argument("--model_dir", help="directory containing model")
     parser.add_argument("--dataset_dir", help="directory containing dataset")
     parser.add_argument("--lora_config", help="lora_config")
@@ -178,7 +188,7 @@ def parse_arguments():
 
     logger.info("Setup model and tokenizer")
     model, tokenizer = setup_model_and_tokenizer(
-        args.model_uri, transformer_type, args.model_dir
+        args.model_uri, transformer_type, args.model_dir, args.num_labels
     )
 
     logger.info("Preprocess dataset")
diff --git a/python/kubeflow/training/api/training_client.py b/python/kubeflow/training/api/training_client.py
@@ -265,6 +265,8 @@ def train(
                 model_provider_parameters.model_uri,
                 "--transformer_type",
                 model_provider_parameters.transformer_type.__name__,
+                "--num_labels",
+                str(model_provider_parameters.num_labels),
                 "--model_dir",
                 VOLUME_PATH_MODEL,
                 "--dataset_dir",