keras-team · divyashreepathihalli · Mar 12, 2024 · Feb 26, 2024 · Feb 26, 2024 · Feb 27, 2024
diff --git a/keras_cv/models/feature_extractor/clip/clip_model.py b/keras_cv/models/feature_extractor/clip/clip_model.py
@@ -61,6 +61,26 @@ class CLIP(Task):
             transformer-based text encoder.
         transformer_layers (int): The number of layers in the transformer-based
             text encoder.
+    Example:
+    ```python
+    processor = CLIPProcessor(
+      input_resolution=224,
+      "path_to_vocab.json",
+      "path_to_merges.txt"
+      )
+    processed_image = processor.process_images(["cat.jpg"])
+    processed_text, attention_mask = processor.process_texts(
+      ["mountains", "cat on tortoise", "two cats"]
+      )
+    model = CLIP.from_preset("clip-vit-base-patch16")
+    image_logits, text_logits = model(
+            {
+                "image": processed_image,
+                "text": processed_text,
+                "attention_mask": attention_mask,
+            }
+        )
+    ```
     """
 
     def __init__(
@@ -133,7 +153,12 @@ def encode_images(self, image):
     def encode_text(self, text, attention_mask=None):
         return self.text_encoder(text, attention_mask=attention_mask)
 
-    def call(self, image, text, attention_mask=None):
+    def call(self, inputs):
+        image, text = inputs["image"], inputs["text"]
+        if "attention_mask" in inputs:
+            attention_mask = inputs["attention_mask"]
+        else:
+            attention_mask = None
         self.image_embeddings = self.encode_images(image)
         self.text_embeddings = self.encode_text(
             text, attention_mask=attention_mask

diff --git a/keras_cv/models/feature_extractor/clip/clip_model_test.py b/keras_cv/models/feature_extractor/clip/clip_model_test.py
@@ -34,27 +34,24 @@
     "https://storage.googleapis.com/keras-cv/models/clip/merges.txt",
 )
 
-MODEL_PATH = keras.utils.get_file(
-    None,
-    "https://storage.googleapis.com/keras-cv/models/clip/clip-vit-base-patch32.weights.h5",  # noqa: E501
-)
-
 
 class CLIPTest(TestCase):
     @pytest.mark.large
     def test_clip_model_golden_values(self):
-        model = CLIP()
-        model.load_weights(MODEL_PATH)
+        model = CLIP.from_preset("clip-vit-base-patch32")
         processed_image = np.ones(shape=[1, 224, 224, 3])
         processed_text = np.ones(shape=[3, 77])
         attention_mask = np.ones(shape=[3, 77])
         image_logits, text_logits = model(
-            processed_image, processed_text, attention_mask
+            {
+                "image": processed_image,
+                "text": processed_text,
+                "attention_mask": attention_mask,
+            }
         )
-        print(image_logits)
-        self.assertAllClose(image_logits, [[1.896713, 1.896713, 1.896713]])
+        self.assertAllClose(image_logits, [[1.896712, 1.896712, 1.896712]])
         self.assertAllClose(
-            text_logits, ops.transpose([[1.896713, 1.896713, 1.896713]])
+            text_logits, ops.transpose([[1.896712, 1.896712, 1.896712]])
         )
 
     def test_clip_preprocessor(self):
@@ -83,17 +80,26 @@ def test_presets(self):
         processed_text = np.ones(shape=[3, 77])
         attention_mask = np.ones(shape=[3, 77])
         image_logits, text_logits = model(
-            processed_image, processed_text, attention_mask
+            {
+                "image": processed_image,
+                "text": processed_text,
+                "attention_mask": attention_mask,
+            }
         )
 
     @pytest.mark.large
     def test_image_encoder_golden_values(self):
-        model = CLIP()
-        model.load_weights(MODEL_PATH)
+        model = CLIP.from_preset("clip-vit-base-patch32")
         processed_image = np.ones(shape=[1, 224, 224, 3])
         processed_text = np.ones(shape=[3, 77])
         attention_mask = np.ones(shape=[3, 77])
-        model(processed_image, processed_text, attention_mask)
+        model(
+            {
+                "image": processed_image,
+                "text": processed_text,
+                "attention_mask": attention_mask,
+            }
+        )
         self.assertAllClose(
             model.image_embeddings[:, :5],
             [[0.023215, 0.026526, 0.008914, -0.091689, 0.021791]],
@@ -105,8 +111,13 @@ def test_text_encoder_golden_values(self):
         processed_image = np.ones(shape=[1, 224, 224, 3])
         processed_text = np.ones(shape=[3, 77])
         attention_mask = np.ones(shape=[3, 77])
-        model(processed_image, processed_text, attention_mask)
-        print(model.text_embeddings)
+        model(
+            {
+                "image": processed_image,
+                "text": processed_text,
+                "attention_mask": attention_mask,
+            }
+        )
         self.assertAllClose(
             model.text_embeddings[0, :3],
             [0.007531, -0.038361, -0.035686],
@@ -118,7 +129,13 @@ def test_saved_model(self):
         processed_image = np.ones(shape=[1, 224, 224, 3])
         processed_text = np.ones(shape=[3, 77])
         attention_mask = np.ones(shape=[3, 77])
-        model_output, _ = model(processed_image, processed_text, attention_mask)
+        model_output, _ = model(
+            {
+                "image": processed_image,
+                "text": processed_text,
+                "attention_mask": attention_mask,
+            }
+        )
         save_path = os.path.join(self.get_temp_dir(), "model.keras")
         if keras_3():
             model.save(save_path)
@@ -130,6 +147,10 @@ def test_saved_model(self):
         self.assertIsInstance(restored_model, CLIP)
         # Check that output matches.
         restored_output, _ = restored_model(
-            processed_image, processed_text, attention_mask
+            {
+                "image": processed_image,
+                "text": processed_text,
+                "attention_mask": attention_mask,
+            }
         )
         self.assertAllClose(model_output, restored_output)