keras-team · divyashreepathihalli · Feb 22, 2024 · Feb 2, 2024 · Feb 2, 2024 · Feb 2, 2024
diff --git a/.github/workflows/actions.yml b/.github/workflows/actions.yml
@@ -38,6 +38,8 @@ jobs:
         pip install torch>=2.0.1+cpu
         pip install "jax[cpu]"
         pip install keras-core
+        pip install keras-nlp-nightly --no-deps
+        pip install tensorflow-text==2.15
         pip install -e ".[tests]" --progress-bar off --upgrade
     - name: Test with pytest
       env:
@@ -75,6 +77,7 @@ jobs:
       run: |
         pip install -r requirements.txt
         pip install -e ".[tests]" --progress-bar off --upgrade
+        pip install keras-nlp-nightly
     - name: Test with pytest
       env:
         TEST_CUSTOM_OPS: false # TODO(ianstenbit): test custom ops, or figure out what our story is here

diff --git a/keras_cv/models/__init__.py b/keras_cv/models/__init__.py
@@ -183,6 +183,7 @@
 from keras_cv.models.backbones.vit_det.vit_det_aliases import ViTDetLBackbone
 from keras_cv.models.backbones.vit_det.vit_det_backbone import ViTDetBackbone
 from keras_cv.models.classification.image_classifier import ImageClassifier
+from keras_cv.models.feature_extractor.clip import CLIP
 from keras_cv.models.object_detection.retinanet.retinanet import RetinaNet
 from keras_cv.models.object_detection.yolo_v8.yolo_v8_backbone import (
     YOLOV8Backbone,

diff --git a/keras_cv/models/feature_extractor/__init__.py b/keras_cv/models/feature_extractor/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/keras_cv/models/feature_extractor/clip/__init__.py b/keras_cv/models/feature_extractor/clip/__init__.py
@@ -0,0 +1,23 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from keras_cv.models.feature_extractor.clip.clip_image_model import (
+    CLIPImageEncoder,
+)
+from keras_cv.models.feature_extractor.clip.clip_model import CLIP
+from keras_cv.models.feature_extractor.clip.clip_processor import CLIPProcessor
+from keras_cv.models.feature_extractor.clip.clip_text_model import (
+    CLIPTextEncoder,
+)
+from keras_cv.models.feature_extractor.clip.clip_tokenizer import CLIPTokenizer
diff --git a/keras_cv/models/feature_extractor/clip/clip_encoder.py b/keras_cv/models/feature_extractor/clip/clip_encoder.py
@@ -0,0 +1,318 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+
+from keras_cv.backend import keras
+from keras_cv.backend import ops
+
+
+def get_initializer(initializer_range=0.02):
+    """
+    Creates a `keras.initializers.TruncatedNormal` with the given range.
+
+    Args:
+        initializer_range (*float*, defaults to 0.02): Standard deviation of the
+        initializer range.
+
+    Returns:
+        `keras.initializers.TruncatedNormal`: The truncated normal initializer.
+    """
+    return keras.initializers.TruncatedNormal(stddev=initializer_range)
+
+
+class QuickGELU(keras.layers.Layer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def call(self, x):
+        return x * ops.sigmoid(1.702 * x)
+
+
+class ResidualAttention(keras.layers.Layer):
+    def __init__(
+        self,
+        proj_dim,
+        num_heads,
+        num_hidden_layers,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.proj_dim = proj_dim
+        self.num_heads = num_heads
+        self.num_hidden_layers = num_hidden_layers
+        self.fc_std = np.power(2 * self.proj_dim, -0.5) * 0.02
+
+        self.in_proj_std = (
+            np.power(self.proj_dim, -0.5)
+            * (np.power(2 * self.num_hidden_layers, -0.5))
+            * 0.02
+        )
+        self.attn = CLIPAttention(
+            self.proj_dim,
+            self.num_heads,
+            self.num_hidden_layers,
+            name="multi_head_attention",
+        )
+        self.ln_1 = keras.layers.LayerNormalization(epsilon=1e-5, name="ln_1")
+        self.mlp_dense_1 = keras.layers.Dense(
+            self.proj_dim * 4,
+            name="c_fc",
+        )
+        self.mlp_activation = QuickGELU(name="gelu")
+        self.mlp_dense_2 = keras.layers.Dense(
+            self.proj_dim,
+            name="c_proj",
+        )
+        self.ln_2 = keras.layers.LayerNormalization(epsilon=1e-5, name="ln_2")
+
+    def attention(self, x, causal_attention_mask=None, attention_mask=None):
+        mask = None
+        if causal_attention_mask is not None:
+            mask = (
+                ops.cast(causal_attention_mask, dtype=x.dtype)
+                if causal_attention_mask is not None
+                else None
+            )
+        if attention_mask is not None:
+            attention_mask = (
+                ops.cast(attention_mask, dtype=x.dtype)
+                if attention_mask is not None
+                else None
+            )
+            mask = ops.add(causal_attention_mask, attention_mask)
+
+        return self.attn(
+            x,
+            attention_mask=mask,
+        )[0]
+
+    def build(self, input_shape):
+        super().build(input_shape)
+        self.attn.build(None)
+        self.ln_1.build([None, None, self.proj_dim])
+        self.mlp_dense_1.build([None, None, self.proj_dim])
+        self.mlp_dense_2.build([None, None, self.proj_dim * 4])
+        self.ln_2.build([None, None, self.proj_dim])
+
+    def call(self, x, causal_attention_mask=None, attention_mask=None):
+        attn_x = x + self.attention(
+            self.ln_1(x),
+            causal_attention_mask=causal_attention_mask,
+            attention_mask=attention_mask,
+        )
+        x = self.mlp_dense_1(self.ln_2(attn_x))
+        x = self.mlp_activation(x)
+        x = self.mlp_dense_2(x)
+        x = attn_x + x
+        return x
+
+    def compute_output_shape(self, inputs_shape):
+        return inputs_shape
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "proj_dim": self.proj_dim,
+                "num_heads": self.num_heads,
+                "num_hidden_layers": self.num_hidden_layers,
+            }
+        )
+        return config
+
+
+class CLIPEncoder(keras.layers.Layer):
+    def __init__(self, width, num_layers, heads, **kwargs):
+        super().__init__(**kwargs)
+        self.width = width
+        self.num_layers = num_layers
+        self.heads = heads
+        self.resblocks = [
+            ResidualAttention(
+                self.width,
+                self.heads,
+                self.num_layers,
+            )
+            for _ in range(self.num_layers)
+        ]
+
+    def build(self, input_shape):
+        super().build(input_shape)
+        for block in self.resblocks:
+            block.build(input_shape)
+
+    def call(
+        self,
+        x,
+        causal_attention_mask=None,
+        attention_mask=None,
+    ):
+        for block in self.resblocks:
+            x = block(
+                x,
+                causal_attention_mask=causal_attention_mask,
+                attention_mask=attention_mask,
+            )
+        return x
+
+    def compute_output_shape(self, inputs_shape):
+        return inputs_shape
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "width": self.width,
+                "num_layers": self.num_layers,
+                "heads": self.heads,
+            }
+        )
+        return config
+
+
+class CLIPAttention(keras.layers.Layer):
+    """
+    - Documentation page: https://huggingface.co/docs/transformers/model_doc/clip # noqa: E501
+    - Implementation: https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/modeling_clip.py # noqa: E501
+    """
+
+    def __init__(
+        self, proj_dim, num_heads, num_hidden_layers, dropout=0.0, **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        self.proj_dim = proj_dim
+        self.num_heads = num_heads
+        self.num_hidden_layers = num_hidden_layers
+        self.dropout = dropout
+        self.head_dim = self.proj_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.proj_dim:
+            raise ValueError(
+                f"proj_dim must be divisible by num_heads (got `proj_dim`"
+                f": {self.proj_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+
+        self.scale = self.head_dim**-0.5
+        in_proj_std = (
+            (self.proj_dim**-0.5)
+            * ((2 * self.num_hidden_layers) ** -0.5)
+            * 0.02
+        )
+        out_proj_std = (self.proj_dim**-0.5) * 0.02
+        self.q_proj = keras.layers.Dense(
+            units=self.proj_dim,
+            kernel_initializer=get_initializer(in_proj_std),
+            name="q_proj",
+        )
+        self.k_proj = keras.layers.Dense(
+            units=self.proj_dim,
+            kernel_initializer=get_initializer(in_proj_std),
+            name="k_proj",
+        )
+        self.v_proj = keras.layers.Dense(
+            units=self.proj_dim,
+            kernel_initializer=get_initializer(in_proj_std),
+            name="v_proj",
+        )
+        self.out_proj = keras.layers.Dense(
+            units=self.proj_dim,
+            kernel_initializer=get_initializer(out_proj_std),
+            name="out_proj",
+        )
+
+    def build(self, input_shape):
+        super().build(input_shape)
+        self.q_proj.build([None, None, self.proj_dim])
+        self.k_proj.build([None, None, self.proj_dim])
+        self.v_proj.build([None, None, self.proj_dim])
+        self.out_proj.build([None, None, self.proj_dim])
+
+    def _transpose_for_scores(self, tensor, batch_size):
+        """
+        Copied from https://github.com/huggingface/transformers/blob/8e164c5400b7b413c7b8fb32e35132001effc970/src/transformers/models/bert/modeling_tf_bert.py#L252 # noqa: E501
+        """
+        # [batch_size, seq_len, all_head_dim] ->
+        # [batch_size, seq_len, num_heads, head_dim]
+        tensor = ops.reshape(
+            tensor, (batch_size, -1, self.num_heads, self.head_dim)
+        )
+        # [batch_size, seq_len, num_heads, head_dim] ->
+        # [batch_size, num_heads, seq_len, head_dim]
+        return ops.transpose(tensor, axes=[0, 2, 1, 3])
+
+    def call(
+        self,
+        x,
+        attention_mask=None,
+        output_attentions=None,
+        training=False,
+    ):
+        batch_size = ops.shape(x)[0]
+        mixed_query_layer = self.q_proj(inputs=x)
+        mixed_key_layer = self.k_proj(inputs=x)
+        mixed_value_layer = self.v_proj(inputs=x)
+        query_layer = self._transpose_for_scores(mixed_query_layer, batch_size)
+        key_layer = self._transpose_for_scores(mixed_key_layer, batch_size)
+        value_layer = self._transpose_for_scores(mixed_value_layer, batch_size)
+
+        # Scaled dot product between key and query = raw attention scores.
+        attention_scores = ops.matmul(
+            query_layer, ops.transpose(key_layer, axes=[0, 1, 3, 2])
+        )
+        dk = ops.cast(ops.sqrt(self.head_dim), dtype=attention_scores.dtype)
+        attention_scores = ops.divide(
+            attention_scores, dk
+        )  # (batch_size, num_heads, seq_len_q, seq_len_k)
+
+        if attention_mask is not None:
+            # Apply the attention mask (precomputed for all layers in the
+            # call() function)
+            attention_scores = ops.add(attention_scores, attention_mask)
+
+        # Normalize the attention scores to probabilities.
+        _attention_probs = ops.softmax(attention_scores + 1e-9, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = keras.layers.Dropout(self.dropout)(
+            inputs=_attention_probs, training=training
+        )
+
+        attn_output = ops.matmul(attention_probs, value_layer)
+        attn_output = ops.transpose(attn_output, axes=[0, 2, 1, 3])
+
+        # (batch_size, seq_len_q, proj_dim)
+        attn_output = ops.reshape(attn_output, (batch_size, -1, self.proj_dim))
+
+        attn_output = self.out_proj(attn_output, training=training)
+        outputs = (
+            (attn_output, _attention_probs)
+            if output_attentions
+            else (attn_output,)
+        )
+
+        return outputs
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "proj_dim": self.proj_dim,
+                "num_heads": self.num_heads,
+                "num_hidden_layers": self.num_hidden_layers,
+                "dropout": self.dropout,
+            }
+        )
+        return config