diff --git a/easy_rec/python/builders/loss_builder.py b/easy_rec/python/builders/loss_builder.py
index 7459372a5..ec4ab57c8 100644
--- a/easy_rec/python/builders/loss_builder.py
+++ b/easy_rec/python/builders/loss_builder.py
@@ -41,12 +41,18 @@ def build(loss_type,
     return tf.losses.mean_squared_error(
         labels=label, predictions=pred, weights=loss_weight, **kwargs)
   elif loss_type == LossType.JRC_LOSS:
-    alpha = 0.5 if loss_param is None else loss_param.alpha
-    auto_weight = False if loss_param is None else not loss_param.HasField(
-        'alpha')
     session = kwargs.get('session_ids', None)
+    if loss_param is None:
+      return jrc_loss(label, pred, session, name=loss_name)
     return jrc_loss(
-        label, pred, session, alpha, auto_weight=auto_weight, name=loss_name)
+        label,
+        pred,
+        session,
+        loss_param.alpha,
+        loss_weight_strategy=loss_param.loss_weight_strategy,
+        sample_weights=loss_weight,
+        same_label_loss=loss_param.same_label_loss,
+        name=loss_name)
   elif loss_type == LossType.PAIR_WISE_LOSS:
     session = kwargs.get('session_ids', None)
     margin = 0 if loss_param is None else loss_param.margin
diff --git a/easy_rec/python/compat/array_ops.py b/easy_rec/python/compat/array_ops.py
new file mode 100644
index 000000000..d788bc8c1
--- /dev/null
+++ b/easy_rec/python/compat/array_ops.py
@@ -0,0 +1,229 @@
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import gen_math_ops
+
+
+def convert_to_int_tensor(tensor, name, dtype=tf.int32):
+  """Converts the given value to an integer Tensor."""
+  tensor = ops.convert_to_tensor(tensor, name=name, preferred_dtype=dtype)
+  if tensor.dtype.is_integer:
+    tensor = gen_math_ops.cast(tensor, dtype)
+  else:
+    raise TypeError('%s must be an integer tensor; dtype=%s' %
+                    (name, tensor.dtype))
+  return tensor
+
+
+def _with_nonzero_rank(data):
+  """If `data` is scalar, then add a dimension; otherwise return as-is."""
+  if data.shape.ndims is not None:
+    if data.shape.ndims == 0:
+      return tf.stack([data])
+    else:
+      return data
+  else:
+    data_shape = tf.shape(data)
+    data_ndims = tf.rank(data)
+    return tf.reshape(data, tf.concat([[1], data_shape], axis=0)[-data_ndims:])
+
+
+def get_positive_axis(axis, ndims):
+  """Validate an `axis` parameter, and normalize it to be positive.
+
+  If `ndims` is known (i.e., not `None`), then check that `axis` is in the
+  range `-ndims <= axis < ndims`, and return `axis` (if `axis >= 0`) or
+  `axis + ndims` (otherwise).
+  If `ndims` is not known, and `axis` is positive, then return it as-is.
+  If `ndims` is not known, and `axis` is negative, then report an error.
+
+  Args:
+    axis: An integer constant
+    ndims: An integer constant, or `None`
+
+  Returns:
+    The normalized `axis` value.
+
+  Raises:
+    ValueError: If `axis` is out-of-bounds, or if `axis` is negative and
+      `ndims is None`.
+  """
+  if not isinstance(axis, int):
+    raise TypeError('axis must be an int; got %s' % type(axis).__name__)
+  if ndims is not None:
+    if 0 <= axis < ndims:
+      return axis
+    elif -ndims <= axis < 0:
+      return axis + ndims
+    else:
+      raise ValueError('axis=%s out of bounds: expected %s<=axis<%s' %
+                       (axis, -ndims, ndims))
+  elif axis < 0:
+    raise ValueError('axis may only be negative if ndims is statically known.')
+  return axis
+
+
+def tile_one_dimension(data, axis, multiple):
+  """Tiles a single dimension of a tensor."""
+  # Assumes axis is a nonnegative int.
+  if data.shape.ndims is not None:
+    multiples = [1] * data.shape.ndims
+    multiples[axis] = multiple
+  else:
+    ones_value = tf.ones(tf.rank(data), tf.int32)
+    multiples = tf.concat(
+        [ones_value[:axis], [multiple], ones_value[axis + 1:]], axis=0)
+  return tf.tile(data, multiples)
+
+
+def _all_dimensions(x):
+  """Returns a 1D-tensor listing all dimensions in x."""
+  # Fast path: avoid creating Rank and Range ops if ndims is known.
+  if isinstance(x, ops.Tensor) and x.get_shape().ndims is not None:
+    return constant_op.constant(np.arange(x.get_shape().ndims), dtype=tf.int32)
+  if (isinstance(x, sparse_tensor.SparseTensor) and
+      x.dense_shape.get_shape().is_fully_defined()):
+    r = x.dense_shape.get_shape().dims[0].value  # sparse.dense_shape is 1-D.
+    return constant_op.constant(np.arange(r), dtype=tf.int32)
+
+  # Otherwise, we rely on `range` and `rank` to do the right thing at runtime.
+  return gen_math_ops._range(0, tf.rank(x), 1)
+
+
+# This op is intended to exactly match the semantics of numpy.repeat, with
+# one exception: numpy.repeat has special (and somewhat non-intuitive) behavior
+# when axis is not specified.  Rather than implement that special behavior, we
+# simply make `axis` be a required argument.
+#
+# External (OSS) `tf.repeat` feature request:
+# https://github.com/tensorflow/tensorflow/issues/8246
+def repeat_with_axis(data, repeats, axis, name=None):
+  """Repeats elements of `data`.
+
+  Args:
+    data: An `N`-dimensional tensor.
+    repeats: A 1-D integer tensor specifying how many times each element in
+      `axis` should be repeated.  `len(repeats)` must equal `data.shape[axis]`.
+      Supports broadcasting from a scalar value.
+    axis: `int`.  The axis along which to repeat values.  Must be less than
+      `max(N, 1)`.
+    name: A name for the operation.
+
+  Returns:
+    A tensor with `max(N, 1)` dimensions.  Has the same shape as `data`,
+    except that dimension `axis` has size `sum(repeats)`.
+  #### Examples:
+    ```python
+    >>> repeat(['a', 'b', 'c'], repeats=[3, 0, 2], axis=0)
+    ['a', 'a', 'a', 'c', 'c']
+    >>> repeat([[1, 2], [3, 4]], repeats=[2, 3], axis=0)
+    [[1, 2], [1, 2], [3, 4], [3, 4], [3, 4]]
+    >>> repeat([[1, 2], [3, 4]], repeats=[2, 3], axis=1)
+    [[1, 1, 2, 2, 2], [3, 3, 4, 4, 4]]
+    ```
+  """
+  if not isinstance(axis, int):
+    raise TypeError('axis must be an int; got %s' % type(axis).__name__)
+
+  with ops.name_scope(name, 'Repeat', [data, repeats]):
+    data = ops.convert_to_tensor(data, name='data')
+    repeats = convert_to_int_tensor(repeats, name='repeats')
+    repeats.shape.with_rank_at_most(1)
+
+    # If `data` is a scalar, then upgrade it to a vector.
+    data = _with_nonzero_rank(data)
+    data_shape = tf.shape(data)
+
+    # If `axis` is negative, then convert it to a positive value.
+    axis = get_positive_axis(axis, data.shape.ndims)
+
+    # Check data Tensor shapes.
+    if repeats.shape.ndims == 1:
+      data.shape.dims[axis].assert_is_compatible_with(repeats.shape[0])
+
+    # If we know that `repeats` is a scalar, then we can just tile & reshape.
+    if repeats.shape.ndims == 0:
+      expanded = tf.expand_dims(data, axis + 1)
+      tiled = tile_one_dimension(expanded, axis + 1, repeats)
+      result_shape = tf.concat([data_shape[:axis], [-1], data_shape[axis + 1:]],
+                               axis=0)
+      return tf.reshape(tiled, result_shape)
+
+    # Broadcast the `repeats` tensor so rank(repeats) == axis + 1.
+    if repeats.shape.ndims != axis + 1:
+      repeats_shape = tf.shape(repeats)
+      repeats_ndims = tf.rank(repeats)
+      broadcast_shape = tf.concat(
+          [data_shape[:axis + 1 - repeats_ndims], repeats_shape], axis=0)
+      repeats = tf.broadcast_to(repeats, broadcast_shape)
+      repeats.set_shape([None] * (axis + 1))
+
+    # Create a "sequence mask" based on `repeats`, where slices across `axis`
+    # contain one `True` value for each repetition.  E.g., if
+    # `repeats = [3, 1, 2]`, then `mask = [[1, 1, 1], [1, 0, 0], [1, 1, 0]]`.
+    max_repeat = gen_math_ops.maximum(
+        0, gen_math_ops._max(repeats, _all_dimensions(repeats)))
+    mask = tf.sequence_mask(repeats, max_repeat)
+
+    # Add a new dimension around each value that needs to be repeated, and
+    # then tile that new dimension to match the maximum number of repetitions.
+    expanded = tf.expand_dims(data, axis + 1)
+    tiled = tile_one_dimension(expanded, axis + 1, max_repeat)
+
+    # Use `boolean_mask` to discard the extra repeated values.  This also
+    # flattens all dimensions up through `axis`.
+    masked = tf.boolean_mask(tiled, mask)
+
+    # Reshape the output tensor to add the outer dimensions back.
+    if axis == 0:
+      result = masked
+    else:
+      result_shape = tf.concat([data_shape[:axis], [-1], data_shape[axis + 1:]],
+                               axis=0)
+      result = tf.reshape(masked, result_shape)
+
+    # Preserve shape information.
+    if data.shape.ndims is not None:
+      new_axis_size = 0 if repeats.shape[0] == 0 else None
+      result.set_shape(data.shape[:axis].concatenate(
+          [new_axis_size]).concatenate(data.shape[axis + 1:]))
+
+    return result
+
+
+def repeat(input, repeats, axis=None, name=None):  # pylint: disable=redefined-builtin
+  """Repeat elements of `input`.
+
+  Args:
+    input: An `N`-dimensional Tensor.
+    repeats: An 1-D `int` Tensor. The number of repetitions for each element.
+      repeats is broadcasted to fit the shape of the given axis. `len(repeats)`
+      must equal `input.shape[axis]` if axis is not None.
+    axis: An int. The axis along which to repeat values. By default (axis=None),
+      use the flattened input array, and return a flat output array.
+    name: A name for the operation.
+
+  Returns:
+    A Tensor which has the same shape as `input`, except along the given axis.
+      If axis is None then the output array is flattened to match the flattened
+      input array.
+  #### Examples:
+    ```python
+    >>> repeat(['a', 'b', 'c'], repeats=[3, 0, 2], axis=0)
+    ['a', 'a', 'a', 'c', 'c']
+    >>> repeat([[1, 2], [3, 4]], repeats=[2, 3], axis=0)
+    [[1, 2], [1, 2], [3, 4], [3, 4], [3, 4]]
+    >>> repeat([[1, 2], [3, 4]], repeats=[2, 3], axis=1)
+    [[1, 1, 2, 2, 2], [3, 3, 4, 4, 4]]
+    >>> repeat(3, repeats=4)
+    [3, 3, 3, 3]
+    >>> repeat([[1,2], [3,4]], repeats=2)
+    [1, 1, 2, 2, 3, 3, 4, 4]
+    ```
+  """
+  if axis is None:
+    input = tf.reshape(input, [-1])
+    axis = 0
+  return repeat_with_axis(input, repeats, axis, name)
diff --git a/easy_rec/python/compat/feature_column/feature_column.py b/easy_rec/python/compat/feature_column/feature_column.py
index 1eb27717d..d446adb76 100644
--- a/easy_rec/python/compat/feature_column/feature_column.py
+++ b/easy_rec/python/compat/feature_column/feature_column.py
@@ -177,7 +177,8 @@ def _internal_input_layer(features,
                           scope=None,
                           cols_to_output_tensors=None,
                           from_template=False,
-                          feature_name_to_output_tensors=None):
+                          feature_name_to_output_tensors=None,
+                          sort_feature_columns_by_name=True):
   """See input_layer, `scope` is a name or variable scope to use."""
   feature_columns = _normalize_feature_columns(feature_columns)
   for column in feature_columns:
@@ -195,9 +196,11 @@ def _internal_input_layer(features,
   def _get_logits():  # pylint: disable=missing-docstring
     builder = _LazyBuilder(features)
     output_tensors = []
-    ordered_columns = []
-    for column in sorted(feature_columns, key=lambda x: x.name):
-      ordered_columns.append(column)
+    if sort_feature_columns_by_name:
+      ordered_columns = sorted(feature_columns, key=lambda x: x.name)
+    else:
+      ordered_columns = feature_columns
+    for column in ordered_columns:
       with variable_scope.variable_scope(
           None, default_name=column._var_scope_name):  # pylint: disable=protected-access
         tensor = column._get_dense_tensor(  # pylint: disable=protected-access
@@ -239,7 +242,8 @@ def input_layer(features,
                 trainable=True,
                 cols_to_vars=None,
                 cols_to_output_tensors=None,
-                feature_name_to_output_tensors=None):
+                feature_name_to_output_tensors=None,
+                sort_feature_columns_by_name=True):
   """Returns a dense `Tensor` as input layer based on given `feature_columns`.
 
   Generally a single example in training data is described with FeatureColumns.
@@ -287,6 +291,7 @@ def input_layer(features,
     cols_to_output_tensors: If not `None`, must be a dictionary that will be
       filled with a mapping from '_FeatureColumn' to the associated
       output `Tensor`s.
+    sort_feature_columns_by_name: whether to sort feature columns
 
   Returns:
     A `Tensor` which represents input layer of a model. Its shape
@@ -303,7 +308,8 @@ def input_layer(features,
       trainable=trainable,
       cols_to_vars=cols_to_vars,
       cols_to_output_tensors=cols_to_output_tensors,
-      feature_name_to_output_tensors=feature_name_to_output_tensors)
+      feature_name_to_output_tensors=feature_name_to_output_tensors,
+      sort_feature_columns_by_name=sort_feature_columns_by_name)
 
 
 # TODO(akshayka): InputLayer should be a subclass of Layer, and it
@@ -2530,7 +2536,46 @@ def name(self):
 
   @property
   def raw_name(self):
-    return self.categorical_column.name
+    return self.categorical_column.raw_name
+
+  @property
+  def cardinality(self):
+    from easy_rec.python.compat.feature_column.feature_column_v2 import HashedCategoricalColumn, \
+        BucketizedColumn, WeightedCategoricalColumn, SequenceWeightedCategoricalColumn, \
+        CrossedColumn, IdentityCategoricalColumn, VocabularyListCategoricalColumn, \
+        VocabularyFileCategoricalColumn
+
+    fc = self.categorical_column
+    if isinstance(fc, HashedCategoricalColumn) or isinstance(fc, CrossedColumn):
+      return fc.hash_bucket_size
+
+    if isinstance(fc, IdentityCategoricalColumn):
+      return fc.num_buckets
+
+    if isinstance(fc, BucketizedColumn):
+      return len(fc.boundaries) + 1
+
+    if isinstance(fc, VocabularyListCategoricalColumn):
+      return len(fc.vocabulary_list) + fc.num_oov_buckets
+
+    if isinstance(fc, VocabularyFileCategoricalColumn):
+      return len(fc.vocabulary_size) + fc.num_oov_buckets
+
+    if isinstance(fc, WeightedCategoricalColumn) or isinstance(
+        fc, SequenceWeightedCategoricalColumn):
+      sub_fc = fc.categorical_column
+      if isinstance(sub_fc, HashedCategoricalColumn) or isinstance(
+          sub_fc, CrossedColumn):
+        return sub_fc.hash_bucket_size
+      if isinstance(sub_fc, IdentityCategoricalColumn):
+        return sub_fc.num_buckets
+      if isinstance(sub_fc, VocabularyListCategoricalColumn):
+        return len(sub_fc.vocabulary_list) + fc.num_oov_buckets
+      if isinstance(sub_fc, VocabularyFileCategoricalColumn):
+        return len(sub_fc.vocabulary_size) + fc.num_oov_buckets
+      if isinstance(sub_fc, BucketizedColumn):
+        return len(sub_fc.boundaries) + 1
+    return 1
 
   @property
   def _var_scope_name(self):
@@ -2605,7 +2650,7 @@ def _get_dense_tensor_internal(self,
           # get zero embedding
           import os
           if os.environ.get('tf.estimator.mode', '') != \
-             os.environ.get('tf.estimator.ModeKeys.TRAIN', 'train'):
+              os.environ.get('tf.estimator.ModeKeys.TRAIN', 'train'):
             initializer = init_ops.zeros_initializer()
           else:
             initializer = self.initializer
diff --git a/easy_rec/python/compat/feature_column/feature_column_v2.py b/easy_rec/python/compat/feature_column/feature_column_v2.py
index e1e4d9304..0ca532bea 100644
--- a/easy_rec/python/compat/feature_column/feature_column_v2.py
+++ b/easy_rec/python/compat/feature_column/feature_column_v2.py
@@ -1328,6 +1328,71 @@ def numeric_column(key,
       normalizer_fn=normalizer_fn)
 
 
+def constant_numeric_column(key,
+                            shape=(1,),
+                            default_value=None,
+                            dtype=dtypes.float32,
+                            feature_name=None):
+  """Represents real valued or numerical features.
+
+  Example:
+
+  ```python
+  price = constant_numeric_column('price')
+  columns = [price, ...]
+  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+
+  # or
+  bucketized_price = bucketized_column(price, boundaries=[...])
+  columns = [bucketized_price, ...]
+  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature. It is used as the
+      column name and the dictionary key for feature parsing configs, feature
+      `Tensor` objects, and feature columns.
+    shape: An iterable of integers specifies the shape of the `Tensor`. An
+      integer can be given which means a single dimension `Tensor` with given
+      width. The `Tensor` representing the column will have the shape of
+      [batch_size] + `shape`.
+    default_value: A single value compatible with `dtype` or an iterable of
+      values compatible with `dtype` which the column takes on during
+      `tf.Example` parsing if data is missing. A default value of `None` will
+      cause `tf.io.parse_example` to fail if an example does not contain this
+      column. If a single value is provided, the same value will be applied as
+      the default value for every item. If an iterable of values is provided,
+      the shape of the `default_value` should be equal to the given `shape`.
+    dtype: defines the type of values. Default value is `tf.float32`. Must be a
+      non-quantized, real integer or floating point type.
+
+  Returns:
+    A `ConstantNumericColumn`.
+
+  Raises:
+    TypeError: if any dimension in shape is not an int
+    ValueError: if any dimension in shape is not a positive integer
+    TypeError: if `default_value` is an iterable but not compatible with `shape`
+    TypeError: if `default_value` is not compatible with `dtype`.
+    ValueError: if `dtype` is not convertible to `tf.float32`.
+  """
+  shape = _check_shape(shape, key)
+  if not (dtype.is_integer or dtype.is_floating):
+    raise ValueError('dtype must be convertible to float. '
+                     'dtype: {}, key: {}'.format(dtype, key))
+  default_value = fc_utils.check_default_value(shape, default_value, dtype, key)
+
+  fc_utils.assert_key_is_string(key)
+  return ConstantNumericColumn(
+      feature_name=feature_name,
+      key=key,
+      shape=shape,
+      default_value=default_value,
+      dtype=dtype)
+
+
 def bucketized_column(source_column, boundaries):
   """Represents discretized dense input.
 
@@ -2619,6 +2684,131 @@ def _normalize_feature_columns(feature_columns):
   return sorted(feature_columns, key=lambda x: x.name)
 
 
+class ConstantNumericColumn(
+    DenseColumn,
+    fc_old._DenseColumn,  # pylint: disable=protected-access
+    collections.namedtuple(
+        'ConstantNumericColumn',
+        ('feature_name', 'key', 'shape', 'default_value', 'dtype'))):
+  """see `numeric_column`."""
+
+  @property
+  def _is_v2_column(self):
+    return True
+
+  @property
+  def name(self):
+    """See `FeatureColumn` base class."""
+    return self.feature_name if self.feature_name else self.key
+
+  @property
+  def raw_name(self):
+    """See `FeatureColumn` base class."""
+    return self.key
+
+  @property
+  def parse_example_spec(self):
+    """See `FeatureColumn` base class."""
+    return {
+        self.key:
+            parsing_ops.FixedLenFeature(self.shape, self.dtype,
+                                        self.default_value)
+    }
+
+  @property
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _parse_example_spec(self):
+    return self.parse_example_spec
+
+  def _transform_input_tensor(self, input_tensor):
+    shape = [1] + list(self.shape)
+    def_val = 0 if self.default_value is None else self.default_value
+    row = tf.constant(def_val, dtypes.float32, shape)
+    batch_size = tf.shape(input_tensor)[0]
+    return tf.tile(row, [batch_size, 1])
+
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _transform_feature(self, inputs):
+    input_tensor = inputs.get(self.key)
+    return self._transform_input_tensor(input_tensor)
+
+  def transform_feature(self, transformation_cache, state_manager):
+    """See `FeatureColumn` base class.
+
+    Args:
+      transformation_cache: A `FeatureTransformationCache` object to access
+        features.
+      state_manager: A `StateManager` to create / access resources such as
+        lookup tables.
+
+    Returns:
+      Normalized input tensor.
+
+    Raises:
+      ValueError: If a SparseTensor is passed in.
+    """
+    input_tensor = transformation_cache.get(self.key, state_manager)
+    return self._transform_input_tensor(input_tensor)
+
+  @property
+  def variable_shape(self):
+    """See `DenseColumn` base class."""
+    return tensor_shape.TensorShape(self.shape)
+
+  @property
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _variable_shape(self):
+    return self.variable_shape
+
+  def get_dense_tensor(self, transformation_cache, state_manager):
+    """Returns dense `Tensor` representing numeric feature.
+
+    Args:
+      transformation_cache: A `FeatureTransformationCache` object to access
+        features.
+      state_manager: A `StateManager` to create / access resources such as
+        lookup tables.
+
+    Returns:
+      Dense `Tensor` created within `transform_feature`.
+    """
+    # Feature has been already transformed. Return the intermediate
+    # representation created by _transform_feature.
+    return transformation_cache.get(self, state_manager)
+
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    del weight_collections
+    del trainable
+    return inputs.get(self)
+
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.key]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['normalizer_fn'] = utils.serialize_keras_object(self.normalizer_fn)
+    config['dtype'] = self.dtype.name
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['normalizer_fn'] = utils.deserialize_keras_object(
+        config['normalizer_fn'], custom_objects=custom_objects)
+    kwargs['dtype'] = dtypes.as_dtype(config['dtype'])
+    return cls(**kwargs)
+
+
 class NumericColumn(
     DenseColumn,
     fc_old._DenseColumn,  # pylint: disable=protected-access
@@ -3377,6 +3567,40 @@ def raw_name(self):
     """See `FeatureColumn` base class."""
     return self.categorical_column.raw_name
 
+  @property
+  def cardinality(self):
+    fc = self.categorical_column
+    if isinstance(fc, HashedCategoricalColumn) or isinstance(fc, CrossedColumn):
+      return fc.hash_bucket_size
+
+    if isinstance(fc, IdentityCategoricalColumn):
+      return fc.num_buckets
+
+    if isinstance(fc, BucketizedColumn):
+      return len(fc.boundaries) + 1
+
+    if isinstance(fc, VocabularyListCategoricalColumn):
+      return len(fc.vocabulary_list) + fc.num_oov_buckets
+
+    if isinstance(fc, VocabularyFileCategoricalColumn):
+      return len(fc.vocabulary_size) + fc.num_oov_buckets
+
+    if isinstance(fc, WeightedCategoricalColumn) or isinstance(
+        fc, SequenceWeightedCategoricalColumn):
+      sub_fc = fc.categorical_column
+      if isinstance(sub_fc, HashedCategoricalColumn) or isinstance(
+          sub_fc, CrossedColumn):
+        return sub_fc.hash_bucket_size
+      if isinstance(sub_fc, IdentityCategoricalColumn):
+        return sub_fc.num_buckets
+      if isinstance(sub_fc, VocabularyListCategoricalColumn):
+        return len(sub_fc.vocabulary_list) + fc.num_oov_buckets
+      if isinstance(sub_fc, VocabularyFileCategoricalColumn):
+        return len(sub_fc.vocabulary_size) + fc.num_oov_buckets
+      if isinstance(sub_fc, BucketizedColumn):
+        return len(sub_fc.boundaries) + 1
+    return 1
+
   @property
   def parse_example_spec(self):
     """See `FeatureColumn` base class."""
@@ -3727,6 +3951,40 @@ def raw_name(self):
     """See `FeatureColumn` base class."""
     return self.categorical_column.raw_name
 
+  @property
+  def cardinality(self):
+    fc = self.categorical_column
+    if isinstance(fc, HashedCategoricalColumn) or isinstance(fc, CrossedColumn):
+      return fc.hash_bucket_size
+
+    if isinstance(fc, IdentityCategoricalColumn):
+      return fc.num_buckets
+
+    if isinstance(fc, BucketizedColumn):
+      return len(fc.boundaries) + 1
+
+    if isinstance(fc, VocabularyListCategoricalColumn):
+      return len(fc.vocabulary_list) + fc.num_oov_buckets
+
+    if isinstance(fc, VocabularyFileCategoricalColumn):
+      return len(fc.vocabulary_size) + fc.num_oov_buckets
+
+    if isinstance(fc, WeightedCategoricalColumn) or isinstance(
+        fc, SequenceWeightedCategoricalColumn):
+      sub_fc = fc.categorical_column
+      if isinstance(sub_fc, HashedCategoricalColumn) or isinstance(
+          sub_fc, CrossedColumn):
+        return sub_fc.hash_bucket_size
+      if isinstance(sub_fc, IdentityCategoricalColumn):
+        return sub_fc.num_buckets
+      if isinstance(sub_fc, VocabularyListCategoricalColumn):
+        return len(sub_fc.vocabulary_list) + fc.num_oov_buckets
+      if isinstance(sub_fc, VocabularyFileCategoricalColumn):
+        return len(sub_fc.vocabulary_size) + fc.num_oov_buckets
+      if isinstance(sub_fc, BucketizedColumn):
+        return len(sub_fc.boundaries) + 1
+    return 1
+
   @property
   def parse_example_spec(self):
     """See `FeatureColumn` base class."""
@@ -5193,3 +5451,13 @@ def deserialize_feature_columns(configs, custom_objects=None):
       deserialize_feature_column(c, custom_objects, columns_by_name)
       for c in configs
   ]
+
+
+def is_embedding_column(fc):
+  if isinstance(fc, EmbeddingColumn):
+    return True
+  if isinstance(fc, fc_old._SharedEmbeddingColumn):
+    return True
+  if isinstance(fc, SharedEmbeddingColumn):
+    return True
+  return False
diff --git a/easy_rec/python/compat/sort_ops.py b/easy_rec/python/compat/sort_ops.py
new file mode 100644
index 000000000..bd7f92ab1
--- /dev/null
+++ b/easy_rec/python/compat/sort_ops.py
@@ -0,0 +1,216 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Support for sorting tensors.
+
+@@argsort
+@@sort
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops as framework_ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export('sort')
+def sort(values, axis=-1, direction='ASCENDING', name=None):
+  """Sorts a tensor.
+
+  Usage:
+
+  ```python
+  import tensorflow as tf
+  a = [1, 10, 26.9, 2.8, 166.32, 62.3]
+  b = tf.sort(a,axis=-1,direction='ASCENDING',name=None)
+  c = tf.keras.backend.eval(b)
+  # Here, c = [  1.     2.8   10.    26.9   62.3  166.32]
+  ```
+
+  Args:
+    values: 1-D or higher numeric `Tensor`.
+    axis: The axis along which to sort. The default is -1, which sorts the last
+      axis.
+    direction: The direction in which to sort the values (`'ASCENDING'` or
+      `'DESCENDING'`).
+    name: Optional name for the operation.
+
+  Returns:
+    A `Tensor` with the same dtype and shape as `values`, with the elements
+        sorted along the given `axis`.
+
+  Raises:
+    ValueError: If axis is not a constant scalar, or the direction is invalid.
+  """
+  with framework_ops.name_scope(name, 'sort'):
+    return _sort_or_argsort(values, axis, direction, return_argsort=False)
+
+
+@tf_export('argsort')
+def argsort(values, axis=-1, direction='ASCENDING', stable=False, name=None):
+  """Returns the indices of a tensor that give its sorted order along an axis.
+
+  For a 1D tensor, `tf.gather(values, tf.argsort(values))` is equivalent to
+  `tf.sort(values)`. For higher dimensions, the output has the same shape as
+  `values`, but along the given axis, values represent the index of the sorted
+  element in that slice of the tensor at the given position.
+
+  Usage:
+
+  ```python
+  import tensorflow as tf
+  a = [1, 10, 26.9, 2.8, 166.32, 62.3]
+  b = tf.argsort(a,axis=-1,direction='ASCENDING',stable=False,name=None)
+  c = tf.keras.backend.eval(b)
+  # Here, c = [0 3 1 2 5 4]
+  ```
+
+  Args:
+    values: 1-D or higher numeric `Tensor`.
+    axis: The axis along which to sort. The default is -1, which sorts the last
+      axis.
+    direction: The direction in which to sort the values (`'ASCENDING'` or
+      `'DESCENDING'`).
+    stable: If True, equal elements in the original tensor will not be
+      re-ordered in the returned order. Unstable sort is not yet implemented,
+      but will eventually be the default for performance reasons. If you require
+      a stable order, pass `stable=True` for forwards compatibility.
+    name: Optional name for the operation.
+
+  Returns:
+    An int32 `Tensor` with the same shape as `values`. The indices that would
+        sort each slice of the given `values` along the given `axis`.
+
+  Raises:
+    ValueError: If axis is not a constant scalar, or the direction is invalid.
+  """
+  del stable  # Unused.
+  with framework_ops.name_scope(name, 'argsort'):
+    return _sort_or_argsort(values, axis, direction, return_argsort=True)
+
+
+def _sort_or_argsort(values, axis, direction, return_argsort):
+  """Internal sort/argsort implementation.
+
+  Args:
+    values: The input values.
+    axis: The axis along which to sort.
+    direction: 'ASCENDING' or 'DESCENDING'.
+    return_argsort: Whether to return the argsort result.
+
+  Returns:
+    Either the sorted values, or the indices of the sorted values in the
+        original tensor. See the `sort` and `argsort` docstrings.
+
+  Raises:
+    ValueError: If axis is not a constant scalar, or the direction is invalid.
+  """
+  if direction not in _SORT_IMPL:
+    raise ValueError('%s should be one of %s' %
+                     (direction, ', '.join(sorted(_SORT_IMPL.keys()))))
+  # Axis must be an integer, not a Tensor.
+  axis = framework_ops.convert_to_tensor(axis, name='axis')
+  axis_static = tensor_util.constant_value(axis)
+  if axis.shape.ndims != 0 or axis_static is None:
+    raise ValueError('axis must be a constant scalar')
+  axis_static = int(axis_static)  # Avoids NumPy casting error
+
+  values = framework_ops.convert_to_tensor(values, name='values')
+
+  return _SORT_IMPL[direction](values, axis_static, return_argsort)
+
+
+def _descending_sort(values, axis, return_argsort=False):
+  """Sorts values in reverse using `top_k`.
+
+  Args:
+    values: Tensor of numeric values.
+    axis: Index of the axis which values should be sorted along.
+    return_argsort: If False, return the sorted values. If True, return the
+      indices that would sort the values.
+
+  Returns:
+    The sorted values.
+  """
+  k = array_ops.shape(values)[axis]
+  rank = array_ops.rank(values)
+  static_rank = values.shape.ndims
+  # Fast path: sorting the last axis.
+  if axis == -1 or axis + 1 == values.get_shape().ndims:
+    top_k_input = values
+    transposition = None
+  else:
+    # Otherwise, transpose the array. Swap axes `axis` and `rank - 1`.
+    if axis < 0:
+      # Calculate the actual axis index if counting from the end. Use the static
+      # rank if available, or else make the axis back into a tensor.
+      axis += static_rank or rank
+    if static_rank is not None:
+      # Prefer to calculate the transposition array in NumPy and make it a
+      # constant.
+      transposition = constant_op.constant(
+          np.r_[
+              # Axes up to axis are unchanged.
+              np.arange(axis),
+              # Swap axis and rank - 1.
+              [static_rank - 1],
+              # Axes in [axis + 1, rank - 1) are unchanged.
+              np.arange(axis + 1, static_rank - 1),
+              # Swap axis and rank - 1.
+              [axis]],
+          name='transposition')
+    else:
+      # Generate the transposition array from the tensors.
+      transposition = array_ops.concat(
+          [
+              # Axes up to axis are unchanged.
+              math_ops.range(axis),
+              # Swap axis and rank - 1.
+              [rank - 1],
+              # Axes in [axis + 1, rank - 1) are unchanged.
+              math_ops.range(axis + 1, rank - 1),
+              # Swap axis and rank - 1.
+              [axis]
+          ],
+          axis=0)
+    top_k_input = array_ops.transpose(values, transposition)
+
+  values, indices = nn_ops.top_k(top_k_input, k)
+  return_value = indices if return_argsort else values
+  if transposition is not None:
+    # transposition contains a single cycle of length 2 (swapping 2 elements),
+    # so it is an involution (it is its own inverse).
+    return_value = array_ops.transpose(return_value, transposition)
+  return return_value
+
+
+def _ascending_sort(values, axis, return_argsort=False):
+  # Negate the values to get the ascending order from descending sort.
+  values_or_indices = _descending_sort(-values, axis, return_argsort)
+  # If not argsort, negate the values again.
+  return values_or_indices if return_argsort else -values_or_indices
+
+
+_SORT_IMPL = {
+    'ASCENDING': _ascending_sort,
+    'DESCENDING': _descending_sort,
+}
diff --git a/easy_rec/python/feature_column/feature_column.py b/easy_rec/python/feature_column/feature_column.py
index 04fc07baf..1f62faef1 100644
--- a/easy_rec/python/feature_column/feature_column.py
+++ b/easy_rec/python/feature_column/feature_column.py
@@ -129,6 +129,8 @@ def _cmp_embed_config(a, b):
           self.parse_sequence_feature(config)
         elif config.feature_type == config.ExprFeature:
           self.parse_expr_feature(config)
+        elif config.feature_type == config.ConstFeature:
+          self.parse_const_feature(config)
         else:
           assert False, 'invalid feature type: %s' % config.feature_type
       except FeatureKeyError:
@@ -331,10 +333,7 @@ def parse_tag_feature(self, config):
           default_value=0,
           feature_name=feature_name)
 
-    if len(config.input_names) > 1:
-      tag_fc = feature_column.weighted_categorical_column(
-          tag_fc, weight_feature_key=feature_name + '_w', dtype=tf.float32)
-    elif config.HasField('kv_separator'):
+    if len(config.input_names) > 1 or config.HasField('kv_separator'):
       tag_fc = feature_column.weighted_categorical_column(
           tag_fc, weight_feature_key=feature_name + '_w', dtype=tf.float32)
 
@@ -400,9 +399,7 @@ def parse_raw_feature(self, config):
           self._deep_columns[feature_name] = fc
 
   def parse_expr_feature(self, config):
-    """Generate raw features columns.
-
-    if boundaries is set, will be converted to category_column first.
+    """Generate expression features columns.
 
     Args:
       config: instance of easy_rec.python.protos.feature_config_pb2.FeatureConfig
@@ -412,7 +409,27 @@ def parse_expr_feature(self, config):
     fc = feature_column.numeric_column(
         feature_name, shape=(1,), feature_name=feature_name)
     if self.is_wide(config):
-      self._add_wide_embedding_column(fc, config)
+      self._wide_columns[feature_name] = fc
+    if self.is_deep(config):
+      self._deep_columns[feature_name] = fc
+
+  def parse_const_feature(self, config):
+    """Generate constant features columns.
+
+    used for mask input features.
+
+    Args:
+      config: instance of easy_rec.python.protos.feature_config_pb2.FeatureConfig
+    """
+    feature_name = config.feature_name if config.HasField('feature_name') \
+        else config.input_names[0]
+    dim = config.raw_input_dim
+    if config.HasField('embedding_dim'):
+      dim = config.embedding_dim
+    fc = feature_column.constant_numeric_column(
+        feature_name, shape=(dim,), feature_name=feature_name)
+    if self.is_wide(config):
+      self._wide_columns[feature_name] = fc
     if self.is_deep(config):
       self._deep_columns[feature_name] = fc
 
diff --git a/easy_rec/python/inference/predictor.py b/easy_rec/python/inference/predictor.py
index dba53f967..e17871892 100644
--- a/easy_rec/python/inference/predictor.py
+++ b/easy_rec/python/inference/predictor.py
@@ -222,6 +222,9 @@ def _build_model(self):
               logging.info('Load input binding: %s -> %s' % (name, tensor.name))
               input_name = tensor.name
               input_name, _ = input_name.split(':')
+              input_op = self._graph.get_operation_by_name(input_name)
+              if input_op.type == 'PlaceholderWithDefault':
+                continue
               try:
                 input_id = input_name.split('_')[-1]
                 input_id = int(input_id)
diff --git a/easy_rec/python/input/augment.py b/easy_rec/python/input/augment.py
new file mode 100644
index 000000000..c9802c88c
--- /dev/null
+++ b/easy_rec/python/input/augment.py
@@ -0,0 +1,91 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import tensorflow as tf
+
+from easy_rec.python.utils.shape_utils import get_shape_list
+
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+
+
+def assign(input_tensor, position=None, value=None):
+  input_tensor[tuple(position)] = value
+  return input_tensor
+
+
+def item_mask(aug_data, length, gamma=0.3):
+  length1 = tf.cast(length, dtype=tf.float32)
+  num_mask = tf.cast(tf.math.floor(length1 * gamma), dtype=tf.int32)
+  seq = tf.range(length, dtype=tf.int32)
+  mask_index = tf.random.shuffle(seq)[:num_mask]
+  masked_item_seq = aug_data
+  masked_item_seq = tf.py_func(
+      assign,
+      inp=[masked_item_seq, [mask_index], 0],
+      Tout=masked_item_seq.dtype)
+  return masked_item_seq, length
+
+
+def item_crop(aug_data, length, eta=0.6):
+  length1 = tf.cast(length, dtype=tf.float32)
+  max_length = tf.cast(get_shape_list(aug_data)[0], dtype=tf.int32)
+  embedding_size = get_shape_list(aug_data)[1]
+
+  num_left = tf.cast(tf.math.floor(length1 * eta), dtype=tf.int32)
+  crop_begin = tf.random.uniform([1],
+                                 minval=0,
+                                 maxval=length - num_left,
+                                 dtype=tf.int32)[0]
+  cropped_item_seq = tf.zeros([get_shape_list(aug_data)[0], embedding_size])
+  cropped_item_seq = tf.where(
+      crop_begin + num_left < max_length,
+      tf.concat([
+          aug_data[crop_begin:crop_begin + num_left],
+          cropped_item_seq[:max_length - num_left]
+      ],
+                axis=0),
+      tf.concat([aug_data[crop_begin:], cropped_item_seq[:crop_begin]], axis=0))
+  return cropped_item_seq, num_left
+
+
+def item_reorder(aug_data, length, beta=0.6):
+  length1 = tf.cast(length, dtype=tf.float32)
+  num_reorder = tf.cast(tf.math.floor(length1 * beta), dtype=tf.int32)
+  reorder_begin = tf.random.uniform([1],
+                                    minval=0,
+                                    maxval=length - num_reorder,
+                                    dtype=tf.int32)[0]
+  shuffle_index = tf.range(reorder_begin, reorder_begin + num_reorder)
+  shuffle_index = tf.random.shuffle(shuffle_index)
+  x = tf.range(get_shape_list(aug_data)[0])
+  left = tf.slice(x, [0], [reorder_begin])
+  right = tf.slice(x, [reorder_begin + num_reorder], [-1])
+  reordered_item_index = tf.concat([left, shuffle_index, right], axis=0)
+  reordered_item_seq = tf.scatter_nd(
+      tf.expand_dims(reordered_item_index, axis=1), aug_data,
+      tf.shape(aug_data))
+  return reordered_item_seq, length
+
+
+def augment(x):
+  seq, length = x
+  flag = tf.range(3, dtype=tf.int32)
+  flag1 = tf.random.shuffle(flag)[:1][0]
+  aug_seq, aug_len = tf.cond(
+      tf.equal(flag1, 0), lambda: item_crop(seq, length), lambda: tf.cond(
+          tf.equal(flag1, 1), lambda: item_mask(seq, length), lambda:
+          item_reorder(seq, length)))
+
+  return [aug_seq, aug_len]
+
+
+def input_aug_data(original_data, seq_len):
+  print('seq_len:', seq_len)
+  lengths = tf.cast(seq_len, dtype=tf.int32)
+  aug_seq1, aug_len1 = tf.map_fn(
+      augment, elems=(original_data, lengths), dtype=[tf.float32, tf.int32])
+  aug_seq2, aug_len2 = tf.map_fn(
+      augment, elems=(original_data, lengths), dtype=[tf.float32, tf.int32])
+  aug_seq1 = tf.reshape(aug_seq1, tf.shape(original_data))
+  aug_seq2 = tf.reshape(aug_seq2, tf.shape(original_data))
+  return aug_seq1, aug_seq2, aug_len1, aug_len2
diff --git a/easy_rec/python/input/input.py b/easy_rec/python/input/input.py
index 52581b4e2..9b8c4b3b0 100644
--- a/easy_rec/python/input/input.py
+++ b/easy_rec/python/input/input.py
@@ -1,11 +1,14 @@
 # -*- encoding:utf-8 -*-
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import logging
+import os
 from abc import abstractmethod
 from collections import OrderedDict
 
+from easy_rec.python.utils import conditional
 import six
 import tensorflow as tf
+from tensorflow.python.framework import ops
 from tensorflow.python.platform import gfile
 
 from easy_rec.python.core import sampler as sampler_lib
@@ -18,6 +21,7 @@
 from easy_rec.python.utils.input_utils import get_type_defaults
 from easy_rec.python.utils.load_class import get_register_class_meta
 from easy_rec.python.utils.load_class import load_by_path
+from easy_rec.python.utils.tf_utils import get_config_type
 from easy_rec.python.utils.tf_utils import get_tf_type
 
 if tf.__version__ >= '2.0':
@@ -93,12 +97,14 @@ def __init__(self,
     # from the types defined in input_fields
     # it is used in create_multi_placeholders
     self._multi_value_types = {}
-
+    self._const_features = set()
     self._normalizer_fn = {}
     for fc in self._feature_configs:
       for input_name in fc.input_names:
         assert input_name in self._input_fields, 'invalid input_name in %s' % str(
             fc)
+        if fc.feature_type == fc.ConstFeature:
+          self._const_features.add(input_name)
         if input_name not in self._effective_fields:
           self._effective_fields.append(input_name)
 
@@ -225,6 +231,19 @@ def should_stop(self, curr_epoch):
       total_epoch = 1
     return total_epoch is not None and curr_epoch >= total_epoch
 
+  def get_erase_features(self):
+    if len(self._const_features) == 0:
+      return self._const_features
+
+    for fc in self._feature_configs:
+      if fc.feature_type == fc.ConstFeature:
+        continue
+      for input_name in fc.input_names:
+        if input_name in self._const_features:
+          self._const_features.remove(input_name)
+
+    return self._const_features
+
   def create_multi_placeholders(self, export_config):
     """Create multiply placeholders on export, one for each feature.
 
@@ -252,6 +271,7 @@ def create_multi_placeholders(self, export_config):
           self._input_fields[fid] != sample_weight_field
       ]
 
+    erase_features = self.get_erase_features()
     inputs = {}
     for fid in effective_fids:
       input_name = self._input_fields[fid]
@@ -265,12 +285,25 @@ def create_multi_placeholders(self, export_config):
         tf_type = self._multi_value_types[input_name]
         logging.info('multi value input_name: %s, dtype: %s' %
                      (input_name, tf_type))
-        finput = tf.placeholder(tf_type, [None, None], name=placeholder_name)
+        if input_name in erase_features:
+          conf_type = get_config_type(tf_type)
+          def_val = self.get_type_defaults(conf_type,
+                                           self._input_field_defaults[fid])
+          finput = tf.placeholder_with_default([def_val], [None, None],
+                                               name=placeholder_name)
+        else:
+          finput = tf.placeholder(tf_type, [None, None], name=placeholder_name)
       else:
         ftype = self._input_field_types[fid]
         tf_type = get_tf_type(ftype)
         logging.info('input_name: %s, dtype: %s' % (input_name, tf_type))
-        finput = tf.placeholder(tf_type, [None], name=placeholder_name)
+        if input_name in erase_features:
+          def_val = self.get_type_defaults(ftype,
+                                           self._input_field_defaults[fid])
+          finput = tf.placeholder_with_default([def_val], [None],
+                                               name=placeholder_name)
+        else:
+          finput = tf.placeholder(tf_type, [None], name=placeholder_name)
       inputs[input_name] = finput
     features = {x: inputs[x] for x in inputs}
     features = self._preprocess(features)
@@ -302,11 +335,15 @@ def create_placeholders(self, export_config):
           len(effective_fids))
     input_vals = tf.reshape(
         input_vals, [-1, len(effective_fids)], name='input_reshape')
+
+    erase_features = self.get_erase_features()
     features = {}
     for tmp_id, fid in enumerate(effective_fids):
       ftype = self._input_field_types[fid]
       tf_type = get_tf_type(ftype)
       input_name = self._input_fields[fid]
+      if input_name in erase_features:
+        continue
       if tf_type in [tf.float32, tf.double, tf.int32, tf.int64]:
         features[input_name] = tf.string_to_number(
             input_vals[:, tmp_id],
@@ -472,6 +509,21 @@ def _parse_id_feature(self, fc, parsed_dict, field_dict):
               tf.int32,
               name='%s_str_2_int' % input_0)
 
+  def _parse_const_feature(self, fc, parsed_dict, field_dict, batch_size):
+    input_0 = fc.input_names[0]
+    input_tensor = field_dict[input_0]
+
+    def expand_input():
+      multiples = [1] * input_tensor.shape.ndims
+      multiples[0] = batch_size
+      return tf.tile(input_tensor, multiples)
+
+    input_tensor = tf.cond(
+        tf.equal(tf.shape(input_tensor)[0], batch_size), lambda: input_tensor,
+        expand_input)
+    feature_name = fc.feature_name if fc.HasField('feature_name') else input_0
+    parsed_dict[feature_name] = input_tensor
+
   def _parse_raw_feature(self, fc, parsed_dict, field_dict):
     input_0 = fc.input_names[0]
     feature_name = fc.feature_name if fc.HasField('feature_name') else input_0
@@ -762,6 +814,14 @@ def _preprocess(self, field_dict):
           parsed_dict[k] = v
           self._appended_fields.append(k)
 
+    batch_size = 1
+    for fc in self._feature_configs:
+      feature_type = fc.feature_type
+      if feature_type != fc.ConstFeature:
+        input_0 = fc.input_names[0]
+        batch_size = tf.shape(field_dict[input_0])[0]
+        break
+
     for fc in self._feature_configs:
       feature_name = fc.feature_name
       feature_type = fc.feature_type
@@ -779,6 +839,8 @@ def _preprocess(self, field_dict):
         self._parse_id_feature(fc, parsed_dict, field_dict)
       elif feature_type == fc.ExprFeature:
         self._parse_expr_feature(fc, parsed_dict, field_dict)
+      elif feature_type == fc.ConstFeature:
+        self._parse_const_feature(fc, parsed_dict, field_dict, batch_size)
       else:
         feature_name = fc.feature_name if fc.HasField(
             'feature_name') else fc.input_names[0]
@@ -952,11 +1014,15 @@ def _input_fn(mode=None, params=None, config=None):
         dataset = self._build(mode, params)
         return dataset
       elif mode is None:  # serving_input_receiver_fn for export SavedModel
+        place_on_cpu = os.getenv('place_embedding_on_cpu')
+        place_on_cpu = eval(place_on_cpu) if place_on_cpu else False
         if export_config.multi_placeholder:
-          inputs, features = self.create_multi_placeholders(export_config)
+          with conditional(place_on_cpu, ops.device('/CPU:0')):
+            inputs, features = self.create_multi_placeholders(export_config)
           return tf.estimator.export.ServingInputReceiver(features, inputs)
         else:
-          inputs, features = self.create_placeholders(export_config)
+          with conditional(place_on_cpu, ops.device('/CPU:0')):
+            inputs, features = self.create_placeholders(export_config)
           print('built feature placeholders. features: {}'.format(
               features.keys()))
           return tf.estimator.export.ServingInputReceiver(features, inputs)
diff --git a/easy_rec/python/layers/backbone.py b/easy_rec/python/layers/backbone.py
new file mode 100644
index 000000000..3093d9f8e
--- /dev/null
+++ b/easy_rec/python/layers/backbone.py
@@ -0,0 +1,349 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import logging
+
+import six
+import tensorflow as tf
+from google.protobuf import struct_pb2
+
+from easy_rec.python.layers.common_layers import EnhancedInputLayer
+from easy_rec.python.layers.keras import MLP
+from easy_rec.python.layers.utils import Parameter
+from easy_rec.python.protos import backbone_pb2
+from easy_rec.python.utils.dag import DAG
+from easy_rec.python.utils.load_class import load_keras_layer
+
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+
+
+class Package(object):
+  """A sub DAG of tf ops for reuse."""
+  __packages = {}
+
+  def __init__(self, config, features, input_layer, l2_reg=None):
+    self._config = config
+    self._features = features
+    self._input_layer = input_layer
+    self._l2_reg = l2_reg
+    self._dag = DAG()
+    self._name_to_blocks = {}
+    self.loss_dict = {}
+    input_feature_groups = set()
+    for block in config.blocks:
+      if len(block.inputs) == 0:
+        raise ValueError('block takes at least one input: %s' % block.name)
+      self._dag.add_node(block.name)
+      self._name_to_blocks[block.name] = block
+      layer = block.WhichOneof('layer')
+      if layer == 'input_layer':
+        if len(block.inputs) != 1:
+          raise ValueError('input layer `%s` takes only one input' % block.name)
+        one_input = block.inputs[0]
+        name = one_input.WhichOneof('name')
+        if name != 'feature_group_name':
+          raise KeyError(
+              '`feature_group_name` should be set for input layer: ' +
+              block.name)
+        input_name = one_input.feature_group_name
+        if not input_layer.has_group(input_name):
+          raise KeyError('invalid feature group name: ' + input_name)
+        if input_name in input_feature_groups:
+          logging.warning('input `%s` already exists in other block' %
+                          input_name)
+        input_feature_groups.add(input_name)
+
+    num_groups = len(input_feature_groups)
+    num_blocks = len(self._name_to_blocks) - num_groups
+    assert num_blocks > 0, 'there must be at least one block in backbone'
+
+    num_pkg_input = 0
+    for block in config.blocks:
+      layer = block.WhichOneof('layer')
+      if layer == 'input_layer':
+        continue
+      if block.name in input_feature_groups:
+        raise KeyError('block name can not be one of feature groups:' +
+                       block.name)
+      for input_node in block.inputs:
+        input_type = input_node.WhichOneof('name')
+        if input_type == 'package_name':
+          num_pkg_input += 1
+          continue
+        input_name = getattr(input_node, input_type)
+        if input_name in self._name_to_blocks:
+          assert input_name != block.name, 'input name can not equal to block name:' + input_name
+          self._dag.add_edge(input_name, block.name)
+        elif input_name not in input_feature_groups:
+          if input_layer.has_group(input_name):
+            logging.info('adding an input_layer block: ' + input_name)
+            new_block = backbone_pb2.Block()
+            new_block.name = input_name
+            input_cfg = backbone_pb2.Input()
+            input_cfg.feature_group_name = input_name
+            new_block.inputs.append(input_cfg)
+            new_block.input_layer.CopyFrom(backbone_pb2.InputLayer())
+            self._name_to_blocks[input_name] = new_block
+            self._dag.add_node(input_name)
+            self._dag.add_edge(input_name, block.name)
+            input_feature_groups.add(input_name)
+          else:
+            raise KeyError(
+                'invalid input name `%s`, must be the name of either a feature group or an another block'
+                % input_name)
+    num_groups = len(input_feature_groups)
+    assert num_pkg_input > 0 or num_groups > 0, 'there must be at least one input layer/feature group'
+
+    if len(config.concat_blocks) == 0:
+      leaf = self._dag.all_leaves()
+      logging.warning(
+          '%s has no `concat_blocks`, try to use all leaf blocks: %s' %
+          (config.name, ','.join(leaf)))
+      self._config.concat_blocks.extend(leaf)
+
+    Package.__packages[self._config.name] = self
+
+  def block_input(self, config, block_outputs, training=None):
+    inputs = []
+    for input_node in config.inputs:
+      input_type = input_node.WhichOneof('name')
+      input_name = getattr(input_node, input_type)
+      if input_type == 'package_name':
+        if input_name not in Package.__packages:
+          raise KeyError('package name `%s` does not exists' % input_name)
+        package = Package.__packages[input_name]
+        input_feature = package(training)
+        if len(package.loss_dict) > 0:
+          self.loss_dict.update(package.loss_dict)
+      elif input_name in block_outputs:
+        input_feature = block_outputs[input_name]
+      else:
+        raise KeyError('input name `%s` does not exists' % input_name)
+
+      if input_node.HasField('input_slice'):
+        fn = eval('lambda x: x' + input_node.input_slice.strip())
+        input_feature = fn(input_feature)
+      if input_node.HasField('input_fn'):
+        fn = eval(input_node.input_fn)
+        input_feature = fn(input_feature)
+      inputs.append(input_feature)
+
+    if config.merge_inputs_into_list:
+      output = inputs
+    else:
+      output = merge_inputs(inputs, config.input_concat_axis, config.name)
+
+    if config.HasField('extra_input_fn'):
+      fn = eval(config.extra_input_fn)
+      output = fn(output)
+    return output
+
+  def __call__(self, is_training, **kwargs):
+    with tf.variable_scope(self._config.name, reuse=tf.AUTO_REUSE):
+      return self.call(is_training)
+
+  def call(self, is_training):
+    block_outputs = {}
+    blocks = self._dag.topological_sort()
+    logging.info(self._config.name + ' topological order: ' + ','.join(blocks))
+    print(self._config.name + ' topological order: ' + ','.join(blocks))
+    for block in blocks:
+      config = self._name_to_blocks[block]
+      if config.layers:  # sequential layers
+        logging.info('call sequential %d layers' % len(config.layers))
+        output = self.block_input(config, block_outputs, is_training)
+        for layer in config.layers:
+          output = self.call_layer(output, layer, block, is_training)
+        block_outputs[block] = output
+        continue
+      # just one of layer
+      layer = config.WhichOneof('layer')
+      if layer is None:  # identity layer
+        block_outputs[block] = self.block_input(config, block_outputs,
+                                                is_training)
+      elif layer == 'input_layer':
+        conf = config.input_layer
+        input_fn = EnhancedInputLayer(conf, self._input_layer, self._features)
+        feature_group = config.inputs[0].feature_group_name
+        output = input_fn(feature_group, is_training)
+        block_outputs[block] = output
+      else:
+        inputs = self.block_input(config, block_outputs, is_training)
+        output = self.call_layer(inputs, config, block, is_training)
+        block_outputs[block] = output
+
+    outputs = []
+    for output in self._config.concat_blocks:
+      if output in block_outputs:
+        temp = block_outputs[output]
+        if type(temp) in (tuple, list):
+          outputs.extend(temp)
+        else:
+          outputs.append(temp)
+      else:
+        raise ValueError('No output `%s` of backbone to be concat' % output)
+    output = merge_inputs(outputs, msg='backbone')
+    return output
+
+  def call_keras_layer(self, layer_conf, inputs, name, training):
+    layer_cls, customize = load_keras_layer(layer_conf.class_name)
+    if layer_cls is None:
+      raise ValueError('Invalid keras layer class name: ' +
+                       layer_conf.class_name)
+
+    param_type = layer_conf.WhichOneof('params')
+    if customize:
+      if param_type is None or param_type == 'st_params':
+        params = Parameter(layer_conf.st_params, True, l2_reg=self._l2_reg)
+      else:
+        pb_params = getattr(layer_conf, param_type)
+        params = Parameter(pb_params, False, l2_reg=self._l2_reg)
+      layer = layer_cls(params, name=name)
+      kwargs = {'loss_dict': self.loss_dict}
+      return layer(inputs, training=training, **kwargs)
+    else:  # internal keras layer
+      if param_type is None:
+        layer = layer_cls(name=name)
+      else:
+        assert param_type == 'st_params', 'internal keras layer only support st_params'
+        try:
+          kwargs = convert_to_dict(layer_conf.st_params)
+          logging.info('call %s layer with params %r' %
+                       (layer_conf.class_name, kwargs))
+          layer = layer_cls(name=name, **kwargs)
+        except TypeError as e:
+          logging.warning(e)
+          args = map(format_value, layer_conf.st_params.values())
+          logging.info('try to call %s layer with params %r' %
+                       (layer_conf.class_name, args))
+          layer = layer_cls(*args, name=name)
+      try:
+        return layer(inputs, training=training)
+      except TypeError:
+        return layer(inputs)
+
+  def call_layer(self, inputs, config, name, training):
+    layer_name = config.WhichOneof('layer')
+    if layer_name == 'keras_layer':
+      return self.call_keras_layer(config.keras_layer, inputs, name, training)
+    if layer_name == 'lambda':
+      conf = getattr(config, 'lambda')
+      fn = eval(conf.expression)
+      return fn(inputs)
+    if layer_name == 'repeat':
+      conf = config.repeat
+      n_loop = conf.num_repeat
+      outputs = []
+      for i in range(n_loop):
+        name_i = '%s_%d' % (name, i)
+        output = self.call_keras_layer(conf.keras_layer, inputs, name_i,
+                                       training)
+        outputs.append(output)
+      if len(outputs) == 1:
+        return outputs[0]
+      if conf.HasField('output_concat_axis'):
+        return tf.concat(outputs, conf.output_concat_axis)
+      return outputs
+    if layer_name == 'recurrent':
+      conf = config.recurrent
+      fixed_input_index = -1
+      if conf.HasField('fixed_input_index'):
+        fixed_input_index = conf.fixed_input_index
+      if fixed_input_index >= 0:
+        assert type(inputs) in (tuple, list), '%s inputs must be a list'
+      output = inputs
+      for i in range(conf.num_steps):
+        name_i = '%s_%d' % (name, i)
+        layer = conf.keras_layer
+        output_i = self.call_keras_layer(layer, output, name_i, training)
+        if fixed_input_index >= 0:
+          j = 0
+          for idx in range(len(output)):
+            if idx == fixed_input_index:
+              continue
+            if type(output_i) in (tuple, list):
+              output[idx] = output_i[j]
+            else:
+              output[idx] = output_i
+            j += 1
+        else:
+          output = output_i
+      if fixed_input_index >= 0:
+        del output[fixed_input_index]
+        if len(output) == 1:
+          return output[0]
+        return output
+      return output
+
+    raise NotImplementedError('Unsupported backbone layer:' + layer_name)
+
+
+class Backbone(object):
+  """Configurable Backbone Network."""
+
+  def __init__(self, config, features, input_layer, l2_reg=None):
+    self._config = config
+    self._l2_reg = l2_reg
+    self.loss_dict = {}
+    for pkg in config.packages:
+      Package(pkg, features, input_layer, l2_reg)
+
+    main_pkg = backbone_pb2.BlockPackage()
+    main_pkg.name = 'backbone'
+    main_pkg.blocks.MergeFrom(config.blocks)
+    main_pkg.concat_blocks.extend(config.concat_blocks)
+    self._main_pkg = Package(main_pkg, features, input_layer, l2_reg)
+
+  def __call__(self, is_training, **kwargs):
+    output = self._main_pkg(is_training, **kwargs)
+    if len(self._main_pkg.loss_dict) > 0:
+      self.loss_dict = self._main_pkg.loss_dict
+
+    if self._config.HasField('top_mlp'):
+      params = Parameter.make_from_pb(self._config.top_mlp)
+      params.l2_regularizer = self._l2_reg
+      final_mlp = MLP(params, name='backbone_top_mlp')
+      output = final_mlp(output, training=is_training)
+    return output
+
+
+def merge_inputs(inputs, axis=-1, msg=''):
+  if len(inputs) == 0:
+    raise ValueError('no inputs to be concat:' + msg)
+  if len(inputs) == 1:
+    return inputs[0]
+
+  from functools import reduce
+  if all(map(lambda x: type(x) == list, inputs)):
+    # merge multiple lists into a list
+    return reduce(lambda x, y: x + y, inputs)
+
+  if any(map(lambda x: type(x) == list, inputs)):
+    logging.warning('%s: try to merge inputs into list' % msg)
+    return reduce(lambda x, y: x + y,
+                  [e if type(e) == list else [e] for e in inputs])
+
+  if axis != -1:
+    logging.info('concat inputs %s axis=%d' % (msg, axis))
+  return tf.concat(inputs, axis=axis)
+
+
+def format_value(value):
+  value_type = type(value)
+  if value_type == six.text_type:
+    return str(value)
+  if value_type == float:
+    int_v = int(value)
+    return int_v if int_v == value else value
+  if value_type == struct_pb2.ListValue:
+    return map(format_value, value)
+  if value_type == struct_pb2.Struct:
+    return convert_to_dict(value)
+  return value
+
+
+def convert_to_dict(struct):
+  kwargs = {}
+  for key, value in struct.items():
+    kwargs[str(key)] = format_value(value)
+  return kwargs
diff --git a/easy_rec/python/layers/cmbf.py b/easy_rec/python/layers/cmbf.py
index b633bac2b..e5f1caeb2 100644
--- a/easy_rec/python/layers/cmbf.py
+++ b/easy_rec/python/layers/cmbf.py
@@ -33,7 +33,8 @@ def __init__(self, model_config, feature_configs, features, cmbf_config,
       has_feature = True
     self._txt_seq_features = None
     if input_layer.has_group('text'):
-      self._txt_seq_features = input_layer(features, 'text', is_combine=False)
+      self._txt_seq_features, _, _ = input_layer(
+          features, 'text', is_combine=False)
       has_feature = True
     self._other_features = None
     if input_layer.has_group('other'):  # e.g. statistical feature
diff --git a/easy_rec/python/layers/common_layers.py b/easy_rec/python/layers/common_layers.py
index 165fce5e1..fae4fe3fc 100644
--- a/easy_rec/python/layers/common_layers.py
+++ b/easy_rec/python/layers/common_layers.py
@@ -1,8 +1,12 @@
 # -*- encoding: utf-8 -*-
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
+import six
 import tensorflow as tf
 
+from easy_rec.python.compat.layers import layer_norm as tf_layer_norm
+from easy_rec.python.utils.activation import get_activation
+
 if tf.__version__ >= '2.0':
   tf = tf.compat.v1
 
@@ -14,6 +18,8 @@ def highway(x,
             scope='highway',
             dropout=0.0,
             reuse=None):
+  if isinstance(activation, six.string_types):
+    activation = get_activation(activation)
   with tf.variable_scope(scope, reuse):
     if size is None:
       size = x.shape.as_list()[-1]
@@ -61,3 +67,80 @@ def text_cnn(x,
   pool_flat = tf.concat(
       pooled_outputs, 1)  # shape: (batch_size, num_filters * len(filter_sizes))
   return pool_flat
+
+
+def layer_norm(input_tensor, name=None, reuse=None):
+  """Run layer normalization on the last dimension of the tensor."""
+  return tf_layer_norm(
+      inputs=input_tensor,
+      begin_norm_axis=-1,
+      begin_params_axis=-1,
+      reuse=reuse,
+      scope=name)
+
+
+class EnhancedInputLayer(object):
+  """Enhance the raw input layer."""
+
+  def __init__(self, config, input_layer, feature_dict):
+    if config.do_batch_norm and config.do_layer_norm:
+      raise ValueError(
+          'can not do batch norm and layer norm for input layer at the same time'
+      )
+    self._config = config
+    self._input_layer = input_layer
+    self._feature_dict = feature_dict
+
+  def __call__(self, group, is_training, **kwargs):
+    with tf.name_scope('input_' + group):
+      return self.call(group, is_training)
+
+  def call(self, group, is_training):
+    if self._config.output_seq_and_normal_feature:
+      seq_features, target_feature, target_features = self._input_layer(
+          self._feature_dict, group, is_combine=False)
+      return seq_features, target_features
+
+    features, feature_list = self._input_layer(self._feature_dict, group)
+    num_features = len(feature_list)
+
+    do_ln = self._config.do_layer_norm
+    do_bn = self._config.do_batch_norm
+    do_feature_dropout = is_training and 0.0 < self._config.feature_dropout_rate < 1.0
+    if do_feature_dropout:
+      keep_prob = 1.0 - self._config.feature_dropout_rate
+      bern = tf.distributions.Bernoulli(probs=keep_prob, dtype=tf.float32)
+      mask = bern.sample(num_features)
+    elif do_bn:
+      features = tf.layers.batch_normalization(features, training=is_training)
+    elif do_ln:
+      features = layer_norm(features)
+
+    do_dropout = 0.0 < self._config.dropout_rate < 1.0
+    if do_feature_dropout or do_ln or do_bn or do_dropout:
+      for i in range(num_features):
+        fea = feature_list[i]
+        if self._config.do_batch_norm:
+          fea = tf.layers.batch_normalization(fea, training=is_training)
+        elif self._config.do_layer_norm:
+          fea = layer_norm(fea)
+        if do_dropout:
+          fea = tf.layers.dropout(
+              fea, self._config.dropout_rate, training=is_training)
+        if do_feature_dropout:
+          fea = tf.div(fea, keep_prob) * mask[i]
+        feature_list[i] = fea
+      if do_feature_dropout:
+        features = tf.concat(feature_list, axis=-1)
+
+    if do_dropout and not do_feature_dropout:
+      features = tf.layers.dropout(
+          features, self._config.dropout_rate, training=is_training)
+
+    if self._config.only_output_feature_list:
+      return feature_list
+    if self._config.only_output_3d_tensor:
+      return tf.stack(feature_list, axis=1)
+    if self._config.output_2d_tensor_and_feature_list:
+      return features, feature_list
+    return features
diff --git a/easy_rec/python/layers/dnn.py b/easy_rec/python/layers/dnn.py
index 7a57f5661..e09891845 100644
--- a/easy_rec/python/layers/dnn.py
+++ b/easy_rec/python/layers/dnn.py
@@ -18,7 +18,8 @@ def __init__(self,
                name='dnn',
                is_training=False,
                last_layer_no_activation=False,
-               last_layer_no_batch_norm=False):
+               last_layer_no_batch_norm=False,
+               reuse=None):
     """Initializes a `DNN` Layer.
 
     Args:
@@ -28,6 +29,7 @@ def __init__(self,
       is_training: train phase or not, impact batch_norm and dropout
       last_layer_no_activation: in last layer, use or not use activation
       last_layer_no_batch_norm: in last layer, use or not use batch norm
+      reuse: Boolean, whether to reuse the weights of a previous layer by the same name.
     """
     self._config = dnn_config
     self._l2_reg = l2_reg
@@ -38,6 +40,7 @@ def __init__(self,
         self._config.activation, training=is_training)
     self._last_layer_no_activation = last_layer_no_activation
     self._last_layer_no_batch_norm = last_layer_no_batch_norm
+    self._reuse = reuse
 
   @property
   def hidden_units(self):
@@ -59,14 +62,16 @@ def __call__(self, deep_fea, hidden_layer_feature_output=False):
           units=unit,
           kernel_regularizer=self._l2_reg,
           activation=None,
-          name='%s/dnn_%d' % (self._name, i))
+          name='%s/dnn_%d' % (self._name, i),
+          reuse=self._reuse)
       if self._config.use_bn and ((i + 1 < hidden_units_len) or
                                   not self._last_layer_no_batch_norm):
         deep_fea = tf.layers.batch_normalization(
             deep_fea,
             training=self._is_training,
             trainable=True,
-            name='%s/dnn_%d/bn' % (self._name, i))
+            name='%s/dnn_%d/bn' % (self._name, i),
+            reuse=self._reuse)
       if (i + 1 < hidden_units_len) or not self._last_layer_no_activation:
         deep_fea = self.activation(
             deep_fea, name='%s/dnn_%d/act' % (self._name, i))
diff --git a/easy_rec/python/layers/fscd_layer.py b/easy_rec/python/layers/fscd_layer.py
new file mode 100644
index 000000000..daccf750e
--- /dev/null
+++ b/easy_rec/python/layers/fscd_layer.py
@@ -0,0 +1,201 @@
+# -*- encoding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import json
+import logging
+import math
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.framework.meta_graph import read_meta_graph_file
+
+from easy_rec.python.compat.sort_ops import argsort
+
+from easy_rec.python.compat.feature_column.feature_column import _SharedEmbeddingColumn  # NOQA
+from easy_rec.python.compat.feature_column.feature_column_v2 import EmbeddingColumn  # NOQA
+from easy_rec.python.compat.feature_column.feature_column_v2 import SharedEmbeddingColumn  # NOQA
+
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+
+
+def get_feature_complexity(feature_configs):
+  feature_complexity = {}
+  for config in feature_configs:
+    name = config.input_names[0]
+    if config.HasField('feature_name'):
+      name = config.feature_name
+    feature_complexity[name] = config.complexity
+  return feature_complexity
+
+
+def sigmoid(x):
+  return 1. / (1. + math.exp(-x))
+
+
+def get_feature_importance(pipeline_config, feature_group_name=None):
+  assert pipeline_config.model_config.HasField(
+      'variational_dropout'), 'variational_dropout must be in model_config'
+
+  checkpoint_path = tf.train.latest_checkpoint(pipeline_config.model_dir)
+  meta_graph_def = read_meta_graph_file(checkpoint_path + '.meta')
+
+  features_map = dict()
+  for col_def in meta_graph_def.collection_def[
+      'variational_dropout'].bytes_list.value:
+    features = json.loads(col_def)
+    features_map.update(features)
+
+  feature_importance = OrderedDict()
+  tf.logging.info('Reading checkpoint from %s ...' % checkpoint_path)
+  reader = tf.train.NewCheckpointReader(checkpoint_path)
+  for feature_group in pipeline_config.model_config.feature_groups:
+    group_name = feature_group.group_name
+    if feature_group_name is not None and feature_group_name != group_name:
+      continue
+    # assert group_name in features_map, "%s not in feature map" % group_name
+    if group_name not in features_map:
+      # for now, sequence feature groups are not supported
+      logging.warn('%s not in feature map' % group_name)
+      continue
+
+    feature_dims = features_map[group_name]
+
+    delta_name = 'fscd_delta_%s' % group_name
+    if not reader.has_tensor(delta_name):
+      logging.warn("feature group `%s` doesn't be involved in FSCD layer")
+      for feature, dim in feature_dims:
+        feature_importance[feature] = 1.0
+      continue
+
+    delta = reader.get_tensor(delta_name)
+    indices = argsort(delta, direction='DESCENDING')
+    keep_prob = tf.nn.sigmoid(delta)
+    with tf.Session() as sess:
+      idx = indices.eval(session=sess)
+      probs = keep_prob.eval(session=sess)
+    for i in idx:
+      feature = feature_dims[i][0]
+      if feature in feature_importance:
+        raw = feature_importance[feature]
+        if probs[i] > raw:
+          logging.info('%s importance change from %d to %d', feature, raw,
+                       probs[i])
+          feature_importance[feature] = probs[i]
+      else:
+        feature_importance[feature] = probs[i]
+  return feature_importance
+
+
+class FSCDLayer(object):
+  """Rank features by variational dropout.
+
+  paper: Towards a Better Tradeoff between Effectiveness and Efficiency in Pre-Ranking,
+    A Learnable Feature Selection based Approach
+  arXiv: 2105.07706
+  """
+
+  def __init__(self,
+               feature_configs,
+               variational_dropout_config,
+               is_training=False,
+               name=''):
+    self._config = variational_dropout_config
+    self.is_training = is_training
+    self.name = name
+    self.feature_complexity = get_feature_complexity(feature_configs)
+
+  def compute_dropout_mask(self, n):
+    delta_name = 'fscd_delta_%s' % self.name
+    delta = tf.get_variable(
+        name=delta_name,
+        shape=[n],
+        dtype=tf.float32,
+        initializer=tf.constant_initializer(0.))
+    delta = tf.nn.sigmoid(delta)
+    epsilon = np.finfo(float).eps
+    max_keep_ratio = self._config.max_keep_ratio
+    min_keep_ratio = self._config.min_keep_ratio
+    if max_keep_ratio >= 1.0:
+      max_keep_ratio = 1.0 - epsilon
+    if min_keep_ratio <= 0.0:
+      min_keep_ratio = epsilon
+    delta = tf.clip_by_value(delta, min_keep_ratio, max_keep_ratio)
+
+    unif_noise = tf.random_uniform([n],
+                                   dtype=tf.float32,
+                                   seed=None,
+                                   name='uniform_noise')
+    approx = (
+        tf.log(delta) - tf.log(1. - delta) + tf.log(unif_noise) -
+        tf.log(1. - unif_noise))
+    return tf.sigmoid(approx / self._config.temperature), delta
+
+  def compute_regular_params(self, cols_to_feature):
+    alphas = {}
+    for fc, fea in cols_to_feature.items():
+      dim = int(fea.shape[-1])
+      complexity = self.feature_complexity[fc.raw_name]
+      cardinal = 1
+      if isinstance(fc, EmbeddingColumn) or isinstance(
+          fc, _SharedEmbeddingColumn) or isinstance(fc, SharedEmbeddingColumn):
+        cardinal = fc.cardinality
+      c = self._config.feature_complexity_weight * complexity
+      c += self._config.feature_cardinality_weight * cardinal
+      c += self._config.feature_dimension_weight * dim
+      sig_c = sigmoid(c)
+      theta = 1.0 - sig_c
+      alpha = math.log(sig_c) - math.log(theta)
+      alphas[fc] = alpha
+      print(
+          str(fc.raw_name), 'complexity:', complexity, 'cardinality:', cardinal,
+          'dimension:', dim, 'c:', c, 'theta:', theta, 'alpha:', alpha)
+    return alphas
+
+  def __call__(self, cols_to_feature):
+    """cols_to_feature: an ordered dict mapping feature_column to feature_values."""
+    feature_dimension = []
+    output_tensors = []
+    alphas = []
+    z, delta = self.compute_dropout_mask(len(cols_to_feature))  # keep ratio
+    tf.summary.histogram('fscd_keep_ratio', delta)
+    tf.summary.histogram('fscd_keep_mask', z)
+    regular = self.compute_regular_params(cols_to_feature)
+
+    feature_columns = cols_to_feature.keys()
+    for column in sorted(feature_columns, key=lambda x: x.name):
+      value = cols_to_feature[column]
+      alpha = regular[column]
+      i = len(output_tensors)
+      if self.is_training:
+        scaled_value = tf.div(value, delta[i])
+        out = tf.multiply(scaled_value, z[i], name='fscd_dropout')
+      else:
+        out = value
+      cols_to_feature[column] = out
+      output_tensors.append(out)
+      alphas.append(alpha)
+      feature_dimension.append((column.raw_name, int(value.shape[-1])))
+
+    output_features = tf.concat(output_tensors, 1)
+    tf.add_to_collection('variational_dropout',
+                         json.dumps({self.name: feature_dimension}))
+
+    batch_size = tf.shape(output_features)[0]
+    t_alpha = tf.convert_to_tensor(alphas, dtype=tf.float32)
+    loss = tf.reduce_sum(t_alpha * z) / tf.to_float(batch_size)
+
+    tf.add_to_collection('variational_dropout_loss', loss)
+    return output_features
+
+
+# def dropout(p):
+#    u = np.random.uniform()
+#    x = math.log(p) - math.log(1-p) + math.log(u) - math.log(1-u)
+#    z = sigmoid(x/0.1)
+#    return z
+#
+#
+# if __name__ == '__main__':
+#    for i in range(100):
+#      print(dropout(0.5))
diff --git a/easy_rec/python/layers/input_layer.py b/easy_rec/python/layers/input_layer.py
index 731f47c82..4c36811fa 100644
--- a/easy_rec/python/layers/input_layer.py
+++ b/easy_rec/python/layers/input_layer.py
@@ -1,8 +1,11 @@
 # -*- encoding: utf-8 -*-
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import logging
+import os
 from collections import OrderedDict
 
 import tensorflow as tf
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variable_scope
 
@@ -13,13 +16,12 @@
 from easy_rec.python.layers import sequence_feature_layer
 from easy_rec.python.layers import variational_dropout_layer
 from easy_rec.python.layers.common_layers import text_cnn
+from easy_rec.python.layers.fscd_layer import FSCDLayer
 from easy_rec.python.protos.feature_config_pb2 import WideOrDeep
+from easy_rec.python.utils import conditional
 from easy_rec.python.utils import shape_utils
 
-from easy_rec.python.compat.feature_column.feature_column_v2 import EmbeddingColumn  # NOQA
-from easy_rec.python.compat.feature_column.feature_column_v2 import SharedEmbeddingColumn  # NOQA
-
-from easy_rec.python.compat.feature_column.feature_column import _SharedEmbeddingColumn  # NOQA
+from easy_rec.python.compat.feature_column.feature_column_v2 import is_embedding_column  # NOQA
 
 
 class InputLayer(object):
@@ -36,7 +38,9 @@ def __init__(self,
                ev_params=None,
                embedding_regularizer=None,
                kernel_regularizer=None,
-               is_training=False):
+               is_training=False,
+               is_predicting=False):
+    self._feature_configs = feature_configs
     self._feature_groups = {
         x.group_name: FeatureGroup(x) for x in feature_groups_config
     }
@@ -62,6 +66,7 @@ def __init__(self,
     self._embedding_regularizer = embedding_regularizer
     self._kernel_regularizer = kernel_regularizer
     self._is_training = is_training
+    self._is_predicting = is_predicting
     self._variational_dropout_config = variational_dropout_config
 
   def has_group(self, group_name):
@@ -92,8 +97,11 @@ def __call__(self, features, group_name, is_combine=True, is_dict=False):
     feature_name_to_output_tensors = {}
     negative_sampler = self._feature_groups[group_name]._config.negative_sampler
     if is_combine:
-      concat_features, group_features = self.single_call_input_layer(
-          features, group_name, feature_name_to_output_tensors)
+      place_on_cpu = os.getenv('place_embedding_on_cpu')
+      place_on_cpu = eval(place_on_cpu) if place_on_cpu else False
+      with conditional(self._is_predicting and place_on_cpu, ops.device('/CPU:0')):
+        concat_features, group_features = self.single_call_input_layer(
+            features, group_name, feature_name_to_output_tensors)
       if group_name in self._group_name_to_seq_features:
         # for target attention
         group_seq_arr = self._group_name_to_seq_features[group_name]
@@ -116,19 +124,32 @@ def __call__(self, features, group_name, is_combine=True, is_dict=False):
         return concat_features, group_features
     else:  # return sequence feature in raw format instead of combine them
       if self._variational_dropout_config is not None:
-        raise ValueError(
+        logging.warning(
             'variational dropout is not supported in not combined mode now.')
 
       feature_group = self._feature_groups[group_name]
       group_columns, group_seq_columns = feature_group.select_columns(
           self._fc_parser)
 
-      assert len(group_columns) == 0, \
-          'there are none sequence columns: %s' % str(group_columns)
+      embedding_reg_lst = []
+      output_features = None
+      group_features = []
+      if group_columns:
+        cols_to_output_tensors = OrderedDict()
+        output_features = feature_column.input_layer(
+            features,
+            group_columns,
+            cols_to_output_tensors=cols_to_output_tensors,
+            feature_name_to_output_tensors=feature_name_to_output_tensors,
+            sort_feature_columns_by_name=False)
+        group_features = [cols_to_output_tensors[x] for x in group_columns]
+
+        for col, val in cols_to_output_tensors.items():
+          if is_embedding_column(col):
+            embedding_reg_lst.append(val)
 
       builder = feature_column._LazyBuilder(features)
       seq_features = []
-      embedding_reg_lst = []
       for fc in group_seq_columns:
         with variable_scope.variable_scope('input_layer/' +
                                            fc.categorical_column.name):
@@ -140,7 +161,7 @@ def __call__(self, features, group_name, is_combine=True, is_dict=False):
           embedding_reg_lst.append(tmp_embedding)
       regularizers.apply_regularization(
           self._embedding_regularizer, weights_list=embedding_reg_lst)
-      return seq_features
+      return seq_features, output_features, group_features
 
   def single_call_input_layer(self,
                               features,
@@ -169,18 +190,17 @@ def single_call_input_layer(self,
         group_columns,
         cols_to_output_tensors=cols_to_output_tensors,
         feature_name_to_output_tensors=feature_name_to_output_tensors)
-    # embedding_reg_lst = [output_features]
+
     embedding_reg_lst = []
-    for col, val in cols_to_output_tensors.items():
-      if isinstance(col, EmbeddingColumn) or isinstance(col,
-                                                        SharedEmbeddingColumn):
-        embedding_reg_lst.append(val)
     builder = feature_column._LazyBuilder(features)
     seq_features = []
     for column in sorted(group_seq_columns, key=lambda x: x.name):
       with variable_scope.variable_scope(
           None, default_name=column._var_scope_name):
-        seq_feature, seq_len = column._get_sequence_dense_tensor(builder)
+        place_on_cpu = os.getenv('place_embedding_on_cpu')
+        place_on_cpu = eval(place_on_cpu) if place_on_cpu else False
+        with conditional(self._is_predicting and place_on_cpu, ops.device('/CPU:0')):
+          seq_feature, seq_len = column._get_sequence_dense_tensor(builder)
         embedding_reg_lst.append(seq_feature)
 
         sequence_combiner = column.sequence_combiner
@@ -213,30 +233,47 @@ def single_call_input_layer(self,
           cols_to_output_tensors[column] = cnn_feature
         else:
           raise NotImplementedError
+
     if self._variational_dropout_config is not None:
-      features_dimension = OrderedDict([
-          (k.raw_name, int(v.shape[-1]))
-          for k, v in cols_to_output_tensors.items()
-      ])
-      concat_features = array_ops.concat(
-          [output_features] + seq_features, axis=-1)
-      variational_dropout = variational_dropout_layer.VariationalDropoutLayer(
-          self._variational_dropout_config,
-          features_dimension,
-          self._is_training,
-          name=group_name)
-      concat_features = variational_dropout(concat_features)
-      group_features = tf.split(
-          concat_features, list(features_dimension.values()), axis=-1)
+      if self._variational_dropout_config.regularize_by_feature_complexity:
+        fscd = FSCDLayer(
+            self._feature_configs,
+            self._variational_dropout_config,
+            is_training=self._is_training,
+            name=group_name)
+        output_features = fscd(cols_to_output_tensors)
+        concat_features = array_ops.concat(
+            [output_features] + seq_features, axis=-1)
+        group_features = [cols_to_output_tensors[x] for x in group_columns] + \
+                         [cols_to_output_tensors[x] for x in group_seq_columns]
+      else:
+        features_dimension = OrderedDict([
+            (k.raw_name, int(v.shape[-1]))
+            for k, v in cols_to_output_tensors.items()
+        ])
+        concat_features = array_ops.concat(
+            [output_features] + seq_features, axis=-1)
+        variational_dropout = variational_dropout_layer.VariationalDropoutLayer(
+            self._variational_dropout_config,
+            features_dimension,
+            self._is_training,
+            name=group_name)
+        concat_features = variational_dropout(concat_features)
+        group_features = tf.split(
+            concat_features, list(features_dimension.values()), axis=-1)
     else:
       concat_features = array_ops.concat(
           [output_features] + seq_features, axis=-1)
       group_features = [cols_to_output_tensors[x] for x in group_columns] + \
                        [cols_to_output_tensors[x] for x in group_seq_columns]
 
-      if embedding_reg_lst:
-        regularizers.apply_regularization(
-            self._embedding_regularizer, weights_list=embedding_reg_lst)
+    for fc, val in cols_to_output_tensors.items():
+      if is_embedding_column(fc):
+        embedding_reg_lst.append(val)
+
+    if embedding_reg_lst:
+      regularizers.apply_regularization(
+          self._embedding_regularizer, weights_list=embedding_reg_lst)
     return concat_features, group_features
 
   def get_wide_deep_dict(self):
diff --git a/easy_rec/python/layers/keras/__init__.py b/easy_rec/python/layers/keras/__init__.py
new file mode 100644
index 000000000..39d7c8be8
--- /dev/null
+++ b/easy_rec/python/layers/keras/__init__.py
@@ -0,0 +1,15 @@
+from .blocks import MLP
+from .blocks import Gate
+from .blocks import Highway
+from .bst import BST
+from .din import DIN
+from .fibinet import BiLinear
+from .fibinet import FiBiNet
+from .fibinet import SENet
+from .interaction import FM
+from .interaction import Cross
+from .interaction import DotInteraction
+from .mask_net import MaskBlock
+from .mask_net import MaskNet
+from .numerical_embedding import AutoDisEmbedding
+from .numerical_embedding import PeriodicEmbedding
diff --git a/easy_rec/python/layers/keras/blocks.py b/easy_rec/python/layers/keras/blocks.py
new file mode 100644
index 000000000..1a6715a8e
--- /dev/null
+++ b/easy_rec/python/layers/keras/blocks.py
@@ -0,0 +1,158 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+"""Convenience blocks for building models."""
+import logging
+
+import tensorflow as tf
+
+from easy_rec.python.utils.activation import get_activation
+
+
+class MLP(tf.keras.layers.Layer):
+  """Sequential multi-layer perceptron (MLP) block.
+
+  Attributes:
+    units: Sequential list of layer sizes.
+    use_bias: Whether to include a bias term.
+    activation: Type of activation to use on all except the last layer.
+    final_activation: Type of activation to use on last layer.
+    **kwargs: Extra args passed to the Keras Layer base class.
+  """
+
+  def __init__(self, params, name='mlp', **kwargs):
+    super(MLP, self).__init__(name=name, **kwargs)
+    params.check_required('hidden_units')
+    use_bn = params.get_or_default('use_bn', True)
+    use_final_bn = params.get_or_default('use_final_bn', True)
+    use_bias = params.get_or_default('use_bias', True)
+    dropout_rate = list(params.get_or_default('dropout_ratio', []))
+    activation = params.get_or_default('activation', 'relu')
+    initializer = params.get_or_default('initializer', 'he_uniform')
+    final_activation = params.get_or_default('final_activation', None)
+    use_bn_after_act = params.get_or_default('use_bn_after_activation', False)
+    units = list(params.hidden_units)
+    logging.info(
+        'MLP(%s) units: %s, dropout: %r, activate=%s, use_bn=%r, final_bn=%r,'
+        ' final_activate=%s, bias=%r, initializer=%s, bn_after_activation=%r' %
+        (name, units, dropout_rate, activation, use_bn, use_final_bn,
+         final_activation, use_bias, initializer, use_bn_after_act))
+
+    num_dropout = len(dropout_rate)
+    self._sub_layers = []
+    for i, num_units in enumerate(units[:-1]):
+      name = 'dnn_%d' % i
+      drop_rate = dropout_rate[i] if i < num_dropout else 0.0
+      self.add_rich_layer(num_units, use_bn, drop_rate, activation, initializer,
+                          use_bias, use_bn_after_act, name,
+                          params.l2_regularizer)
+
+    n = len(units) - 1
+    drop_rate = dropout_rate[n] if num_dropout > n else 0.0
+    name = 'dnn_%d' % n
+    self.add_rich_layer(units[-1], use_final_bn, drop_rate, final_activation,
+                        initializer, use_bias, use_bn_after_act, name,
+                        params.l2_regularizer)
+
+  def add_rich_layer(self,
+                     num_units,
+                     use_bn,
+                     dropout_rate,
+                     activation,
+                     initializer,
+                     use_bias=True,
+                     use_bn_after_activation=False,
+                     name='mlp',
+                     l2_reg=None):
+
+    def batch_norm(x, training):
+      return tf.layers.batch_normalization(
+          x, training=training, name='%s/%s/bn' % (self.name, name))
+
+    act_fn = get_activation(activation)
+    if use_bn and not use_bn_after_activation:
+      dense = tf.keras.layers.Dense(
+          units=num_units,
+          use_bias=use_bias,
+          kernel_initializer=initializer,
+          kernel_regularizer=l2_reg,
+          name=name)
+      self._sub_layers.append(dense)
+
+      # bn = tf.keras.layers.BatchNormalization(name='%s/bn' % name)
+      # keras BN layer have a stale issue on some versions of tf
+      self._sub_layers.append(batch_norm)
+      act = tf.keras.layers.Activation(act_fn, name='%s/act' % name)
+      self._sub_layers.append(act)
+    else:
+      dense = tf.keras.layers.Dense(
+          num_units,
+          activation=act_fn,
+          use_bias=use_bias,
+          kernel_initializer=initializer,
+          kernel_regularizer=l2_reg,
+          name=name)
+      self._sub_layers.append(dense)
+      if use_bn and use_bn_after_activation:
+        self._sub_layers.append(batch_norm)
+
+    if 0.0 < dropout_rate < 1.0:
+      dropout = tf.keras.layers.Dropout(dropout_rate, name='%s/dropout' % name)
+      self._sub_layers.append(dropout)
+    elif dropout_rate >= 1.0:
+      raise ValueError('invalid dropout_ratio: %.3f' % dropout_rate)
+
+  def call(self, x, training=None, **kwargs):
+    """Performs the forward computation of the block."""
+    from inspect import isfunction
+    for layer in self._sub_layers:
+      if isfunction(layer):
+        x = layer(x, training=training)
+      else:
+        cls = layer.__class__.__name__
+        if cls in ('Dropout', 'BatchNormalization'):
+          x = layer(x, training=training)
+        else:
+          x = layer(x)
+    return x
+
+
+class Highway(tf.keras.layers.Layer):
+
+  def __init__(self, params, name='highway', **kwargs):
+    super(Highway, self).__init__(name, **kwargs)
+    params.check_required('emb_size')
+    self.emb_size = params.emb_size
+    self.num_layers = params.get_or_default('num_layers', 1)
+    self.activation = params.get_or_default('activation', 'gelu')
+    self.dropout_rate = params.get_or_default('dropout_rate', 0.0)
+
+  def call(self, inputs, training=None, **kwargs):
+    from easy_rec.python.layers.common_layers import highway
+    return highway(
+        inputs,
+        self.emb_size,
+        activation=self.activation,
+        num_layers=self.num_layers,
+        dropout=self.dropout_rate if training else 0.0)
+
+
+class Gate(tf.keras.layers.Layer):
+  """Weighted sum gate."""
+
+  def __init__(self, params, name='gate', **kwargs):
+    super(Gate, self).__init__(name, **kwargs)
+    self.weight_index = params.get_or_default("weight_index", 0)
+
+  def call(self, inputs, **kwargs):
+    assert len(inputs) > 1, 'input of Gate layer must be a list containing at least 2 elements'
+    weights = inputs[self.weight_index]
+    j = 0
+    for i, x in enumerate(inputs):
+      if i == self.weight_index:
+        continue
+      if j == 0:
+        output = weights[:, j, None] * x
+      else:
+        output += weights[:, j, None] * x
+      j += 1
+    return output
diff --git a/easy_rec/python/layers/keras/bst.py b/easy_rec/python/layers/keras/bst.py
new file mode 100644
index 000000000..f8b876fb4
--- /dev/null
+++ b/easy_rec/python/layers/keras/bst.py
@@ -0,0 +1,122 @@
+# -*- encoding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import tensorflow as tf
+from tensorflow.python.keras.layers import Layer
+
+from easy_rec.python.input.augment import input_aug_data
+from easy_rec.python.layers import multihead_cross_attention
+from easy_rec.python.loss.nce_loss import nce_loss
+from easy_rec.python.utils.activation import get_activation
+from easy_rec.python.utils.shape_utils import get_shape_list
+
+
+class BST(Layer):
+
+  def __init__(self, params, name='bst', l2_reg=None, **kwargs):
+    super(BST, self).__init__(name=name, **kwargs)
+    self.l2_reg = l2_reg
+    self.config = params.get_pb_config()
+
+  def encode(self, seq_input, max_position):
+    seq_fea = multihead_cross_attention.embedding_postprocessor(
+        seq_input,
+        position_embedding_name=self.name + '/position_embeddings',
+        max_position_embeddings=max_position,
+        reuse_position_embedding=tf.AUTO_REUSE)
+
+    n = tf.count_nonzero(seq_input, axis=-1)
+    seq_mask = tf.cast(n > 0, tf.int32)
+
+    attention_mask = multihead_cross_attention.create_attention_mask_from_input_mask(
+        from_tensor=seq_fea, to_mask=seq_mask)
+
+    hidden_act = get_activation(self.config.hidden_act)
+    attention_fea = multihead_cross_attention.transformer_encoder(
+        seq_fea,
+        hidden_size=self.config.hidden_size,
+        num_hidden_layers=self.config.num_hidden_layers,
+        num_attention_heads=self.config.num_attention_heads,
+        attention_mask=attention_mask,
+        intermediate_size=self.config.intermediate_size,
+        intermediate_act_fn=hidden_act,
+        hidden_dropout_prob=self.config.hidden_dropout_prob,
+        attention_probs_dropout_prob=self.config.attention_probs_dropout_prob,
+        initializer_range=self.config.initializer_range,
+        name=self.name + '/transformer',
+        reuse=tf.AUTO_REUSE)
+    # attention_fea shape: [batch_size, seq_length, hidden_size]
+    out_fea = attention_fea[:, 0, :]  # target feature
+    print('bst output shape:', out_fea.shape)
+    return out_fea
+
+  def call(self, inputs, training=None, **kwargs):
+    seq_features, target_features = inputs
+    assert len(seq_features) > 0, '[%s] sequence feature is empty' % self.name
+    if not training:
+      self.config.hidden_dropout_prob = 0.0
+      self.config.attention_probs_dropout_prob = 0.0
+
+    seq_embeds = [seq_fea for seq_fea, _ in seq_features]
+
+    max_position = self.config.max_position_embeddings
+    # max_seq_len: the max sequence length in current mini-batch, all sequences are padded to this length
+    batch_size, max_seq_len, _ = get_shape_list(seq_features[0][0], 3)
+    valid_len = tf.assert_less_equal(
+        max_seq_len,
+        max_position,
+        message='sequence length is greater than `max_position_embeddings`:' +
+        str(max_position) + ' in feature group:' + self.name)
+    with tf.control_dependencies([valid_len]):
+      # seq_input: [batch_size, seq_len, embed_size]
+      seq_input = tf.concat(seq_embeds, axis=-1)
+    if len(target_features) > 0:
+      max_position += 1
+
+    seq_embed_size = seq_input.shape.as_list()[-1]
+    if seq_embed_size != self.config.hidden_size:
+      seq_input = tf.layers.dense(
+          seq_input,
+          self.config.hidden_size,
+          activation=tf.nn.relu,
+          kernel_regularizer=self.l2_reg)
+
+    # seq_len: [batch_size, 1], the true length of each sequence
+    seq_len = seq_features[0][1]
+
+    if self.config.need_contrastive_learning:
+      assert 'loss_dict' in kwargs, 'no `loss_dict` in kwargs of bst layer: %s' % self.name
+      loss = self.contrastive_loss(seq_input, seq_len, max_position)
+      if self.config.auto_contrastive_loss_weight:
+        uncertainty = tf.Variable(
+            0, name='%s_contrastive_loss_weight' % self.name, dtype=tf.float32)
+        loss = tf.exp(-uncertainty) * loss + 0.5 * uncertainty
+      else:
+        loss *= self.config.contrastive_loss_weight
+      loss_dict = kwargs['loss_dict']
+      loss_dict['%s_contrastive_loss' % self.name] = loss
+      # tf.summary.scalar('loss/%s_contrastive_loss' % self.name, loss)
+
+    if len(target_features) > 0:
+      target_feature = tf.concat(target_features, axis=-1)
+      target_size = target_feature.shape.as_list()[-1]
+      assert seq_embed_size == target_size, 'the embedding size of sequence and target item is not equal' \
+                                            ' in feature group:' + self.name
+      if target_size != self.config.hidden_size:
+        target_feature = tf.layers.dense(
+            target_feature,
+            self.config.hidden_size,
+            activation=tf.nn.relu,
+            kernel_regularizer=self.l2_reg)
+      # target_feature: [batch_size, 1, embed_size]
+      target_feature = tf.expand_dims(target_feature, 1)
+      # seq_input: [batch_size, seq_len+1, embed_size]
+      seq_input = tf.concat([target_feature, seq_input], axis=1)
+
+    return self.encode(seq_input, max_position)
+
+  def contrastive_loss(self, seq_input, seq_len, max_position):
+    aug_seq1, aug_seq2, aug_len1, aug_len2 = input_aug_data(seq_input, seq_len)
+    seq_output1 = self.encode(aug_seq1, max_position)
+    seq_output2 = self.encode(aug_seq2, max_position)
+    loss = nce_loss(seq_output1, seq_output2)
+    return loss
diff --git a/easy_rec/python/layers/keras/din.py b/easy_rec/python/layers/keras/din.py
new file mode 100644
index 000000000..cee57ac90
--- /dev/null
+++ b/easy_rec/python/layers/keras/din.py
@@ -0,0 +1,73 @@
+# -*- encoding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import logging
+
+import tensorflow as tf
+from tensorflow.python.keras.layers import Layer
+
+from easy_rec.python.layers import dnn
+from easy_rec.python.utils.shape_utils import get_shape_list
+
+
+class DIN(Layer):
+
+  def __init__(self, params, name='din', l2_reg=None, **kwargs):
+    super(DIN, self).__init__(name=name, **kwargs)
+    self.l2_reg = l2_reg
+    self.config = params.get_pb_config()
+
+  def call(self, inputs, training=None, **kwargs):
+    seq_features, target_features = inputs
+    assert len(seq_features) > 0, '[%s] sequence feature is empty' % self.name
+    assert len(target_features) > 0, '[%s] target feature is empty' % self.name
+
+    query = tf.concat(target_features, axis=-1)
+    seq_input = [seq_fea for seq_fea, _ in seq_features]
+    keys = tf.concat(seq_input, axis=-1)
+
+    query_emb_size = int(query.shape[-1])
+    seq_emb_size = keys.shape.as_list()[-1]
+    if query_emb_size != seq_emb_size:
+      logging.info(
+          '<din> the embedding size of sequence [%d] and target item [%d] is not equal'
+          ' in feature group: %s', seq_emb_size, query_emb_size, self.name)
+      if query_emb_size < seq_emb_size:
+        query = tf.pad(query, [[0, 0], [0, seq_emb_size - query_emb_size]])
+      else:
+        assert False, 'the embedding size of target item is larger than the one of sequence'
+
+    batch_size, max_seq_len, _ = get_shape_list(keys, 3)
+    queries = tf.tile(tf.expand_dims(query, 1), [1, max_seq_len, 1])
+    din_all = tf.concat([queries, keys, queries - keys, queries * keys],
+                        axis=-1)
+    din_layer = dnn.DNN(
+        self.config.attention_dnn,
+        self.l2_reg,
+        self.name + '/din_attention',
+        training,
+        last_layer_no_activation=True,
+        last_layer_no_batch_norm=True)
+    output = din_layer(din_all)  # [B, L, 1]
+    scores = tf.transpose(output, [0, 2, 1])  # [B, 1, L]
+
+    seq_len = seq_features[0][1]
+    seq_mask = tf.sequence_mask(seq_len, max_seq_len, dtype=tf.bool)
+    seq_mask = tf.expand_dims(seq_mask, 1)
+    paddings = tf.ones_like(scores) * (-2**32 + 1)
+    scores = tf.where(seq_mask, scores, paddings)  # [B, 1, L]
+    if self.config.attention_normalizer == 'softmax':
+      scores = tf.nn.softmax(scores)  # (B, 1, L)
+    elif self.config.attention_normalizer == 'sigmoid':
+      scores = scores / (seq_emb_size**0.5)
+      scores = tf.nn.sigmoid(scores)
+    else:
+      raise ValueError('unsupported attention normalizer: ' +
+                       self.config.attention_normalizer)
+
+    if query_emb_size < seq_emb_size:
+      keys = keys[:, :, :query_emb_size]  # [B, L, E]
+    output = tf.squeeze(tf.matmul(scores, keys), axis=[1])
+    if self.config.need_target_feature:
+      output = tf.concat([output, query], axis=-1)
+    print('din output shape:', output.shape)
+    return output
diff --git a/easy_rec/python/layers/keras/fibinet.py b/easy_rec/python/layers/keras/fibinet.py
new file mode 100644
index 000000000..98cdb3179
--- /dev/null
+++ b/easy_rec/python/layers/keras/fibinet.py
@@ -0,0 +1,245 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import itertools
+import logging
+
+import tensorflow as tf
+
+from easy_rec.python.layers.common_layers import layer_norm
+from easy_rec.python.layers.keras.blocks import MLP
+from easy_rec.python.layers.utils import Parameter
+
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+
+
+class SENet(tf.keras.layers.Layer):
+  """SENET Layer used in FiBiNET.
+
+  Input shape
+    - A list of 2D tensor with shape: ``(batch_size,embedding_size)``.
+      The ``embedding_size`` of each field can have different value.
+
+  Output shape
+    - A 2D tensor with shape: ``(batch_size,sum_of_embedding_size)``.
+
+  References:
+    1. [FiBiNET](https://arxiv.org/pdf/1905.09433.pdf)
+      Combining Feature Importance and Bilinear feature Interaction for Click-Through Rate Prediction
+    2. [FiBiNet++](https://arxiv.org/pdf/2209.05016.pdf)
+      Improving FiBiNet by Greatly Reducing Model Size for CTR Prediction
+  """
+
+  def __init__(self, params, name='SENet', **kwargs):
+    super(SENet, self).__init__(name, **kwargs)
+    self.config = params.get_pb_config()
+
+  def call(self, inputs, **kwargs):
+    g = self.config.num_squeeze_group
+    for emb in inputs:
+      assert emb.shape.ndims == 2, 'field embeddings must be rank 2 tensors'
+      dim = int(emb.shape[-1])
+      assert dim >= g and dim % g == 0, 'field embedding dimension %d must be divisible by %d' % (
+          dim, g)
+
+    field_size = len(inputs)
+    feature_size_list = [emb.shape.as_list()[-1] for emb in inputs]
+
+    # Squeeze
+    # embedding dimension 必须能被 g 整除
+    group_embs = [
+        tf.reshape(emb, [-1, g, int(emb.shape[-1]) // g]) for emb in inputs
+    ]
+
+    squeezed = []
+    for emb in group_embs:
+      squeezed.append(tf.reduce_max(emb, axis=-1))  # [B, g]
+      squeezed.append(tf.reduce_mean(emb, axis=-1))  # [B, g]
+    z = tf.concat(squeezed, axis=1)  # [bs, field_size * num_groups * 2]
+
+    # Excitation
+    r = self.config.reduction_ratio
+    reduction_size = max(1, field_size * g * 2 // r)
+
+    initializer = tf.glorot_normal_initializer()
+    a1 = tf.layers.dense(
+        z,
+        reduction_size,
+        kernel_initializer=initializer,
+        activation=tf.nn.relu,
+        name='%s/W1' % self.name)
+    weights = tf.layers.dense(
+        a1,
+        sum(feature_size_list),
+        kernel_initializer=initializer,
+        name='%s/W2' % self.name)
+
+    # Re-weight
+    inputs = tf.concat(inputs, axis=-1)
+    output = inputs * weights
+
+    # Fuse, add skip-connection
+    if self.config.use_skip_connection:
+      output += inputs
+
+    # Layer Normalization
+    if self.config.use_output_layer_norm:
+      output = layer_norm(output)
+    return output
+
+
+def _full_interaction(v_i, v_j):
+  # [bs, 1, dim] x [bs, dim, 1] = [bs, 1]
+  interaction = tf.matmul(
+      tf.expand_dims(v_i, axis=1), tf.expand_dims(v_j, axis=-1))
+  return tf.squeeze(interaction, axis=1)
+
+
+class BiLinear(tf.keras.layers.Layer):
+  """BilinearInteraction Layer used in FiBiNET.
+
+  Input shape
+    - A list of 2D tensor with shape: ``(batch_size,embedding_size)``.
+      Its length is ``filed_size``.
+      The ``embedding_size`` of each field can have different value.
+
+  Output shape
+    - 2D tensor with shape: ``(batch_size,output_size)``.
+
+  Attributes:
+    num_output_units: the number of output units
+    type: ['all', 'each', 'interaction'], types of bilinear functions used in this layer
+    use_plus: whether to use bi-linear+
+
+  References:
+    1. [FiBiNET](https://arxiv.org/pdf/1905.09433.pdf)
+      Combining Feature Importance and Bilinear feature Interaction for Click-Through Rate Prediction
+    2. [FiBiNet++](https://arxiv.org/pdf/2209.05016.pdf)
+      Improving FiBiNet by Greatly Reducing Model Size for CTR Prediction
+  """
+
+  def __init__(self, params, name='bilinear', **kwargs):
+    super(BiLinear, self).__init__(name, **kwargs)
+    params.check_required(['num_output_units'])
+    bilinear_plus = params.get_or_default('use_plus', True)
+    self.bilinear_type = params.get_or_default('type', 'interaction').lower()
+    self.output_size = params.num_output_units
+
+    if self.bilinear_type not in ['all', 'each', 'interaction']:
+      raise NotImplementedError(
+          "bilinear_type only support: ['all', 'each', 'interaction']")
+
+    if bilinear_plus:
+      self.func = _full_interaction
+    else:
+      self.func = tf.multiply
+
+  def call(self, inputs, **kwargs):
+    embeddings = inputs
+    logging.info('Bilinear Layer with %d inputs' % len(embeddings))
+    if len(embeddings) > 200:
+      logging.warning('There are too many inputs for bilinear layer: %d' %
+                      len(embeddings))
+    equal_dim = True
+    _dim = embeddings[0].shape[-1]
+    for emb in embeddings:
+      assert emb.shape.ndims == 2, 'field embeddings must be rank 2 tensors'
+      if emb.shape[-1] != _dim:
+        equal_dim = False
+    if not equal_dim and self.bilinear_type != 'interaction':
+      raise ValueError(
+          'all embedding dimensions must be same when not use bilinear type: interaction'
+      )
+    dim = int(_dim)
+
+    field_size = len(embeddings)
+    initializer = tf.glorot_normal_initializer()
+
+    # bi-linear+: p的维度为[bs, f*(f-1)/2]
+    # bi-linear:
+    # 当equal_dim=True时，p的维度为[bs, f*(f-1)/2*k]，k为embeddings的size
+    # 当equal_dim=False时，p的维度为[bs, (k_2+k_3+...+k_f)+...+(k_i+k_{i+1}+...+k_f)+...+k_f]，
+    # 其中 k_i为第i个field的embedding的size
+    if self.bilinear_type == 'all':
+      v_dot = [
+          tf.layers.dense(
+              v_i,
+              dim,
+              kernel_initializer=initializer,
+              name='%s/all' % self.name,
+              reuse=tf.AUTO_REUSE) for v_i in embeddings[:-1]
+      ]
+      p = [
+          self.func(v_dot[i], embeddings[j])
+          for i, j in itertools.combinations(range(field_size), 2)
+      ]
+    elif self.bilinear_type == 'each':
+      v_dot = [
+          tf.layers.dense(
+              v_i,
+              dim,
+              kernel_initializer=initializer,
+              name='%s/each_%d' % (self.name, i),
+              reuse=tf.AUTO_REUSE) for i, v_i in enumerate(embeddings[:-1])
+      ]
+      p = [
+          self.func(v_dot[i], embeddings[j])
+          for i, j in itertools.combinations(range(field_size), 2)
+      ]
+    else:  # interaction
+      p = [
+          self.func(
+              tf.layers.dense(
+                  embeddings[i],
+                  embeddings[j].shape.as_list()[-1],
+                  kernel_initializer=initializer,
+                  name='%s/interaction_%d_%d' % (self.name, i, j),
+                  reuse=tf.AUTO_REUSE), embeddings[j])
+          for i, j in itertools.combinations(range(field_size), 2)
+      ]
+
+    output = tf.layers.dense(
+        tf.concat(p, axis=-1), self.output_size, kernel_initializer=initializer)
+    return output
+
+
+class FiBiNet(tf.keras.layers.Layer):
+  """FiBiNet++:Improving FiBiNet by Greatly Reducing Model Size for CTR Prediction.
+
+  References:
+    - [FiBiNet++](https://arxiv.org/pdf/2209.05016.pdf)
+      Improving FiBiNet by Greatly Reducing Model Size for CTR Prediction
+  """
+
+  def __init__(self, params, name='fibinet', **kwargs):
+    super(FiBiNet, self).__init__(name, **kwargs)
+    self._config = params.get_pb_config()
+    if self._config.HasField('mlp'):
+      p = Parameter.make_from_pb(self._config.mlp)
+      p.l2_regularizer = params.l2_regularizer
+      self.final_mlp = MLP(p, name=name)
+    else:
+      self.final_mlp = None
+
+  def call(self, inputs, training=None, **kwargs):
+    feature_list = []
+
+    params = Parameter.make_from_pb(self._config.senet)
+    senet = SENet(params, name='%s/senet' % self.name)
+    senet_output = senet(inputs)
+    feature_list.append(senet_output)
+
+    if self._config.HasField('bilinear'):
+      params = Parameter.make_from_pb(self._config.bilinear)
+      bilinear = BiLinear(params, name='%s/bilinear' % self.name)
+      bilinear_output = bilinear(inputs)
+      feature_list.append(bilinear_output)
+
+    if len(feature_list) > 1:
+      feature = tf.concat(feature_list, axis=-1)
+    else:
+      feature = feature_list[0]
+
+    if self.final_mlp is not None:
+      feature = self.final_mlp(feature, training=training)
+    return feature
diff --git a/easy_rec/python/layers/keras/interaction.py b/easy_rec/python/layers/keras/interaction.py
new file mode 100644
index 000000000..55f56f7a1
--- /dev/null
+++ b/easy_rec/python/layers/keras/interaction.py
@@ -0,0 +1,312 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import tensorflow as tf
+
+from easy_rec.python.utils.activation import get_activation
+
+
+class FM(tf.keras.layers.Layer):
+  """Factorization Machine models pairwise (order-2) feature interactions without linear term and bias.
+
+  References
+    - [Factorization Machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)
+  Input shape.
+    - List of 2D tensor with shape: ``(batch_size,embedding_size)``.
+    - Or a 3D tensor with shape: ``(batch_size,field_size,embedding_size)``
+  Output shape
+    - 2D tensor with shape: ``(batch_size, 1)``.
+  """
+
+  def __init__(self, params, name='fm', **kwargs):
+    super(FM, self).__init__(name, **kwargs)
+    self.use_variant = params.get_or_default('use_variant', False)
+
+  def call(self, inputs, **kwargs):
+    if type(inputs) == list:
+      emb_dims = set(map(lambda x: int(x.shape[-1]), inputs))
+      if len(emb_dims) != 1:
+        dims = ','.join([str(d) for d in emb_dims])
+        raise ValueError('all embedding dim must be equal in FM layer:' + dims)
+      with tf.name_scope(self.name):
+        fea = tf.stack(inputs, axis=1)
+    else:
+      assert inputs.shape.ndims == 3, 'input of FM layer must be a 3D tensor or a list of 2D tensors'
+      fea = inputs
+
+    with tf.name_scope(self.name):
+      square_of_sum = tf.square(tf.reduce_sum(fea, axis=1))
+      sum_of_square = tf.reduce_sum(tf.square(fea), axis=1)
+      cross_term = tf.subtract(square_of_sum, sum_of_square)
+      if self.use_variant:
+        cross_term = 0.5 * cross_term
+      else:
+        cross_term = 0.5 * tf.reduce_sum(cross_term, axis=-1, keepdims=True)
+    return cross_term
+
+
+class DotInteraction(tf.keras.layers.Layer):
+  """Dot interaction layer of DLRM model..
+
+  See theory in the DLRM paper: https://arxiv.org/pdf/1906.00091.pdf,
+  section 2.1.3. Sparse activations and dense activations are combined.
+  Dot interaction is applied to a batch of input Tensors [e1,...,e_k] of the
+  same dimension and the output is a batch of Tensors with all distinct pairwise
+  dot products of the form dot(e_i, e_j) for i <= j if self self_interaction is
+  True, otherwise dot(e_i, e_j) i < j.
+
+  Attributes:
+    self_interaction: Boolean indicating if features should self-interact.
+      If it is True, then the diagonal entries of the interaction metric are
+      also taken.
+    skip_gather: An optimization flag. If it's set then the upper triangle part
+      of the dot interaction matrix dot(e_i, e_j) is set to 0. The resulting
+      activations will be of dimension [num_features * num_features] from which
+      half will be zeros. Otherwise activations will be only lower triangle part
+      of the interaction matrix. The later saves space but is much slower.
+    name: String name of the layer.
+  """
+
+  def __init__(self, params, name=None, **kwargs):
+    self._self_interaction = params.get_or_default('self_interaction', False)
+    self._skip_gather = params.get_or_default('skip_gather', False)
+    super(DotInteraction, self).__init__(name=name, **kwargs)
+
+  def call(self, inputs, **kwargs):
+    """Performs the interaction operation on the tensors in the list.
+
+    The tensors represent as transformed dense features and embedded categorical
+    features.
+    Pre-condition: The tensors should all have the same shape.
+
+    Args:
+      inputs: List of features with shapes [batch_size, feature_dim].
+
+    Returns:
+      activations: Tensor representing interacted features. It has a dimension
+      `num_features * num_features` if skip_gather is True, otherside
+      `num_features * (num_features + 1) / 2` if self_interaction is True and
+      `num_features * (num_features - 1) / 2` if self_interaction is False.
+    """
+    if isinstance(inputs, (list, tuple)):
+      # concat_features shape: batch_size, num_features, feature_dim
+      try:
+        concat_features = tf.stack(inputs, axis=1)
+      except (ValueError, tf.errors.InvalidArgumentError) as e:
+        raise ValueError('Input tensors` dimensions must be equal, original'
+                         'error message: {}'.format(e))
+    else:
+      assert inputs.shape.ndims == 3, 'input of dot func must be a 3D tensor or a list of 2D tensors'
+      concat_features = inputs
+
+    batch_size = tf.shape(concat_features)[0]
+
+    # Interact features, select lower-triangular portion, and re-shape.
+    xactions = tf.matmul(concat_features, concat_features, transpose_b=True)
+    num_features = xactions.shape[-1]
+    ones = tf.ones_like(xactions)
+    if self._self_interaction:
+      # Selecting lower-triangular portion including the diagonal.
+      lower_tri_mask = tf.linalg.band_part(ones, -1, 0)
+      upper_tri_mask = ones - lower_tri_mask
+      out_dim = num_features * (num_features + 1) // 2
+    else:
+      # Selecting lower-triangular portion not included the diagonal.
+      upper_tri_mask = tf.linalg.band_part(ones, 0, -1)
+      lower_tri_mask = ones - upper_tri_mask
+      out_dim = num_features * (num_features - 1) // 2
+
+    if self._skip_gather:
+      # Setting upper triangle part of the interaction matrix to zeros.
+      activations = tf.where(
+          condition=tf.cast(upper_tri_mask, tf.bool),
+          x=tf.zeros_like(xactions),
+          y=xactions)
+      out_dim = num_features * num_features
+    else:
+      activations = tf.boolean_mask(xactions, lower_tri_mask)
+    activations = tf.reshape(activations, (batch_size, out_dim))
+    return activations
+
+
+class Cross(tf.keras.layers.Layer):
+  """Cross Layer in Deep & Cross Network to learn explicit feature interactions.
+
+  A layer that creates explicit and bounded-degree feature interactions
+  efficiently. The `call` method accepts `inputs` as a tuple of size 2
+  tensors. The first input `x0` is the base layer that contains the original
+  features (usually the embedding layer); the second input `xi` is the output
+  of the previous `Cross` layer in the stack, i.e., the i-th `Cross`
+  layer. For the first `Cross` layer in the stack, x0 = xi.
+
+  The output is x_{i+1} = x0 .* (W * xi + bias + diag_scale * xi) + xi,
+  where .* designates elementwise multiplication, W could be a full-rank
+  matrix, or a low-rank matrix U*V to reduce the computational cost, and
+  diag_scale increases the diagonal of W to improve training stability (
+  especially for the low-rank case).
+
+  References:
+      1. [R. Wang et al.](https://arxiv.org/pdf/2008.13535.pdf)
+        See Eq. (1) for full-rank and Eq. (2) for low-rank version.
+      2. [R. Wang et al.](https://arxiv.org/pdf/1708.05123.pdf)
+
+  Example:
+
+      ```python
+      # after embedding layer in a functional model:
+      input = tf.keras.Input(shape=(None,), name='index', dtype=tf.int64)
+      x0 = tf.keras.layers.Embedding(input_dim=32, output_dim=6)
+      x1 = Cross()(x0, x0)
+      x2 = Cross()(x0, x1)
+      logits = tf.keras.layers.Dense(units=10)(x2)
+      model = tf.keras.Model(input, logits)
+      ```
+
+  Args:
+      projection_dim: project dimension to reduce the computational cost.
+        Default is `None` such that a full (`input_dim` by `input_dim`) matrix
+        W is used. If enabled, a low-rank matrix W = U*V will be used, where U
+        is of size `input_dim` by `projection_dim` and V is of size
+        `projection_dim` by `input_dim`. `projection_dim` need to be smaller
+        than `input_dim`/2 to improve the model efficiency. In practice, we've
+        observed that `projection_dim` = d/4 consistently preserved the
+        accuracy of a full-rank version.
+      diag_scale: a non-negative float used to increase the diagonal of the
+        kernel W by `diag_scale`, that is, W + diag_scale * I, where I is an
+        identity matrix.
+      use_bias: whether to add a bias term for this layer. If set to False,
+        no bias term will be used.
+      preactivation: Activation applied to output matrix of the layer, before
+        multiplication with the input. Can be used to control the scale of the
+        layer's outputs and improve stability.
+      kernel_initializer: Initializer to use on the kernel matrix.
+      bias_initializer: Initializer to use on the bias vector.
+      kernel_regularizer: Regularizer to use on the kernel matrix.
+      bias_regularizer: Regularizer to use on bias vector.
+
+  Input shape: A tuple of 2 (batch_size, `input_dim`) dimensional inputs.
+  Output shape: A single (batch_size, `input_dim`) dimensional output.
+  """
+
+  def __init__(self, params, **kwargs):
+    super(Cross, self).__init__(**kwargs)
+    self._projection_dim = params.get_or_default('projection_dim', None)
+    self._diag_scale = params.get_or_default('diag_scale', 0.0)
+    self._use_bias = params.get_or_default('use_bias', True)
+    preactivation = params.get_or_default('preactivation', None)
+    preact = get_activation(preactivation)
+    self._preactivation = tf.keras.activations.get(preact)
+    kernel_initializer = params.get_or_default('kernel_initializer',
+                                               'truncated_normal')
+    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+    bias_initializer = params.get_or_default('bias_initializer', 'zeros')
+    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
+    kernel_regularizer = params.get_or_default('kernel_regularizer', None)
+    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
+    bias_regularizer = params.get_or_default('bias_regularizer', None)
+    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
+    self._input_dim = None
+    self._supports_masking = True
+
+    if self._diag_scale < 0:  # pytype: disable=unsupported-operands
+      raise ValueError(
+          '`diag_scale` should be non-negative. Got `diag_scale` = {}'.format(
+              self._diag_scale))
+
+  def build(self, input_shape):
+    last_dim = input_shape[0][-1]
+
+    if self._projection_dim is None:
+      self._dense = tf.keras.layers.Dense(
+          last_dim,
+          kernel_initializer=_clone_initializer(self._kernel_initializer),
+          bias_initializer=self._bias_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer,
+          use_bias=self._use_bias,
+          dtype=self.dtype,
+          activation=self._preactivation,
+      )
+    else:
+      self._dense_u = tf.keras.layers.Dense(
+          self._projection_dim,
+          kernel_initializer=_clone_initializer(self._kernel_initializer),
+          kernel_regularizer=self._kernel_regularizer,
+          use_bias=False,
+          dtype=self.dtype,
+      )
+      self._dense_v = tf.keras.layers.Dense(
+          last_dim,
+          kernel_initializer=_clone_initializer(self._kernel_initializer),
+          bias_initializer=self._bias_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer,
+          use_bias=self._use_bias,
+          dtype=self.dtype,
+          activation=self._preactivation,
+      )
+    self.built = True
+
+  def call(self, inputs, **kwargs):
+    """Computes the feature cross.
+
+    Args:
+      inputs: The input tensor(x0, x)
+      - x0: The input tensor
+      - x: Optional second input tensor. If provided, the layer will compute
+        crosses between x0 and x; if not provided, the layer will compute
+        crosses between x0 and itself.
+
+    Returns:
+     Tensor of crosses.
+    """
+    if isinstance(inputs, (list, tuple)):
+      x0, x = inputs
+    else:
+      x0, x = inputs, inputs
+
+    if not self.built:
+      self.build(x0.shape)
+
+    if x0.shape[-1] != x.shape[-1]:
+      raise ValueError(
+          '`x0` and `x` dimension mismatch! Got `x0` dimension {}, and x '
+          'dimension {}. This case is not supported yet.'.format(
+              x0.shape[-1], x.shape[-1]))
+
+    if self._projection_dim is None:
+      prod_output = self._dense(x)
+    else:
+      prod_output = self._dense_v(self._dense_u(x))
+
+    # prod_output = tf.cast(prod_output, self.compute_dtype)
+
+    if self._diag_scale:
+      prod_output = prod_output + self._diag_scale * x
+
+    return x0 * prod_output + x
+
+  def get_config(self):
+    config = {
+        'projection_dim':
+            self._projection_dim,
+        'diag_scale':
+            self._diag_scale,
+        'use_bias':
+            self._use_bias,
+        'preactivation':
+            tf.keras.activations.serialize(self._preactivation),
+        'kernel_initializer':
+            tf.keras.initializers.serialize(self._kernel_initializer),
+        'bias_initializer':
+            tf.keras.initializers.serialize(self._bias_initializer),
+        'kernel_regularizer':
+            tf.keras.regularizers.serialize(self._kernel_regularizer),
+        'bias_regularizer':
+            tf.keras.regularizers.serialize(self._bias_regularizer),
+    }
+    base_config = super(Cross, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+def _clone_initializer(initializer):
+  return initializer.__class__.from_config(initializer.get_config())
diff --git a/easy_rec/python/layers/keras/mask_net.py b/easy_rec/python/layers/keras/mask_net.py
new file mode 100644
index 000000000..fa1503b11
--- /dev/null
+++ b/easy_rec/python/layers/keras/mask_net.py
@@ -0,0 +1,135 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import tensorflow as tf
+
+from easy_rec.python.layers.common_layers import layer_norm
+from easy_rec.python.layers.keras.blocks import MLP
+from easy_rec.python.layers.utils import Parameter
+
+
+class MaskBlock(tf.keras.layers.Layer):
+  """MaskBlock use in MaskNet.
+
+  Args:
+    projection_dim: project dimension to reduce the computational cost.
+    Default is `None` such that a full (`input_dim` by `aggregation_size`) matrix
+    W is used. If enabled, a low-rank matrix W = U*V will be used, where U
+    is of size `input_dim` by `projection_dim` and V is of size
+    `projection_dim` by `aggregation_size`. `projection_dim` need to be smaller
+    than `aggregation_size`/2 to improve the model efficiency. In practice, we've
+    observed that `projection_dim` = d/4 consistently preserved the
+    accuracy of a full-rank version.
+  """
+
+  def __init__(self, params, name='mask_block', reuse=None, **kwargs):
+    super(MaskBlock, self).__init__(name, **kwargs)
+    self.config = params.get_pb_config()
+    self.l2_reg = params.l2_regularizer
+    self._projection_dim = params.get_or_default('projection_dim', None)
+    self.reuse = reuse
+
+  def call(self, inputs, **kwargs):
+    net, mask_input = inputs
+    mask_input_dim = int(mask_input.shape[-1])
+    if self.config.HasField('reduction_factor'):
+      aggregation_size = int(mask_input_dim * self.config.reduction_factor)
+    elif self.config.HasField('aggregation_size') is not None:
+      aggregation_size = self.config.aggregation_size
+    else:
+      raise ValueError(
+          'Need one of reduction factor or aggregation size for MaskBlock.')
+
+    if self.config.input_layer_norm:
+      input_name = net.name.replace(':', '_')
+      net = layer_norm(net, reuse=tf.AUTO_REUSE, name='ln_' + input_name)
+
+    # initializer = tf.initializers.variance_scaling()
+    initializer = tf.glorot_uniform_initializer()
+
+    if self._projection_dim is None:
+      mask = tf.layers.dense(
+        mask_input,
+        aggregation_size,
+        activation=tf.nn.relu,
+        kernel_initializer=initializer,
+        kernel_regularizer=self.l2_reg,
+        name='%s/hidden' % self.name,
+        reuse=self.reuse)
+    else:
+      u = tf.layers.dense(
+        mask_input,
+        self._projection_dim,
+        kernel_initializer=initializer,
+        kernel_regularizer=self.l2_reg,
+        use_bias=False,
+        name='%s/prj_u' % self.name,
+        reuse=self.reuse)
+      mask = tf.layers.dense(
+        u,
+        aggregation_size,
+        activation=tf.nn.relu,
+        kernel_initializer=initializer,
+        kernel_regularizer=self.l2_reg,
+        name='%s/prj_v' % self.name,
+        reuse=self.reuse)
+    mask = tf.layers.dense(
+        mask, net.shape[-1], name='%s/mask' % self.name, reuse=self.reuse)
+    masked_net = net * mask
+
+    output_size = self.config.output_size
+    hidden = tf.layers.dense(
+        masked_net,
+        output_size,
+        use_bias=False,
+        name='%s/output' % self.name,
+        reuse=self.reuse)
+    ln_hidden = layer_norm(
+        hidden, name='%s/ln_output' % self.name, reuse=self.reuse)
+    return tf.nn.relu(ln_hidden)
+
+
+class MaskNet(tf.keras.layers.Layer):
+  """MaskNet: Introducing Feature-Wise Multiplication to CTR Ranking Models by Instance-Guided Mask.
+
+  Refer: https://arxiv.org/pdf/2102.07619.pdf
+  """
+
+  def __init__(self, params, name='mask_net', **kwargs):
+    super(MaskNet, self).__init__(name, **kwargs)
+    self.params = params
+    self.config = params.get_pb_config()
+    if self.config.HasField('mlp'):
+      p = Parameter.make_from_pb(self.config.mlp)
+      p.l2_regularizer = params.l2_regularizer
+      self.mlp = MLP(p, name='%s/mlp' % name)
+    else:
+      self.mlp = None
+
+  def call(self, inputs, training=None, **kwargs):
+    if self.config.use_parallel:
+      mask_outputs = []
+      for i, block_conf in enumerate(self.config.mask_blocks):
+        params = Parameter.make_from_pb(block_conf)
+        params.l2_regularizer = self.params.l2_regularizer
+        mask_layer = MaskBlock(params, name='%s/block_%d' % (self.name, i))
+        mask_outputs.append(mask_layer((inputs, inputs)))
+      all_mask_outputs = tf.concat(mask_outputs, axis=1)
+
+      if self.mlp is not None:
+        output = self.mlp(all_mask_outputs)
+      else:
+        output = all_mask_outputs
+      return output
+    else:
+      net = inputs
+      for i, block_conf in enumerate(self.config.mask_blocks):
+        params = Parameter.make_from_pb(block_conf)
+        params.l2_regularizer = self.params.l2_regularizer
+        mask_layer = MaskBlock(params, name='%s/block_%d' % (self.name, i))
+        net = mask_layer((net, inputs))
+
+      if self.mlp is not None:
+        output = self.mlp(net)
+      else:
+        output = net
+      return output
diff --git a/easy_rec/python/layers/keras/numerical_embedding.py b/easy_rec/python/layers/keras/numerical_embedding.py
new file mode 100644
index 000000000..4d6a16ca5
--- /dev/null
+++ b/easy_rec/python/layers/keras/numerical_embedding.py
@@ -0,0 +1,196 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import math
+
+import tensorflow as tf
+
+from easy_rec.python.utils.activation import get_activation
+
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+
+
+class NLinear(object):
+  """N linear layers for N token (feature) embeddings.
+
+  To understand this module, let's revise `tf.layers.dense`. When `tf.layers.dense` is
+  applied to three-dimensional inputs of the shape
+  ``(batch_size, n_tokens, d_embedding)``, then the same linear transformation is
+  applied to each of ``n_tokens`` token (feature) embeddings.
+
+  By contrast, `NLinear` allocates one linear layer per token (``n_tokens`` layers in total).
+  One such layer can be represented as ``tf.layers.dense(d_in, d_out)``.
+  So, the i-th linear transformation is applied to the i-th token embedding, as
+  illustrated in the following pseudocode::
+
+      layers = [tf.layers.dense(d_in, d_out) for _ in range(n_tokens)]
+      x = tf.random.normal(batch_size, n_tokens, d_in)
+      result = tf.stack([layers[i](x[:, i]) for i in range(n_tokens)], 1)
+
+  Examples:
+      .. testcode::
+
+          batch_size = 2
+          n_features = 3
+          d_embedding_in = 4
+          d_embedding_out = 5
+          x = tf.random.normal(batch_size, n_features, d_embedding_in)
+          m = NLinear(n_features, d_embedding_in, d_embedding_out)
+          assert m(x).shape == (batch_size, n_features, d_embedding_out)
+  """
+
+  def __init__(self, n_tokens, d_in, d_out, bias=True, scope='nd_linear'):
+    """Init with input shapes.
+
+    Args:
+        n_tokens: the number of tokens (features)
+        d_in: the input dimension
+        d_out: the output dimension
+        bias: indicates if the underlying linear layers have biases
+        scope: variable scope name
+    """
+    with tf.variable_scope(scope):
+      self.weight = tf.get_variable(
+          'weights', [1, n_tokens, d_in, d_out], dtype=tf.float32)
+      if bias:
+        initializer = tf.constant_initializer(0.0)
+        self.bias = tf.get_variable(
+            'bias', [1, n_tokens, d_out],
+            dtype=tf.float32,
+            initializer=initializer)
+      else:
+        self.bias = None
+
+  def __call__(self, x, *args, **kwargs):
+    if x.shape.ndims != 3:
+      raise ValueError(
+          'The input must have three dimensions (batch_size, n_tokens, d_embedding)'
+      )
+    if x.shape[2] != self.weight.shape[2]:
+      raise ValueError('invalid input embedding dimension %d, expect %d' %
+                       (int(x.shape[2]), int(self.weight.shape[2])))
+
+    x = x[..., None] * self.weight  # [B, N, D, D_out]
+    x = tf.reduce_sum(x, axis=-2)  # [B, N, D_out]
+    if self.bias is not None:
+      x = x + self.bias
+    return x
+
+
+class PeriodicEmbedding(tf.keras.layers.Layer):
+  """Periodic embeddings for numerical features described in [1].
+
+  References:
+    * [1] Yury Gorishniy, Ivan Rubachev, Artem Babenko,
+    "On Embeddings for Numerical Features in Tabular Deep Learning", 2022
+    https://arxiv.org/pdf/2203.05556.pdf
+
+  Attributes:
+    embedding_dim: the embedding size, must be an even positive integer.
+    sigma: the scale of the weight initialization.
+      **This is a super important parameter which significantly affects performance**.
+      Its optimal value can be dramatically different for different datasets, so
+      no "default value" can exist for this parameter, and it must be tuned for
+      each dataset. In the original paper, during hyperparameter tuning, this
+      parameter was sampled from the distribution ``LogUniform[1e-2, 1e2]``.
+      A similar grid would be ``[1e-2, 1e-1, 1e0, 1e1, 1e2]``.
+      If possible, add more intermediate values to this grid.
+    output_3d_tensor: whether to output a 3d tensor
+    output_tensor_list: whether to output the list of embedding
+  """
+
+  def __init__(self, params, name='periodic_embedding', **kwargs):
+    super(PeriodicEmbedding, self).__init__(name, **kwargs)
+    params.check_required(['embedding_dim', 'sigma'])
+    self.embedding_dim = int(params.embedding_dim)
+    if self.embedding_dim % 2:
+      raise ValueError('embedding_dim must be even')
+    sigma = params.sigma
+    self.initializer = tf.random_normal_initializer(stddev=sigma)
+    self.add_linear_layer = params.get_or_default('add_linear_layer', True)
+    self.linear_activation = params.get_or_default('linear_activation', 'relu')
+    self.output_tensor_list = params.get_or_default('output_tensor_list', False)
+    self.output_3d_tensor = params.get_or_default('output_3d_tensor', False)
+
+  def call(self, inputs, **kwargs):
+    if inputs.shape.ndims != 2:
+      raise ValueError('inputs of PeriodicEmbedding must have 2 dimensions.')
+
+    num_features = int(inputs.shape[-1])
+    emb_dim = self.embedding_dim // 2
+    with tf.variable_scope(self.name):
+      c = tf.get_variable(
+          'coefficients',
+          shape=[1, num_features, emb_dim],
+          initializer=self.initializer)
+
+      features = inputs[..., None]  # [B, N, 1]
+      v = 2 * math.pi * c * features  # [B, N, E]
+      emb = tf.concat([tf.sin(v), tf.cos(v)], axis=-1)  # [B, N, 2E]
+
+      dim = self.embedding_dim
+      if self.add_linear_layer:
+        linear = NLinear(num_features, dim, dim)
+        emb = linear(emb)
+        act = get_activation(self.linear_activation)
+        if callable(act):
+          emb = act(emb)
+      output = tf.reshape(emb, [-1, num_features * dim])
+
+      if self.output_tensor_list:
+        return output, tf.unstack(emb, axis=1)
+      if self.output_3d_tensor:
+        return output, emb
+      return output
+
+
+class AutoDisEmbedding(tf.keras.layers.Layer):
+  """An Embedding Learning Framework for Numerical Features in CTR Prediction.
+
+  Refer: https://arxiv.org/pdf/2012.08986v2.pdf
+  """
+
+  def __init__(self, params, name='auto_dis_embedding', **kwargs):
+    super(AutoDisEmbedding, self).__init__(name, **kwargs)
+    params.check_required(['embedding_dim', 'num_bins', 'temperature'])
+    self.emb_dim = int(params.embedding_dim)
+    self.num_bins = int(params.num_bins)
+    self.temperature = params.temperature
+    self.keep_prob = params.get_or_default('keep_prob', 0.8)
+    self.output_tensor_list = params.get_or_default('output_tensor_list', False)
+    self.output_3d_tensor = params.get_or_default('output_3d_tensor', False)
+
+  def call(self, inputs, **kwargs):
+    if inputs.shape.ndims != 2:
+      raise ValueError('inputs of AutoDisEmbedding must have 2 dimensions.')
+
+    num_features = int(inputs.shape[-1])
+    with tf.variable_scope(self.name):
+      meta_emb = tf.get_variable(
+          'meta_embedding',
+          shape=[1, num_features, self.num_bins, self.emb_dim])
+      w = tf.get_variable('project_w', shape=[1, num_features, self.num_bins])
+      mat = tf.get_variable(
+          'project_mat', shape=[1, num_features, self.num_bins, self.num_bins])
+
+      x = tf.expand_dims(inputs, axis=-1)  # [B, N, 1]
+      hidden = tf.nn.leaky_relu(w * x)  # [B, N, num_bin]
+
+      y = tf.matmul(mat, hidden[..., None])  # [B, N, num_bin, 1]
+      y = tf.squeeze(y, axis=3)  # [B, N, num_bin]
+
+      # keep_prob(float): if dropout_flag is True, keep_prob rate to keep connect
+      alpha = self.keep_prob
+      x_bar = y + alpha * hidden  # [B, N, num_bin]
+      x_hat = tf.nn.softmax(x_bar / self.temperature)  # [B, N, num_bin]
+
+      emb = tf.matmul(x_hat[:, :, None, :], meta_emb)  # [B, N, 1, D]
+      emb = tf.squeeze(emb, axis=2)  # [B, N, D]
+      output = tf.reshape(emb, [-1, self.emb_dim * num_features])  # [B, N*D]
+
+      if self.output_tensor_list:
+        return output, tf.unstack(emb, axis=1)
+
+      if self.output_3d_tensor:
+        return output, emb
+      return output
diff --git a/easy_rec/python/layers/sequence_encoder.py b/easy_rec/python/layers/sequence_encoder.py
new file mode 100644
index 000000000..24dab9754
--- /dev/null
+++ b/easy_rec/python/layers/sequence_encoder.py
@@ -0,0 +1,95 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import logging
+
+import tensorflow as tf
+
+from easy_rec.python.layers.keras.bst import BST
+from easy_rec.python.layers.keras.din import DIN
+from easy_rec.python.protos.feature_config_pb2 import FeatureConfig
+
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+
+
+class SequenceEncoder(object):
+
+  def __init__(self, input_layer, feature_configs, feature_groups_config,
+               l2_reg):
+    self._input_layer = input_layer
+    self._feature_groups_config = {
+        x.group_name: x for x in feature_groups_config
+    }
+    self._l2_reg = l2_reg
+    self._feature_config_by_name = {
+        x.feature_name if x.HasField('feature_name') else x.input_names[0]: x
+        for x in feature_configs
+    }
+
+    for name, group in self._feature_groups_config.items():
+      if len(group.sequence_encoders) == 0:
+        continue
+      check_share_emb = False
+      for encoder in group.sequence_encoders:
+        if encoder.force_share_embeddings:
+          check_share_emb = True
+          break
+      if not check_share_emb:
+        continue
+      if not self.check_share_embedding(group):
+        raise ValueError(
+            'sequence feature group `%s` check share embedding failed, '
+            'you should add `embedding_name` to feature config' % name)
+
+  def check_share_embedding(self, feature_group):
+    seq_emb_names = set()
+    target_emb_names = set()
+    for feature in feature_group.feature_names:
+      conf = self._feature_config_by_name[feature]
+      if not conf.HasField('embedding_name'):
+        return False
+      if conf.feature_type == FeatureConfig.FeatureType.SequenceFeature:
+        seq_emb_names.add(conf.embedding_name)
+      else:
+        target_emb_names.add(conf.embedding_name)
+
+    if seq_emb_names != target_emb_names:
+      tf.logging.error(
+          'sequence share embedding names: %s, target share embedding names: %s'
+          % (','.join(seq_emb_names), ','.join(target_emb_names)))
+      return False
+    return True
+
+  def __call__(self, features, group_name, is_training=True, *args, **kwargs):
+    group_config = self._feature_groups_config[group_name]
+    if len(group_config.sequence_encoders) == 0:
+      return None
+
+    seq_features, target_feature, target_features = self._input_layer(
+        features, group_name, is_combine=False)
+    assert len(
+        seq_features) > 0, 'sequence feature is empty in group: ' + group_name
+
+    outputs = []
+    for encoder in group_config.sequence_encoders:
+      encoder_type = encoder.WhichOneof('encoder').lower()
+      if encoder_type == 'bst':
+        bst = BST(encoder.bst, self._l2_reg, name=group_name)
+        encoding = bst([seq_features, target_feature], is_training, **kwargs)
+        outputs.append(encoding)
+      elif encoder_type == 'din':
+        din = DIN(encoder.din, self._l2_reg, name=group_name)
+        encoding = din([seq_features, target_feature], is_training)
+        outputs.append(encoding)
+      else:
+        assert False, 'unsupported sequence encode type: ' + encoder_type
+
+    if len(outputs) == 0:
+      logging.warning(
+          "there's no sequence encoder configured in feature group: " +
+          group_name)
+      return None
+    if len(outputs) == 1:
+      return outputs[0]
+
+    return tf.concat(outputs, axis=-1)
diff --git a/easy_rec/python/layers/uniter.py b/easy_rec/python/layers/uniter.py
index fa5c6a3ca..3018bad61 100644
--- a/easy_rec/python/layers/uniter.py
+++ b/easy_rec/python/layers/uniter.py
@@ -32,7 +32,8 @@ def __init__(self, model_config, feature_configs, features, uniter_config,
       tower_num += 1
     self._txt_seq_features = None
     if input_layer.has_group('text'):
-      self._txt_seq_features = input_layer(features, 'text', is_combine=False)
+      self._txt_seq_features, _, _ = input_layer(
+          features, 'text', is_combine=False)
       tower_num += 1
     self._use_token_type = True if tower_num > 1 else False
     self._other_features = None
diff --git a/easy_rec/python/layers/utils.py b/easy_rec/python/layers/utils.py
index 43204241c..b95eef2fe 100644
--- a/easy_rec/python/layers/utils.py
+++ b/easy_rec/python/layers/utils.py
@@ -19,6 +19,7 @@
 
 import json
 
+from google.protobuf import struct_pb2
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import variables
@@ -158,3 +159,73 @@ def mark_input_src(name, src_desc):
                             'name': name,
                             'src': src_desc
                         }))
+
+
+class Parameter(object):
+
+  def __init__(self, params, is_struct, l2_reg=None):
+    self.params = params
+    self.is_struct = is_struct
+    self._l2_reg = l2_reg
+
+  @staticmethod
+  def make_from_pb(config):
+    return Parameter(config, False)
+
+  def get_pb_config(self):
+    assert not self.is_struct, 'Struct parameter can not convert to pb config'
+    return self.params
+
+  @property
+  def l2_regularizer(self):
+    return self._l2_reg
+
+  @l2_regularizer.setter
+  def l2_regularizer(self, value):
+    self._l2_reg = value
+
+  def __getattr__(self, key):
+    if self.is_struct:
+      value = self.params[key]
+      if type(value) == struct_pb2.Struct:
+        return Parameter(value, True, self._l2_reg)
+      else:
+        return value
+    return getattr(self.params, key)
+
+  def __getitem__(self, key):
+    return self.__getattr__(key)
+
+  def get_or_default(self, key, def_val):
+    if self.is_struct:
+      if key in self.params:
+        if def_val is None:
+          return self.params[key]
+        value = self.params[key]
+        if type(value) == float:
+          return type(def_val)(value)
+        return value
+      return def_val
+    else:  # pb message
+      value = getattr(self.params, key)
+      if hasattr(value, '__len__'):
+        if len(value) > 0:
+          return value
+      elif self.params.HasField(key):
+        return value
+      return def_val
+
+  def check_required(self, keys):
+    if not self.is_struct:
+      return
+    if not isinstance(keys, (list, tuple)):
+      keys = [keys]
+    for key in keys:
+      if key not in self.params:
+        raise KeyError('%s must be set in params')
+
+  def has_field(self, key):
+    if self.is_struct:
+      return key in self.params
+    else:
+      return self.params.HasField(key)
diff --git a/easy_rec/python/loss/info_nce_loss.py b/easy_rec/python/loss/info_nce_loss.py
new file mode 100644
index 000000000..3fd6b6b18
--- /dev/null
+++ b/easy_rec/python/loss/info_nce_loss.py
@@ -0,0 +1,41 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import tensorflow as tf
+
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+
+
+def info_nce_loss(query, positive, temperature=0.1):
+  """Calculates the InfoNCE loss for self-supervised learning.
+
+  This contrastive loss enforces the embeddings of similar (positive) samples to be close
+      and those of different (negative) samples to be distant.
+  A query embedding is compared with one positive key and with one or more negative keys.
+
+  References:
+      https://arxiv.org/abs/1807.03748v2
+      https://arxiv.org/abs/2010.05113
+  """
+  # Check input dimensionality.
+  if query.shape.ndims != 2:
+    raise ValueError('<query> must have 2 dimensions.')
+  if positive.shape.ndims != 2:
+    raise ValueError('<positive> must have 2 dimensions.')
+  # Embedding vectors should have same number of components.
+  if query.shape[-1] != positive.shape[-1]:
+    raise ValueError(
+        'Vectors of <query> and <positive> should have the same number of components.'
+    )
+
+  # Negative keys are implicitly off-diagonal positive keys.
+
+  # Cosine between all combinations
+  logits = tf.matmul(query, positive, transpose_b=True)
+  logits /= temperature
+
+  # Positive keys are the entries on the diagonal
+  batch_size = tf.shape(query)[0]
+  labels = tf.range(batch_size)
+
+  return tf.losses.sparse_softmax_cross_entropy(labels, logits)
diff --git a/easy_rec/python/loss/jrc_loss.py b/easy_rec/python/loss/jrc_loss.py
index fc8266b2c..778068e7e 100644
--- a/easy_rec/python/loss/jrc_loss.py
+++ b/easy_rec/python/loss/jrc_loss.py
@@ -12,7 +12,9 @@ def jrc_loss(labels,
              logits,
              session_ids,
              alpha=0.5,
-             auto_weight=False,
+             loss_weight_strategy='fixed',
+             sample_weights=1.0,
+             same_label_loss=True,
              name=''):
   """Joint Optimization of Ranking and Calibration with Contextualized Hybrid Model.
 
@@ -23,14 +25,18 @@ def jrc_loss(labels,
     logits: a `Tensor` with shape [batch_size, 2]. e.g. the value of last neuron before activation.
     session_ids: a `Tensor` with shape [batch_size]. Session ids of each sample, used to max GAUC metric. e.g. user_id
     alpha: the weight to balance ranking loss and calibration loss
-    auto_weight: bool, whether to learn loss weight between ranking loss and calibration loss
+    loss_weight_strategy: str, the loss weight strategy to balancing between ce_loss and ge_loss
+    sample_weights: Coefficients for the loss. This must be scalar or broadcastable to
+      `labels` (i.e. same rank and each dimension is either 1 or the same).
+    same_label_loss: enable ge_loss for sample with same label in a session or not.
     name: the name of loss
   """
   loss_name = name if name else 'jrc_loss'
-  logging.info('[{}] alpha: {}, auto_weight: {}'.format(loss_name, alpha,
-                                                        auto_weight))
+  logging.info('[{}] alpha: {}, loss_weight_strategy: {}'.format(
+      loss_name, alpha, loss_weight_strategy))
 
-  ce_loss = tf.losses.sparse_softmax_cross_entropy(labels, logits)
+  ce_loss = tf.losses.sparse_softmax_cross_entropy(
+      labels, logits, weights=sample_weights)
 
   labels = tf.expand_dims(labels, 1)  # [B, 1]
   labels = tf.concat([1 - labels, labels], axis=1)  # [B, 2]
@@ -54,13 +60,56 @@ def jrc_loss(labels,
   y_neg, y_pos = y[:, :, 0], y[:, :, 1]
   l_neg, l_pos = logits[:, :, 0], logits[:, :, 1]
 
+  if tf.is_numeric_tensor(sample_weights):
+    logging.info('[%s] use sample weight' % loss_name)
+    weights = tf.expand_dims(tf.cast(sample_weights, tf.float32), 0)
+    pairwise_weights = tf.tile(weights, tf.stack([batch_size, 1]))
+    y_pos *= pairwise_weights
+    y_neg *= pairwise_weights
+
   # Compute list-wise generative loss -log p(x|y, z)
-  loss_pos = -tf.reduce_sum(y_pos * tf.nn.log_softmax(l_pos, axis=0), axis=0)
-  loss_neg = -tf.reduce_sum(y_neg * tf.nn.log_softmax(l_neg, axis=0), axis=0)
-  ge_loss = tf.reduce_mean((loss_pos + loss_neg) / tf.reduce_sum(mask, axis=0))
+  if same_label_loss:
+    logging.info('[%s] enable same_label_loss' % loss_name)
+    loss_pos = -tf.reduce_sum(y_pos * tf.nn.log_softmax(l_pos, axis=0), axis=0)
+    loss_neg = -tf.reduce_sum(y_neg * tf.nn.log_softmax(l_neg, axis=0), axis=0)
+    ge_loss = tf.reduce_mean(
+        (loss_pos + loss_neg) / tf.reduce_sum(mask, axis=0))
+  else:
+    logging.info('[%s] disable same_label_loss' % loss_name)
+    diag = tf.one_hot(tf.range(batch_size), batch_size)
+    l_pos = l_pos + (1 - diag) * y_pos * -1e9
+    l_neg = l_neg + (1 - diag) * y_neg * -1e9
+    loss_pos = -tf.linalg.diag_part(y_pos * tf.nn.log_softmax(l_pos, axis=0))
+    loss_neg = -tf.linalg.diag_part(y_neg * tf.nn.log_softmax(l_neg, axis=0))
+    ge_loss = tf.reduce_mean(loss_pos + loss_neg)
+
+  tf.summary.scalar('loss/%s_ce' % loss_name, ce_loss)
+  tf.summary.scalar('loss/%s_ge' % loss_name, ge_loss)
 
   # The final JRC model
-  if auto_weight:
+  if loss_weight_strategy == 'fixed':
+    loss = alpha * ce_loss + (1 - alpha) * ge_loss
+  elif loss_weight_strategy == 'random_uniform':
+    weight = tf.random_uniform([])
+    loss = weight * ce_loss + (1 - weight) * ge_loss
+    tf.summary.scalar('loss/%s_ce_weight' % loss_name, weight)
+    tf.summary.scalar('loss/%s_ge_weight' % loss_name, 1 - weight)
+  elif loss_weight_strategy == 'random_normal':
+    weights = tf.random_normal([2])
+    loss_weight = tf.nn.softmax(weights)
+    loss = loss_weight[0] * ce_loss + loss_weight[1] * ge_loss
+    tf.summary.scalar('loss/%s_ce_weight' % loss_name, loss_weight[0])
+    tf.summary.scalar('loss/%s_ge_weight' % loss_name, loss_weight[1])
+  elif loss_weight_strategy == 'random_bernoulli':
+    bern = tf.distributions.Bernoulli(probs=0.5, dtype=tf.float32)
+    weights = bern.sample(2)
+    loss_weight = tf.cond(
+        tf.equal(tf.reduce_sum(weights), 1), lambda: weights,
+        lambda: tf.convert_to_tensor([0.5, 0.5]))
+    loss = loss_weight[0] * ce_loss + loss_weight[1] * ge_loss
+    tf.summary.scalar('loss/%s_ce_weight' % loss_name, loss_weight[0])
+    tf.summary.scalar('loss/%s_ge_weight' % loss_name, loss_weight[1])
+  elif loss_weight_strategy == 'uncertainty':
     uncertainty1 = tf.Variable(
         0, name='%s_ranking_loss_weight' % loss_name, dtype=tf.float32)
     tf.summary.scalar('loss/%s_ranking_uncertainty' % loss_name, uncertainty1)
@@ -71,5 +120,6 @@ def jrc_loss(labels,
     loss = tf.exp(-uncertainty1) * ce_loss + 0.5 * uncertainty1
     loss += tf.exp(-uncertainty2) * ge_loss + 0.5 * uncertainty2
   else:
-    loss = alpha * ce_loss + (1 - alpha) * ge_loss
+    raise ValueError('Unsupported loss weight strategy `%s` for jrc loss' %
+                     loss_weight_strategy)
   return loss
diff --git a/easy_rec/python/loss/nce_loss.py b/easy_rec/python/loss/nce_loss.py
new file mode 100644
index 000000000..f2e406d20
--- /dev/null
+++ b/easy_rec/python/loss/nce_loss.py
@@ -0,0 +1,39 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import tensorflow as tf
+
+from easy_rec.python.utils.shape_utils import get_shape_list
+
+
+def mask_samples(batch_size):
+  part = tf.ones((batch_size, batch_size), bool)
+  diag_part = tf.linalg.diag_part(part)
+  diag_part = tf.fill(tf.shape(diag_part), False)
+  part = tf.linalg.set_diag(part, diag_part)
+  part_half = tf.concat([part, part], axis=1)
+  part_total = tf.concat([part_half, part_half], axis=0)
+  return part_total
+
+
+def nce_loss(z_i, z_j, temp=1):
+  batch_size = get_shape_list(z_i)[0]
+  N = 2 * batch_size
+  z = tf.concat((z_i, z_j), axis=0)
+  sim = tf.matmul(z, tf.transpose(z)) / temp
+  sim_i_j = tf.matrix_diag_part(
+      tf.slice(sim, [batch_size, 0], [batch_size, batch_size]))
+  sim_j_i = tf.matrix_diag_part(
+      tf.slice(sim, [0, batch_size], [batch_size, batch_size]))
+  positive_samples = tf.reshape(tf.concat((sim_i_j, sim_j_i), axis=0), (N, 1))
+  mask = mask_samples(batch_size)
+  negative_samples = tf.reshape(tf.boolean_mask(sim, mask), (N, -1))
+
+  labels = tf.zeros(N, dtype=tf.int32)
+  logits = tf.concat((positive_samples, negative_samples), axis=1)
+
+  loss = tf.reduce_mean(
+      tf.nn.sparse_softmax_cross_entropy_with_logits(
+          labels=labels, logits=logits))
+
+  return loss
diff --git a/easy_rec/python/model/collaborative_metric_learning.py b/easy_rec/python/model/collaborative_metric_learning.py
index d785e7141..b19537239 100644
--- a/easy_rec/python/model/collaborative_metric_learning.py
+++ b/easy_rec/python/model/collaborative_metric_learning.py
@@ -48,21 +48,22 @@ def __init__(
       raise ValueError('unsupported loss type: %s' %
                        LossType.Name(self._loss_type))
 
-    self._highway_features = {}
-    self._highway_num = len(self._model_config.highway)
-    for _id in range(self._highway_num):
-      highway_cfg = self._model_config.highway[_id]
-      highway_feature, _ = self._input_layer(self._feature_dict,
-                                             highway_cfg.input)
-      self._highway_features[highway_cfg.input] = highway_feature
-
-    self.input_features = []
-    if self._model_config.HasField('input'):
-      input_feature, _ = self._input_layer(self._feature_dict,
-                                           self._model_config.input)
-      self.input_features.append(input_feature)
-
-    self.dnn = copy_obj(self._model_config.dnn)
+    if not self.has_backbone:
+      self._highway_features = {}
+      self._highway_num = len(self._model_config.highway)
+      for _id in range(self._highway_num):
+        highway_cfg = self._model_config.highway[_id]
+        highway_feature, _ = self._input_layer(self._feature_dict,
+                                               highway_cfg.input)
+        self._highway_features[highway_cfg.input] = highway_feature
+
+      self.input_features = []
+      if self._model_config.HasField('input'):
+        input_feature, _ = self._input_layer(self._feature_dict,
+                                             self._model_config.input)
+        self.input_features.append(input_feature)
+
+      self.dnn = copy_obj(self._model_config.dnn)
 
     if self._labels is not None:
       if self._model_config.HasField('session_id'):
@@ -79,32 +80,35 @@ def __init__(
       self.sample_id = None
 
   def build_predict_graph(self):
-    for _id in range(self._highway_num):
-      highway_cfg = self._model_config.highway[_id]
-      highway_fea = tf.layers.batch_normalization(
-          self._highway_features[highway_cfg.input],
-          training=self._is_training,
-          trainable=True,
-          name='highway_%s_bn' % highway_cfg.input)
-      highway_fea = highway(
-          highway_fea,
-          highway_cfg.emb_size,
-          activation=gelu,
-          scope='highway_%s' % _id)
-      print('highway_fea: ', highway_fea)
-      self.input_features.append(highway_fea)
-
-    feature = tf.concat(self.input_features, axis=1)
-
-    num_dnn_layer = len(self.dnn.hidden_units)
-    last_hidden = self.dnn.hidden_units.pop()
-    dnn_net = dnn.DNN(self.dnn, self._l2_reg, 'dnn', self._is_training)
-    net_output = dnn_net(feature)
-    tower_emb = tf.layers.dense(
-        inputs=net_output,
-        units=last_hidden,
-        kernel_regularizer=self._l2_reg,
-        name='dnn/dnn_%d' % (num_dnn_layer - 1))
+    if self.has_backbone:
+      tower_emb = self.backbone
+    else:
+      for _id in range(self._highway_num):
+        highway_cfg = self._model_config.highway[_id]
+        highway_fea = tf.layers.batch_normalization(
+            self._highway_features[highway_cfg.input],
+            training=self._is_training,
+            trainable=True,
+            name='highway_%s_bn' % highway_cfg.input)
+        highway_fea = highway(
+            highway_fea,
+            highway_cfg.emb_size,
+            activation=gelu,
+            scope='highway_%s' % _id)
+        print('highway_fea: ', highway_fea)
+        self.input_features.append(highway_fea)
+
+      feature = tf.concat(self.input_features, axis=1)
+
+      num_dnn_layer = len(self.dnn.hidden_units)
+      last_hidden = self.dnn.hidden_units.pop()
+      dnn_net = dnn.DNN(self.dnn, self._l2_reg, 'dnn', self._is_training)
+      net_output = dnn_net(feature)
+      tower_emb = tf.layers.dense(
+          inputs=net_output,
+          units=last_hidden,
+          kernel_regularizer=self._l2_reg,
+          name='dnn/dnn_%d' % (num_dnn_layer - 1))
 
     if self._model_config.output_l2_normalized_emb:
       norm_emb = tf.nn.l2_normalize(tower_emb, axis=-1)
diff --git a/easy_rec/python/model/dbmtl.py b/easy_rec/python/model/dbmtl.py
index 913793474..e87ee9ae7 100644
--- a/easy_rec/python/model/dbmtl.py
+++ b/easy_rec/python/model/dbmtl.py
@@ -37,24 +37,27 @@ def __init__(self,
                                          features,
                                          self._model_config.bottom_uniter,
                                          self._input_layer)
-    else:
-      self._features, _ = self._input_layer(self._feature_dict, 'all')
+    elif not self.has_backbone:
+      self._features, self._feature_list = self._input_layer(
+          self._feature_dict, 'all')
     self._init_towers(self._model_config.task_towers)
 
   def build_predict_graph(self):
-    if self._model_config.HasField('bottom_cmbf'):
-      bottom_fea = self._cmbf_layer(self._is_training, l2_reg=self._l2_reg)
-    elif self._model_config.HasField('bottom_uniter'):
-      bottom_fea = self._uniter_layer(self._is_training, l2_reg=self._l2_reg)
-    elif self._model_config.HasField('bottom_dnn'):
-      bottom_dnn = dnn.DNN(
-          self._model_config.bottom_dnn,
-          self._l2_reg,
-          name='bottom_dnn',
-          is_training=self._is_training)
-      bottom_fea = bottom_dnn(self._features)
-    else:
-      bottom_fea = self._features
+    bottom_fea = self.backbone
+    if bottom_fea is None:
+      if self._model_config.HasField('bottom_cmbf'):
+        bottom_fea = self._cmbf_layer(self._is_training, l2_reg=self._l2_reg)
+      elif self._model_config.HasField('bottom_uniter'):
+        bottom_fea = self._uniter_layer(self._is_training, l2_reg=self._l2_reg)
+      elif self._model_config.HasField('bottom_dnn'):
+        bottom_dnn = dnn.DNN(
+            self._model_config.bottom_dnn,
+            self._l2_reg,
+            name='bottom_dnn',
+            is_training=self._is_training)
+        bottom_fea = bottom_dnn(self._features)
+      else:
+        bottom_fea = self._features
 
     # MMOE block
     if self._model_config.HasField('expert_dnn'):
diff --git a/easy_rec/python/model/easy_rec_model.py b/easy_rec/python/model/easy_rec_model.py
index 7416c5cc4..522d3632e 100644
--- a/easy_rec/python/model/easy_rec_model.py
+++ b/easy_rec/python/model/easy_rec_model.py
@@ -12,6 +12,7 @@
 
 from easy_rec.python.compat import regularizers
 from easy_rec.python.layers import input_layer
+from easy_rec.python.layers.backbone import Backbone
 from easy_rec.python.utils import constant
 from easy_rec.python.utils import estimator_utils
 from easy_rec.python.utils import restore_filter
@@ -36,6 +37,7 @@ def __init__(self,
     self._base_model_config = model_config
     self._model_config = model_config
     self._is_training = is_training
+    self._is_predicting = labels is None
     self._feature_dict = features
 
     # embedding variable parameters
@@ -46,7 +48,7 @@ def __init__(self,
     self._emb_reg = regularizers.l2_regularizer(self.embedding_regularization)
     self._l2_reg = regularizers.l2_regularizer(self.l2_regularization)
     # only used by model with wide feature groups, e.g. WideAndDeep
-    self._wide_output_dim = -1
+    self._wide_output_dim = self.get_wide_output_dim()
 
     self._feature_configs = feature_configs
     self.build_input_layer(model_config, feature_configs)
@@ -60,6 +62,31 @@ def __init__(self,
     if constant.SAMPLE_WEIGHT in features:
       self._sample_weight = features[constant.SAMPLE_WEIGHT]
 
+    self._backbone_output = None
+    if model_config.HasField('backbone'):
+      self._backbone = Backbone(
+          model_config.backbone,
+          features,
+          input_layer=self._input_layer,
+          l2_reg=self._l2_reg)
+    else:
+      self._backbone = None
+
+  @property
+  def has_backbone(self):
+    return self._base_model_config.HasField('backbone')
+
+  @property
+  def backbone(self):
+    if self._backbone_output:
+      return self._backbone_output
+    if self._backbone:
+      self._backbone_output = self._backbone(self._is_training)
+      loss_dict = self._backbone.loss_dict
+      self._loss_dict.update(loss_dict)
+      return self._backbone_output
+    return None
+
   @property
   def embedding_regularization(self):
     return self._base_model_config.embedding_regularization
@@ -87,6 +114,13 @@ def l2_regularization(self):
       l2_regularization = model_config.l2_regularization
     return l2_regularization
 
+  def get_wide_output_dim(self):
+    model_config = getattr(self._base_model_config,
+                           self._base_model_config.WhichOneof('model'))
+    if hasattr(model_config, 'wide_output_dim'):
+      return model_config.wide_output_dim
+    return -1
+
   def build_input_layer(self, model_config, feature_configs):
     self._input_layer = input_layer.InputLayer(
         feature_configs,
@@ -97,7 +131,8 @@ def build_input_layer(self, model_config, feature_configs):
         kernel_regularizer=self._l2_reg,
         variational_dropout_config=model_config.variational_dropout
         if model_config.HasField('variational_dropout') else None,
-        is_training=self._is_training)
+        is_training=self._is_training,
+        is_predicting=self._is_predicting)
 
   @abstractmethod
   def build_predict_graph(self):
diff --git a/easy_rec/python/model/esmm.py b/easy_rec/python/model/esmm.py
index c6eaad483..50567ae63 100644
--- a/easy_rec/python/model/esmm.py
+++ b/easy_rec/python/model/esmm.py
@@ -31,7 +31,9 @@ def __init__(self,
 
     self._group_num = len(self._model_config.groups)
     self._group_features = []
-    if self._group_num > 0:
+    if self.has_backbone:
+      logging.info('use bottom backbone network')
+    elif self._group_num > 0:
       logging.info('group_num: {0}'.format(self._group_num))
       for group_id in range(self._group_num):
         group = self._model_config.groups[group_id]
@@ -173,7 +175,9 @@ def build_predict_graph(self):
     Returns:
       self._prediction_dict: Prediction result of two tasks.
     """
-    if self._group_num > 0:
+    if self.has_backbone:
+      all_fea = self.backbone
+    elif self._group_num > 0:
       group_fea_arr = []
       # Both towers share the underlying network.
       for group_id in range(self._group_num):
diff --git a/easy_rec/python/model/mind.py b/easy_rec/python/model/mind.py
index c414703d2..270060297 100644
--- a/easy_rec/python/model/mind.py
+++ b/easy_rec/python/model/mind.py
@@ -32,7 +32,7 @@ def __init__(self,
         'invalid model config: %s' % self._model_config.WhichOneof('model')
     self._model_config = self._model_config.mind
 
-    self._hist_seq_features = self._input_layer(
+    self._hist_seq_features, _, _ = self._input_layer(
         self._feature_dict, 'hist', is_combine=False)
     self._user_features, _ = self._input_layer(self._feature_dict, 'user')
     self._item_features, _ = self._input_layer(self._feature_dict, 'item')
diff --git a/easy_rec/python/model/mmoe.py b/easy_rec/python/model/mmoe.py
index acf1d6d59..3cc644f6d 100644
--- a/easy_rec/python/model/mmoe.py
+++ b/easy_rec/python/model/mmoe.py
@@ -26,7 +26,10 @@ def __init__(self,
     self._model_config = self._model_config.mmoe
     assert isinstance(self._model_config, MMoEConfig)
 
-    self._features, _ = self._input_layer(self._feature_dict, 'all')
+    if self.has_backbone:
+      self._features = self.backbone
+    else:
+      self._features, _ = self._input_layer(self._feature_dict, 'all')
     self._init_towers(self._model_config.task_towers)
 
   def build_predict_graph(self):
diff --git a/easy_rec/python/model/multi_task_model.py b/easy_rec/python/model/multi_task_model.py
index 43e5663ce..21e8f2c55 100644
--- a/easy_rec/python/model/multi_task_model.py
+++ b/easy_rec/python/model/multi_task_model.py
@@ -5,6 +5,7 @@
 import tensorflow as tf
 
 from easy_rec.python.builders import loss_builder
+from easy_rec.python.layers.dnn import DNN
 from easy_rec.python.model.rank_model import RankModel
 from easy_rec.python.protos import tower_pb2
 from easy_rec.python.protos.loss_pb2 import LossType
@@ -27,6 +28,71 @@ def __init__(self,
     self._task_num = None
     self._label_name_dict = {}
 
+  def build_predict_graph(self):
+    if not self.has_backbone:
+      raise NotImplementedError(
+          'method `build_predict_graph` must be implemented when backbone network do not exits'
+      )
+    model = self._model_config.WhichOneof('model')
+    assert model == 'model_params', '`model_params` must be configured'
+    config = self._model_config.model_params
+
+    self._init_towers(config.task_towers)
+
+    backbone = self.backbone
+    if type(backbone) in (list, tuple):
+      if len(backbone) != len(config.task_towers):
+        raise ValueError(
+            'The number of backbone outputs and task towers must be equal')
+      task_input_list = backbone
+    else:
+      task_input_list = [backbone] * len(config.task_towers)
+
+    tower_features = {}
+    for i, task_tower_cfg in enumerate(config.task_towers):
+      tower_name = task_tower_cfg.tower_name
+      if task_tower_cfg.HasField('dnn'):
+        tower_dnn = DNN(
+            task_tower_cfg.dnn,
+            self._l2_reg,
+            name=tower_name,
+            is_training=self._is_training)
+        tower_output = tower_dnn(task_input_list[i])
+      else:
+        tower_output = task_input_list[i]
+      tower_features[tower_name] = tower_output
+
+    tower_outputs = {}
+    relation_features = {}
+    # bayes network
+    for task_tower_cfg in config.task_towers:
+      tower_name = task_tower_cfg.tower_name
+      if task_tower_cfg.HasField('relation_dnn'):
+        relation_dnn = DNN(
+            task_tower_cfg.relation_dnn,
+            self._l2_reg,
+            name=tower_name + '/relation_dnn',
+            is_training=self._is_training)
+        tower_inputs = [tower_features[tower_name]]
+        for relation_tower_name in task_tower_cfg.relation_tower_names:
+          tower_inputs.append(relation_features[relation_tower_name])
+        relation_input = tf.concat(
+            tower_inputs, axis=-1, name=tower_name + '/relation_input')
+        relation_fea = relation_dnn(relation_input)
+        relation_features[tower_name] = relation_fea
+      else:
+        relation_fea = tower_features[tower_name]
+
+      output_logits = tf.layers.dense(
+          relation_fea,
+          task_tower_cfg.num_class,
+          kernel_regularizer=self._l2_reg,
+          name=tower_name + '/output')
+      tower_outputs[tower_name] = output_logits
+
+    self._add_to_prediction_dict(tower_outputs)
+    return self._prediction_dict
+
   def _init_towers(self, task_tower_configs):
     """Init task towers."""
     self._task_towers = task_tower_configs
@@ -88,6 +154,17 @@ def build_metric_graph(self, eval_config):
 
   def build_loss_graph(self):
     """Build loss graph for multi task model."""
+    strategy = self._base_model_config.loss_weight_strategy
+    loss_weight_arr = [1.0] * len(self._task_towers)
+    if strategy == self._base_model_config.Random:
+      num = 0
+      for task_tower_cfg in self._task_towers:
+        losses = task_tower_cfg.losses
+        num += 1 if len(losses) == 0 else len(losses)
+      weights = tf.random_normal([num])
+      loss_weight_arr = tf.nn.softmax(weights)
+
+    offset = 0
     for task_tower_cfg in self._task_towers:
       tower_name = task_tower_cfg.tower_name
       loss_weight = task_tower_cfg.weight
@@ -111,8 +188,13 @@ def build_loss_graph(self):
             loss_weight=loss_weight,
             num_class=task_tower_cfg.num_class,
             suffix='_%s' % tower_name)
+        if strategy == self._base_model_config.Random:
+          for loss_name in loss_dict.keys():
+            loss_dict[
+                loss_name] = loss_dict[loss_name] * loss_weight_arr[offset]
+        offset += 1
       else:
-        for loss in losses:
+        for i, loss in enumerate(losses):
           loss_param = loss.WhichOneof('loss_param')
           if loss_param is not None:
             loss_param = getattr(loss, loss_param)
@@ -125,19 +207,30 @@ def build_loss_graph(self):
               loss_name=loss.loss_name,
               loss_param=loss_param)
           for loss_name, loss_value in loss_ops.items():
-            if loss.learn_loss_weight:
-              uncertainty = tf.Variable(
-                  0, name='%s_loss_weight' % loss_name, dtype=tf.float32)
-              tf.summary.scalar('loss/%s_uncertainty' % loss_name, uncertainty)
-              if loss.loss_type in {LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS}:
-                loss_dict[loss_name] = 0.5 * tf.exp(
-                    -uncertainty) * loss_value + 0.5 * uncertainty
+            if strategy == self._base_model_config.Fixed:
+              loss_dict[loss_name] = loss_value * loss.weight
+            elif strategy == self._base_model_config.Uncertainty:
+              if loss.learn_loss_weight:
+                uncertainty = tf.Variable(
+                    0, name='%s_loss_weight' % loss_name, dtype=tf.float32)
+                tf.summary.scalar('loss/%s_uncertainty' % loss_name,
+                                  uncertainty)
+                if loss.loss_type in {
+                    LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS
+                }:
+                  loss_dict[loss_name] = 0.5 * tf.exp(
+                      -uncertainty) * loss_value + 0.5 * uncertainty
+                else:
+                  loss_dict[loss_name] = tf.exp(
+                      -uncertainty) * loss_value + 0.5 * uncertainty
               else:
-                loss_dict[loss_name] = tf.exp(
-                    -uncertainty) * loss_value + 0.5 * uncertainty
+                loss_dict[loss_name] = loss_value * loss.weight
+            elif strategy == self._base_model_config.Random:
+              loss_dict[loss_name] = loss_value * loss_weight_arr[i + offset]
             else:
-              loss_dict[loss_name] = loss_value * loss.weight
-
+              raise ValueError('Unsupported loss weight strategy: ' +
+                               strategy.Name)
+        offset += len(losses)
       self._loss_dict.update(loss_dict)
 
     kd_loss_dict = loss_builder.build_kd_loss(self.kd, self._prediction_dict,
diff --git a/easy_rec/python/model/ple.py b/easy_rec/python/model/ple.py
index f3ad71215..e04781bcd 100644
--- a/easy_rec/python/model/ple.py
+++ b/easy_rec/python/model/ple.py
@@ -27,7 +27,10 @@ def __init__(self,
 
     self._layer_nums = len(self._model_config.extraction_networks)
     self._task_nums = len(self._model_config.task_towers)
-    self._features, _ = self._input_layer(self._feature_dict, 'all')
+    if self.has_backbone:
+      self._features = self.backbone
+    else:
+      self._features, _ = self._input_layer(self._feature_dict, 'all')
     self._init_towers(self._model_config.task_towers)
 
   def gate(self, selector_fea, vec_feas, name):
diff --git a/easy_rec/python/model/rank_model.py b/easy_rec/python/model/rank_model.py
index 25eff23ea..a5f447d86 100644
--- a/easy_rec/python/model/rank_model.py
+++ b/easy_rec/python/model/rank_model.py
@@ -29,6 +29,29 @@ def __init__(self,
     if self._labels is not None:
       self._label_name = list(self._labels.keys())[0]
 
+  def build_predict_graph(self):
+    if not self.has_backbone:
+      raise NotImplementedError(
+          'method `build_predict_graph` must be implemented when backbone network do not exits'
+      )
+    output = self.backbone
+    if int(output.shape[-1]) != self._num_class:
+      logging.info('add head logits layer for rank model')
+      output = tf.layers.dense(output, self._num_class, name='output')
+    # model_config = getattr(self._base_model_config,
+    #                        self._base_model_config.WhichOneof('model'))
+    # if hasattr(model_config, 'add_head_logits_layer') and \
+    #     model_config.HasField('add_head_logits_layer'):
+    #   add_head_logits_layer = model_config.add_head_logits_layer
+    # else:
+    #   add_head_logits_layer = True
+    # if add_head_logits_layer:
+    #   logging.info('add head logits layer for rank model')
+    #   output = tf.layers.dense(output, self._num_class, name='output')
+
+    self._add_to_prediction_dict(output)
+    return self._prediction_dict
+
   def _output_to_prediction_impl(self,
                                  output,
                                  loss_type,
@@ -193,7 +216,12 @@ def build_loss_graph(self):
           loss_weight=self._sample_weight,
           num_class=self._num_class)
     else:
-      for loss in self._losses:
+      strategy = self._base_model_config.loss_weight_strategy
+      loss_weight = [1.0]
+      if strategy == self._base_model_config.Random and len(self._losses) > 1:
+        weights = tf.random_normal([len(self._losses)])
+        loss_weight = tf.nn.softmax(weights)
+      for i, loss in enumerate(self._losses):
         loss_param = loss.WhichOneof('loss_param')
         if loss_param is not None:
           loss_param = getattr(loss, loss_param)
@@ -205,18 +233,26 @@ def build_loss_graph(self):
             loss_name=loss.loss_name,
             loss_param=loss_param)
         for loss_name, loss_value in loss_ops.items():
-          if loss.learn_loss_weight:
-            uncertainty = tf.Variable(
-                0, name='%s_loss_weight' % loss_name, dtype=tf.float32)
-            tf.summary.scalar('loss/%s_uncertainty' % loss_name, uncertainty)
-            if loss.loss_type in {LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS}:
-              loss_dict[loss_name] = 0.5 * tf.exp(
-                  -uncertainty) * loss_value + 0.5 * uncertainty
+          if strategy == self._base_model_config.Fixed:
+            loss_dict[loss_name] = loss_value * loss.weight
+          elif strategy == self._base_model_config.Uncertainty:
+            if loss.learn_loss_weight:
+              uncertainty = tf.Variable(
+                  0, name='%s_loss_weight' % loss_name, dtype=tf.float32)
+              tf.summary.scalar('loss/%s_uncertainty' % loss_name, uncertainty)
+              if loss.loss_type in {LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS}:
+                loss_dict[loss_name] = 0.5 * tf.exp(
+                    -uncertainty) * loss_value + 0.5 * uncertainty
+              else:
+                loss_dict[loss_name] = tf.exp(
+                    -uncertainty) * loss_value + 0.5 * uncertainty
             else:
-              loss_dict[loss_name] = tf.exp(
-                  -uncertainty) * loss_value + 0.5 * uncertainty
+              loss_dict[loss_name] = loss_value * loss.weight
+          elif strategy == self._base_model_config.Random:
+            loss_dict[loss_name] = loss_value * loss_weight[i]
           else:
-            loss_dict[loss_name] = loss_value * loss.weight
+            raise ValueError('Unsupported loss weight strategy: ' +
+                             strategy.Name)
 
     self._loss_dict.update(loss_dict)
 
diff --git a/easy_rec/python/model/simple_multi_task.py b/easy_rec/python/model/simple_multi_task.py
index b4c0613bc..05dd7a773 100644
--- a/easy_rec/python/model/simple_multi_task.py
+++ b/easy_rec/python/model/simple_multi_task.py
@@ -27,7 +27,10 @@ def __init__(self,
     self._model_config = self._model_config.simple_multi_task
     assert isinstance(self._model_config, SimpleMultiTaskConfig)
 
-    self._features, _ = self._input_layer(self._feature_dict, 'all')
+    if self.has_backbone:
+      self._features = self.backbone
+    else:
+      self._features, _ = self._input_layer(self._feature_dict, 'all')
     self._init_towers(self._model_config.task_towers)
 
   def build_predict_graph(self):
diff --git a/easy_rec/python/protos/backbone.proto b/easy_rec/python/protos/backbone.proto
new file mode 100644
index 000000000..67b230c04
--- /dev/null
+++ b/easy_rec/python/protos/backbone.proto
@@ -0,0 +1,95 @@
+syntax = "proto2";
+package protos;
+
+import "easy_rec/python/protos/dnn.proto";
+import "easy_rec/python/protos/keras_layer.proto";
+
+message InputLayer {
+    optional bool do_batch_norm = 1;
+    optional bool do_layer_norm = 2;
+    optional float dropout_rate = 3;
+    optional float feature_dropout_rate = 4;
+    optional bool only_output_feature_list = 5;
+    optional bool only_output_3d_tensor = 6;
+    optional bool output_2d_tensor_and_feature_list = 7;
+    optional bool output_seq_and_normal_feature = 8;
+}
+
+message Lambda {
+    required string expression = 1;
+}
+
+message Input {
+    oneof name {
+        string feature_group_name = 1;
+        string block_name = 2;
+        string package_name = 3;
+    }
+    optional string input_fn = 11;
+    optional string input_slice = 12;
+}
+
+message RecurrentLayer {
+    required uint32 num_steps = 1 [default = 1];
+    optional uint32 fixed_input_index = 2;
+    required KerasLayer keras_layer = 3;
+}
+
+message RepeatLayer {
+    required uint32 num_repeat = 1 [default = 1];
+    // default output the list of multiple outputs
+    optional int32 output_concat_axis = 2;
+    required KerasLayer keras_layer = 3;
+}
+
+message Layer {
+    oneof layer {
+        Lambda lambda = 1;
+        KerasLayer keras_layer = 2;
+        RecurrentLayer recurrent = 3;
+        RepeatLayer repeat = 4;
+        InputLayer input_layer = 5;
+    }
+}
+
+message Block {
+    required string name = 1;
+    // the input names of feature groups or other blocks
+    repeated Input inputs = 2;
+    optional int32 input_concat_axis = 3 [default = -1];
+    optional bool merge_inputs_into_list = 4;
+    optional string extra_input_fn = 5;
+
+    // sequential layers
+    repeated Layer layers = 6;
+
+    // only take effect when there are no layers
+    oneof layer {
+        InputLayer input_layer = 101;
+        Lambda lambda = 102;
+        KerasLayer keras_layer = 103;
+        RecurrentLayer recurrent = 104;
+        RepeatLayer repeat = 105;
+    }
+}
+
+// a package of blocks for reuse; e.g. call in a contrastive learning manner
+message BlockPackage {
+    // package name
+    required string name = 1;
+    // a few blocks generating a DAG
+    repeated Block blocks = 2;
+    // the names of output blocks
+    repeated string concat_blocks = 3;
+}
+
+message BackboneTower {
+    // a few sub DAGs
+    repeated BlockPackage packages = 1;
+    // a few blocks generating a DAG
+    repeated Block blocks = 2;
+    // the names of output blocks
+    repeated string concat_blocks = 3;
+    // optional top mlp layer
+    optional MLP top_mlp = 4;
+}
diff --git a/easy_rec/python/protos/cmbf.proto b/easy_rec/python/protos/cmbf.proto
index 598bf1ecf..34e082115 100644
--- a/easy_rec/python/protos/cmbf.proto
+++ b/easy_rec/python/protos/cmbf.proto
@@ -1,9 +1,50 @@
 syntax = "proto2";
 package protos;
 
-import "easy_rec/python/protos/layer.proto";
 import "easy_rec/python/protos/dnn.proto";
 
+message CMBFTower {
+    // The number of heads of cross modal fusion layer
+    required uint32 multi_head_num = 1 [default = 1];
+    // The number of heads of image feature learning layer
+    required uint32 image_multi_head_num = 101 [default = 1];
+    // The number of heads of text feature learning layer
+    required uint32 text_multi_head_num = 102 [default = 1];
+    // The dimension of text heads
+    required uint32 text_head_size = 2;
+    // The dimension of image heads
+    required uint32 image_head_size = 3 [default = 64];
+    // The number of patches of image feature, take effect when there is only one image feature
+    required uint32 image_feature_patch_num = 4 [default = 1];
+    // Do dimension reduce to this size for image feature before single modal learning module
+    required uint32 image_feature_dim = 5 [default = 0];
+    // The number of self attention layers for image features
+    required uint32 image_self_attention_layer_num = 6 [default = 0];
+    // The number of self attention layers for text features
+    required uint32 text_self_attention_layer_num = 7 [default = 1];
+    // The number of cross modal layers
+    required uint32 cross_modal_layer_num = 8 [default = 1];
+    // The dimension of image cross modal heads
+    required uint32 image_cross_head_size = 9;
+    // The dimension of text cross modal heads
+    required uint32 text_cross_head_size = 10;
+    // Dropout probability for hidden layers
+    required float hidden_dropout_prob = 11 [default = 0.0];
+    // Dropout probability of the attention probabilities
+    required float attention_probs_dropout_prob = 12 [default = 0.0];
+
+    // Whether to add embeddings for different text sequence features
+    required bool use_token_type = 13 [default = false];
+    // Whether to add position embeddings for the position of each token in the text sequence
+    required bool use_position_embeddings = 14 [default = true];
+    // Maximum sequence length that might ever be used with this model
+    required uint32 max_position_embeddings = 15 [default = 0];
+    // Dropout probability for text sequence embeddings
+    required float text_seq_emb_dropout_prob = 16 [default = 0.1];
+    // dnn layers for other features
+    optional DNN other_feature_dnn = 17;
+}
+
 message CMBF {
     required CMBFTower config = 1;
 
diff --git a/easy_rec/python/protos/dbmtl.proto b/easy_rec/python/protos/dbmtl.proto
index 841b8adec..9adff1f62 100644
--- a/easy_rec/python/protos/dbmtl.proto
+++ b/easy_rec/python/protos/dbmtl.proto
@@ -3,7 +3,8 @@ package protos;
 
 import "easy_rec/python/protos/dnn.proto";
 import "easy_rec/python/protos/tower.proto";
-import "easy_rec/python/protos/layer.proto";
+import "easy_rec/python/protos/cmbf.proto";
+import "easy_rec/python/protos/uniter.proto";
 
 message DBMTL {
     // shared bottom cmbf layer
diff --git a/easy_rec/python/protos/dnn.proto b/easy_rec/python/protos/dnn.proto
index 021d34dbb..ff40f0fe4 100644
--- a/easy_rec/python/protos/dnn.proto
+++ b/easy_rec/python/protos/dnn.proto
@@ -12,3 +12,20 @@ message DNN {
     // use batch normalization
     optional bool use_bn = 4 [default = true];
 }
+
+message MLP {
+    // hidden units for each layer
+    repeated uint32 hidden_units = 1;
+    // ratio of dropout
+    repeated float dropout_ratio = 2;
+    // activation function
+    optional string activation = 3 [default = 'relu'];
+    // use batch normalization
+    optional bool use_bn = 4 [default = true];
+    optional bool use_final_bn = 5 [default = true];
+    optional string final_activation = 6 [default = 'relu'];
+    optional bool use_bias = 7 [default = true];
+    // kernel_initializer
+    optional string initializer = 8 [default = 'he_uniform'];
+    optional bool use_bn_after_activation = 9;
+}
diff --git a/easy_rec/python/protos/easy_rec_model.proto b/easy_rec/python/protos/easy_rec_model.proto
index 27dcefadc..1e926c368 100644
--- a/easy_rec/python/protos/easy_rec_model.proto
+++ b/easy_rec/python/protos/easy_rec_model.proto
@@ -1,6 +1,7 @@
 syntax = "proto2";
 package protos;
 
+import "easy_rec/python/protos/backbone.proto";
 import "easy_rec/python/protos/fm.proto";
 import "easy_rec/python/protos/deepfm.proto";
 import "easy_rec/python/protos/wide_and_deep.proto";
@@ -24,9 +25,17 @@ import "easy_rec/python/protos/loss.proto";
 import "easy_rec/python/protos/rocket_launching.proto";
 import "easy_rec/python/protos/variational_dropout.proto";
 import "easy_rec/python/protos/multi_tower_recall.proto";
+import "easy_rec/python/protos/tower.proto";
+
 // for input performance test
 message DummyModel {
+}
 
+// configure backbone network common parameters
+message ModelParams {
+  optional float l2_regularization = 1;
+  optional uint32 wide_output_dim = 2;
+  repeated BayesTaskTower task_towers = 3;
 }
 
 // for knowledge distillation
@@ -44,17 +53,19 @@ message KD {
   optional float loss_weight = 4 [default=1.0];
   // only for loss_type == CROSS_ENTROPY_LOSS
   optional float temperature = 5 [default=1.0];
-
 }
 
 message EasyRecModel {
     required string model_class = 1;
+    // just a name for backbone config
+    optional string model_name = 99;
 
     // actually input layers, each layer produce a group of feature
     repeated FeatureGroupConfig feature_groups = 2;
 
     // model parameters
     oneof model {
+        ModelParams model_params = 100;
         DummyModel dummy = 101;
         WideAndDeep wide_and_deep = 102;
         DeepFM deepfm = 103;
@@ -102,4 +113,12 @@ message EasyRecModel {
 
     repeated Loss losses = 15;
 
+    enum LossWeightStrategy {
+        Fixed = 0;
+        Uncertainty = 1;
+        Random = 2;
+    }
+    required LossWeightStrategy loss_weight_strategy = 16 [default = Fixed];
+
+    optional BackboneTower backbone = 17;
 }
diff --git a/easy_rec/python/protos/feature_config.proto b/easy_rec/python/protos/feature_config.proto
index 596e87e4d..e05e73753 100644
--- a/easy_rec/python/protos/feature_config.proto
+++ b/easy_rec/python/protos/feature_config.proto
@@ -42,6 +42,7 @@ message FeatureConfig {
         LookupFeature = 4;
         SequenceFeature = 5;
         ExprFeature = 6;
+        ConstFeature = 7;
     }
 
     enum FieldType {
@@ -127,6 +128,9 @@ message FeatureConfig {
 
     // embedding variable params
     optional EVParams ev_params = 31;
+
+    // fg complexity
+    optional float complexity = 32 [default = 1.0];
 }
 
 message FeatureConfigV2 {
diff --git a/easy_rec/python/protos/fm.proto b/easy_rec/python/protos/fm.proto
index c90af8cab..31d8f27d7 100644
--- a/easy_rec/python/protos/fm.proto
+++ b/easy_rec/python/protos/fm.proto
@@ -2,5 +2,6 @@ syntax = "proto2";
 package protos;
 
 message FM {
+    optional bool use_variant = 1;
     optional float l2_regularization = 5 [default = 1e-4];
 }
diff --git a/easy_rec/python/protos/keras_layer.proto b/easy_rec/python/protos/keras_layer.proto
new file mode 100644
index 000000000..2798260d3
--- /dev/null
+++ b/easy_rec/python/protos/keras_layer.proto
@@ -0,0 +1,27 @@
+syntax = "proto2";
+package protos;
+
+import "google/protobuf/struct.proto";
+import "easy_rec/python/protos/layer.proto";
+import "easy_rec/python/protos/dnn.proto";
+import "easy_rec/python/protos/fm.proto";
+import "easy_rec/python/protos/seq_encoder.proto";
+
+message KerasLayer {
+    required string class_name = 1;
+    oneof params {
+        google.protobuf.Struct st_params = 2;
+        PeriodicEmbedding periodic_embedding = 3;
+        AutoDisEmbedding auto_dis_embedding = 4;
+        FM fm = 5;
+        MaskBlock mask_block = 6;
+        MaskNet masknet = 7;
+        SENet senet = 8;
+        Bilinear bilinear = 9;
+        FiBiNet fibinet = 10;
+        MLP mlp = 11;
+        DINEncoder din = 12;
+        BSTEncoder bst = 13;
+        MMoELayer mmoe = 14;
+    }
+}
diff --git a/easy_rec/python/protos/layer.proto b/easy_rec/python/protos/layer.proto
index 6cea6d3bd..52a1cbf30 100644
--- a/easy_rec/python/protos/layer.proto
+++ b/easy_rec/python/protos/layer.proto
@@ -4,73 +4,68 @@ package protos;
 import "easy_rec/python/protos/dnn.proto";
 
 message HighWayTower {
-    required string input = 1;
+    optional string input = 1;
     required uint32 emb_size = 2;
+    required string activation = 3 [default = 'gelu'];
+    optional float dropout_rate = 4;
 }
 
-message CMBFTower {
-    // The number of heads of cross modal fusion layer
-    required uint32 multi_head_num = 1 [default = 1];
-    // The number of heads of image feature learning layer
-    required uint32 image_multi_head_num = 101 [default = 1];
-    // The number of heads of text feature learning layer
-    required uint32 text_multi_head_num = 102 [default = 1];
-    // The dimension of text heads
-    required uint32 text_head_size = 2;
-    // The dimension of image heads
-    required uint32 image_head_size = 3 [default = 64];
-    // The number of patches of image feature, take effect when there is only one image feature
-    required uint32 image_feature_patch_num = 4 [default = 1];
-    // Do dimension reduce to this size for image feature before single modal learning module
-    required uint32 image_feature_dim = 5 [default = 0];
-    // The number of self attention layers for image features
-    required uint32 image_self_attention_layer_num = 6 [default = 0];
-    // The number of self attention layers for text features
-    required uint32 text_self_attention_layer_num = 7 [default = 1];
-    // The number of cross modal layers
-    required uint32 cross_modal_layer_num = 8 [default = 1];
-    // The dimension of image cross modal heads
-    required uint32 image_cross_head_size = 9;
-    // The dimension of text cross modal heads
-    required uint32 text_cross_head_size = 10;
-    // Dropout probability for hidden layers
-    required float hidden_dropout_prob = 11 [default = 0.0];
-    // Dropout probability of the attention probabilities
-    required float attention_probs_dropout_prob = 12 [default = 0.0];
+message PeriodicEmbedding {
+    required uint32 embedding_dim = 1;
+    required float  sigma = 2;
+    optional bool add_linear_layer = 3 [default = true];
+    optional string linear_activation = 4 [default = 'relu'];
+    optional bool output_3d_tensor = 5;
+    optional bool output_tensor_list = 6;
+}
+
+message AutoDisEmbedding {
+    required uint32 embedding_dim = 1;
+    required uint32 num_bins = 2;
+    required float keep_prob = 3 [default = 0.8];
+    required float temperature = 4;
+    optional bool output_3d_tensor = 5;
+    optional bool output_tensor_list = 6;
+}
+
+message SENet {
+    required uint32 reduction_ratio = 1 [default = 4];
+    optional uint32 num_squeeze_group = 2 [default = 2];
+    optional bool use_skip_connection = 3 [default = true];
+    optional bool use_output_layer_norm = 4 [default = true];
+}
+
+message Bilinear {
+    required string type = 1 [default = 'interaction'];
+    required bool use_plus = 2 [default = true];
+    required uint32 num_output_units = 3;
+}
+
+message FiBiNet {
+    optional Bilinear bilinear = 1;
+    required SENet senet = 2;
+    optional MLP mlp = 8;
+}
+
+message MaskBlock {
+    optional float reduction_factor = 1;
+    required uint32 output_size = 2;
+    optional uint32 aggregation_size = 3;
+    optional bool input_layer_norm = 4 [default = true];
+    optional uint32 projection_dim = 5;
+}
 
-    // Whether to add embeddings for different text sequence features
-    required bool use_token_type = 13 [default = false];
-    // Whether to add position embeddings for the position of each token in the text sequence
-    required bool use_position_embeddings = 14 [default = true];
-    // Maximum sequence length that might ever be used with this model
-    required uint32 max_position_embeddings = 15 [default = 0];
-    // Dropout probability for text sequence embeddings
-    required float text_seq_emb_dropout_prob = 16 [default = 0.1];
-    // dnn layers for other features
-    optional DNN other_feature_dnn = 17;
+message MaskNet {
+    repeated MaskBlock mask_blocks = 1;
+    required bool use_parallel = 2 [default = true];
+    optional MLP mlp = 3;
 }
 
-message UniterTower {
-    // Size of the encoder layers and the pooler layer
-    required uint32 hidden_size = 1;
-    // Number of hidden layers in the Transformer encoder
-    required uint32 num_hidden_layers = 2;
-    // Number of attention heads for each attention layer in the Transformer encoder
-    required uint32 num_attention_heads = 3;
-    // The size of the "intermediate" (i.e. feed-forward) layer in the Transformer encoder
-    required uint32 intermediate_size = 4;
-    // The non-linear activation function (function or string) in the encoder and pooler.
-    required string hidden_act = 5 [default = 'gelu'];  // "gelu", "relu", "tanh" and "swish" are supported.
-    // The dropout probability for all fully connected layers in the embeddings, encoder, and pooler
-    required float hidden_dropout_prob = 6 [default = 0.1];
-    // The dropout ratio for the attention probabilities
-    required float attention_probs_dropout_prob = 7 [default = 0.1];
-    // The maximum sequence length that this model might ever be used with
-    required uint32 max_position_embeddings = 8 [default = 512];
-    // Whether to add position embeddings for the position of each token in the text sequence
-    required bool use_position_embeddings = 9 [default = true];
-    // The stddev of the truncated_normal_initializer for initializing all weight matrices
-    required float initializer_range = 10 [default = 0.02];
-    // dnn layers for other features
-    optional DNN other_feature_dnn = 11;
+message MMoELayer {
+    // number of tasks
+    required uint32 num_task = 1;
+    // mmoe expert mlp layer definition
+    optional MLP expert_mlp = 2;
+    // number of mmoe experts
+    optional uint32 num_expert = 3;
 }
diff --git a/easy_rec/python/protos/loss.proto b/easy_rec/python/protos/loss.proto
index c5b74f47d..5c913bf6e 100644
--- a/easy_rec/python/protos/loss.proto
+++ b/easy_rec/python/protos/loss.proto
@@ -93,4 +93,6 @@ message PairwiseLogisticLoss {
 message JRCLoss {
   required string session_name = 1;
   optional float alpha = 2 [default = 0.5];
+  optional bool same_label_loss = 3 [default = true];
+  required string loss_weight_strategy = 4 [default = 'fixed'];
 }
diff --git a/easy_rec/python/protos/seq_encoder.proto b/easy_rec/python/protos/seq_encoder.proto
new file mode 100644
index 000000000..2b845a429
--- /dev/null
+++ b/easy_rec/python/protos/seq_encoder.proto
@@ -0,0 +1,37 @@
+syntax = "proto2";
+package protos;
+
+import "easy_rec/python/protos/dnn.proto";
+
+
+message BSTEncoder {
+    // Size of the encoder layers and the pooler layer
+    required uint32 hidden_size = 1;
+    // Number of hidden layers in the Transformer encoder
+    required uint32 num_hidden_layers = 2;
+    // Number of attention heads for each attention layer in the Transformer encoder
+    required uint32 num_attention_heads = 3;
+    // The size of the "intermediate" (i.e. feed-forward) layer in the Transformer encoder
+    required uint32 intermediate_size = 4;
+    // The non-linear activation function (function or string) in the encoder and pooler.
+    required string hidden_act = 5 [default = 'gelu'];  // "gelu", "relu", "tanh" and "swish" are supported.
+    // The dropout probability for all fully connected layers in the embeddings, encoder, and pooler
+    required float hidden_dropout_prob = 6 [default = 0.1];
+    // The dropout ratio for the attention probabilities
+    required float attention_probs_dropout_prob = 7 [default = 0.1];
+    // The maximum sequence length that this model might ever be used with
+    required uint32 max_position_embeddings = 8 [default = 512];
+    // Whether to add position embeddings for the position of each token in the text sequence
+    required bool use_position_embeddings = 9 [default = true];
+    // The stddev of the truncated_normal_initializer for initializing all weight matrices
+    required float initializer_range = 10 [default = 0.02];
+}
+
+message DINEncoder {
+    // din attention layer
+    required DNN attention_dnn = 1;
+    // whether to keep target item feature
+    required bool need_target_feature = 2 [default = true];
+    // option: softmax, sigmoid
+    required string attention_normalizer = 3 [default = 'softmax'];
+}
diff --git a/easy_rec/python/protos/uniter.proto b/easy_rec/python/protos/uniter.proto
index 7e78ad23e..9efc1dc9e 100644
--- a/easy_rec/python/protos/uniter.proto
+++ b/easy_rec/python/protos/uniter.proto
@@ -1,9 +1,33 @@
 syntax = "proto2";
 package protos;
 
-import "easy_rec/python/protos/layer.proto";
 import "easy_rec/python/protos/dnn.proto";
 
+message UniterTower {
+    // Size of the encoder layers and the pooler layer
+    required uint32 hidden_size = 1;
+    // Number of hidden layers in the Transformer encoder
+    required uint32 num_hidden_layers = 2;
+    // Number of attention heads for each attention layer in the Transformer encoder
+    required uint32 num_attention_heads = 3;
+    // The size of the "intermediate" (i.e. feed-forward) layer in the Transformer encoder
+    required uint32 intermediate_size = 4;
+    // The non-linear activation function (function or string) in the encoder and pooler.
+    required string hidden_act = 5 [default = 'gelu'];  // "gelu", "relu", "tanh" and "swish" are supported.
+    // The dropout probability for all fully connected layers in the embeddings, encoder, and pooler
+    required float hidden_dropout_prob = 6 [default = 0.1];
+    // The dropout ratio for the attention probabilities
+    required float attention_probs_dropout_prob = 7 [default = 0.1];
+    // The maximum sequence length that this model might ever be used with
+    required uint32 max_position_embeddings = 8 [default = 512];
+    // Whether to add position embeddings for the position of each token in the text sequence
+    required bool use_position_embeddings = 9 [default = true];
+    // The stddev of the truncated_normal_initializer for initializing all weight matrices
+    required float initializer_range = 10 [default = 0.02];
+    // dnn layers for other features
+    optional DNN other_feature_dnn = 11;
+}
+
 message Uniter {
     required UniterTower config = 1;
 
diff --git a/easy_rec/python/protos/variational_dropout.proto b/easy_rec/python/protos/variational_dropout.proto
index e72ca54c6..a1bb39974 100644
--- a/easy_rec/python/protos/variational_dropout.proto
+++ b/easy_rec/python/protos/variational_dropout.proto
@@ -7,4 +7,15 @@ message  VariationalDropoutLayer{
     optional float regularization_lambda = 1 [default = 0.01];
     // variational_dropout dimension
     optional bool embedding_wise_variational_dropout = 2 [default = false];
+
+    // whether to use FSCD model
+    optional bool regularize_by_feature_complexity = 3 [default = false];
+    optional float feature_complexity_weight = 4 [default = 1.0];
+    optional float feature_dimension_weight = 5 [default = 1e-2];
+    optional float feature_cardinality_weight = 6 [default = 1e-7];
+    // temperature
+    optional float temperature = 7 [default = 0.1];
+
+    optional float min_keep_ratio = 8 [default = 1e-3];
+    optional float max_keep_ratio = 9 [default = 1.0];
 }
diff --git a/easy_rec/python/test/train_eval_test.py b/easy_rec/python/test/train_eval_test.py
index 4560f89c6..5680cadb3 100644
--- a/easy_rec/python/test/train_eval_test.py
+++ b/easy_rec/python/test/train_eval_test.py
@@ -306,6 +306,11 @@ def test_bst(self):
         'samples/model_config/bst_on_taobao.config', self._test_dir)
     self.assertTrue(self._success)
 
+  # def test_bst_contrastive_learning(self):
+  #   self._success = test_utils.test_single_train_eval(
+  #       'samples/model_config/bst_cl_on_taobao.config', self._test_dir)
+  #   self.assertTrue(self._success)
+
   def test_dcn(self):
     self._success = test_utils.test_single_train_eval(
         'samples/model_config/dcn_on_taobao.config', self._test_dir)
@@ -955,6 +960,7 @@ def test_distribute_eval_deepfm_multi_cls(self):
 
   def test_distribute_eval_deepfm_single_cls(self):
     cur_eval_path = 'data/test/distribute_eval_test/dwd_distribute_eval_avazu_out_test_combo'
+    #cur_eval_path = '/Users/weisu.yxd/Code/EasyRec/experiments/distribute_eval_test/dwd_distribute_eval_avazu_out_test_combo'
     self._success = test_utils.test_distributed_eval(
         'samples/model_config/deepfm_distribute_eval_combo_on_avazu_ctr.config',
         cur_eval_path, self._test_dir)
diff --git a/easy_rec/python/tools/__init__.py b/easy_rec/python/tools/__init__.py
index e69de29bb..d8300f4e3 100644
--- a/easy_rec/python/tools/__init__.py
+++ b/easy_rec/python/tools/__init__.py
@@ -0,0 +1 @@
+# from .explainer.explainer import create_explainer
diff --git a/easy_rec/python/tools/explainer/__init__.py b/easy_rec/python/tools/explainer/__init__.py
new file mode 100644
index 000000000..c1917b9fd
--- /dev/null
+++ b/easy_rec/python/tools/explainer/__init__.py
@@ -0,0 +1 @@
+# from .methods import DeepExplain
diff --git a/easy_rec/python/tools/explainer/deep_shap.py b/easy_rec/python/tools/explainer/deep_shap.py
new file mode 100644
index 000000000..64508232f
--- /dev/null
+++ b/easy_rec/python/tools/explainer/deep_shap.py
@@ -0,0 +1,766 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import warnings
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.framework import ops as tf_ops
+from tensorflow.python.ops import gradients_impl as tf_gradients_impl
+
+if not hasattr(tf_gradients_impl, '_IsBackpropagatable'):
+  from tensorflow.python.ops import gradients_util as tf_gradients_impl
+
+
+class DeepShap(object):
+  """Meant to approximate SHAP values for deep learning models.
+
+  This is an enhanced version of the DeepLIFT algorithm (Deep SHAP) where, similar to Kernel SHAP, we
+  approximate the conditional expectations of SHAP values using a selection of background samples.
+  Lundberg and Lee, NIPS 2017 showed that the per node attribution rules in DeepLIFT (Shrikumar,
+  Greenside, and Kundaje, arXiv 2017) can be chosen to approximate Shapley values. By integrating
+  over many backgound samples Deep estimates approximate SHAP values such that they sum
+  up to the difference between the expected model output on the passed background samples and the
+  current model output (f(x) - E[f(x)]).
+  """
+
+  def __init__(self,
+               inputs,
+               output,
+               data,
+               session=None,
+               learning_phase_flags=None):
+    """An explainer object for a deep model using a given background dataset.
+
+    Note that the complexity of the method scales linearly with the number of background data
+    samples. Passing the entire training dataset as `data` will give very accurate expected
+    values, but be unreasonably expensive. The variance of the expectation estimates scale by
+    roughly 1/sqrt(N) for N background data samples. So 100 samples will give a good estimate,
+    and 1000 samples a very good estimate of the expected values.
+
+    Parameters
+    ----------
+    inputs : [tf.Operation]
+    output : tf.Operation
+        A pair of TensorFlow operations (or a list and an op) that
+        specifies the input and output of the model to be explained. Note that SHAP values
+        are specific to a single output value, so you get an explanation for each element of
+        the output tensor (which must be a flat rank one vector).
+
+    data : [numpy.array] or [pandas.DataFrame] or function
+        The background dataset to use for integrating out features. DeepExplainer integrates
+        over all these samples for each explanation. The data passed here must match the input
+        operations given to the model. If a function is supplied, it must be a function that
+        takes a particular input example and generates the background dataset for that example
+    session : None or tensorflow.Session
+        The TensorFlow session that has the model we are explaining. If None is passed then
+        we do our best to find the right session, first looking for a keras session, then
+        falling back to the default TensorFlow session.
+
+    learning_phase_flags : None or list of tensors
+        If you have your own custom learning phase flags pass them here. When explaining a prediction
+        we need to ensure we are not in training mode, since this changes the behavior of ops like
+        batch norm or dropout. If None is passed then we look for tensors in the graph that look like
+        learning phase flags. Note that we assume all the flags should
+        have a value of False during predictions (and hence explanations).
+    """
+    self.model_inputs = inputs
+    self.model_output = output
+    assert type(
+        self.model_output
+    ) != list, 'The model output to be explained must be a single tensor!'
+    assert len(self.model_output.shape
+               ) < 3, 'The model output must be a vector or a single value!'
+    self.multi_output = True
+    if len(self.model_output.shape) == 1:
+      self.multi_output = False
+
+    # check if we have multiple inputs
+    self.multi_input = True
+    if type(self.model_inputs) != list or len(self.model_inputs) == 1:
+      self.multi_input = False
+      if type(self.model_inputs) != list:
+        self.model_inputs = [self.model_inputs]
+    if type(data) != list and (hasattr(data, '__call__') == False):
+      data = [data]
+    self.data = data
+
+    self._vinputs = {
+    }  # used to track what op inputs depends on the model inputs
+    self.orig_grads = {}
+
+    if session is None:
+      try:
+        session = tf.compat.v1.keras.backend.get_session()
+      except:
+        session = tf.keras.backend.get_session()
+    self.session = tf.get_default_session() if session is None else session
+    self.graph = self.session.graph
+
+    # if no learning phase flags were given we go looking for them
+    # ...this will catch the one that keras uses
+    # we need to find them since we want to make sure learning phase flags are set to False
+    if learning_phase_flags is None:
+      self.learning_phase_ops = []
+      for op in self.graph.get_operations():
+        if 'learning_phase' in op.name and op.type == 'Const' and len(
+            op.outputs[0].shape) == 0:
+          if op.outputs[0].dtype == tf.bool:
+            self.learning_phase_ops.append(op)
+      self.learning_phase_flags = [
+          op.outputs[0] for op in self.learning_phase_ops
+      ]
+    else:
+      self.learning_phase_ops = [t.op for t in learning_phase_flags]
+
+    # save the expected output of the model
+    # if self.data is a function, set self.expected_value to None
+    if (hasattr(self.data, '__call__')):
+      self.expected_value = None
+    else:
+      if self.data[0].shape[0] > 5000:
+        warnings.warn(
+            'You have provided over 5k background samples! For better performance consider using smaller random sample.'
+        )
+      self.expected_value = self.run(self.model_output, self.model_inputs,
+                                     self.data).mean(0)
+
+    self._init_between_tensors(self.model_output.op, self.model_inputs)
+
+    # make a blank array that will get lazily filled in with the SHAP value computation
+    # graphs for each output. Lazy is important since if there are 1000 outputs and we
+    # only explain the top 5 it would be a waste to build graphs for the other 995
+    if not self.multi_output:
+      self.phi_symbolics = [None]
+    else:
+      noutputs = self.model_output.shape.as_list()[1]
+      if noutputs is not None:
+        self.phi_symbolics = [None for i in range(noutputs)]
+      else:
+        raise Exception(
+            'The model output tensor to be explained cannot have a static shape in dim 1 of None!'
+        )
+
+  def run(self, out, model_inputs, X):
+    """Runs the model while also setting the learning phase flags to False."""
+    feed_dict = dict(zip(model_inputs, X))
+    for t in self.learning_phase_flags:
+      feed_dict[t] = False
+    return self.session.run(out, feed_dict)
+
+  def phi_symbolic(self, i):
+    """Get the SHAP value computation graph for a given model output."""
+    if self.phi_symbolics[i] is None:
+
+      def anon():
+        out = self.model_output[:,
+                                i] if self.multi_output else self.model_output
+        return tf.gradients(out, self.model_inputs)
+
+      self.phi_symbolics[i] = self.execute_with_overridden_gradients(anon)
+
+    return self.phi_symbolics[i]
+
+  def custom_grad(self, op, *grads):
+    """Passes a gradient op creation request to the correct handler."""
+    type_name = op.type[5:] if op.type.startswith('shap_') else op.type
+    out = op_handlers[type_name](
+        self, op, *grads)  # we cut off the shap_ prefex before the lookup
+    return out
+
+  def execute_with_overridden_gradients(self, f):
+    # replace the gradients for all the non-linear activations
+    # we do this by hacking our way into the registry (TODO: find a public API for this if it exists)
+    reg = tf_ops._gradient_registry._registry
+    ops_not_in_registry = ['TensorListReserve']
+    # NOTE: location_tag taken from tensorflow source for None type ops
+    location_tag = ('UNKNOWN', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN')
+    # TODO: unclear why some ops are not in the registry with TF 2.0 like TensorListReserve
+    for non_reg_ops in ops_not_in_registry:
+      reg[non_reg_ops] = {'type': None, 'location': location_tag}
+    for n in op_handlers:
+      if n in reg:
+        self.orig_grads[n] = reg[n]['type']
+        reg['shap_' + n] = {
+            'type': self.custom_grad,
+            'location': reg[n]['location']
+        }
+        reg[n]['type'] = self.custom_grad
+
+    # In TensorFlow 1.10 they started pruning out nodes that they think can't be backpropped
+    # unfortunately that includes the index of embedding layers so we disable that check here
+    if hasattr(tf_gradients_impl, '_IsBackpropagatable'):
+      orig_IsBackpropagatable = tf_gradients_impl._IsBackpropagatable
+      tf_gradients_impl._IsBackpropagatable = lambda tensor: True
+
+    # define the computation graph for the attribution values using a custom gradient-like computation
+    try:
+      out = f()
+    finally:
+      # reinstate the backpropagatable check
+      if hasattr(tf_gradients_impl, '_IsBackpropagatable'):
+        tf_gradients_impl._IsBackpropagatable = orig_IsBackpropagatable
+
+      # restore the original gradient definitions
+      for n in op_handlers:
+        if n in reg:
+          del reg['shap_' + n]
+          reg[n]['type'] = self.orig_grads[n]
+      for non_reg_ops in ops_not_in_registry:
+        del reg[non_reg_ops]
+    return out
+
+  def shap_values(self,
+                  X,
+                  ranked_outputs=None,
+                  output_rank_order='max',
+                  check_additivity=True):
+    """Return approximate SHAP values for the model applied to the data given by X.
+
+    Parameters
+    ----------
+    X : list, numpy.array, or pandas.DataFrame
+        A tensor (or list of tensors) of samples (where X.shape[0] == # samples) on which to
+        explain the model's output.
+
+    ranked_outputs : None or int
+        If ranked_outputs is None then we explain all the outputs in a multi-output model. If
+        ranked_outputs is a positive integer then we only explain that many of the top model
+        outputs (where "top" is determined by output_rank_order). Note that this causes a pair
+        of values to be returned (shap_values, indexes), where shap_values is a list of numpy
+        arrays for each of the output ranks, and indexes is a matrix that indicates for each sample
+        which output indexes were choses as "top".
+
+    output_rank_order : "max", "min", or "max_abs"
+        How to order the model outputs when using ranked_outputs, either by maximum, minimum, or
+        maximum absolute value.
+
+    Returns
+    -------
+    array or list
+        For a models with a single output this returns a tensor of SHAP values with the same shape
+        as X. For a model with multiple outputs this returns a list of SHAP value tensors, each of
+        which are the same shape as X. If ranked_outputs is None then this list of tensors matches
+        the number of model outputs. If ranked_outputs is a positive integer a pair is returned
+        (shap_values, indexes), where shap_values is a list of tensors with a length of
+        ranked_outputs, and indexes is a matrix that indicates for each sample which output indexes
+        were chosen as "top".
+    """
+    # check if we have multiple inputs
+    if not self.multi_input:
+      if type(X) == list and len(X) != 1:
+        assert False, 'Expected a single tensor as model input!'
+      elif type(X) != list:
+        X = [X]
+    else:
+      assert type(X) == list, 'Expected a list of model inputs!'
+    assert len(self.model_inputs) == len(
+        X
+    ), 'Number of model inputs (%d) does not match the number given (%d)!' % (
+        len(self.model_inputs), len(X))
+
+    # rank and determine the model outputs that we will explain
+    if ranked_outputs is not None and self.multi_output:
+      model_output_values = self.run(self.model_output, self.model_inputs, X)
+
+      if output_rank_order == 'max':
+        model_output_ranks = np.argsort(-model_output_values)
+      elif output_rank_order == 'min':
+        model_output_ranks = np.argsort(model_output_values)
+      elif output_rank_order == 'max_abs':
+        model_output_ranks = np.argsort(np.abs(model_output_values))
+      else:
+        assert False, 'output_rank_order must be max, min, or max_abs!'
+      model_output_ranks = model_output_ranks[:, :ranked_outputs]
+    else:
+      model_output_ranks = np.tile(
+          np.arange(len(self.phi_symbolics)), (X[0].shape[0], 1))
+
+    # compute the attributions
+    output_phis = []
+    for i in range(model_output_ranks.shape[1]):
+      phis = []
+      for k in range(len(X)):
+        phis.append(np.zeros(X[k].shape))
+      for j in range(X[0].shape[0]):
+        if (hasattr(self.data, '__call__')):
+          bg_data = self.data([X[l][j] for l in range(len(X))])
+          if type(bg_data) != list:
+            bg_data = [bg_data]
+        else:
+          bg_data = self.data
+
+        # tile the inputs to line up with the background data samples
+        tiled_X = [
+            np.tile(X[l][j:j + 1], (bg_data[l].shape[0],) +
+                    tuple([1
+                           for k in range(len(X[l].shape) - 1)]))
+            for l in range(len(X))
+        ]
+
+        # we use the first sample for the current sample and the rest for the references
+        joint_input = [
+            np.concatenate([tiled_X[l], bg_data[l]], 0) for l in range(len(X))
+        ]
+
+        # run attribution computation graph
+        feature_ind = model_output_ranks[j, i]
+        sample_phis = self.run(
+            self.phi_symbolic(feature_ind), self.model_inputs, joint_input)
+
+        # assign the attributions to the right part of the output arrays
+        for l in range(len(X)):
+          phis[l][j] = (sample_phis[l][bg_data[l].shape[0]:] *
+                        (X[l][j] - bg_data[l])).mean(0)
+
+      output_phis.append(phis[0] if not self.multi_input else phis)
+
+    # check that the SHAP values sum up to the model output
+    if check_additivity:
+      model_output = self.run(self.model_output, self.model_inputs, X)
+      for l in range(len(self.expected_value)):
+        if not self.multi_input:
+          diffs = model_output[:,
+                               l] - self.expected_value[l] - output_phis[l].sum(
+                                   axis=tuple(range(1, output_phis[l].ndim)))
+        else:
+          diffs = model_output[:, l] - self.expected_value[l]
+          for i in range(len(output_phis[l])):
+            diffs -= output_phis[l][i].sum(
+                axis=tuple(range(1, output_phis[l][i].ndim)))
+        assert np.abs(
+          diffs).max() < 1e-2, "The SHAP explanations do not sum up to the model's output! This is either because of a " \
+                               'rounding error or because an operator in your computation graph was not fully supported. If ' \
+                               'the sum difference of %f is significant compared the scale of your model outputs please post ' \
+                               'as a github issue, with a reproducible example if possible so we can debug it.' % np.abs(
+          diffs).max()
+
+    if not self.multi_output:
+      return output_phis[0]
+    elif ranked_outputs is not None:
+      return output_phis, model_output_ranks
+    else:
+      return output_phis
+
+  def _init_between_tensors(self, out_op, model_inputs):
+    # find all the operations in the graph between our inputs and outputs
+    tensor_blacklist = tensors_blocked_by_false(
+        self.learning_phase_ops)  # don't follow learning phase branches
+    dependence_breakers = [
+        k for k in op_handlers if op_handlers[k] == break_dependence
+    ]
+    back_ops = backward_walk_ops([out_op], tensor_blacklist,
+                                 dependence_breakers)
+    start_ops = []
+    for minput in model_inputs:
+      for op in minput.consumers():
+        start_ops.append(op)
+    self.between_ops = forward_walk_ops(
+        start_ops, tensor_blacklist, dependence_breakers, within_ops=back_ops)
+
+    # note all the tensors that are on the path between the inputs and the output
+    self.between_tensors = {}
+    for op in self.between_ops:
+      for t in op.outputs:
+        self.between_tensors[t.name] = True
+    for t in model_inputs:
+      self.between_tensors[t.name] = True
+
+    # save what types are being used
+    self.used_types = {}
+    for op in self.between_ops:
+      self.used_types[op.type] = True
+
+  def _variable_inputs(self, op):
+    """Return which inputs of this operation are variable (i.e. depend on the model inputs)."""
+    if op not in self._vinputs:
+      out = np.zeros(len(op.inputs), dtype=np.bool)
+      for i, t in enumerate(op.inputs):
+        out[i] = t.name in self.between_tensors
+      self._vinputs[op] = out
+    return self._vinputs[op]
+
+
+def tensors_blocked_by_false(ops):
+  """Follows a set of ops assuming their value is False and find blocked Switch paths.
+
+  This is used to prune away parts of the model graph that are only used during the training
+  phase (like dropout, batch norm, etc.).
+  """
+  blocked = []
+
+  def recurse(op):
+    if op.type == 'Switch':
+      blocked.append(
+          op.outputs[1]
+      )  # the true path is blocked since we assume the ops we trace are False
+    else:
+      for out in op.outputs:
+        for c in out.consumers():
+          recurse(c)
+
+  for op in ops:
+    recurse(op)
+
+  return blocked
+
+
+def backward_walk_ops(start_ops, tensor_blacklist, op_type_blacklist):
+  found_ops = []
+  op_stack = [op for op in start_ops]
+  while len(op_stack) > 0:
+    op = op_stack.pop()
+    if op.type not in op_type_blacklist and op not in found_ops:
+      found_ops.append(op)
+      for input in op.inputs:
+        if input not in tensor_blacklist:
+          op_stack.append(input.op)
+  return found_ops
+
+
+def forward_walk_ops(start_ops, tensor_blacklist, op_type_blacklist,
+                     within_ops):
+  found_ops = []
+  op_stack = [op for op in start_ops]
+  while len(op_stack) > 0:
+    op = op_stack.pop()
+    if op.type not in op_type_blacklist and op in within_ops and op not in found_ops:
+      found_ops.append(op)
+      for out in op.outputs:
+        if out not in tensor_blacklist:
+          for c in out.consumers():
+            op_stack.append(c)
+  return found_ops
+
+
+def linearity_1d_nonlinearity_2d(input_ind0, input_ind1, op_func):
+
+  def handler(explainer, op, *grads):
+    var = explainer._variable_inputs(op)
+    if var[input_ind0] and not var[input_ind1]:
+      return linearity_1d_handler(input_ind0, explainer, op, *grads)
+    elif var[input_ind1] and not var[input_ind0]:
+      return linearity_1d_handler(input_ind1, explainer, op, *grads)
+    elif var[input_ind0] and var[input_ind1]:
+      return nonlinearity_2d_handler(input_ind0, input_ind1, op_func, explainer,
+                                     op, *grads)
+    else:
+      return [None for _ in op.inputs
+              ]  # no inputs vary, we must be hidden by a switch function
+
+  return handler
+
+
+def nonlinearity_1d_nonlinearity_2d(input_ind0, input_ind1, op_func):
+
+  def handler(explainer, op, *grads):
+    var = explainer._variable_inputs(op)
+    if var[input_ind0] and not var[input_ind1]:
+      return nonlinearity_1d_handler(input_ind0, explainer, op, *grads)
+    elif var[input_ind1] and not var[input_ind0]:
+      return nonlinearity_1d_handler(input_ind1, explainer, op, *grads)
+    elif var[input_ind0] and var[input_ind1]:
+      return nonlinearity_2d_handler(input_ind0, input_ind1, op_func, explainer,
+                                     op, *grads)
+    else:
+      return [None for _ in op.inputs
+              ]  # no inputs vary, we must be hidden by a switch function
+
+  return handler
+
+
+def nonlinearity_1d(input_ind):
+
+  def handler(explainer, op, *grads):
+    return nonlinearity_1d_handler(input_ind, explainer, op, *grads)
+
+  return handler
+
+
+def nonlinearity_1d_handler(input_ind, explainer, op, *grads):
+  # make sure only the given input varies
+  op_inputs = op.inputs
+  if op_inputs is None:
+    op_inputs = op.outputs[0].op.inputs
+
+  for i in range(len(op_inputs)):
+    if i != input_ind:
+      assert not explainer._variable_inputs(
+          op)[i], str(i) + 'th input to ' + op.name + ' cannot vary!'
+
+  xin0, rin0 = tf.split(op_inputs[input_ind], 2)
+  xout, rout = tf.split(op.outputs[input_ind], 2)
+  delta_in0 = xin0 - rin0
+  if delta_in0.shape is None:
+    dup0 = [2, 1]
+  else:
+    dup0 = [2] + [1 for i in delta_in0.shape[1:]]
+  out = [None for _ in op_inputs]
+  if op.type.startswith('shap_'):
+    op.type = op.type[5:]
+  orig_grad = explainer.orig_grads[op.type](op, grads[0])
+  out[input_ind] = tf.where(
+      tf.tile(tf.abs(delta_in0), dup0) < 1e-6,
+      orig_grad[input_ind] if len(op_inputs) > 1 else orig_grad,
+      grads[0] * tf.tile((xout - rout) / delta_in0, dup0))
+  return out
+
+
+def nonlinearity_2d_handler(input_ind0, input_ind1, op_func, explainer, op,
+                            *grads):
+  assert input_ind0 == 0 and input_ind1 == 1, "TODO: Can't yet handle double inputs that are not first!"
+  xout, rout = tf.split(op.outputs[0], 2)
+  in0 = op.inputs[input_ind0]
+  in1 = op.inputs[input_ind1]
+  xin0, rin0 = tf.split(in0, 2)
+  xin1, rin1 = tf.split(in1, 2)
+  delta_in0 = xin0 - rin0
+  delta_in1 = xin1 - rin1
+  dup0 = [2] + [1 for i in delta_in0.shape[1:]]
+  out10 = op_func(xin0, rin1)
+  out01 = op_func(rin0, xin1)
+  out11, out00 = xout, rout
+  out0 = 0.5 * (out11 - out01 + out10 - out00)
+  out0 = grads[0] * tf.tile(out0 / delta_in0, dup0)
+  out1 = 0.5 * (out11 - out10 + out01 - out00)
+  out1 = grads[0] * tf.tile(out1 / delta_in1, dup0)
+
+  # Avoid divide by zero nans
+  out0 = tf.where(
+      tf.abs(tf.tile(delta_in0, dup0)) < 1e-7, tf.zeros_like(out0), out0)
+  out1 = tf.where(
+      tf.abs(tf.tile(delta_in1, dup0)) < 1e-7, tf.zeros_like(out1), out1)
+
+  # see if due to broadcasting our gradient shapes don't match our input shapes
+  if (np.any(np.array(out1.shape) != np.array(in1.shape))):
+    broadcast_index = np.where(
+        np.array(out1.shape) != np.array(in1.shape))[0][0]
+    out1 = tf.reduce_sum(out1, axis=broadcast_index, keepdims=True)
+  elif (np.any(np.array(out0.shape) != np.array(in0.shape))):
+    broadcast_index = np.where(
+        np.array(out0.shape) != np.array(in0.shape))[0][0]
+    out0 = tf.reduce_sum(out0, axis=broadcast_index, keepdims=True)
+
+  return [out0, out1]
+
+
+def softmax(explainer, op, *grads):
+  """Just decompose softmax into its components and recurse, we can handle all of them :)
+
+  We assume the 'axis' is the last dimension because the TF codebase swaps the 'axis' to
+  the last dimension before the softmax op if 'axis' is not already the last dimension.
+  We also don't subtract the max before tf.exp for numerical stability since that might
+  mess up the attributions and it seems like TensorFlow doesn't define softmax that way
+  (according to the docs)
+  """
+  in0 = op.inputs[0]
+  in0_max = tf.reduce_max(in0, axis=-1, keepdims=True, name='in0_max')
+  in0_centered = in0 - in0_max
+  evals = tf.exp(in0_centered, name='custom_exp')
+  rsum = tf.reduce_sum(evals, axis=-1, keepdims=True)
+  div = evals / rsum
+
+  # mark these as in-between the inputs and outputs
+  for op in [evals.op, rsum.op, div.op, in0_centered.op]:
+    for t in op.outputs:
+      if t.name not in explainer.between_tensors:
+        explainer.between_tensors[t.name] = False
+
+  out = tf.gradients(div, in0_centered, grad_ys=grads[0])[0]
+
+  # remove the names we just added
+  for op in [evals.op, rsum.op, div.op, in0_centered.op]:
+    for t in op.outputs:
+      if explainer.between_tensors[t.name] is False:
+        del explainer.between_tensors[t.name]
+
+  # rescale to account for our shift by in0_max (which we did for numerical stability)
+  xin0, rin0 = tf.split(in0, 2)
+  xin0_centered, rin0_centered = tf.split(in0_centered, 2)
+  delta_in0 = xin0 - rin0
+  dup0 = [2] + [1 for i in delta_in0.shape[1:]]
+  return tf.where(
+      tf.tile(tf.abs(delta_in0), dup0) < 1e-6, out,
+      out * tf.tile((xin0_centered - rin0_centered) / delta_in0, dup0))
+
+
+def maxpool(explainer, op, *grads):
+  xin0, rin0 = tf.split(op.inputs[0], 2)
+  xout, rout = tf.split(op.outputs[0], 2)
+  delta_in0 = xin0 - rin0
+  dup0 = [2] + [1 for i in delta_in0.shape[1:]]
+  cross_max = tf.maximum(xout, rout)
+  diffs = tf.concat([cross_max - rout, xout - cross_max], 0)
+  if op.type.startswith('shap_'):
+    op.type = op.type[5:]
+  xmax_pos, rmax_pos = tf.split(
+      explainer.orig_grads[op.type](op, grads[0] * diffs), 2)
+  return tf.tile(
+      tf.where(
+          tf.abs(delta_in0) < 1e-7, tf.zeros_like(delta_in0),
+          (xmax_pos + rmax_pos) / delta_in0), dup0)
+
+
+def gather(explainer, op, *grads):
+  # params = op.inputs[0]
+  indices = op.inputs[1]
+  # axis = op.inputs[2]
+  var = explainer._variable_inputs(op)
+  if var[1] and not var[0]:
+    assert len(indices.shape
+               ) == 2, 'Only scalar indices supported right now in GatherV2!'
+
+    xin1, rin1 = tf.split(tf.cast(op.inputs[1], tf.float32), 2)
+    xout, rout = tf.split(op.outputs[0], 2)
+    dup_in1 = [2] + [1 for i in xin1.shape[1:]]
+    dup_out = [2] + [1 for i in xout.shape[1:]]
+    delta_in1_t = tf.tile(xin1 - rin1, dup_in1)
+    out_sum = tf.reduce_sum(
+        grads[0] * tf.tile(xout - rout, dup_out),
+        list(range(len(indices.shape), len(grads[0].shape))))
+    if op.type == 'ResourceGather':
+      return [
+          None,
+          tf.where(
+              tf.abs(delta_in1_t) < 1e-6, tf.zeros_like(delta_in1_t),
+              out_sum / delta_in1_t)
+      ]
+    return [
+        None,
+        tf.where(
+            tf.abs(delta_in1_t) < 1e-6, tf.zeros_like(delta_in1_t),
+            out_sum / delta_in1_t), None
+    ]
+  elif var[0] and not var[1]:
+    if op.type.startswith('shap_'):
+      op.type = op.type[5:]
+    return [explainer.orig_grads[op.type](op, grads[0]),
+            None]  # linear in this case
+  else:
+    assert False, 'Axis not yet supported to be varying for gather op!'
+
+
+def linearity_1d(input_ind):
+
+  def handler(explainer, op, *grads):
+    return linearity_1d_handler(input_ind, explainer, op, *grads)
+
+  return handler
+
+
+def linearity_1d_handler(input_ind, explainer, op, *grads):
+  # make sure only the given input varies (negative means only that input cannot vary, and is measured from the end of the list)
+  for i in range(len(op.inputs)):
+    if i != input_ind:
+      assert not explainer._variable_inputs(
+          op)[i], str(i) + 'th input to ' + op.name + ' cannot vary!'
+  if op.type.startswith('shap_'):
+    op.type = op.type[5:]
+  return explainer.orig_grads[op.type](op, *grads)
+
+
+def linearity_with_excluded(input_inds):
+
+  def handler(explainer, op, *grads):
+    return linearity_with_excluded_handler(input_inds, explainer, op, *grads)
+
+  return handler
+
+
+def linearity_with_excluded_handler(input_inds, explainer, op, *grads):
+  # make sure the given inputs don't vary (negative is measured from the end of the list)
+  for i in range(len(op.inputs)):
+    if i in input_inds or i - len(op.inputs) in input_inds:
+      assert not explainer._variable_inputs(
+          op)[i], str(i) + 'th input to ' + op.name + ' cannot vary!'
+  if op.type.startswith('shap_'):
+    op.type = op.type[5:]
+  return explainer.orig_grads[op.type](op, *grads)
+
+
+def passthrough(explainer, op, *grads):
+  if op.type.startswith('shap_'):
+    op.type = op.type[5:]
+  return explainer.orig_grads[op.type](op, *grads)
+
+
+def break_dependence(explainer, op, *grads):
+  """This function name is used to break attribution dependence in the graph traversal.
+
+  These operation types may be connected above input data values in the graph but their outputs
+  don't depend on the input values (for example they just depend on the shape).
+  """
+  return [None for _ in op.inputs]
+
+
+op_handlers = {}
+
+# ops that are always linear
+op_handlers['Identity'] = passthrough
+op_handlers['StridedSlice'] = passthrough
+op_handlers['Squeeze'] = passthrough
+op_handlers['ExpandDims'] = passthrough
+op_handlers['Pack'] = passthrough
+op_handlers['BiasAdd'] = passthrough
+op_handlers['Unpack'] = passthrough
+op_handlers['Add'] = passthrough
+op_handlers['Sub'] = passthrough
+op_handlers['Merge'] = passthrough
+op_handlers['Sum'] = passthrough
+op_handlers['Mean'] = passthrough
+op_handlers['Cast'] = passthrough
+op_handlers['Transpose'] = passthrough
+op_handlers['Enter'] = passthrough
+op_handlers['Exit'] = passthrough
+op_handlers['NextIteration'] = passthrough
+op_handlers['Tile'] = passthrough
+op_handlers['TensorArrayScatterV3'] = passthrough
+op_handlers['TensorArrayReadV3'] = passthrough
+op_handlers['TensorArrayWriteV3'] = passthrough
+
+# ops that don't pass any attributions to their inputs
+op_handlers['Shape'] = break_dependence
+op_handlers['RandomUniform'] = break_dependence
+op_handlers['ZerosLike'] = break_dependence
+# op_handlers["StopGradient"] = break_dependence # this allows us to stop attributions when we want to (like softmax re-centering)
+
+# ops that are linear and only allow a single input to vary
+op_handlers['Reshape'] = linearity_1d(0)
+op_handlers['Pad'] = linearity_1d(0)
+op_handlers['ReverseV2'] = linearity_1d(0)
+op_handlers['ConcatV2'] = linearity_with_excluded([-1])
+op_handlers['Conv2D'] = linearity_1d(0)
+op_handlers['Switch'] = linearity_1d(0)
+op_handlers['AvgPool'] = linearity_1d(0)
+op_handlers['FusedBatchNorm'] = linearity_1d(0)
+
+# ops that are nonlinear and only allow a single input to vary
+op_handlers['Relu'] = nonlinearity_1d(0)
+op_handlers['Elu'] = nonlinearity_1d(0)
+op_handlers['Sigmoid'] = nonlinearity_1d(0)
+op_handlers['Tanh'] = nonlinearity_1d(0)
+op_handlers['Softplus'] = nonlinearity_1d(0)
+op_handlers['Exp'] = nonlinearity_1d(0)
+op_handlers['ClipByValue'] = nonlinearity_1d(0)
+op_handlers['Rsqrt'] = nonlinearity_1d(0)
+op_handlers['Square'] = nonlinearity_1d(0)
+op_handlers['Max'] = nonlinearity_1d(0)
+
+# ops that are nonlinear and allow two inputs to vary
+op_handlers['SquaredDifference'] = nonlinearity_1d_nonlinearity_2d(
+    0, 1, lambda x, y: (x - y) * (x - y))
+op_handlers['Minimum'] = nonlinearity_1d_nonlinearity_2d(
+    0, 1, lambda x, y: tf.minimum(x, y))
+op_handlers['Maximum'] = nonlinearity_1d_nonlinearity_2d(
+    0, 1, lambda x, y: tf.maximum(x, y))
+
+# ops that allow up to two inputs to vary are are linear when only one input varies
+op_handlers['Mul'] = linearity_1d_nonlinearity_2d(0, 1, lambda x, y: x * y)
+op_handlers['RealDiv'] = linearity_1d_nonlinearity_2d(0, 1, lambda x, y: x / y)
+op_handlers['MatMul'] = linearity_1d_nonlinearity_2d(
+    0, 1, lambda x, y: tf.matmul(x, y))
+
+# ops that need their own custom attribution functions
+op_handlers['GatherV2'] = gather
+op_handlers['ResourceGather'] = gather
+op_handlers['MaxPool'] = maxpool
+op_handlers['Softmax'] = softmax
diff --git a/easy_rec/python/tools/explainer/explainer.py b/easy_rec/python/tools/explainer/explainer.py
new file mode 100644
index 000000000..04d2bc4dc
--- /dev/null
+++ b/easy_rec/python/tools/explainer/explainer.py
@@ -0,0 +1,534 @@
+import abc
+import collections
+import logging
+import os
+import time
+
+import numpy as np
+import six
+import tensorflow as tf
+from six import moves
+from tensorflow.python.platform import gfile
+from tensorflow.python.saved_model import signature_constants
+
+# from easy_rec.python.tools.explainer.deep_shap import DeepShap
+from easy_rec.python.protos.dataset_pb2 import DatasetConfig
+from easy_rec.python.tools.explainer.methods import DeepExplain
+from easy_rec.python.utils.config_util import get_configs_from_pipeline_file
+from easy_rec.python.utils.input_utils import get_type_defaults
+from easy_rec.python.utils.load_class import get_register_class_meta
+
+_EXPLAINER_CLASS_MAP = {}
+_register_abc_meta = get_register_class_meta(
+    _EXPLAINER_CLASS_MAP, have_abstract_class=True)
+
+
+class Explainer(six.with_metaclass(_register_abc_meta, object)):
+  version = 1
+
+  def __init__(self, deep_explain, model_path, method_name):
+    """Base class for explainer.
+
+    Args:
+      deep_explain: a deep explain context manager
+      model_path:  saved_model directory or frozen pb file path
+      method_name: explain method name
+    """
+    self.deep_explain = deep_explain
+    self.method = method_name
+    self._inputs_map = collections.OrderedDict()
+    self._outputs_map = collections.OrderedDict()
+    self._model_path = model_path
+    self._explainer = None
+    self._effective_fields = None
+    self._build_model()
+
+  def _build_model(self):
+    model_path = self._model_path
+    logging.info('loading model from %s' % model_path)
+    if gfile.IsDirectory(model_path):
+      assert tf.saved_model.loader.maybe_saved_model_directory(model_path), \
+        'saved model does not exists in %s' % model_path
+    else:
+      raise ValueError('currently only savedmodel is supported, path:' +
+                       model_path)
+
+    input_fields = _get_input_fields_from_pipeline_config(model_path)
+    self._input_fields_info, self._input_fields = input_fields
+
+    de = self.deep_explain
+    meta_graph_def = tf.saved_model.loader.load(
+        de.session, [tf.saved_model.tag_constants.SERVING], model_path)
+    # parse signature
+    signature_def = meta_graph_def.signature_def[
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+    inputs = signature_def.inputs
+    input_info = []
+    self._is_multi_placeholder = len(inputs.items()) > 1
+    if self._is_multi_placeholder:
+      for gid, item in enumerate(inputs.items()):
+        name, tensor = item
+        logging.info('Load input binding: %s -> %s' % (name, tensor.name))
+        input_name = tensor.name
+        input_name, _ = input_name.split(':')
+        try:
+          input_id = input_name.split('_')[-1]
+          input_id = int(input_id)
+        except Exception:
+          # support for models that are not exported by easy_rec
+          # in which case, the order of inputs may not be the
+          # same as they are defined, therefore, list input
+          # could not be supported, only dict input could be supported
+          logging.warning('could not determine input_id from input_name: %s' %
+                          input_name)
+          input_id = gid
+        input_info.append((input_id, name, tensor.dtype))
+        self._inputs_map[name] = de.graph.get_tensor_by_name(tensor.name)
+    else:
+      # only one input, all features concatenate together
+      for name, tensor in inputs.items():
+        logging.info('Load input binding: %s -> %s' % (name, tensor.name))
+        input_info.append((0, name, tensor.dtype))
+        self._inputs_map[name] = de.graph.get_tensor_by_name(tensor.name)
+
+    # sort inputs by input_ids so as to match the order of csv data
+    input_info.sort(key=lambda t: t[0])
+    self._input_names = [t[1] for t in input_info]
+
+    outputs = signature_def.outputs
+    for name, tensor in outputs.items():
+      logging.info('Load output binding: %s -> %s' % (name, tensor.name))
+      self._outputs_map[name] = de.graph.get_tensor_by_name(tensor.name)
+
+    # get assets
+    # self._assets = {}
+    # asset_files = tf.get_collection(constants.ASSETS_KEY)
+    # for any_proto in asset_files:
+    #   asset_file = meta_graph_pb2.AssetFileDef()
+    #   any_proto.Unpack(asset_file)
+    #   type_name = asset_file.tensor_info.name.split(':')[0]
+    #   asset_path = os.path.join(model_path, constants.ASSETS_DIRECTORY,
+    #                             asset_file.filename)
+    #   assert gfile.Exists(
+    #     asset_path), '%s is missing in saved model' % asset_path
+    #   self._assets[type_name] = asset_path
+    # logging.info(self._assets)
+
+  def default_values(self):
+    input_fields = self._input_fields if self._effective_fields is None else self._effective_fields
+    n = len(input_fields)
+    m = len(self._input_names)
+    assert m == n, 'the number input columns is not expected, %d given, %d expected\n' \
+                   'model inputs: %s\ninput fields: %s' % (n, m, ','.join(self._input_names), ','.join(input_fields))
+
+    default_value = []
+    for i, (field, name) in enumerate(zip(input_fields, self._input_names)):
+      assert field == name, 'input field `%d` has different names: <%s, %s>' % (
+          i, field, name)
+      value = self._get_defaults(field)
+      # default_value.append(np.array([value]))  # for deep_shap
+      default_value.append(np.array(value))  # for deep_shap
+    return default_value
+
+  def _get_defaults(self, col_name, col_type='string'):
+    if col_name in self._input_fields_info:
+      col_type, default_val = self._input_fields_info[col_name]
+      default_val = get_type_defaults(col_type, default_val)
+      logging.info('col_name: %s, default_val: %s' % (col_name, default_val))
+    else:
+      defaults = {'string': '', 'double': 0.0, 'bigint': 0}
+      assert col_type in defaults, 'invalid col_type: %s, col_type: %s' % (
+          col_name, col_type)
+      default_val = defaults[col_type]
+      logging.info(
+          'col_name: %s, default_val: %s.[not defined in saved_model_dir/assets/pipeline.config]'
+          % (col_name, default_val))
+    return default_val
+
+  def str_to_number(self, values):
+    assert len(values) == len(
+        self._input_fields
+    ), 'value count %d is not equal to the number of input fields %d' % (
+        len(values), len(self._input_fields))
+    result = []
+    for i, name in enumerate(self._input_names):
+      assert name in self._input_fields_info, 'input `%s` not in pipeline config' % name
+      idx = self._input_fields.index(name)
+      input_type, default_val = self._input_fields_info[name]
+      if input_type in {DatasetConfig.INT32, DatasetConfig.INT64}:
+        tmp_field = int(values[idx])
+      elif input_type in [DatasetConfig.FLOAT, DatasetConfig.DOUBLE]:
+        tmp_field = float(values[idx])
+      elif input_type in [DatasetConfig.BOOL]:
+        tmp_field = values[idx].lower() in ['true', '1', 't', 'y', 'yes']
+      elif input_type in [DatasetConfig.STRING]:
+        tmp_field = values[idx]
+      else:
+        assert False, 'invalid types: %s' % str(input_type)
+      result.append(tmp_field)
+    return result
+
+  def get_explainer(self, output_cols=None):
+    if output_cols is None or output_cols == 'ALL_COLUMNS':
+      self._output_cols = sorted(self.output_names)
+      logging.info('predict output cols: %s' % self._output_cols)
+    else:
+      # specified as score float,embedding string
+      tmp_cols = []
+      for x in output_cols.split(','):
+        if x.strip() == '':
+          continue
+        tmp_keys = x.split(' ')
+        tmp_cols.append(tmp_keys[0].strip())
+      self._output_cols = tmp_cols
+    if len(self._output_cols) > 1:
+      logging.warning(
+          'Only one output can be supported currently, use the first one: %s',
+          self._output_cols[0])
+
+    output_name = self._output_cols[0]
+    assert output_name in self.output_names, 'invalid output name `%s` not in model outputs `%s`' % (
+        output_name, ','.join(self.output_names))
+    if output_name is None:
+      output = self._outputs_map.values()[0]
+    elif type(output_name) in {str, unicode}:
+      output = self._outputs_map[output_name]
+    else:
+      raise Exception('unsupported type of output_name: ' +
+                      str(type(output_name)))
+
+    def_vals = self.default_values()
+    # print('default values (%d):' % len(def_vals), def_vals)
+    inputs = [self._inputs_map[name] for name in self._input_names]
+    # e = DeepShap(inputs, output, def_vals, session=self._session)
+    # self._explainer = e
+    e = self.deep_explain.get_explainer(
+        self.method, output, inputs, baseline=def_vals)
+    return e
+
+  @property
+  def input_names(self):
+    """Input names of the model.
+
+    Returns:
+      a list, which conaining the name of input nodes available in model
+    """
+    return self._input_names
+
+  @property
+  def output_names(self):
+    """Output names of the model.
+
+    Returns:
+      a list, which containing the name of outputs nodes available in model
+    """
+    return list(self._outputs_map.keys())
+
+  @abc.abstractmethod
+  def feature_importance(self,
+                         input_path,
+                         output_path,
+                         reserved_cols='',
+                         output_cols=None,
+                         batch_size=1024,
+                         slice_id=0,
+                         slice_num=1):
+    pass
+
+  # def create_output_table(self, reserved_cols=''):
+  #   reserved_cols = [x.strip() for x in reserved_cols.split(',') if x != '']
+  #   outputs = self.input_names
+  #   reserved_cols = filter(lambda r: r not in outputs, reserved_cols)
+  #   output_cols = reserved_cols + outputs
+  #   sql = 'create table output_table '
+  #   return sql
+
+
+class OdpsExplainer(Explainer):
+
+  def feature_importance(self,
+                         input_path,
+                         output_path,
+                         reserved_cols='',
+                         output_cols=None,
+                         batch_size=1024,
+                         slice_id=0,
+                         slice_num=1):
+    input_cols = self.input_names
+    input_dim = len(input_cols)
+    if reserved_cols:
+      reserved_cols = [
+          x.strip()
+          for x in reserved_cols.split(',')
+          if x.strip() not in input_cols
+      ]
+      input_cols.extend(reserved_cols)
+    selected_cols = ','.join(input_cols)
+    print('selected_cols: ' + selected_cols)
+
+    explainer = self.get_explainer(output_cols)
+    print('reference value:', explainer.expected_value)
+
+    import common_io
+    reader = common_io.table.TableReader(
+        input_path,
+        selected_cols=selected_cols,
+        slice_id=slice_id,
+        slice_count=slice_num)
+
+    reserved_cols_idx = []
+    if reserved_cols:
+      reserved_cols = [x.strip() for x in reserved_cols.split(',') if x != '']
+      schema = reader.get_schema()
+      columns = [str(x[0]) for x in schema]
+      reserved_cols_idx = [columns.index(x) for x in reserved_cols]
+      print(reserved_cols_idx)
+
+    sum_t0, sum_t1, sum_t2 = 0, 0, 0
+    writer = common_io.table.TableWriter(output_path, slice_id=slice_id)
+    total_records_num = reader.get_row_count()
+    for i in moves.range(0, total_records_num, batch_size):
+      t0 = time.time()
+      records = reader.read(batch_size, allow_smaller_final_batch=True)
+      t1 = time.time()
+      records = np.array(records)
+      inputs = list(records[:, :input_dim].T)
+      sv = explainer.shap_values(inputs, check_additivity=False)
+      outputs = [records[:, i] for i in reserved_cols_idx]
+      if outputs:
+        outputs.extend(sv[0])
+      else:
+        outputs = sv[0]
+      indices = range(len(outputs))
+      t2 = time.time()
+      writer.write(np.array(outputs).T, indices, allow_type_cast=True)
+      t3 = time.time()
+      sum_t0 += (t1 - t0)
+      sum_t1 += (t2 - t1)
+      sum_t2 += (t3 - t2)
+      if i % 100 == 0:
+        logging.info('progress: batch_num=%d sample_num=%d' %
+                     (i + 1, (i + 1) * batch_size))
+        logging.info('time_stats: read: %.2f predict: %.2f write: %.2f' %
+                     (sum_t0, sum_t1, sum_t2))
+      logging.info('Final_time_stats: read: %.2f predict: %.2f write: %.2f' %
+                   (sum_t0, sum_t1, sum_t2))
+    writer.close()
+    reader.close()
+    logging.info('Explain %s done.' % input_path)
+
+
+class OdpsRtpExplainer(Explainer):
+
+  def __init__(self, deep_explain, model_path, method_name):
+    super(OdpsRtpExplainer, self).__init__(deep_explain, model_path,
+                                           method_name)
+    pipeline_path = os.path.join(model_path, 'assets/pipeline.config')
+    if not gfile.Exists(pipeline_path):
+      logging.warning(
+          '%s not exists, default values maybe inconsistent with the values used in training.'
+          % pipeline_path)
+      return
+    pipeline_config = get_configs_from_pipeline_file(pipeline_path)
+    self._fg_separator = pipeline_config.data_config.separator
+
+    if pipeline_config.export_config.filter_inputs:
+      if len(pipeline_config.feature_configs) > 0:
+        feature_configs = pipeline_config.feature_configs
+      elif pipeline_config.feature_config and len(
+          pipeline_config.feature_config.features) > 0:
+        feature_configs = pipeline_config.feature_config.features
+      else:
+        assert False, 'One of feature_configs and feature_config.features must be configured.'
+
+      self._effective_fields = []
+      for fc in feature_configs:
+        for input_name in fc.input_names:
+          assert input_name in self._input_fields, 'invalid input_name in %s' % str(
+              fc)
+          if input_name not in self._effective_fields:
+            self._effective_fields.append(input_name)
+      self._effective_fids = [
+          self._input_fields.index(x) for x in self._effective_fields
+      ]
+      # sort fids from small to large
+      self._effective_fids = list(set(self._effective_fids))
+      self._effective_fields = [
+          self._input_fields[x] for x in self._effective_fids
+      ]
+      logging.info('raw input fields: %d, effective fields: %d' %
+                   (len(self._input_fields), len(self._effective_fields)))
+
+  def feature_importance(self,
+                         input_path,
+                         output_path,
+                         reserved_cols='',
+                         output_cols=None,
+                         batch_size=1024,
+                         slice_id=0,
+                         slice_num=1):
+    input_cols = [x.strip() for x in reserved_cols.split(',') if x != '']
+    reserved_dim = len(input_cols)
+    if 'features' not in input_cols:
+      input_cols.append('features')
+    selected_cols = ','.join(input_cols)
+    print('selected_cols: ' + selected_cols)
+
+    explainer = self.get_explainer(output_cols)
+    print('reference value:', explainer.expected_value)
+
+    import common_io
+    reader = common_io.table.TableReader(
+        input_path,
+        selected_cols=selected_cols,
+        slice_id=slice_id,
+        slice_count=slice_num)
+
+    sum_t0, sum_t1, sum_t2 = 0, 0, 0
+    writer = common_io.table.TableWriter(output_path, slice_id=slice_id)
+    total_records_num = reader.get_row_count()
+    for i in moves.range(0, total_records_num, batch_size):
+      t0 = time.time()
+      records = reader.read(batch_size, allow_smaller_final_batch=True)
+      t1 = time.time()
+      inputs = []
+      reserved = []
+      for j in range(len(records)):
+        if reserved_dim > 0:
+          reserved.append(records[j][:reserved_dim])
+        inputs.append(
+            self.str_to_number(records[j][-1].decode('utf-8').split(
+                self._fg_separator)))
+      inputs = list(np.array(inputs).T)
+      print('inputs:', inputs)
+      # sv = explainer.shap_values(inputs, check_additivity=False)
+      ret = explainer.run(inputs, batch_size=len(records))
+      ret = np.array(ret)
+      if reserved_dim > 0:
+        outputs = np.concatenate([np.array(reserved), ret], axis=1)
+      else:
+        outputs = ret
+      indices = range(outputs.shape[1])
+      t2 = time.time()
+      writer.write(outputs.T, indices, allow_type_cast=True)
+      t3 = time.time()
+      sum_t0 += (t1 - t0)
+      sum_t1 += (t2 - t1)
+      sum_t2 += (t3 - t2)
+      if i % 2 == 0:
+        logging.info('progress: batch_num=%d sample_num=%d' %
+                     (i + 1, (i + 1) * batch_size))
+        logging.info('time_stats: read: %.2f predict: %.2f write: %.2f' %
+                     (sum_t0, sum_t1, sum_t2))
+      logging.info('Final_time_stats: read: %.2f predict: %.2f write: %.2f' %
+                   (sum_t0, sum_t1, sum_t2))
+    writer.close()
+    reader.close()
+    logging.info('Explain %s done.' % input_path)
+
+
+def _get_input_fields_from_pipeline_config(model_path):
+  pipeline_path = os.path.join(model_path, 'assets/pipeline.config')
+  if not gfile.Exists(pipeline_path):
+    logging.warning(
+        '%s not exists, default values maybe inconsistent with the values used in training.'
+        % pipeline_path)
+    return {}, []
+  pipeline_config = get_configs_from_pipeline_file(pipeline_path)
+  data_config = pipeline_config.data_config
+  label_fields = data_config.label_fields
+  labels = {x for x in label_fields}
+  if data_config.HasField('sample_weight'):
+    labels.add(data_config.sample_weight)
+
+  input_fields = data_config.input_fields
+  input_fields_info = {
+      input_field.input_name: (input_field.input_type, input_field.default_val)
+      for input_field in input_fields
+      if input_field.input_name not in labels
+  }
+  input_fields_list = [
+      input_field.input_name
+      for input_field in input_fields
+      if input_field.input_name not in labels
+  ]
+  return input_fields_info, input_fields_list
+
+
+def search_pb(directory, use_latest=False):
+  """Search pb file recursively in model directory. if multiple pb files exist, exception will be raised.
+
+  If multiple pb files exist, exception will be raised.
+
+  Args:
+    directory: model directory.
+
+  Returns:
+    directory contain pb file
+  """
+  dir_list = []
+  for root, dirs, files in gfile.Walk(directory):
+    for f in files:
+      if f.endswith('saved_model.pb'):
+        dir_list.append(root)
+  if len(dir_list) == 0:
+    raise ValueError('savedmodel is not found in directory %s' % directory)
+  elif len(dir_list) > 1:
+    if use_latest:
+      logging.info('find %d models: %s' % (len(dir_list), ','.join(dir_list)))
+      dir_list = sorted(
+          dir_list,
+          key=lambda x: int(x.split('/')[(-2 if (x[-1] == '/') else -1)]))
+      return dir_list[-1]
+    else:
+      raise ValueError('multiple saved model found in directory %s' % directory)
+
+  return dir_list[0]
+
+
+# def create_explainer(model_path, use_latest=False):
+#   if gfile.IsDirectory(model_path):
+#     model_path = search_pb(model_path, use_latest)
+#   else:
+#     raise ValueError('model_path should be a directory, path:' + model_path)
+#   pipeline_path = os.path.join(model_path, 'assets/pipeline.config')
+#   if not gfile.Exists(pipeline_path):
+#     logging.warning('%s not exists' % pipeline_path)
+#     raise ValueError('%s not exists' % pipeline_path)
+#
+#   pipeline_config = get_configs_from_pipeline_file(pipeline_path)
+#   input_type = pipeline_config.data_config.input_type
+#   if input_type in {DatasetConfig.OdpsInput, DatasetConfig.OdpsInputV2, DatasetConfig.OdpsInputV3}:
+#     return OdpsExplainer(model_path)
+#   if input_type in {DatasetConfig.OdpsRTPInput, DatasetConfig.OdpsRTPInputV2}:
+#     return OdpsRtpExplainer(model_path)
+#   raise ValueError("currently unsupported input type: " + input_type)
+
+
+def run(FLAGS):
+  model_path = FLAGS.saved_model_dir
+  if gfile.IsDirectory(model_path):
+    model_path = search_pb(model_path, False)
+  else:
+    raise ValueError('model_path should be a directory, path:' + model_path)
+  pipeline_path = os.path.join(model_path, 'assets/pipeline.config')
+  if not gfile.Exists(pipeline_path):
+    logging.warning('%s not exists' % pipeline_path)
+    raise ValueError('%s not exists' % pipeline_path)
+
+  gpu_options = tf.GPUOptions(allow_growth=True)
+  session_config = tf.ConfigProto(
+      gpu_options=gpu_options, allow_soft_placement=True)
+  session = tf.Session(config=session_config)
+
+  worker_count = len(FLAGS.worker_hosts.split(','))
+  with DeepExplain(session=session) as de:
+    e = OdpsRtpExplainer(de, model_path, 'deeplift')
+    e.feature_importance(
+        FLAGS.explain_tables if FLAGS.explain_tables else FLAGS.tables,
+        FLAGS.outputs,
+        reserved_cols=FLAGS.reserved_cols,
+        output_cols=FLAGS.output_cols,
+        batch_size=FLAGS.batch_size,
+        slice_id=FLAGS.task_index,
+        slice_num=worker_count)
diff --git a/easy_rec/python/tools/explainer/feature_importance.py b/easy_rec/python/tools/explainer/feature_importance.py
new file mode 100644
index 000000000..7085274ab
--- /dev/null
+++ b/easy_rec/python/tools/explainer/feature_importance.py
@@ -0,0 +1,55 @@
+from __future__ import print_function
+
+import tensorflow as tf
+
+from easy_rec.python.tools.explainer.explainer import run
+
+flags = tf.app.flags
+
+flags.DEFINE_string('saved_model_dir', '',
+                    'directory where saved_model.pb exists')
+flags.DEFINE_string('explain_tables', '', 'tables used for explaination')
+flags.DEFINE_string('background_table', '', 'tables used for expected value')
+flags.DEFINE_string('tables', '', 'tables passed by pai command')
+flags.DEFINE_string('outputs', '', 'output tables')
+flags.DEFINE_string(
+    'selected_cols', '',
+    'columns to keep from input table,  they are separated with ,')
+flags.DEFINE_string(
+    'reserved_cols', '',
+    'columns to keep from input table,  they are separated with ,')
+flags.DEFINE_string(
+    'output_cols', None,
+    'output columns, such as: score float. multiple columns are separated by ,')
+flags.DEFINE_integer('batch_size', 1024, 'predict batch size')
+flags.DEFINE_string('worker_hosts', '',
+                    'Comma-separated list of hostname:port pairs')
+flags.DEFINE_integer('task_index', 0, 'Index of task within the job')
+
+FLAGS = flags.FLAGS
+
+
+def main(_):
+  for k in FLAGS:
+    if k in ('h', 'help', 'helpshort', 'helpfull'):
+      continue
+    print('%s=%s' % (k, FLAGS[k].value))
+
+  # worker_count = len(FLAGS.worker_hosts.split(','))
+  # e = create_explainer(FLAGS.saved_model_dir)
+  #
+  # output_names = e.input_names
+  # print("feature_names:", output_names)
+  # print("feature_num:", len(output_names))
+  # e.feature_importance(FLAGS.explain_tables if FLAGS.explain_tables else FLAGS.tables,
+  #                      FLAGS.outputs,
+  #                      reserved_cols=FLAGS.reserved_cols,
+  #                      output_cols=FLAGS.output_cols,
+  #                      batch_size=FLAGS.batch_size,
+  #                      slice_id=FLAGS.task_index,
+  #                      slice_num=worker_count)
+  run(FLAGS)
+
+
+if __name__ == '__main__':
+  tf.app.run(main=main)
diff --git a/easy_rec/python/tools/explainer/methods.py b/easy_rec/python/tools/explainer/methods.py
new file mode 100644
index 000000000..38c53be55
--- /dev/null
+++ b/easy_rec/python/tools/explainer/methods.py
@@ -0,0 +1,721 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import logging
+import sys
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from skimage.util import view_as_windows
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_grad
+from tensorflow.python.ops import nn_grad
+
+from easy_rec.python.tools.explainer.utils import make_batches
+from easy_rec.python.tools.explainer.utils import slice_arrays
+from easy_rec.python.tools.explainer.utils import to_list
+from easy_rec.python.tools.explainer.utils import unpack_singleton
+
+SUPPORTED_ACTIVATIONS = ['Relu', 'Elu', 'Sigmoid', 'Tanh', 'Softplus']
+
+UNSUPPORTED_ACTIVATIONS = ['CRelu', 'Relu6', 'Softsign']
+
+_ENABLED_METHOD_CLASS = None
+_GRAD_OVERRIDE_CHECKFLAG = 0
+
+# -----------------------------------------------------------------------------
+# UTILITY FUNCTIONS
+# -----------------------------------------------------------------------------
+
+
+def activation(type):
+  """Returns Tensorflow's activation op, given its type.
+
+  :param type: string
+  :return: op
+  """
+  if type not in SUPPORTED_ACTIVATIONS:
+    warnings.warn('Activation function (%s) not supported' % type)
+  f = getattr(tf.nn, type.lower())
+  return f
+
+
+def original_grad(op, grad):
+  """Return original Tensorflow gradient for an op.
+
+  :param op: op
+  :param grad: Tensor
+  :return: Tensor
+  """
+  if op.type not in SUPPORTED_ACTIVATIONS:
+    warnings.warn('Activation function (%s) not supported' % op.type)
+  opname = '_%sGrad' % op.type
+  if hasattr(nn_grad, opname):
+    f = getattr(nn_grad, opname)
+  else:
+    f = getattr(math_grad, opname)
+  return f(op, grad)
+
+
+# -----------------------------------------------------------------------------
+# ATTRIBUTION METHODS BASE CLASSES
+# -----------------------------------------------------------------------------
+
+
+class AttributionMethod(object):
+  """Attribution method base class."""
+
+  def __init__(self, T, X, session, keras_learning_phase=None):
+    self.T = T  # target Tensor
+    self.X = X  # input Tensor
+    self.Y_shape = [
+        None,
+    ] + T.get_shape().as_list()[1:]
+    # Most often T contains multiple output units. In this case, it is often necessary to select
+    # a single unit to compute contributions for. This can be achieved passing 'ys' as weight for the output Tensor.
+    self.Y = tf.placeholder(tf.float32, self.Y_shape)
+    # placeholder_from_data(ys) if ys is not None else 1.0  # Tensor that represents weights for T
+    self.T = self.T * self.Y
+    self.symbolic_attribution = None
+    self.session = session
+    self.keras_learning_phase = keras_learning_phase
+    self.has_multiple_inputs = type(self.X) is list or type(self.X) is tuple
+    logging.info('Model with multiple inputs: %s' % self.has_multiple_inputs)
+
+    # Set baseline
+    # TODO: now this sets a baseline also for those methods that does not require it
+    self._set_check_baseline()
+
+    # References
+    self._init_references()
+
+    # Create symbolic explanation once during construction (affects only gradient-based methods)
+    self.explain_symbolic()
+
+  def explain_symbolic(self):
+    return None
+
+  def run(self, xs, ys=None, batch_size=None):
+    pass
+
+  def _init_references(self):
+    pass
+
+  def _check_input_compatibility(self, xs, ys=None, batch_size=None):
+    if ys is not None:
+      if not self.has_multiple_inputs and len(xs) != len(ys):
+        raise RuntimeError(
+            'When provided, ys must have the same batch size as xs (xs has batch size {} and ys {})'
+            .format(len(xs), len(ys)))
+      elif self.has_multiple_inputs and np.all([len(i) != len(ys) for i in xs]):
+        raise RuntimeError(
+            'When provided, ys must have the same batch size as all elements of xs'
+        )
+    if batch_size is not None and batch_size > 0:
+      if self.T.shape[0].value is not None and self.T.shape[
+          0].value is not batch_size:
+        raise RuntimeError(
+            'When using batch evaluation, the first dimension of the target tensor '
+            'must be compatible with the batch size. Found %s instead' %
+            self.T.shape[0].value)
+      if isinstance(self.X, list):
+        for x in self.X:
+          if x.shape[0].value is not None and x.shape[0].value is not batch_size:
+            raise RuntimeError(
+                'When using batch evaluation, the first dimension of the input tensor '
+                'must be compatible with the batch size. Found %s instead' %
+                x.shape[0].value)
+      else:
+        if self.X.shape[0].value is not None and self.X.shape[
+            0].value is not batch_size:
+          raise RuntimeError(
+              'When using batch evaluation, the first dimension of the input tensor '
+              'must be compatible with the batch size. Found %s instead' %
+              self.X.shape[0].value)
+
+  def _session_run_batch(self, T, xs, ys=None):
+    feed_dict = {}
+    if self.has_multiple_inputs:
+      for k, v in zip(self.X, xs):
+        feed_dict[k] = v
+    else:
+      feed_dict[self.X] = xs
+
+    # If ys is not passed, produce a vector of ones that will be broadcasted to all batch samples
+    feed_dict[self.Y] = ys if ys is not None else np.ones([
+        1,
+    ] + self.Y_shape[1:])
+
+    if self.keras_learning_phase is not None:
+      feed_dict[self.keras_learning_phase] = 0
+    return self.session.run(T, feed_dict)
+
+  def _session_run(self, T, xs, ys=None, batch_size=None):
+    num_samples = len(xs)
+    if self.has_multiple_inputs is True:
+      num_samples = len(xs[0])
+      if len(xs) != len(self.X):
+        raise RuntimeError(
+            'List of input tensors and input data have different lengths (%s and %s)'
+            % (str(len(xs)), str(len(self.X))))
+      if batch_size is not None:
+        for xi in xs:
+          if len(xi) != num_samples:
+            raise RuntimeError(
+                'Evaluation in batches requires all inputs to have '
+                'the same number of samples')
+
+    if batch_size is None or batch_size <= 0 or num_samples <= batch_size:
+      return self._session_run_batch(T, xs, ys)
+    else:
+      outs = []
+      batches = make_batches(num_samples, batch_size)
+      for batch_index, (batch_start, batch_end) in enumerate(batches):
+        # Get a batch from data
+        xs_batch = slice_arrays(xs, batch_start, batch_end)
+        # If the target tensor has one entry for each sample, we need to batch it as well
+        ys_batch = None
+        if ys is not None:
+          ys_batch = slice_arrays(ys, batch_start, batch_end)
+        batch_outs = self._session_run_batch(T, xs_batch, ys_batch)
+        batch_outs = to_list(batch_outs)
+        if batch_index == 0:
+          # Pre-allocate the results arrays.
+          for batch_out in batch_outs:
+            shape = (num_samples,) + batch_out.shape[1:]
+            outs.append(np.zeros(shape, dtype=batch_out.dtype))
+        for i, batch_out in enumerate(batch_outs):
+          outs[i][batch_start:batch_end] = batch_out
+      return unpack_singleton(outs)
+
+  def _set_check_baseline(self):
+    # Do nothing for those methods that have no baseline required
+    if not hasattr(self, 'baseline'):
+      return
+
+    if self.baseline is None:
+      if self.has_multiple_inputs:
+        self.baseline = [
+            np.zeros([
+                1,
+            ] + xi.get_shape().as_list()[1:]) for xi in self.X
+        ]
+      else:
+        self.baseline = np.zeros([
+            1,
+        ] + self.X.get_shape().as_list()[1:])
+
+    else:
+      if self.has_multiple_inputs:
+        for i, xi in enumerate(self.X):
+          if list(self.baseline[i].shape) == xi.get_shape().as_list()[1:]:
+            self.baseline[i] = np.expand_dims(self.baseline[i], 0)
+          else:
+            raise RuntimeError(
+                'Baseline shape %s does not match expected shape %s' %
+                (self.baseline[i].shape, xi.get_shape().as_list()[1:]))
+      else:
+        if list(self.baseline.shape) == self.X.get_shape().as_list()[1:]:
+          self.baseline = np.expand_dims(self.baseline, 0)
+        else:
+          raise RuntimeError(
+              'Baseline shape %s does not match expected shape %s' %
+              (self.baseline.shape, self.X.get_shape().as_list()[1:]))
+
+
+class GradientBasedMethod(AttributionMethod):
+  """Base class for gradient-based attribution methods."""
+
+  def get_symbolic_attribution(self):
+    return tf.gradients(self.T, self.X)
+
+  def explain_symbolic(self):
+    if self.symbolic_attribution is None:
+      self.symbolic_attribution = self.get_symbolic_attribution()
+    return self.symbolic_attribution
+
+  def run(self, xs, ys=None, batch_size=None):
+    self._check_input_compatibility(xs, ys, batch_size)
+    results = self._session_run(self.explain_symbolic(), xs, ys, batch_size)
+    return results[0] if not self.has_multiple_inputs else results
+
+  @classmethod
+  def nonlinearity_grad_override(cls, op, grad):
+    return original_grad(op, grad)
+
+
+class PerturbationBasedMethod(AttributionMethod):
+  """Base class for perturbation-based attribution methods."""
+
+  def __init__(self, T, X, session, keras_learning_phase):
+    super(PerturbationBasedMethod, self).__init__(T, X, session,
+                                                  keras_learning_phase)
+    self.base_activation = None
+
+
+# -----------------------------------------------------------------------------
+# ATTRIBUTION METHODS
+# -----------------------------------------------------------------------------
+"""
+Returns zero attributions. For testing only.
+"""
+
+
+class DummyZero(GradientBasedMethod):
+
+  def get_symbolic_attribution(self,):
+    return tf.gradients(self.T, self.X)
+
+  @classmethod
+  def nonlinearity_grad_override(cls, op, grad):
+    input = op.inputs[0]
+    return tf.zeros_like(input)
+
+
+"""
+Saliency maps
+https://arxiv.org/abs/1312.6034
+"""
+
+
+class Saliency(GradientBasedMethod):
+
+  def get_symbolic_attribution(self):
+    return [tf.abs(g) for g in tf.gradients(self.T, self.X)]
+
+
+"""
+Gradient * Input
+https://arxiv.org/pdf/1704.02685.pdf - https://arxiv.org/abs/1611.07270
+"""
+
+
+class GradientXInput(GradientBasedMethod):
+
+  def get_symbolic_attribution(self):
+    return [
+        g * x for g, x in zip(
+            tf.gradients(self.T, self.X),
+            self.X if self.has_multiple_inputs else [self.X])
+    ]
+
+
+"""
+Integrated Gradients
+https://arxiv.org/pdf/1703.01365.pdf
+"""
+
+
+class IntegratedGradients(GradientBasedMethod):
+
+  def __init__(self,
+               T,
+               X,
+               session,
+               keras_learning_phase,
+               steps=100,
+               baseline=None):
+    self.steps = steps
+    self.baseline = baseline
+    super(IntegratedGradients, self).__init__(T, X, session,
+                                              keras_learning_phase)
+
+  def run(self, xs, ys=None, batch_size=None):
+    self._check_input_compatibility(xs, ys, batch_size)
+
+    gradient = None
+    for alpha in list(np.linspace(1. / self.steps, 1.0, self.steps)):
+      xs_mod = [b + (x - b) * alpha for x, b in zip(xs, self.baseline)] if self.has_multiple_inputs \
+          else self.baseline + (xs - self.baseline) * alpha
+      _attr = self._session_run(self.explain_symbolic(), xs_mod, ys, batch_size)
+      if gradient is None:
+        gradient = _attr
+      else:
+        gradient = [g + a for g, a in zip(gradient, _attr)]
+
+    results = [
+        g * (x - b) / self.steps for g, x, b in zip(
+            gradient, xs if self.has_multiple_inputs else [xs],
+            self.baseline if self.has_multiple_inputs else [self.baseline])
+    ]
+
+    return results[0] if not self.has_multiple_inputs else results
+
+
+"""
+Layer-wise Relevance Propagation with epsilon rule
+http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0130140
+"""
+
+
+class EpsilonLRP(GradientBasedMethod):
+  eps = None
+
+  def __init__(self, T, X, session, keras_learning_phase, epsilon=1e-4):
+    assert epsilon > 0.0, 'LRP epsilon must be greater than zero'
+    global eps
+    eps = epsilon
+    super(EpsilonLRP, self).__init__(T, X, session, keras_learning_phase)
+
+  def get_symbolic_attribution(self):
+    return [
+        g * x for g, x in zip(
+            tf.gradients(self.T, self.X),
+            self.X if self.has_multiple_inputs else [self.X])
+    ]
+
+  @classmethod
+  def nonlinearity_grad_override(cls, op, grad):
+    output = op.outputs[0]
+    input = op.inputs[0]
+    return grad * output / (
+        input + eps *
+        tf.where(input >= 0, tf.ones_like(input), -1 * tf.ones_like(input)))
+
+
+"""
+DeepLIFT
+This reformulation only considers the "Rescale" rule
+https://arxiv.org/abs/1704.02685
+"""
+
+
+class DeepLIFTRescale(GradientBasedMethod):
+
+  _deeplift_ref = {}
+
+  def __init__(self, T, X, session, keras_learning_phase, baseline=None):
+    self.baseline = baseline
+    super(DeepLIFTRescale, self).__init__(T, X, session, keras_learning_phase)
+
+  def get_symbolic_attribution(self):
+    return [
+        g * (x - b) for g, x, b in zip(
+            tf.gradients(self.T, self.X),
+            self.X if self.has_multiple_inputs else [self.X],
+            self.baseline if self.has_multiple_inputs else [self.baseline])
+    ]
+
+  @classmethod
+  def nonlinearity_grad_override(cls, op, grad):
+    output = op.outputs[0]
+    input = op.inputs[0]
+    ref_input = cls._deeplift_ref[op.name]
+    ref_output = activation(op.type)(ref_input)
+    delta_out = output - ref_output
+    delta_in = input - ref_input
+    instant_grad = activation(op.type)(0.5 * (ref_input + input))
+    return tf.where(
+        tf.abs(delta_in) > 1e-5, grad * delta_out / delta_in,
+        original_grad(instant_grad.op, grad))
+
+  def _init_references(self):
+    # print ('DeepLIFT: computing references...')
+    sys.stdout.flush()
+    self._deeplift_ref.clear()
+    ops = []
+    g = tf.get_default_graph()
+    for op in g.get_operations():
+      if len(op.inputs) > 0 and not op.name.startswith('gradients'):
+        if op.type in SUPPORTED_ACTIVATIONS:
+          ops.append(op)
+    YR = self._session_run([o.inputs[0] for o in ops], self.baseline)
+    for (r, op) in zip(YR, ops):
+      self._deeplift_ref[op.name] = r
+    # print('DeepLIFT: references ready')
+    sys.stdout.flush()
+
+
+"""
+Occlusion method
+Generalization of the grey-box method presented in https://arxiv.org/pdf/1311.2901.pdf
+This method performs a systematic perturbation of contiguous hyperpatches in the input,
+replacing each patch with a user-defined value (by default 0).
+window_shape : integer or tuple of length xs_ndim
+Defines the shape of the elementary n-dimensional orthotope the rolling window view.
+If an integer is given, the shape will be a hypercube of sidelength given by its value.
+step : integer or tuple of length xs_ndim
+Indicates step size at which extraction shall be performed.
+If integer is given, then the step is uniform in all dimensions.
+"""
+
+
+class Occlusion(PerturbationBasedMethod):
+
+  def __init__(self,
+               T,
+               X,
+               session,
+               keras_learning_phase,
+               window_shape=None,
+               step=None):
+    super(Occlusion, self).__init__(T, X, session, keras_learning_phase)
+    if self.has_multiple_inputs:
+      raise RuntimeError(
+          'Multiple inputs not yet supported for perturbation methods')
+
+    input_shape = X[0].get_shape().as_list()
+    if window_shape is not None:
+      assert len(window_shape) == len(input_shape), \
+          'window_shape must have length of input (%d)' % len(input_shape)
+      self.window_shape = tuple(window_shape)
+    else:
+      self.window_shape = (1,) * len(input_shape)
+
+    if step is not None:
+      assert isinstance(step, int) or len(step) == len(input_shape), \
+          'step must be integer or tuple with the length of input (%d)' % len(input_shape)
+      self.step = step
+    else:
+      self.step = 1
+    self.replace_value = 0.0
+    logging.info('Input shape: %s; window_shape %s; step %s' %
+                 (input_shape, self.window_shape, self.step))
+
+  def run(self, xs, ys=None, batch_size=None):
+    self._check_input_compatibility(xs, ys, batch_size)
+    input_shape = xs.shape[1:]
+    batch_size = xs.shape[0]
+    total_dim = np.asscalar(np.prod(input_shape))
+
+    # Create mask
+    index_matrix = np.arange(total_dim).reshape(input_shape)
+    idx_patches = view_as_windows(index_matrix, self.window_shape,
+                                  self.step).reshape((-1,) + self.window_shape)
+    heatmap = np.zeros_like(xs, dtype=np.float32).reshape((-1), total_dim)
+    w = np.zeros_like(heatmap)
+
+    # Compute original output
+    eval0 = self._session_run(self.T, xs, ys, batch_size)
+
+    # Start perturbation loop
+    for i, p in enumerate(idx_patches):
+      mask = np.ones(input_shape).flatten()
+      mask[p.flatten()] = self.replace_value
+      masked_xs = mask.reshape((1,) + input_shape) * xs
+      delta = eval0 - self._session_run(self.T, masked_xs, ys, batch_size)
+      delta_aggregated = np.sum(
+          delta.reshape((batch_size, -1)), -1, keepdims=True)
+      heatmap[:, p.flatten()] += delta_aggregated
+      w[:, p.flatten()] += p.size
+
+    attribution = np.reshape(heatmap / w, xs.shape)
+    if np.isnan(attribution).any():
+      warnings.warn(
+          'Attributions generated by Occlusion method contain nans, '
+          'probably because window_shape and step do not allow to cover the all input.'
+      )
+    return attribution
+
+
+"""
+Shapley Value sampling
+Computes approximate Shapley Values using "Polynomial calculation of the Shapley value based on sampling",
+Castro et al, 2009 (https://www.sciencedirect.com/science/article/pii/S0305054808000804)
+samples : integer (default 5)
+Defined the number of samples for each input feature.
+Notice that evaluating a model samples * n_input_feature times might take a while.
+sampling_dims : list of dimension indexes to run sampling on (feature dimensions).
+By default, all dimensions except the batch dimension will be sampled.
+For example, with a 4-D tensor that contains color images, single color channels are sampled.
+To sample pixels, instead, use sampling_dims=[1,2]
+"""
+
+
+class ShapleySampling(PerturbationBasedMethod):
+
+  def __init__(self,
+               T,
+               X,
+               session,
+               keras_learning_phase,
+               samples=5,
+               sampling_dims=None):
+    super(ShapleySampling, self).__init__(T, X, session, keras_learning_phase)
+    if self.has_multiple_inputs:
+      raise RuntimeError(
+          'Multiple inputs not yet supported for perturbation methods')
+    dims = len(X.shape)
+    if sampling_dims is not None:
+      if not 0 < len(sampling_dims) <= (dims - 1):
+        raise RuntimeError(
+            'sampling_dims must be a list containing 1 to %d elements' %
+            (dims - 1))
+      if 0 in sampling_dims:
+        raise RuntimeError(
+            'Cannot sample batch dimension: remove 0 from sampling_dims')
+      if any([x < 1 or x > dims - 1 for x in sampling_dims]):
+        raise RuntimeError('Invalid value in sampling_dims')
+    else:
+      sampling_dims = list(range(1, dims))
+
+    self.samples = samples
+    self.sampling_dims = sampling_dims
+
+  def run(self, xs, ys=None, batch_size=None):
+    xs_shape = list(xs.shape)
+    batch_size = xs.shape[0]
+    n_features = int(
+        np.asscalar(np.prod([xs.shape[i] for i in self.sampling_dims])))
+    result = np.zeros((xs_shape[0], n_features))
+
+    run_shape = list(xs_shape)  # a copy
+    run_shape = np.delete(run_shape, self.sampling_dims).tolist()
+    run_shape.insert(1, -1)
+
+    reconstruction_shape = [xs_shape[0]]
+    for j in self.sampling_dims:
+      reconstruction_shape.append(xs_shape[j])
+
+    for r in range(self.samples):
+      p = np.random.permutation(n_features)
+      x = xs.copy().reshape(run_shape)
+      y = None
+      for i in p:
+        if y is None:
+          y = self._session_run(self.T, x.reshape(xs_shape), ys, batch_size)
+        x[:, i] = 0
+        y0 = self._session_run(self.T, x.reshape(xs_shape), ys, batch_size)
+        delta = y - y0
+        delta_aggregated = np.sum(
+            delta.reshape((batch_size, -1)), -1, keepdims=False)
+        result[:, i] += delta_aggregated
+        y = y0
+
+    shapley = result / self.samples
+    return shapley.reshape(reconstruction_shape)
+
+
+# -----------------------------------------------------------------------------
+# END ATTRIBUTION METHODS
+# -----------------------------------------------------------------------------
+
+attribution_methods = OrderedDict({
+    'zero': (DummyZero, 0),
+    'saliency': (Saliency, 1),
+    'grad*input': (GradientXInput, 2),
+    'intgrad': (IntegratedGradients, 3),
+    'elrp': (EpsilonLRP, 4),
+    'deeplift': (DeepLIFTRescale, 5),
+    'occlusion': (Occlusion, 6),
+    'shapley_sampling': (ShapleySampling, 7)
+})
+
+
+@ops.RegisterGradient('DeepExplainGrad')
+def deepexplain_grad(op, grad):
+  global _ENABLED_METHOD_CLASS, _GRAD_OVERRIDE_CHECKFLAG
+  _GRAD_OVERRIDE_CHECKFLAG = 1
+  if _ENABLED_METHOD_CLASS is not None \
+          and issubclass(_ENABLED_METHOD_CLASS, GradientBasedMethod):
+    return _ENABLED_METHOD_CLASS.nonlinearity_grad_override(op, grad)
+  else:
+    return original_grad(op, grad)
+
+
+class DeepExplain(object):
+
+  def __init__(self, graph=None, session=tf.get_default_session()):
+    self.method = None
+    self.batch_size = None
+    self.session = session
+    self.graph = session.graph if graph is None else graph
+    self.graph_context = self.graph.as_default()
+    self.override_context = self.graph.gradient_override_map(
+        self.get_override_map())
+    self.keras_phase_placeholder = None
+    self.context_on = False
+    if self.session is None:
+      raise RuntimeError(
+          'DeepExplain: could not retrieve a session. Use DeepExplain(session=your_session).'
+      )
+
+  def __enter__(self):
+    # Override gradient of all ops created in context
+    self.graph_context.__enter__()
+    self.override_context.__enter__()
+    self.context_on = True
+    return self
+
+  def __exit__(self, type, value, traceback):
+    self.graph_context.__exit__(type, value, traceback)
+    self.override_context.__exit__(type, value, traceback)
+    self.context_on = False
+
+  def get_explainer(self, method, T, X, **kwargs):
+    if not self.context_on:
+      raise RuntimeError(
+          'Explain can be called only within a DeepExplain context.')
+    global _ENABLED_METHOD_CLASS, _GRAD_OVERRIDE_CHECKFLAG
+    self.method = method
+    if self.method in attribution_methods:
+      method_class, method_flag = attribution_methods[self.method]
+    else:
+      raise RuntimeError('Method must be in %s' %
+                         list(attribution_methods.keys()))
+    if isinstance(X, list):
+      for x in X:
+        if 'tensor' not in str(type(x)).lower():
+          raise RuntimeError(
+              'If a list, X must contain only Tensorflow Tensor objects')
+    else:
+      if 'tensor' not in str(type(X)).lower():
+        raise RuntimeError(
+            'X must be a Tensorflow Tensor object or a list of them')
+
+    if 'tensor' not in str(type(T)).lower():
+      raise RuntimeError('T must be a Tensorflow Tensor object')
+
+    logging.info('DeepExplain: running "%s" explanation method (%d)' %
+                 (self.method, method_flag))
+    self._check_ops()
+    _GRAD_OVERRIDE_CHECKFLAG = 0
+
+    _ENABLED_METHOD_CLASS = method_class
+    method = _ENABLED_METHOD_CLASS(
+        T,
+        X,
+        self.session,
+        keras_learning_phase=self.keras_phase_placeholder,
+        **kwargs)
+
+    if issubclass(_ENABLED_METHOD_CLASS,
+                  GradientBasedMethod) and _GRAD_OVERRIDE_CHECKFLAG == 0:
+      warnings.warn(
+          'DeepExplain detected you are trying to use an attribution method that requires '
+          'gradient override but the original gradient was used instead. You might have forgot to '
+          '(re)create your graph within the DeepExlain context. Results are not reliable!'
+      )
+    _ENABLED_METHOD_CLASS = None
+    _GRAD_OVERRIDE_CHECKFLAG = 0
+    self.keras_phase_placeholder = None
+    return method
+
+  def explain(self, method, T, X, xs, ys=None, batch_size=None, **kwargs):
+    explainer = self.get_explainer(method, T, X, **kwargs)
+    return explainer.run(xs, ys, batch_size)
+
+  @staticmethod
+  def get_override_map():
+    return dict((a, 'DeepExplainGrad') for a in SUPPORTED_ACTIVATIONS)
+
+  def _check_ops(self):
+    """Heuristically check if any op is in the list of unsupported activation functions.
+
+    This does not cover all cases where explanation methods would fail, and must be improved in the future.
+    Also, check if the placeholder named 'keras_learning_phase' exists in the graph. This is used by Keras
+     and needs to be passed in feed_dict.
+    :return:
+    """
+    g = tf.get_default_graph()
+    for op in g.get_operations():
+      if len(op.inputs) > 0 and not op.name.startswith('gradients'):
+        if op.type in UNSUPPORTED_ACTIVATIONS:
+          warnings.warn('Detected unsupported activation (%s). '
+                        'This might lead to unexpected or wrong results.' %
+                        op.type)
+      elif 'keras_learning_phase' in op.name:
+        self.keras_phase_placeholder = op.outputs[0]
diff --git a/easy_rec/python/tools/explainer/utils.py b/easy_rec/python/tools/explainer/utils.py
new file mode 100644
index 000000000..574d067a8
--- /dev/null
+++ b/easy_rec/python/tools/explainer/utils.py
@@ -0,0 +1,70 @@
+import numpy as np
+import tensorflow as tf
+
+# Some of the following functions for batch processing have been borrowed and adapter from Keras
+# https://github.com/keras-team/keras/blob/master/keras/utils/generic_utils.py
+# https://github.com/keras-team/keras/blob/master/keras/engine/training_utils.py
+
+
+def make_batches(size, batch_size):
+  """Returns a list of batch indices (tuples of indices).
+
+  # Arguments
+      size: Integer, total size of the data to slice into batches.
+      batch_size: Integer, batch size.
+  # Returns
+      A list of tuples of array indices.
+  """
+  num_batches = (size + batch_size - 1) // batch_size  # round up
+  return [(i * batch_size, min(size, (i + 1) * batch_size))
+          for i in range(num_batches)]
+
+
+def to_list(x, allow_tuple=False):
+  """Normalizes a list/tensor into a list. If a tensor is passed, we return a list of size 1 containing the tensor.
+
+  # Arguments
+      x: target object to be normalized.
+      allow_tuple: If False and x is a tuple,
+          it will be converted into a list
+          with a single element (the tuple).
+          Else converts the tuple to a list.
+  # Returns
+      A list.
+  """
+  if isinstance(x, list):
+    return x
+  if allow_tuple and isinstance(x, tuple):
+    return list(x)
+  return [x]
+
+
+def unpack_singleton(x):
+  """Gets the equivalent np-array if the iterable has only one value. Otherwise return the iterable.
+
+  # Argument
+      x: A list or tuple.
+  # Returns
+      The same iterable or the iterable converted to a np-array.
+  """
+  if len(x) == 1:
+    return np.array(x)
+  return x
+
+
+def slice_arrays(arrays, start=None, stop=None):
+  """Slices an array or list of arrays."""
+  if arrays is None:
+    return [None]
+  elif isinstance(arrays, list):
+    return [None if x is None else x[start:stop] for x in arrays]
+  else:
+    return arrays[start:stop]
+
+
+def placeholder_from_data(numpy_array):
+  if numpy_array is None:
+    return None
+  return tf.placeholder('float', [
+      None,
+  ] + list(numpy_array.shape[1:]))
diff --git a/easy_rec/python/tools/feature_selection.py b/easy_rec/python/tools/feature_selection.py
index 05b193897..bd31fef9b 100644
--- a/easy_rec/python/tools/feature_selection.py
+++ b/easy_rec/python/tools/feature_selection.py
@@ -10,6 +10,7 @@
 import tensorflow as tf
 from tensorflow.python.framework.meta_graph import read_meta_graph_file
 
+from easy_rec.python.protos.feature_config_pb2 import FeatureConfig
 from easy_rec.python.utils import config_util
 
 if tf.__version__ >= '2.0':
@@ -19,8 +20,9 @@
 matplotlib.use('Agg')  # NOQA
 import matplotlib.pyplot as plt  # NOQA
 
-tf.app.flags.DEFINE_string('model_type', 'variational_dropout',
-                           'feature selection model type')
+tf.app.flags.DEFINE_enum('model_type', 'variational_dropout',
+                         ['variational_dropout', 'fscd'],
+                         'feature selection model type')
 tf.app.flags.DEFINE_string('config_path', '',
                            'feature selection model config path')
 tf.app.flags.DEFINE_string('checkpoint_path', None,
@@ -294,6 +296,159 @@ def _visualize_feature_importance(self, feature_importance, group_name):
       plt.savefig(f, format='png')
 
 
+class FSCD(object):
+
+  def __init__(self,
+               config_path,
+               output_dir,
+               topk,
+               checkpoint_path=None,
+               fg_path=None,
+               visualize=False):
+    self._config_path = config_path
+    self._output_dir = output_dir
+    self._topk = topk
+    if not tf.gfile.Exists(self._output_dir):
+      tf.gfile.MakeDirs(self._output_dir)
+    self._checkpoint_path = checkpoint_path
+    self._fg_path = fg_path
+    self._visualize = visualize
+
+  def process(self):
+    tf.logging.info('Loading delta of FSCD layer ...')
+    config = config_util.get_configs_from_pipeline_file(self._config_path)
+    assert config.model_config.HasField(
+        'variational_dropout'), 'variational_dropout must be in model_config'
+
+    feature_importance_map = {}
+    white_feature_group = set()
+    from easy_rec.python.layers.fscd_layer import get_feature_importance
+    for feature_group in config.model_config.feature_groups:
+      group_name = feature_group.group_name
+      tf.logging.info('Calculating %s feature importance ...' % group_name)
+      feature_importance = get_feature_importance(config, group_name)
+      if len(feature_importance) == 0:
+        tf.logging.info('No feature importance in group %s' % group_name)
+        white_feature_group.add(group_name)
+        continue
+      feature_importance_map[group_name] = feature_importance
+
+      tf.logging.info('Dump %s  feature importance to csv ...' % group_name)
+      self._dump_to_csv(feature_importance, group_name)
+
+      if self._visualize:
+        tf.logging.info('Visualizing %s feature importance ...' % group_name)
+        self._visualize_feature_importance(feature_importance, group_name)
+
+    tf.logging.info('Processing model config ...')
+    self._process_config(feature_importance_map, white_feature_group)
+
+  def _dump_to_csv(self, feature_importance, group_name):
+    """Dump feature importance data to a csv file."""
+    with tf.gfile.Open(
+        os.path.join(self._output_dir,
+                     'feature_importance_%s.csv' % group_name), 'w') as f:
+      df = pd.DataFrame(
+          columns=['feature_name', 'importance'],
+          data=[list(kv) for kv in feature_importance.items()])
+      df.to_csv(f, encoding='gbk')
+
+  def _visualize_feature_importance(self, feature_importance, group_name):
+    """Draw feature importance histogram."""
+    df = pd.DataFrame(
+        columns=['feature_name', 'importance'],
+        data=[list(kv) for kv in feature_importance.items()])
+    df['color'] = ['red' if x < 0.5 else 'green' for x in df['importance']]
+    df.sort_values('importance', inplace=True, ascending=False)
+    df.reset_index(inplace=True)
+    # Draw plot
+    plt.figure(figsize=(90, 200), dpi=100)
+    plt.hlines(y=df.index, xmin=0, xmax=df.importance)
+    for x, y, tex in zip(df.importance, df.index, df.importance):
+      plt.text(
+          x,
+          y,
+          round(tex, 2),
+          horizontalalignment='right' if x < 0 else 'left',
+          verticalalignment='center',
+          fontdict={
+              'color': 'red' if x < 0 else 'green',
+              'size': 14
+          })
+    # Decorations
+    plt.yticks(df.index, df.feature_name, fontsize=20)
+    plt.title('Feature Importance', fontdict={'size': 30})
+    plt.grid(linestyle='--', alpha=0.5)
+    plt.xlim(0, 1)
+    with tf.gfile.GFile(
+        os.path.join(self._output_dir,
+                     'feature_importance_pic_%s.png' % group_name), 'wb') as f:
+      plt.savefig(f, format='png')
+
+  def _process_config(self, feature_importance_map, white_feature_group):
+    """Process model config and fg config with feature selection."""
+    excluded_features = set()
+    for group_name, feature_importance in feature_importance_map.items():
+      for i, (feature_name, _) in enumerate(feature_importance.items()):
+        if i >= self._topk:
+          excluded_features.add(feature_name)
+
+    config = config_util.get_configs_from_pipeline_file(self._config_path)
+    # keep sequence features and side-infos
+    sequence_features = set()
+    for feature_group in config.model_config.feature_groups:
+      for sequence_feature in feature_group.sequence_features:
+        for seq_att_map in sequence_feature.seq_att_map:
+          for key in seq_att_map.key:
+            sequence_features.add(key)
+          for hist_seq in seq_att_map.hist_seq:
+            sequence_features.add(hist_seq)
+    # compat with din
+    for sequence_feature in config.model_config.seq_att_groups:
+      for seq_att_map in sequence_feature.seq_att_map:
+        for key in seq_att_map.key:
+          sequence_features.add(key)
+        for hist_seq in seq_att_map.hist_seq:
+          sequence_features.add(hist_seq)
+    # sequence feature group
+    for feature_group in config.model_config.feature_groups:
+      group_name = feature_group.group_name
+      if group_name not in white_feature_group:
+        continue
+      for feature_name in feature_group.feature_names:
+        sequence_features.add(feature_name)
+
+    excluded_features = excluded_features - sequence_features
+
+    for feature_config in config_util.get_compatible_feature_configs(config):
+      feature_name = feature_config.input_names[0]
+      if feature_config.HasField('feature_name'):
+        feature_name = feature_config.feature_name
+      if feature_name in excluded_features:
+        feature_config.feature_type = FeatureConfig.FeatureType.ConstFeature
+
+    config.model_config.ClearField('variational_dropout')
+    config_util.save_message(
+        config,
+        os.path.join(self._output_dir, os.path.basename(self._config_path)))
+
+    if self._fg_path is not None and len(self._fg_path) > 0:
+      with tf.gfile.Open(self._fg_path) as f:
+        fg_json = json.load(f, object_pairs_hook=OrderedDict)
+        features = []
+        for feature in fg_json['features']:
+          if 'feature_name' in feature:
+            if feature['feature_name'] not in excluded_features:
+              features.append(feature)
+          else:
+            features.append(feature)
+        fg_json['features'] = features
+
+      fg_file = os.path.join(self._output_dir, os.path.basename(self._fg_path))
+      with tf.gfile.Open(fg_file, 'w') as f:
+        json.dump(fg_json, f, indent=4)
+
+
 if __name__ == '__main__':
   if FLAGS.model_type == 'variational_dropout':
     fs = VariationalDropoutFS(
@@ -304,6 +459,15 @@ def _visualize_feature_importance(self, feature_importance, group_name):
         fg_path=FLAGS.fg_path,
         visualize=FLAGS.visualize)
     fs.process()
+  elif FLAGS.model_type == 'fscd':
+    fs = FSCD(
+        FLAGS.config_path,
+        FLAGS.output_dir,
+        FLAGS.topk,
+        checkpoint_path=FLAGS.checkpoint_path,
+        fg_path=FLAGS.fg_path,
+        visualize=FLAGS.visualize)
+    fs.process()
   else:
     raise ValueError('Unknown feature selection model type %s' %
                      FLAGS.model_type)
diff --git a/easy_rec/python/tools/view_saved_model.py b/easy_rec/python/tools/view_saved_model.py
new file mode 100644
index 000000000..022bcf1aa
--- /dev/null
+++ b/easy_rec/python/tools/view_saved_model.py
@@ -0,0 +1,39 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import argparse
+import logging
+
+from google.protobuf import text_format
+from tensorflow.core.protobuf import saved_model_pb2
+from tensorflow.python.platform.gfile import GFile
+
+logging.basicConfig(
+    format='[%(levelname)s] %(asctime)s %(filename)s:%(lineno)d : %(message)s',
+    level=logging.INFO)
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      '--input', type=str, default=None, help='saved model path')
+  parser.add_argument(
+      '--output', type=str, default=None, help='saved model save path')
+  args = parser.parse_args()
+
+  assert args.input is not None and args.output is not None
+
+  logging.info('saved_model_path: %s' % args.input)
+
+  saved_model = saved_model_pb2.SavedModel()
+  if args.input.endswith('.pb'):
+    with GFile(args.input, 'rb') as fin:
+      saved_model.ParseFromString(fin.read())
+  else:
+    with GFile(args.input, 'r') as fin:
+      text_format.Merge(fin.read(), saved_model)
+
+  if args.output.endswith('.pbtxt'):
+    with GFile(args.output, 'w') as fout:
+      fout.write(text_format.MessageToString(saved_model, as_utf8=True))
+  else:
+    with GFile(args.output, 'wb') as fout:
+      fout.write(saved_model.SerializeToString())
diff --git a/easy_rec/python/train_eval.py b/easy_rec/python/train_eval.py
index bdb65eb0a..f12784ac1 100644
--- a/easy_rec/python/train_eval.py
+++ b/easy_rec/python/train_eval.py
@@ -95,8 +95,12 @@
       help='is use check mode')
   parser.add_argument(
       '--selected_cols', type=str, default=None, help='select input columns')
+  parser.add_argument('--gpu', type=str, default=None, help='gpu id')
   args, extra_args = parser.parse_known_args()
 
+  if args.gpu is not None:
+    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
+
   edit_config_json = {}
   if args.edit_config_json:
     edit_config_json = json.loads(args.edit_config_json)
diff --git a/easy_rec/python/utils/__init__.py b/easy_rec/python/utils/__init__.py
index e69de29bb..09dc89476 100644
--- a/easy_rec/python/utils/__init__.py
+++ b/easy_rec/python/utils/__init__.py
@@ -0,0 +1,15 @@
+class conditional(object):
+  """Wrap another context manager and enter it only if condition is true."""
+
+  def __init__(self, condition, contextmanager):
+    self.condition = condition
+    self.contextmanager = contextmanager
+
+  def __enter__(self):
+    """Conditionally enter a context manager."""
+    if self.condition:
+      return self.contextmanager.__enter__()
+
+  def __exit__(self, *args):
+    if self.condition:
+      return self.contextmanager.__exit__(*args)
diff --git a/easy_rec/python/utils/config_util.py b/easy_rec/python/utils/config_util.py
index b63a02f71..e35175be9 100644
--- a/easy_rec/python/utils/config_util.py
+++ b/easy_rec/python/utils/config_util.py
@@ -5,6 +5,7 @@
 Such as Hyper parameter tuning or automatic feature expanding.
 """
 
+import argparse
 import datetime
 import json
 import logging
@@ -605,3 +606,144 @@ def process_multi_file_input_path(sampler_config_input_path):
     input_path = sampler_config_input_path
 
   return input_path
+
+
+def change_configured_embedding_dim(pipeline_config_path, groups, emb_dim):
+  """Change the embedding dimension of the features in groups.
+
+  Args:
+    pipeline_config_path: Path to pipeline_pb2.EasyRecConfig text
+      proto.
+    groups: the names of feature group to be changed
+    emb_dim: target embedding dimension
+
+  Returns:
+    Dictionary of configuration objects. Keys are `model`, `train_config`,
+      `train_input_config`, `eval_config`, `eval_input_config`. Value are the
+      corresponding config objects.
+  """
+  pipeline_config = get_configs_from_pipeline_file(pipeline_config_path, False)
+
+  target_groups = set(groups.split(','))
+  features = set()
+  conf = pipeline_config.model_config
+  for group in conf.feature_groups:
+    if group.group_name not in target_groups:
+      continue
+    for feature in group.feature_names:
+      features.add(feature)
+
+  feature_configs = get_compatible_feature_configs(pipeline_config)
+  for fea_conf in feature_configs:
+    fea_name = fea_conf.input_names[0]
+    if fea_conf.HasField('feature_name'):
+      fea_name = fea_conf.feature_name
+    if fea_name in features:
+      fea_conf.embedding_dim = emb_dim
+
+  return pipeline_config
+
+
+def remove_redundant_config(pipeline_config_path, remove_input=False):
+  """Remove redundant configs from a file containing pipeline_pb2.EasyRecConfig.
+
+  Args:
+    pipeline_config_path: Path to pipeline_pb2.EasyRecConfig text
+      proto.
+    remove_input: whether to remove input configs
+
+  Returns:
+    Dictionary of configuration objects. Keys are `model`, `train_config`,
+      `train_input_config`, `eval_config`, `eval_input_config`. Value are the
+      corresponding config objects.
+  """
+  pipeline_config = get_configs_from_pipeline_file(pipeline_config_path, False)
+
+  inputs = set()
+  features = set()
+  conf = pipeline_config.model_config
+  for group in conf.feature_groups:
+    for feature in group.feature_names:
+      features.add(feature)
+
+  feature_configs = get_compatible_feature_configs(pipeline_config)
+  offset = 0
+  for i in range(len(feature_configs)):
+    fea_conf = feature_configs[i - offset]
+    fea_name = fea_conf.input_names[0]
+    if fea_conf.HasField('feature_name'):
+      fea_name = fea_conf.feature_name
+    if fea_name not in features:
+      logging.info("redundant feature:" + fea_name)
+      del feature_configs[i - offset]
+      offset += 1
+    elif remove_input:
+      for input_name in fea_conf.input_names:
+        inputs.add(input_name)
+
+  if remove_input:
+    for label in pipeline_config.data_config.label_fields:
+      inputs.add(label)
+    input_fields = pipeline_config.data_config.input_fields
+    offset = 0
+    for i in range(len(input_fields)):
+      field = input_fields[i - offset]
+      if field.input_name not in inputs:
+        del input_fields[i - offset]
+        offset += 1
+  return pipeline_config
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+    '--cmd',
+    type=str,
+    choices=['format', 'set_emb_dim', 'rm_redundancy'],
+    required=True,
+    help='Path to pipeline config file.')
+  parser.add_argument(
+      '-c', '--pipeline_config_path',
+      type=str,
+      default=None,
+      required=True,
+      help='Path to pipeline config file.')
+  parser.add_argument(
+      '-g', '--feature_groups',
+      type=str,
+      default=None,
+      help='The name of feature group to be changed.')
+  parser.add_argument(
+      '--rm_input',
+      type=bool,
+      default=False,
+      help='Whether to remove redundancy input.')
+  parser.add_argument(
+      '-d', '--embedding_dim',
+      type=int,
+      default=None,
+      help='The embedding dim to be changed to.')
+  parser.add_argument(
+      '-o', '--save_config_path',
+      type=str,
+      default=None,
+      required=True,
+      help='Path to save changed config.')
+
+  args, extra_args = parser.parse_known_args()
+  if args.cmd == 'format':
+    config = get_configs_from_pipeline_file(args.pipeline_config_path)
+    save_message(config, args.save_config_path)
+  elif args.cmd == 'set_emb_dim':
+    if args.feature_groups is None:
+      raise ValueError('--feature_groups must be set')
+    if args.embedding_dim is None:
+      raise ValueError('--embedding_dim must be set')
+
+    config = change_configured_embedding_dim(args.pipeline_config_path,
+                                             args.feature_groups,
+                                             args.embedding_dim)
+    save_message(config, args.save_config_path)
+  elif args.cmd == 'rm_redundancy':
+    config = remove_redundant_config(args.pipeline_config_path)
+    save_message(config, args.save_config_path)
diff --git a/easy_rec/python/utils/dag.py b/easy_rec/python/utils/dag.py
new file mode 100644
index 000000000..00646f732
--- /dev/null
+++ b/easy_rec/python/utils/dag.py
@@ -0,0 +1,205 @@
+from collections import OrderedDict
+from collections import defaultdict
+from copy import copy
+from copy import deepcopy
+
+
+class DAG(object):
+  """Directed acyclic graph implementation."""
+
+  def __init__(self):
+    """Construct a new DAG with no nodes or edges."""
+    self.reset_graph()
+
+  def add_node(self, node_name, graph=None):
+    """Add a node if it does not exist yet, or error out."""
+    if not graph:
+      graph = self.graph
+    if node_name in graph:
+      raise KeyError('node %s already exists' % node_name)
+    graph[node_name] = set()
+
+  def add_node_if_not_exists(self, node_name, graph=None):
+    try:
+      self.add_node(node_name, graph=graph)
+    except KeyError:
+      pass
+
+  def delete_node(self, node_name, graph=None):
+    """Deletes this node and all edges referencing it."""
+    if not graph:
+      graph = self.graph
+    if node_name not in graph:
+      raise KeyError('node %s does not exist' % node_name)
+    graph.pop(node_name)
+
+    for node, edges in graph.items():
+      if node_name in edges:
+        edges.remove(node_name)
+
+  def delete_node_if_exists(self, node_name, graph=None):
+    try:
+      self.delete_node(node_name, graph=graph)
+    except KeyError:
+      pass
+
+  def add_edge(self, ind_node, dep_node, graph=None):
+    """Add an edge (dependency) between the specified nodes."""
+    if not graph:
+      graph = self.graph
+    if ind_node not in graph or dep_node not in graph:
+      raise KeyError('one or more nodes do not exist in graph')
+    test_graph = deepcopy(graph)
+    test_graph[ind_node].add(dep_node)
+    is_valid, message = self.validate(test_graph)
+    if is_valid:
+      graph[ind_node].add(dep_node)
+    else:
+      raise Exception()
+
+  def delete_edge(self, ind_node, dep_node, graph=None):
+    """Delete an edge from the graph."""
+    if not graph:
+      graph = self.graph
+    if dep_node not in graph.get(ind_node, []):
+      raise KeyError('this edge does not exist in graph')
+    graph[ind_node].remove(dep_node)
+
+  def rename_edges(self, old_task_name, new_task_name, graph=None):
+    """Change references to a task in existing edges."""
+    if not graph:
+      graph = self.graph
+    for node, edges in graph.items():
+
+      if node == old_task_name:
+        graph[new_task_name] = copy(edges)
+        del graph[old_task_name]
+
+      else:
+        if old_task_name in edges:
+          edges.remove(old_task_name)
+          edges.add(new_task_name)
+
+  def predecessors(self, node, graph=None):
+    """Returns a list of all predecessors of the given node."""
+    if graph is None:
+      graph = self.graph
+    return [key for key in graph if node in graph[key]]
+
+  def downstream(self, node, graph=None):
+    """Returns a list of all nodes this node has edges towards."""
+    if graph is None:
+      graph = self.graph
+    if node not in graph:
+      raise KeyError('node %s is not in graph' % node)
+    return list(graph[node])
+
+  def all_downstreams(self, node, graph=None):
+    """Returns a list of all nodes ultimately downstream of the given node in the dependency graph.
+
+    in topological order.
+    """
+    if graph is None:
+      graph = self.graph
+    nodes = [node]
+    nodes_seen = set()
+    i = 0
+    while i < len(nodes):
+      downstreams = self.downstream(nodes[i], graph)
+      for downstream_node in downstreams:
+        if downstream_node not in nodes_seen:
+          nodes_seen.add(downstream_node)
+          nodes.append(downstream_node)
+      i += 1
+    return list(
+        filter(lambda node: node in nodes_seen,
+               self.topological_sort(graph=graph)))
+
+  def all_leaves(self, graph=None):
+    """Return a list of all leaves (nodes with no downstreams)."""
+    if graph is None:
+      graph = self.graph
+    return [key for key in graph if not graph[key]]
+
+  def from_dict(self, graph_dict):
+    """Reset the graph and build it from the passed dictionary.
+
+    The dictionary takes the form of {node_name: [directed edges]}
+    """
+    self.reset_graph()
+    for new_node in graph_dict.keys():
+      self.add_node(new_node)
+    for ind_node, dep_nodes in graph_dict.items():
+      if not isinstance(dep_nodes, list):
+        raise TypeError('dict values must be lists')
+      for dep_node in dep_nodes:
+        self.add_edge(ind_node, dep_node)
+
+  def reset_graph(self):
+    """Restore the graph to an empty state."""
+    self.graph = OrderedDict()
+
+  def ind_nodes(self, graph=None):
+    """Returns a list of all nodes in the graph with no dependencies."""
+    if graph is None:
+      graph = self.graph
+
+    dependent_nodes = set(
+        node for dependents in graph.values() for node in dependents)
+    return [node for node in graph.keys() if node not in dependent_nodes]
+
+  def validate(self, graph=None):
+    """Returns (Boolean, message) of whether DAG is valid."""
+    graph = graph if graph is not None else self.graph
+    if len(self.ind_nodes(graph)) == 0:
+      return False, 'no independent nodes detected'
+    try:
+      self.topological_sort(graph)
+    except ValueError:
+      return False, 'failed topological sort'
+    return True, 'valid'
+
+  def topological_sort(self, graph=None):
+    """Returns a topological ordering of the DAG.
+
+    Raises an error if this is not possible (graph is not valid).
+    """
+    if graph is None:
+      graph = self.graph
+    result = []
+    in_degree = defaultdict(lambda: 0)
+
+    for u in graph:
+      for v in graph[u]:
+        in_degree[v] += 1
+    ready = [node for node in graph if not in_degree[node]]
+
+    while ready:
+      u = ready.pop()
+      result.append(u)
+      for v in graph[u]:
+        in_degree[v] -= 1
+        if in_degree[v] == 0:
+          ready.append(v)
+
+    if len(result) == len(graph):
+      return result
+    else:
+      raise ValueError('graph is not acyclic')
+
+  def size(self):
+    return len(self.graph)
+
+
+if __name__ == '__main__':
+  dag = DAG()
+  dag.add_node('a')
+  dag.add_node('b')
+  dag.add_node('c')
+  dag.add_node('d')
+  dag.add_edge('a', 'b')
+  dag.add_edge('a', 'd')
+  dag.add_edge('b', 'c')
+  print(dag.topological_sort())
+  print(dag.graph)
+  print(dag.all_downstreams('b'))
diff --git a/easy_rec/python/utils/load_class.py b/easy_rec/python/utils/load_class.py
index 2da1e4e41..9ac749c76 100644
--- a/easy_rec/python/utils/load_class.py
+++ b/easy_rec/python/utils/load_class.py
@@ -220,3 +220,30 @@ def create_class(cls, name):
       return newclass
 
   return RegisterABCMeta
+
+
+def load_keras_layer(name):
+  """Load keras layer class.
+
+  Args:
+    name: keras layer name
+
+  Return:
+    (layer_class, is_customize)
+  """
+  name = name.strip()
+  if name == '' or name is None:
+    return None
+
+  path = 'easy_rec.python.layers.keras.' + name
+  try:
+    cls = pydoc.locate(path)
+    if cls is not None:
+      return cls, True
+    path = 'tensorflow.keras.layers.' + name
+    return pydoc.locate(path), False
+  except pydoc.ErrorDuringImport:
+    print('load keras layer %s failed' % name)
+    logging.error('load keras layer %s failed: %s' %
+                  (name, traceback.format_exc()))
+    return None, False
diff --git a/easy_rec/python/utils/tf_utils.py b/easy_rec/python/utils/tf_utils.py
index 20e19496c..160a2f67a 100644
--- a/easy_rec/python/utils/tf_utils.py
+++ b/easy_rec/python/utils/tf_utils.py
@@ -33,3 +33,51 @@ def get_col_type(tf_type):
   }
   assert tf_type in type_map, 'invalid type: %s' % tf_type
   return type_map[tf_type]
+
+
+def get_config_type(tf_type):
+  type_map = {
+      tf.int32: DatasetConfig.INT32,
+      tf.int64: DatasetConfig.INT64,
+      tf.string: DatasetConfig.STRING,
+      tf.bool: DatasetConfig.BOOL,
+      tf.float32: DatasetConfig.FLOAT,
+      tf.double: DatasetConfig.DOUBLE
+  }
+  assert tf_type in type_map, 'invalid type: %s' % tf_type
+  return type_map[tf_type]
+
+
+# def add_op(inputs):
+#   if not isinstance(inputs, list):
+#     return inputs
+#   if len(inputs) == 1:
+#     if isinstance(inputs[0], list):
+#       return tf.keras.layers.Add()(inputs[0])
+#     return inputs[0]
+#   return tf.keras.layers.Add()(inputs)
+
+# def dot_op(features):
+#   """Compute inner dot between any two pair tensors.
+#
+#   Args:
+#     features: must be one of
+#     - List of 2D tensor with shape: ``(batch_size,embedding_size)``.
+#     - Or a 3D tensor with shape: ``(batch_size,field_size,embedding_size)``
+#   Return:
+#     - 2D tensor with shape: ``(batch_size, 1)``.
+#   """
+#   if isinstance(features, (list, tuple)):
+#     features = tf.stack(features, axis=1)
+#   assert features.shape.ndims == 3, 'input of dot func must be a 3D tensor or a list of 2D tensors'
+#
+#   batch_size = tf.shape(features)[0]
+#   matrixdot = tf.matmul(features, features, transpose_b=True)
+#   feature_dim = matrixdot.shape[-1]
+#
+#   ones_mat = tf.ones_like(matrixdot)
+#   lower_tri_mat = ones_mat - tf.linalg.band_part(ones_mat, 0, -1)
+#   lower_tri_mask = tf.cast(lower_tri_mat, tf.bool)
+#   result = tf.boolean_mask(matrixdot, lower_tri_mask)
+#   output_dim = feature_dim * (feature_dim - 1) // 2
+#   return tf.reshape(result, (batch_size, output_dim))
diff --git a/easy_rec/version.py b/easy_rec/version.py
index f70f1bfba..520cefe3d 100644
--- a/easy_rec/version.py
+++ b/easy_rec/version.py
@@ -1,3 +1,3 @@
 # -*- encoding:utf-8 -*-
 # Copyright (c) Alibaba, Inc. and its affiliates.
-__version__ = '0.6.3'
+__version__ = '1.0.0'
diff --git a/examples/configs/dcn_backbone_on_movielens.config b/examples/configs/dcn_backbone_on_movielens.config
new file mode 100644
index 000000000..3376db96f
--- /dev/null
+++ b/examples/configs/dcn_backbone_on_movielens.config
@@ -0,0 +1,203 @@
+train_input_path: "examples/data/movielens_1m/movies_train_data"
+eval_input_path: "examples/data/movielens_1m/movies_test_data"
+model_dir: "examples/ckpt/dcn_on_movieslen"
+
+train_config {
+  log_step_count_steps: 100
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 2000
+  sync_replicas: false
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+  metrics_set: {
+    gauc {
+      uid_field: 'user_id'
+    }
+  }
+  metrics_set: {
+    max_f1 {}
+  }
+}
+
+data_config {
+  input_fields {
+    input_name:'label'
+    input_type: INT32
+  }
+  input_fields {
+    input_name:'user_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'movie_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name:'rating'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'gender'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'age'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'job_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'zip_id'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'title'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'genres'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'year'
+    input_type: INT32
+  }
+
+  label_fields: 'label'
+  batch_size: 1024
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+  separator: '\t'
+}
+
+feature_config: {
+  features: {
+    input_names: 'user_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 12000
+  }
+  features: {
+    input_names: 'movie_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 6000
+  }
+  features: {
+    input_names: 'gender'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 2
+  }
+  features: {
+    input_names: 'job_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 21
+  }
+  features: {
+    input_names: 'age'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 7
+  }
+  features: {
+    input_names: 'genres'
+    feature_type: TagFeature
+    separator: '|'
+    embedding_dim: 16
+    hash_bucket_size: 100
+  }
+  features: {
+    input_names: 'title'
+    feature_type: SequenceFeature
+    separator: ' '
+    embedding_dim: 16
+    hash_bucket_size: 10000
+    sequence_combiner: {
+      text_cnn: {
+        filter_sizes: [2, 3, 4]
+        num_filters: [16, 8, 8]
+      }
+    }
+  }
+  features: {
+    input_names: 'year'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 36
+  }
+}
+model_config: {
+  model_name: 'DCN v2'
+  model_class: 'RankModel'
+  feature_groups: {
+    group_name: 'all'
+    feature_names: 'user_id'
+    feature_names: 'movie_id'
+    feature_names: 'job_id'
+    feature_names: 'age'
+    feature_names: 'gender'
+    feature_names: 'year'
+    feature_names: 'genres'
+    wide_deep: DEEP
+  }
+  backbone {
+    blocks {
+      name: "deep"
+      inputs {
+        feature_group_name: 'all'
+      }
+      keras_layer {
+        class_name: 'MLP'
+        mlp {
+          hidden_units: [256, 128, 64]
+        }
+      }
+    }
+    blocks {
+      name: "dcn"
+      inputs {
+        feature_group_name: 'all'
+        input_fn: 'lambda x: [x, x]'
+      }
+      recurrent {
+        num_steps: 3
+        fixed_input_index: 0
+        keras_layer {
+          class_name: 'Cross'
+        }
+      }
+    }
+    concat_blocks: ['deep', 'dcn']
+    top_mlp {
+      hidden_units: [64, 32, 16]
+    }
+  }
+  rank_model {
+    l2_regularization: 1e-4
+  }
+  embedding_regularization: 1e-4
+}
+export_config {
+  multi_placeholder: false
+}
diff --git a/examples/configs/deepfm_backbone_on_criteo.config b/examples/configs/deepfm_backbone_on_criteo.config
new file mode 100644
index 000000000..06c60f966
--- /dev/null
+++ b/examples/configs/deepfm_backbone_on_criteo.config
@@ -0,0 +1,635 @@
+train_input_path: "examples/data/criteo/criteo_train_data"
+eval_input_path: "examples/data/criteo/criteo_test_data"
+model_dir: "examples/ckpt/deepfm_backbone_criteo"
+
+train_config {
+  log_step_count_steps: 500
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 20000
+  sync_replicas: True
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+}
+
+data_config {
+  separator: "\t"
+  input_fields: {
+    input_name: "label"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F1"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F2"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F3"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F4"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F5"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F6"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F7"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F8"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F9"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F10"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F11"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F12"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F13"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "C1"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C2"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C3"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C4"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C5"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C6"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C7"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C8"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C9"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C10"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C11"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C12"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C13"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C14"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C15"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C16"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C17"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C18"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C19"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C20"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C21"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C22"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C23"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C24"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C25"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C26"
+    input_type: STRING
+    default_val:""
+  }
+  label_fields: "label"
+
+  batch_size: 4096
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+}
+
+feature_config: {
+  features: {
+    input_names: "F1"
+    embedding_dim: 16
+    feature_type: RawFeature
+    min_val:0.0
+    max_val: 5775.0
+  }
+  features: {
+    input_names: "F2"
+    embedding_dim: 16
+    feature_type: RawFeature
+    min_val: -3.0
+    max_val: 257675.0
+  }
+  features: {
+    input_names: "F3"
+    embedding_dim: 16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 65535.0
+  }
+  features: {
+    input_names: "F4"
+    embedding_dim: 16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 969.0
+  }
+  features: {
+    input_names: "F5"
+    embedding_dim: 16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 23159456.0
+  }
+  features: {
+    input_names: "F6"
+    embedding_dim: 16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 431037.0
+  }
+  features: {
+    input_names: "F7"
+    embedding_dim: 16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 56311.0
+  }
+  features: {
+    input_names: "F8"
+    embedding_dim: 16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 6047.0
+  }
+  features: {
+    input_names: "F9"
+    embedding_dim: 16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 29019.0
+  }
+  features: {
+    input_names: "F10"
+    embedding_dim: 16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 46.0
+  }
+  features: {
+    input_names: "F11"
+    embedding_dim: 16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 231.0
+  }
+  features: {
+    input_names: "F12"
+    embedding_dim: 16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 4008.0
+  }
+  features: {
+    input_names: "F13"
+    embedding_dim: 16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 7393.0
+  }
+  features: {
+    input_names: "C1"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C2"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C3"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C4"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C5"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C6"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C7"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C8"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C9"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C10"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C11"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C12"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C13"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C14"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C15"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C16"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C17"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C18"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C19"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C20"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C21"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C22"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C23"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C24"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }features: {
+    input_names: "C25"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C26"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+}
+model_config: {
+  model_name: 'DeepFM'
+  model_class: 'RankModel'
+  feature_groups: {
+    group_name: "deep_features"
+    feature_names: "F1"
+    feature_names: "F2"
+    feature_names: "F3"
+    feature_names: "F4"
+    feature_names: "F5"
+    feature_names: "F6"
+    feature_names: "F7"
+    feature_names: "F8"
+    feature_names: "F9"
+    feature_names: "F10"
+    feature_names: "F11"
+    feature_names: "F12"
+    feature_names: "F13"
+    feature_names: "C1"
+    feature_names: "C2"
+    feature_names: "C3"
+    feature_names: "C4"
+    feature_names: "C5"
+    feature_names: "C6"
+    feature_names: "C7"
+    feature_names: "C8"
+    feature_names: "C9"
+    feature_names: "C10"
+    feature_names: "C11"
+    feature_names: "C12"
+    feature_names: "C13"
+    feature_names: "C14"
+    feature_names: "C15"
+    feature_names: "C16"
+    feature_names: "C17"
+    feature_names: "C18"
+    feature_names: "C19"
+    feature_names: "C20"
+    feature_names: "C21"
+    feature_names: "C22"
+    feature_names: "C23"
+    feature_names: "C24"
+    feature_names: "C25"
+    feature_names: "C26"
+    wide_deep:DEEP
+  }
+  feature_groups: {
+    group_name: "wide_features"
+    feature_names: "F1"
+    feature_names: "F2"
+    feature_names: "F3"
+    feature_names: "F4"
+    feature_names: "F5"
+    feature_names: "F6"
+    feature_names: "F7"
+    feature_names: "F8"
+    feature_names: "F9"
+    feature_names: "F10"
+    feature_names: "F11"
+    feature_names: "F12"
+    feature_names: "F13"
+    feature_names: "C1"
+    feature_names: "C2"
+    feature_names: "C3"
+    feature_names: "C4"
+    feature_names: "C5"
+    feature_names: "C6"
+    feature_names: "C7"
+    feature_names: "C8"
+    feature_names: "C9"
+    feature_names: "C10"
+    feature_names: "C11"
+    feature_names: "C12"
+    feature_names: "C13"
+    feature_names: "C14"
+    feature_names: "C15"
+    feature_names: "C16"
+    feature_names: "C17"
+    feature_names: "C18"
+    feature_names: "C19"
+    feature_names: "C20"
+    feature_names: "C21"
+    feature_names: "C22"
+    feature_names: "C23"
+    feature_names: "C24"
+    feature_names: "C25"
+    feature_names: "C26"
+    wide_deep:WIDE
+  }
+  backbone {
+    blocks {
+      name: 'wide_logit'
+      inputs {
+        feature_group_name: 'wide_features'
+      }
+      lambda {
+        expression: 'lambda x: tf.reduce_sum(x, axis=1, keepdims=True)'
+      }
+    }
+    blocks {
+      name: 'deep_features'
+      inputs {
+        feature_group_name: 'deep_features'
+      }
+      input_layer {
+        output_2d_tensor_and_feature_list: true
+      }
+    }
+    blocks {
+      name: 'fm'
+      inputs {
+        block_name: 'deep_features'
+        input_fn: 'lambda x: x[1]'
+      }
+      keras_layer {
+        class_name: 'FM'
+        st_params {
+          fields {
+            key: 'use_variant'
+            value { bool_value: true }
+          }
+        }
+      }
+    }
+    blocks {
+      name: 'deep'
+      inputs {
+        block_name: 'deep_features'
+        input_fn: 'lambda x: x[0]'
+      }
+      keras_layer {
+        class_name: 'MLP'
+        mlp {
+          hidden_units: [256, 128, 64]
+        }
+      }
+    }
+    concat_blocks: ['wide_logit', 'fm', 'deep']
+    top_mlp {
+      hidden_units: [256, 128, 64]
+    }
+  }
+  rank_model {
+    l2_regularization: 1e-5
+    wide_output_dim: 1
+  }
+  embedding_regularization: 1e-5
+}
diff --git a/examples/configs/deepfm_backbone_on_criteo_with_autodis.config b/examples/configs/deepfm_backbone_on_criteo_with_autodis.config
new file mode 100644
index 000000000..9d1856cae
--- /dev/null
+++ b/examples/configs/deepfm_backbone_on_criteo_with_autodis.config
@@ -0,0 +1,751 @@
+train_input_path: "examples/data/criteo/criteo_train_data"
+eval_input_path: "examples/data/criteo/criteo_test_data"
+model_dir: "examples/ckpt/deepfm_autodis_criteo"
+
+train_config {
+  log_step_count_steps: 500
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 20000
+  sync_replicas: True
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+}
+
+data_config {
+  separator: "\t"
+  input_fields: {
+    input_name: "label"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F1"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F2"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F3"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F4"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F5"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F6"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F7"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F8"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F9"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F10"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F11"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F12"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F13"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "C1"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C2"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C3"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C4"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C5"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C6"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C7"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C8"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C9"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C10"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C11"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C12"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C13"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C14"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C15"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C16"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C17"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C18"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C19"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C20"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C21"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C22"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C23"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C24"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C25"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C26"
+    input_type: STRING
+    default_val:""
+  }
+  label_fields: "label"
+
+  batch_size: 4096
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+}
+
+feature_config: {
+  features: {
+    input_names: "F1"
+    feature_type: RawFeature
+    min_val:0.0
+    max_val: 5775.0
+  }
+  features: {
+    input_names: "F2"
+    feature_type: RawFeature
+    min_val: -3.0
+    max_val: 257675.0
+  }
+  features: {
+    input_names: "F3"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 65535.0
+  }
+  features: {
+    input_names: "F4"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 969.0
+  }
+  features: {
+    input_names: "F5"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 23159456.0
+  }
+  features: {
+    input_names: "F6"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 431037.0
+  }
+  features: {
+    input_names: "F7"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 56311.0
+  }
+  features: {
+    input_names: "F8"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 6047.0
+  }
+  features: {
+    input_names: "F9"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 29019.0
+  }
+  features: {
+    input_names: "F10"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 46.0
+  }
+  features: {
+    input_names: "F11"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 231.0
+  }
+  features: {
+    input_names: "F12"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 4008.0
+  }
+  features: {
+    input_names: "F13"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 7393.0
+  }
+  features: {
+    input_names: "C1"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C2"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C3"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C4"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C5"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C6"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C7"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C8"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C9"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C10"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C11"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C12"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C13"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C14"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C15"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C16"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C17"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C18"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C19"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C20"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C21"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C22"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C23"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C24"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }features: {
+    input_names: "C25"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C26"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    feature_name: "D1"
+    input_names: "F1"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val:0.0
+    max_val: 5775.0
+  }
+  features: {
+    feature_name: "D2"
+    input_names: "F2"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: -3.0
+    max_val: 257675.0
+  }
+  features: {
+    feature_name: "D3"
+    input_names: "F3"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 65535.0
+  }
+  features: {
+    feature_name: "D4"
+    input_names: "F4"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 969.0
+  }
+  features: {
+    feature_name: "D5"
+    input_names: "F5"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 23159456.0
+  }
+  features: {
+    feature_name: "D6"
+    input_names: "F6"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 431037.0
+  }
+  features: {
+    feature_name: "D7"
+    input_names: "F7"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 56311.0
+  }
+  features: {
+    feature_name: "D8"
+    input_names: "F8"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 6047.0
+  }
+  features: {
+    feature_name: "D9"
+    input_names: "F9"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 29019.0
+  }
+  features: {
+    feature_name: "D10"
+    input_names: "F10"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 46.0
+  }
+  features: {
+    feature_name: "D11"
+    input_names: "F11"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 231.0
+  }
+  features: {
+    feature_name: "D12"
+    input_names: "F12"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 4008.0
+  }
+  features: {
+    feature_name: "D13"
+    input_names: "F13"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 7393.0
+  }
+}
+model_config: {
+  model_name: 'DeepFM with AutoDis'
+  model_class: 'RankModel'
+  feature_groups: {
+    group_name: "numerical_features"
+    feature_names: "F1"
+    feature_names: "F2"
+    feature_names: "F3"
+    feature_names: "F4"
+    feature_names: "F5"
+    feature_names: "F6"
+    feature_names: "F7"
+    feature_names: "F8"
+    feature_names: "F9"
+    feature_names: "F10"
+    feature_names: "F11"
+    feature_names: "F12"
+    feature_names: "F13"
+    wide_deep:DEEP
+  }
+  feature_groups: {
+    group_name: "categorical_features"
+    feature_names: "C1"
+    feature_names: "C2"
+    feature_names: "C3"
+    feature_names: "C4"
+    feature_names: "C5"
+    feature_names: "C6"
+    feature_names: "C7"
+    feature_names: "C8"
+    feature_names: "C9"
+    feature_names: "C10"
+    feature_names: "C11"
+    feature_names: "C12"
+    feature_names: "C13"
+    feature_names: "C14"
+    feature_names: "C15"
+    feature_names: "C16"
+    feature_names: "C17"
+    feature_names: "C18"
+    feature_names: "C19"
+    feature_names: "C20"
+    feature_names: "C21"
+    feature_names: "C22"
+    feature_names: "C23"
+    feature_names: "C24"
+    feature_names: "C25"
+    feature_names: "C26"
+    wide_deep:DEEP
+  }
+  feature_groups: {
+    group_name: "wide_features"
+    feature_names: "D1"
+    feature_names: "D2"
+    feature_names: "D3"
+    feature_names: "D4"
+    feature_names: "D5"
+    feature_names: "D6"
+    feature_names: "D7"
+    feature_names: "D8"
+    feature_names: "D9"
+    feature_names: "D10"
+    feature_names: "D11"
+    feature_names: "D12"
+    feature_names: "D13"
+    feature_names: "C1"
+    feature_names: "C2"
+    feature_names: "C3"
+    feature_names: "C4"
+    feature_names: "C5"
+    feature_names: "C6"
+    feature_names: "C7"
+    feature_names: "C8"
+    feature_names: "C9"
+    feature_names: "C10"
+    feature_names: "C11"
+    feature_names: "C12"
+    feature_names: "C13"
+    feature_names: "C14"
+    feature_names: "C15"
+    feature_names: "C16"
+    feature_names: "C17"
+    feature_names: "C18"
+    feature_names: "C19"
+    feature_names: "C20"
+    feature_names: "C21"
+    feature_names: "C22"
+    feature_names: "C23"
+    feature_names: "C24"
+    feature_names: "C25"
+    feature_names: "C26"
+    wide_deep:WIDE
+  }
+  backbone {
+    blocks {
+      name: 'wide_logit'
+      inputs {
+        feature_group_name: 'wide_features'
+      }
+      lambda {
+        expression: 'lambda x: tf.reduce_sum(x, axis=1, keepdims=True)'
+      }
+    }
+    blocks {
+      name: 'num_emb'
+      inputs {
+        feature_group_name: 'numerical_features'
+      }
+      keras_layer {
+        class_name: 'AutoDisEmbedding'
+        auto_dis_embedding {
+          embedding_dim: 16
+          num_bins: 20
+          temperature: 0.815
+          output_tensor_list: true
+        }
+      }
+    }
+    blocks {
+      name: 'categorical_features'
+      inputs {
+        feature_group_name: 'categorical_features'
+      }
+      input_layer {
+        output_2d_tensor_and_feature_list: true
+      }
+    }
+    blocks {
+      name: 'fm'
+      inputs {
+        block_name: 'categorical_features'
+        input_fn: 'lambda x: x[1]'
+      }
+      inputs {
+        block_name: 'num_emb'
+        input_fn: 'lambda x: x[1]'
+      }
+      keras_layer {
+        class_name: 'FM'
+        fm {
+          use_variant: true
+        }
+      }
+    }
+    blocks {
+      name: 'deep'
+      inputs {
+        block_name: 'categorical_features'
+        input_fn: 'lambda x: x[0]'
+      }
+      inputs {
+        block_name: 'num_emb'
+        input_fn: 'lambda x: x[0]'
+      }
+      keras_layer {
+        class_name: 'MLP'
+        mlp {
+          hidden_units: [256, 128, 64]
+        }
+      }
+    }
+    # no wide_logit may have better performance
+    concat_blocks: ['wide_logit', 'fm', 'deep']
+    top_mlp {
+      hidden_units: [256, 128, 64]
+    }
+  }
+  rank_model {
+    l2_regularization: 1e-5
+    wide_output_dim: 1
+  }
+  embedding_regularization: 1e-5
+}
diff --git a/examples/configs/deepfm_backbone_on_criteo_with_periodic.config b/examples/configs/deepfm_backbone_on_criteo_with_periodic.config
new file mode 100644
index 000000000..3ce65c8bf
--- /dev/null
+++ b/examples/configs/deepfm_backbone_on_criteo_with_periodic.config
@@ -0,0 +1,749 @@
+train_input_path: "examples/data/criteo/criteo_train_data"
+eval_input_path: "examples/data/criteo/criteo_test_data"
+model_dir: "examples/ckpt/deepfm_periodic_criteo"
+
+train_config {
+  log_step_count_steps: 500
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 20000
+  sync_replicas: True
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+}
+
+data_config {
+  separator: "\t"
+  input_fields: {
+    input_name: "label"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F1"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F2"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F3"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F4"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F5"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F6"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F7"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F8"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F9"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F10"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F11"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F12"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F13"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "C1"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C2"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C3"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C4"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C5"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C6"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C7"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C8"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C9"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C10"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C11"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C12"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C13"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C14"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C15"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C16"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C17"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C18"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C19"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C20"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C21"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C22"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C23"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C24"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C25"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C26"
+    input_type: STRING
+    default_val:""
+  }
+  label_fields: "label"
+
+  batch_size: 4096
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+}
+
+feature_config: {
+  features: {
+    input_names: "F1"
+    feature_type: RawFeature
+    min_val:0.0
+    max_val: 5775.0
+  }
+  features: {
+    input_names: "F2"
+    feature_type: RawFeature
+    min_val: -3.0
+    max_val: 257675.0
+  }
+  features: {
+    input_names: "F3"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 65535.0
+  }
+  features: {
+    input_names: "F4"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 969.0
+  }
+  features: {
+    input_names: "F5"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 23159456.0
+  }
+  features: {
+    input_names: "F6"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 431037.0
+  }
+  features: {
+    input_names: "F7"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 56311.0
+  }
+  features: {
+    input_names: "F8"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 6047.0
+  }
+  features: {
+    input_names: "F9"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 29019.0
+  }
+  features: {
+    input_names: "F10"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 46.0
+  }
+  features: {
+    input_names: "F11"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 231.0
+  }
+  features: {
+    input_names: "F12"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 4008.0
+  }
+  features: {
+    input_names: "F13"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 7393.0
+  }
+  features: {
+    input_names: "C1"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C2"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C3"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C4"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C5"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C6"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C7"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C8"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C9"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C10"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C11"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C12"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C13"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C14"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C15"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C16"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C17"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C18"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C19"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C20"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C21"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C22"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C23"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C24"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }features: {
+    input_names: "C25"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C26"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    feature_name: "D1"
+    input_names: "F1"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val:0.0
+    max_val: 5775.0
+  }
+  features: {
+    feature_name: "D2"
+    input_names: "F2"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: -3.0
+    max_val: 257675.0
+  }
+  features: {
+    feature_name: "D3"
+    input_names: "F3"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 65535.0
+  }
+  features: {
+    feature_name: "D4"
+    input_names: "F4"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 969.0
+  }
+  features: {
+    feature_name: "D5"
+    input_names: "F5"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 23159456.0
+  }
+  features: {
+    feature_name: "D6"
+    input_names: "F6"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 431037.0
+  }
+  features: {
+    feature_name: "D7"
+    input_names: "F7"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 56311.0
+  }
+  features: {
+    feature_name: "D8"
+    input_names: "F8"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 6047.0
+  }
+  features: {
+    feature_name: "D9"
+    input_names: "F9"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 29019.0
+  }
+  features: {
+    feature_name: "D10"
+    input_names: "F10"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 46.0
+  }
+  features: {
+    feature_name: "D11"
+    input_names: "F11"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 231.0
+  }
+  features: {
+    feature_name: "D12"
+    input_names: "F12"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 4008.0
+  }
+  features: {
+    feature_name: "D13"
+    input_names: "F13"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 7393.0
+  }
+}
+model_config: {
+  model_name: 'DeepFM with Periodic'
+  model_class: 'RankModel'
+  feature_groups: {
+    group_name: "numerical_features"
+    feature_names: "F1"
+    feature_names: "F2"
+    feature_names: "F3"
+    feature_names: "F4"
+    feature_names: "F5"
+    feature_names: "F6"
+    feature_names: "F7"
+    feature_names: "F8"
+    feature_names: "F9"
+    feature_names: "F10"
+    feature_names: "F11"
+    feature_names: "F12"
+    feature_names: "F13"
+    wide_deep:DEEP
+  }
+  feature_groups: {
+    group_name: "categorical_features"
+    feature_names: "C1"
+    feature_names: "C2"
+    feature_names: "C3"
+    feature_names: "C4"
+    feature_names: "C5"
+    feature_names: "C6"
+    feature_names: "C7"
+    feature_names: "C8"
+    feature_names: "C9"
+    feature_names: "C10"
+    feature_names: "C11"
+    feature_names: "C12"
+    feature_names: "C13"
+    feature_names: "C14"
+    feature_names: "C15"
+    feature_names: "C16"
+    feature_names: "C17"
+    feature_names: "C18"
+    feature_names: "C19"
+    feature_names: "C20"
+    feature_names: "C21"
+    feature_names: "C22"
+    feature_names: "C23"
+    feature_names: "C24"
+    feature_names: "C25"
+    feature_names: "C26"
+    wide_deep:DEEP
+  }
+  feature_groups: {
+    group_name: "wide_features"
+    feature_names: "D1"
+    feature_names: "D2"
+    feature_names: "D3"
+    feature_names: "D4"
+    feature_names: "D5"
+    feature_names: "D6"
+    feature_names: "D7"
+    feature_names: "D8"
+    feature_names: "D9"
+    feature_names: "D10"
+    feature_names: "D11"
+    feature_names: "D12"
+    feature_names: "D13"
+    feature_names: "C1"
+    feature_names: "C2"
+    feature_names: "C3"
+    feature_names: "C4"
+    feature_names: "C5"
+    feature_names: "C6"
+    feature_names: "C7"
+    feature_names: "C8"
+    feature_names: "C9"
+    feature_names: "C10"
+    feature_names: "C11"
+    feature_names: "C12"
+    feature_names: "C13"
+    feature_names: "C14"
+    feature_names: "C15"
+    feature_names: "C16"
+    feature_names: "C17"
+    feature_names: "C18"
+    feature_names: "C19"
+    feature_names: "C20"
+    feature_names: "C21"
+    feature_names: "C22"
+    feature_names: "C23"
+    feature_names: "C24"
+    feature_names: "C25"
+    feature_names: "C26"
+    wide_deep:WIDE
+  }
+  backbone {
+    blocks {
+      name: 'wide_logit'
+      inputs {
+        feature_group_name: 'wide_features'
+      }
+      lambda {
+        expression: 'lambda x: tf.reduce_sum(x, axis=1, keepdims=True)'
+      }
+    }
+    blocks {
+      name: 'num_emb'
+      inputs {
+        feature_group_name: 'numerical_features'
+      }
+      keras_layer {
+        class_name: 'PeriodicEmbedding'
+        periodic_embedding {
+          embedding_dim: 16
+          sigma: 0.005
+          output_tensor_list: true
+        }
+      }
+    }
+    blocks {
+      name: 'categorical_features'
+      inputs {
+        feature_group_name: 'categorical_features'
+      }
+      input_layer {
+        output_2d_tensor_and_feature_list: true
+      }
+    }
+    blocks {
+      name: 'fm'
+      inputs {
+        block_name: 'categorical_features'
+        input_fn: 'lambda x: x[1]'
+      }
+      inputs {
+        block_name: 'num_emb'
+        input_fn: 'lambda x: x[1]'
+      }
+      keras_layer {
+        class_name: 'FM'
+        fm {
+          use_variant: true
+        }
+      }
+    }
+    blocks {
+      name: 'deep'
+      inputs {
+        block_name: 'categorical_features'
+        input_fn: 'lambda x: x[0]'
+      }
+      inputs {
+        block_name: 'num_emb'
+        input_fn: 'lambda x: x[0]'
+      }
+      keras_layer {
+        class_name: 'MLP'
+        mlp {
+          hidden_units: [256, 128, 64]
+        }
+      }
+    }
+    concat_blocks: ['wide_logit', 'fm', 'deep']
+    top_mlp {
+      hidden_units: [256, 128, 64]
+    }
+  }
+  rank_model {
+    l2_regularization: 1e-5
+    wide_output_dim: 1
+  }
+  embedding_regularization: 1e-5
+}
diff --git a/examples/configs/deepfm_backbone_on_movielens.config b/examples/configs/deepfm_backbone_on_movielens.config
new file mode 100644
index 000000000..36ef7ace3
--- /dev/null
+++ b/examples/configs/deepfm_backbone_on_movielens.config
@@ -0,0 +1,246 @@
+train_input_path: "examples/data/movielens_1m/movies_train_data"
+eval_input_path: "examples/data/movielens_1m/movies_test_data"
+model_dir: "examples/ckpt/deepfm_backbone_movieslen"
+
+train_config {
+  log_step_count_steps: 100
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 2000
+  sync_replicas: True
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+  metrics_set: {
+    gauc {
+      uid_field: 'user_id'
+    }
+  }
+  metrics_set: {
+    max_f1 {}
+  }
+}
+
+data_config {
+  input_fields {
+    input_name:'label'
+    input_type: INT32
+  }
+  input_fields {
+    input_name:'user_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'movie_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name:'rating'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'gender'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'age'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'job_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'zip_id'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'title'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'genres'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'year'
+    input_type: INT32
+  }
+
+  label_fields: 'label'
+  batch_size: 1024
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+  separator: '\t'
+}
+
+feature_config: {
+  features: {
+    input_names: 'user_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 12000
+  }
+  features: {
+    input_names: 'movie_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 6000
+  }
+  features: {
+    input_names: 'gender'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 2
+  }
+  features: {
+    input_names: 'job_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 21
+  }
+  features: {
+    input_names: 'age'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 7
+  }
+  features: {
+    input_names: 'genres'
+    feature_type: TagFeature
+    separator: '|'
+    embedding_dim: 16
+    hash_bucket_size: 100
+  }
+  features: {
+    input_names: 'title'
+    feature_type: SequenceFeature
+    separator: ' '
+    embedding_dim: 16
+    hash_bucket_size: 10000
+    sequence_combiner: {
+      text_cnn: {
+        filter_sizes: [2, 3, 4]
+        num_filters: [8, 4, 4]
+      }
+    }
+  }
+  features: {
+    input_names: 'year'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 36
+  }
+}
+model_config: {
+  model_name: 'DeepFM'
+  model_class: 'RankModel'
+  feature_groups: {
+    group_name: 'wide'
+    feature_names: 'user_id'
+    feature_names: 'movie_id'
+    feature_names: 'job_id'
+    feature_names: 'age'
+    feature_names: 'gender'
+    feature_names: 'year'
+    feature_names: 'genres'
+    wide_deep: WIDE
+  }
+  feature_groups: {
+    group_name: 'features'
+    feature_names: 'user_id'
+    feature_names: 'movie_id'
+    feature_names: 'job_id'
+    feature_names: 'age'
+    feature_names: 'gender'
+    feature_names: 'year'
+    feature_names: 'genres'
+    feature_names: 'title'
+    wide_deep: DEEP
+  }
+  backbone {
+    blocks {
+      name: 'wide_logit'
+      inputs {
+        feature_group_name: 'wide'
+      }
+      lambda {
+        expression: 'lambda x: tf.reduce_sum(x, axis=1, keepdims=True)'
+      }
+    }
+    blocks {
+      name: 'features'
+      inputs {
+        feature_group_name: 'features'
+      }
+      input_layer {
+        output_2d_tensor_and_feature_list: true
+      }
+    }
+    blocks {
+      name: 'fm'
+      inputs {
+        block_name: 'features'
+        input_fn: 'lambda x: x[1]'
+      }
+      keras_layer {
+        class_name: 'FM'
+      }
+    }
+    blocks {
+      name: 'deep'
+      inputs {
+        block_name: 'features'
+        input_fn: 'lambda x: x[0]'
+      }
+      keras_layer {
+        class_name: 'MLP'
+        mlp {
+          hidden_units: [256, 128, 64, 1]
+          use_final_bn: false
+          final_activation: 'linear'
+        }
+      }
+    }
+    blocks {
+      name: 'add'
+      inputs {
+        block_name: 'wide_logit'
+      }
+      inputs {
+        block_name: 'fm'
+      }
+      inputs {
+        block_name: 'deep'
+      }
+      merge_inputs_into_list: true
+      keras_layer {
+        class_name: 'Add'
+      }
+    }
+    concat_blocks: 'add'
+  }
+  rank_model {
+    l2_regularization: 1e-4
+    wide_output_dim: 1
+  }
+  embedding_regularization: 1e-4
+}
+export_config {
+  multi_placeholder: false
+}
diff --git a/examples/configs/deepfm_on_criteo.config b/examples/configs/deepfm_on_criteo.config
index c482cf246..fc8537f0d 100644
--- a/examples/configs/deepfm_on_criteo.config
+++ b/examples/configs/deepfm_on_criteo.config
@@ -241,91 +241,91 @@ data_config {
 feature_config: {
   features: {
     input_names: "F1"
-    embedding_dim:10
+    embedding_dim:16
     feature_type: RawFeature
     min_val:0.0
     max_val: 5775.0
   }
   features: {
     input_names: "F2"
-    embedding_dim:10
+    embedding_dim:16
     feature_type: RawFeature
     min_val: -3.0
     max_val: 257675.0
   }
   features: {
     input_names: "F3"
-    embedding_dim:10
+    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 65535.0
   }
   features: {
     input_names: "F4"
-    embedding_dim:10
+    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 969.0
   }
   features: {
     input_names: "F5"
-    embedding_dim:10
+    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 23159456.0
   }
   features: {
     input_names: "F6"
-    embedding_dim:10
+    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 431037.0
   }
   features: {
     input_names: "F7"
-    embedding_dim:10
+    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 56311.0
   }
   features: {
     input_names: "F8"
-    embedding_dim:10
+    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 6047.0
   }
   features: {
     input_names: "F9"
-    embedding_dim:10
+    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 29019.0
   }
   features: {
     input_names: "F10"
-    embedding_dim:10
+    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 46.0
   }
   features: {
     input_names: "F11"
-    embedding_dim:10
+    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 231.0
   }
   features: {
     input_names: "F12"
-    embedding_dim:10
+    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 4008.0
   }
   features: {
     input_names: "F13"
-    embedding_dim:10
+    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 7393.0
diff --git a/examples/configs/deepfm_on_movielens.config b/examples/configs/deepfm_on_movielens.config
index cab092c20..0468ae12f 100644
--- a/examples/configs/deepfm_on_movielens.config
+++ b/examples/configs/deepfm_on_movielens.config
@@ -137,7 +137,7 @@ feature_config: {
     sequence_combiner: {
       text_cnn: {
         filter_sizes: [2, 3, 4]
-        num_filters: [16, 8, 8]
+        num_filters: [8, 4, 4]
       }
     }
   }
diff --git a/examples/configs/dlrm_backbone_on_criteo.config b/examples/configs/dlrm_backbone_on_criteo.config
new file mode 100644
index 000000000..6dc5dd41e
--- /dev/null
+++ b/examples/configs/dlrm_backbone_on_criteo.config
@@ -0,0 +1,578 @@
+# align with raw dlrm model
+train_input_path: "examples/data/criteo/criteo_train_data"
+eval_input_path: "examples/data/criteo/criteo_test_data"
+model_dir: "examples/ckpt/dlrm_backbone_criteo"
+
+train_config {
+  log_step_count_steps: 500
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 20000
+  sync_replicas: True
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+}
+
+data_config {
+  separator: "\t"
+  input_fields: {
+    input_name: "label"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F1"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F2"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F3"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F4"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F5"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F6"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F7"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F8"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F9"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F10"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F11"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F12"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F13"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "C1"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C2"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C3"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C4"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C5"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C6"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C7"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C8"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C9"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C10"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C11"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C12"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C13"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C14"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C15"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C16"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C17"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C18"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C19"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C20"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C21"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C22"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C23"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C24"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C25"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C26"
+    input_type: STRING
+    default_val:""
+  }
+  label_fields: "label"
+
+  batch_size: 4096
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+}
+
+feature_config: {
+  features: {
+    input_names: "F1"
+    feature_type: RawFeature
+    min_val:0.0
+    max_val: 5775.0
+  }
+  features: {
+    input_names: "F2"
+    feature_type: RawFeature
+    min_val: -3.0
+    max_val: 257675.0
+  }
+  features: {
+    input_names: "F3"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 65535.0
+  }
+  features: {
+    input_names: "F4"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 969.0
+  }
+  features: {
+    input_names: "F5"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 23159456.0
+  }
+  features: {
+    input_names: "F6"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 431037.0
+  }
+  features: {
+    input_names: "F7"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 56311.0
+  }
+  features: {
+    input_names: "F8"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 6047.0
+  }
+  features: {
+    input_names: "F9"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 29019.0
+  }
+  features: {
+    input_names: "F10"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 46.0
+  }
+  features: {
+    input_names: "F11"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 231.0
+  }
+  features: {
+    input_names: "F12"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 4008.0
+  }
+  features: {
+    input_names: "F13"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 7393.0
+  }
+  features: {
+    input_names: "C1"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C2"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C3"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C4"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C5"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C6"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C7"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C8"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C9"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C10"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C11"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C12"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C13"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C14"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C15"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C16"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C17"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C18"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C19"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C20"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C21"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C22"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C23"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C24"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }features: {
+    input_names: "C25"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C26"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+}
+model_config: {
+  model_name: 'DLRM'
+  model_class: 'RankModel'
+  feature_groups: {
+    group_name: "dense"
+    feature_names: "F1"
+    feature_names: "F2"
+    feature_names: "F3"
+    feature_names: "F4"
+    feature_names: "F5"
+    feature_names: "F6"
+    feature_names: "F7"
+    feature_names: "F8"
+    feature_names: "F9"
+    feature_names: "F10"
+    feature_names: "F11"
+    feature_names: "F12"
+    feature_names: "F13"
+    wide_deep:DEEP
+  }
+  feature_groups: {
+    group_name: "sparse"
+    feature_names: "C1"
+    feature_names: "C2"
+    feature_names: "C3"
+    feature_names: "C4"
+    feature_names: "C5"
+    feature_names: "C6"
+    feature_names: "C7"
+    feature_names: "C8"
+    feature_names: "C9"
+    feature_names: "C10"
+    feature_names: "C11"
+    feature_names: "C12"
+    feature_names: "C13"
+    feature_names: "C14"
+    feature_names: "C15"
+    feature_names: "C16"
+    feature_names: "C17"
+    feature_names: "C18"
+    feature_names: "C19"
+    feature_names: "C20"
+    feature_names: "C21"
+    feature_names: "C22"
+    feature_names: "C23"
+    feature_names: "C24"
+    feature_names: "C25"
+    feature_names: "C26"
+    wide_deep:DEEP
+  }
+  backbone {
+    blocks {
+      name: 'bottom_mlp'
+      inputs {
+        feature_group_name: 'dense'
+      }
+      keras_layer {
+        class_name: 'MLP'
+        mlp {
+          hidden_units: [64, 32, 16]
+        }
+      }
+    }
+    blocks {
+      name: 'sparse'
+      inputs {
+        feature_group_name: 'sparse'
+      }
+      input_layer {
+        output_2d_tensor_and_feature_list: true
+      }
+    }
+    blocks {
+      name: 'dot'
+      inputs {
+        block_name: 'bottom_mlp'
+        input_fn: 'lambda x: [x]'
+      }
+      inputs {
+        block_name: 'sparse'
+        input_fn: 'lambda x: x[1]'
+      }
+      keras_layer {
+        class_name: 'DotInteraction'
+      }
+    }
+    blocks {
+      name: 'sparse_2d'
+      inputs {
+        block_name: 'sparse'
+        input_fn: 'lambda x: x[0]'
+      }
+    }
+    concat_blocks: ['sparse_2d', 'dot']
+    top_mlp {
+      hidden_units: [256, 128, 64]
+    }
+  }
+  rank_model {
+    l2_regularization: 1e-5
+  }
+  embedding_regularization: 1e-5
+}
diff --git a/examples/configs/dlrm_on_criteo.config b/examples/configs/dlrm_on_criteo.config
new file mode 100644
index 000000000..e6c45d574
--- /dev/null
+++ b/examples/configs/dlrm_on_criteo.config
@@ -0,0 +1,534 @@
+train_input_path: "examples/data/criteo/criteo_train_data"
+eval_input_path: "examples/data/criteo/criteo_test_data"
+model_dir: "examples/ckpt/dlrm_criteo_ckpt"
+
+train_config {
+  log_step_count_steps: 500
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 20000
+  sync_replicas: True
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+}
+
+data_config {
+  separator: "\t"
+  input_fields: {
+    input_name: "label"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F1"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F2"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F3"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F4"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F5"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F6"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F7"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F8"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F9"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F10"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F11"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F12"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F13"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "C1"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C2"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C3"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C4"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C5"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C6"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C7"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C8"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C9"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C10"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C11"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C12"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C13"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C14"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C15"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C16"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C17"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C18"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C19"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C20"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C21"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C22"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C23"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C24"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C25"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C26"
+    input_type: STRING
+    default_val:""
+  }
+  label_fields: "label"
+
+  batch_size: 4096
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+}
+
+feature_config: {
+  features: {
+    input_names: "F1"
+    feature_type: RawFeature
+    min_val:0.0
+    max_val: 5775.0
+  }
+  features: {
+    input_names: "F2"
+    feature_type: RawFeature
+    min_val: -3.0
+    max_val: 257675.0
+  }
+  features: {
+    input_names: "F3"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 65535.0
+  }
+  features: {
+    input_names: "F4"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 969.0
+  }
+  features: {
+    input_names: "F5"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 23159456.0
+  }
+  features: {
+    input_names: "F6"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 431037.0
+  }
+  features: {
+    input_names: "F7"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 56311.0
+  }
+  features: {
+    input_names: "F8"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 6047.0
+  }
+  features: {
+    input_names: "F9"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 29019.0
+  }
+  features: {
+    input_names: "F10"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 46.0
+  }
+  features: {
+    input_names: "F11"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 231.0
+  }
+  features: {
+    input_names: "F12"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 4008.0
+  }
+  features: {
+    input_names: "F13"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 7393.0
+  }
+  features: {
+    input_names: "C1"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C2"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C3"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C4"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C5"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C6"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C7"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C8"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C9"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C10"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C11"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C12"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C13"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C14"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C15"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C16"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C17"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C18"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C19"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C20"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C21"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C22"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C23"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C24"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }features: {
+    input_names: "C25"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C26"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+}
+model_config: {
+  model_class: 'DLRM'
+  feature_groups: {
+    group_name: "dense"
+    feature_names: "F1"
+    feature_names: "F2"
+    feature_names: "F3"
+    feature_names: "F4"
+    feature_names: "F5"
+    feature_names: "F6"
+    feature_names: "F7"
+    feature_names: "F8"
+    feature_names: "F9"
+    feature_names: "F10"
+    feature_names: "F11"
+    feature_names: "F12"
+    feature_names: "F13"
+    wide_deep:DEEP
+  }
+  feature_groups: {
+    group_name: "sparse"
+    feature_names: "C1"
+    feature_names: "C2"
+    feature_names: "C3"
+    feature_names: "C4"
+    feature_names: "C5"
+    feature_names: "C6"
+    feature_names: "C7"
+    feature_names: "C8"
+    feature_names: "C9"
+    feature_names: "C10"
+    feature_names: "C11"
+    feature_names: "C12"
+    feature_names: "C13"
+    feature_names: "C14"
+    feature_names: "C15"
+    feature_names: "C16"
+    feature_names: "C17"
+    feature_names: "C18"
+    feature_names: "C19"
+    feature_names: "C20"
+    feature_names: "C21"
+    feature_names: "C22"
+    feature_names: "C23"
+    feature_names: "C24"
+    feature_names: "C25"
+    feature_names: "C26"
+    wide_deep:DEEP
+  }
+  dlrm {
+    bot_dnn {
+      hidden_units: [64, 32, 16]
+    }
+    top_dnn {
+      hidden_units: [256, 128, 64]
+    }
+    l2_regularization: 1e-5
+  }
+  embedding_regularization: 1e-5
+}
diff --git a/examples/configs/dlrm_on_criteo_with_autodis.config b/examples/configs/dlrm_on_criteo_with_autodis.config
new file mode 100644
index 000000000..c6f522f95
--- /dev/null
+++ b/examples/configs/dlrm_on_criteo_with_autodis.config
@@ -0,0 +1,587 @@
+train_input_path: "examples/data/criteo/criteo_train_data"
+eval_input_path: "examples/data/criteo/criteo_test_data"
+model_dir: "examples/ckpt/dlrm_autodis_criteo"
+
+train_config {
+  log_step_count_steps: 500
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 20000
+  sync_replicas: True
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+}
+
+data_config {
+  separator: "\t"
+  input_fields: {
+    input_name: "label"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F1"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F2"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F3"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F4"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F5"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F6"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F7"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F8"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F9"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F10"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F11"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F12"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F13"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "C1"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C2"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C3"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C4"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C5"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C6"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C7"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C8"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C9"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C10"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C11"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C12"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C13"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C14"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C15"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C16"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C17"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C18"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C19"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C20"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C21"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C22"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C23"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C24"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C25"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C26"
+    input_type: STRING
+    default_val:""
+  }
+  label_fields: "label"
+
+  batch_size: 4096
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+}
+
+feature_config: {
+  features: {
+    input_names: "F1"
+    feature_type: RawFeature
+    min_val:0.0
+    max_val: 5775.0
+  }
+  features: {
+    input_names: "F2"
+    feature_type: RawFeature
+    min_val: -3.0
+    max_val: 257675.0
+  }
+  features: {
+    input_names: "F3"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 65535.0
+  }
+  features: {
+    input_names: "F4"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 969.0
+  }
+  features: {
+    input_names: "F5"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 23159456.0
+  }
+  features: {
+    input_names: "F6"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 431037.0
+  }
+  features: {
+    input_names: "F7"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 56311.0
+  }
+  features: {
+    input_names: "F8"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 6047.0
+  }
+  features: {
+    input_names: "F9"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 29019.0
+  }
+  features: {
+    input_names: "F10"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 46.0
+  }
+  features: {
+    input_names: "F11"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 231.0
+  }
+  features: {
+    input_names: "F12"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 4008.0
+  }
+  features: {
+    input_names: "F13"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 7393.0
+  }
+  features: {
+    input_names: "C1"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C2"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C3"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C4"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C5"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C6"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C7"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C8"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C9"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C10"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C11"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C12"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C13"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C14"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C15"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C16"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C17"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C18"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C19"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C20"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C21"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C22"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C23"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C24"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }features: {
+    input_names: "C25"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C26"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+}
+model_config: {
+  model_name: 'DLRM with autodis'
+  model_class: 'RankModel'
+  feature_groups: {
+    group_name: "dense"
+    feature_names: "F1"
+    feature_names: "F2"
+    feature_names: "F3"
+    feature_names: "F4"
+    feature_names: "F5"
+    feature_names: "F6"
+    feature_names: "F7"
+    feature_names: "F8"
+    feature_names: "F9"
+    feature_names: "F10"
+    feature_names: "F11"
+    feature_names: "F12"
+    feature_names: "F13"
+    wide_deep:DEEP
+  }
+  feature_groups: {
+    group_name: "sparse"
+    feature_names: "C1"
+    feature_names: "C2"
+    feature_names: "C3"
+    feature_names: "C4"
+    feature_names: "C5"
+    feature_names: "C6"
+    feature_names: "C7"
+    feature_names: "C8"
+    feature_names: "C9"
+    feature_names: "C10"
+    feature_names: "C11"
+    feature_names: "C12"
+    feature_names: "C13"
+    feature_names: "C14"
+    feature_names: "C15"
+    feature_names: "C16"
+    feature_names: "C17"
+    feature_names: "C18"
+    feature_names: "C19"
+    feature_names: "C20"
+    feature_names: "C21"
+    feature_names: "C22"
+    feature_names: "C23"
+    feature_names: "C24"
+    feature_names: "C25"
+    feature_names: "C26"
+    wide_deep:DEEP
+  }
+  backbone {
+    blocks {
+      name: 'num_emb'
+      inputs {
+        feature_group_name: 'dense'
+      }
+      keras_layer {
+        class_name: 'AutoDisEmbedding'
+        auto_dis_embedding {
+          embedding_dim: 16
+          num_bins: 40
+          temperature: 0.815
+          output_tensor_list: true
+        }
+      }
+    }
+    blocks {
+      name: 'sparse'
+      inputs {
+        feature_group_name: 'sparse'
+      }
+      input_layer {
+        output_2d_tensor_and_feature_list: true
+      }
+    }
+    blocks {
+      name: 'dot'
+      inputs {
+        block_name: 'num_emb'
+        input_fn: 'lambda x: x[1]'
+      }
+      inputs {
+        block_name: 'sparse'
+        input_fn: 'lambda x: x[1]'
+      }
+      keras_layer {
+        class_name: 'DotInteraction'
+      }
+    }
+    blocks {
+      name: 'sparse_2d'
+      inputs {
+        block_name: 'sparse'
+        input_fn: 'lambda x: x[0]'
+      }
+    }
+    blocks {
+      name: 'num_emb_2d'
+      inputs {
+        block_name: 'num_emb'
+        input_fn: 'lambda x: x[0]'
+      }
+    }
+    concat_blocks: ['num_emb_2d', 'dot', 'sparse_2d']
+    top_mlp {
+      hidden_units: [256, 128, 64]
+    }
+  }
+  rank_model {
+    l2_regularization: 1e-5
+  }
+  embedding_regularization: 1e-5
+}
diff --git a/examples/configs/dlrm_on_criteo_with_periodic.config b/examples/configs/dlrm_on_criteo_with_periodic.config
new file mode 100644
index 000000000..c42e8252b
--- /dev/null
+++ b/examples/configs/dlrm_on_criteo_with_periodic.config
@@ -0,0 +1,595 @@
+train_input_path: "examples/data/criteo/criteo_train_data"
+eval_input_path: "examples/data/criteo/criteo_test_data"
+model_dir: "examples/ckpt/dlrm_periodic_criteo"
+
+train_config {
+  log_step_count_steps: 500
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 20000
+  sync_replicas: True
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+}
+
+data_config {
+  separator: "\t"
+  input_fields: {
+    input_name: "label"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F1"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F2"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F3"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F4"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F5"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F6"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F7"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F8"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F9"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F10"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F11"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F12"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F13"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "C1"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C2"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C3"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C4"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C5"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C6"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C7"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C8"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C9"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C10"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C11"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C12"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C13"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C14"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C15"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C16"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C17"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C18"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C19"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C20"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C21"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C22"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C23"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C24"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C25"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C26"
+    input_type: STRING
+    default_val:""
+  }
+  label_fields: "label"
+
+  batch_size: 4096
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+}
+
+feature_config: {
+  features: {
+    input_names: "F1"
+    feature_type: RawFeature
+    min_val:0.0
+    max_val: 5775.0
+  }
+  features: {
+    input_names: "F2"
+    feature_type: RawFeature
+    min_val: -3.0
+    max_val: 257675.0
+  }
+  features: {
+    input_names: "F3"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 65535.0
+  }
+  features: {
+    input_names: "F4"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 969.0
+  }
+  features: {
+    input_names: "F5"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 23159456.0
+  }
+  features: {
+    input_names: "F6"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 431037.0
+  }
+  features: {
+    input_names: "F7"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 56311.0
+  }
+  features: {
+    input_names: "F8"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 6047.0
+  }
+  features: {
+    input_names: "F9"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 29019.0
+  }
+  features: {
+    input_names: "F10"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 46.0
+  }
+  features: {
+    input_names: "F11"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 231.0
+  }
+  features: {
+    input_names: "F12"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 4008.0
+  }
+  features: {
+    input_names: "F13"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 7393.0
+  }
+  features: {
+    input_names: "C1"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C2"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C3"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C4"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C5"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C6"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C7"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C8"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C9"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C10"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C11"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C12"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C13"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C14"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C15"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C16"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C17"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C18"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C19"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C20"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C21"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C22"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C23"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C24"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }features: {
+    input_names: "C25"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C26"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+}
+model_config: {
+  model_name: 'dlrm with periodic'
+  model_class: 'RankModel'
+  feature_groups: {
+    group_name: "dense"
+    feature_names: "F1"
+    feature_names: "F2"
+    feature_names: "F3"
+    feature_names: "F4"
+    feature_names: "F5"
+    feature_names: "F6"
+    feature_names: "F7"
+    feature_names: "F8"
+    feature_names: "F9"
+    feature_names: "F10"
+    feature_names: "F11"
+    feature_names: "F12"
+    feature_names: "F13"
+    wide_deep:DEEP
+  }
+  feature_groups: {
+    group_name: "sparse"
+    feature_names: "C1"
+    feature_names: "C2"
+    feature_names: "C3"
+    feature_names: "C4"
+    feature_names: "C5"
+    feature_names: "C6"
+    feature_names: "C7"
+    feature_names: "C8"
+    feature_names: "C9"
+    feature_names: "C10"
+    feature_names: "C11"
+    feature_names: "C12"
+    feature_names: "C13"
+    feature_names: "C14"
+    feature_names: "C15"
+    feature_names: "C16"
+    feature_names: "C17"
+    feature_names: "C18"
+    feature_names: "C19"
+    feature_names: "C20"
+    feature_names: "C21"
+    feature_names: "C22"
+    feature_names: "C23"
+    feature_names: "C24"
+    feature_names: "C25"
+    feature_names: "C26"
+    wide_deep:DEEP
+  }
+  backbone {
+    blocks {
+      name: 'num_emb'
+      inputs {
+        feature_group_name: 'dense'
+      }
+      keras_layer {
+        class_name: 'PeriodicEmbedding'
+        st_params {
+          fields {
+            key: "output_tensor_list"
+            value { bool_value: true }
+          }
+          fields {
+            key: "embedding_dim"
+            value { number_value: 16 }
+          }
+          fields {
+            key: "sigma"
+            value { number_value: 0.005 }
+          }
+        }
+      }
+    }
+    blocks {
+      name: 'sparse'
+      inputs {
+        feature_group_name: 'sparse'
+      }
+      input_layer {
+        output_2d_tensor_and_feature_list: true
+      }
+    }
+    blocks {
+      name: 'dot'
+      inputs {
+        block_name: 'num_emb'
+        input_fn: 'lambda x: x[1]'
+      }
+      inputs {
+        block_name: 'sparse'
+        input_fn: 'lambda x: x[1]'
+      }
+      keras_layer {
+        class_name: 'DotInteraction'
+      }
+    }
+    blocks {
+      name: 'sparse_2d'
+      inputs {
+        block_name: 'sparse'
+        input_fn: 'lambda x: x[0]'
+      }
+    }
+    blocks {
+      name: 'num_emb_2d'
+      inputs {
+        block_name: 'num_emb'
+        input_fn: 'lambda x: x[0]'
+      }
+    }
+    concat_blocks: ['num_emb_2d', 'dot', 'sparse_2d']
+    top_mlp {
+      hidden_units: [256, 128, 64]
+    }
+  }
+  rank_model {
+    l2_regularization: 1e-5
+  }
+  embedding_regularization: 1e-5
+}
diff --git a/examples/configs/dlrm_standard_on_criteo.config b/examples/configs/dlrm_standard_on_criteo.config
new file mode 100644
index 000000000..df82e7990
--- /dev/null
+++ b/examples/configs/dlrm_standard_on_criteo.config
@@ -0,0 +1,569 @@
+train_input_path: "examples/data/criteo/criteo_train_data"
+eval_input_path: "examples/data/criteo/criteo_test_data"
+model_dir: "examples/ckpt/dlrm_standard_criteo"
+
+train_config {
+  log_step_count_steps: 500
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 20000
+  sync_replicas: True
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+}
+
+data_config {
+  separator: "\t"
+  input_fields: {
+    input_name: "label"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F1"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F2"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F3"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F4"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F5"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F6"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F7"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F8"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F9"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F10"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F11"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F12"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F13"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "C1"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C2"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C3"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C4"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C5"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C6"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C7"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C8"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C9"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C10"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C11"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C12"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C13"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C14"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C15"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C16"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C17"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C18"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C19"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C20"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C21"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C22"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C23"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C24"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C25"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C26"
+    input_type: STRING
+    default_val:""
+  }
+  label_fields: "label"
+
+  batch_size: 4096
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+}
+
+feature_config: {
+  features: {
+    input_names: "F1"
+    feature_type: RawFeature
+    min_val:0.0
+    max_val: 5775.0
+  }
+  features: {
+    input_names: "F2"
+    feature_type: RawFeature
+    min_val: -3.0
+    max_val: 257675.0
+  }
+  features: {
+    input_names: "F3"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 65535.0
+  }
+  features: {
+    input_names: "F4"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 969.0
+  }
+  features: {
+    input_names: "F5"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 23159456.0
+  }
+  features: {
+    input_names: "F6"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 431037.0
+  }
+  features: {
+    input_names: "F7"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 56311.0
+  }
+  features: {
+    input_names: "F8"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 6047.0
+  }
+  features: {
+    input_names: "F9"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 29019.0
+  }
+  features: {
+    input_names: "F10"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 46.0
+  }
+  features: {
+    input_names: "F11"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 231.0
+  }
+  features: {
+    input_names: "F12"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 4008.0
+  }
+  features: {
+    input_names: "F13"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 7393.0
+  }
+  features: {
+    input_names: "C1"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C2"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C3"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C4"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C5"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C6"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C7"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C8"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C9"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C10"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C11"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C12"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C13"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C14"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C15"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C16"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C17"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C18"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C19"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C20"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C21"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C22"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C23"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C24"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }features: {
+    input_names: "C25"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C26"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+}
+model_config: {
+  model_name: 'Stardard DLRM'
+  model_class: 'RankModel'
+  feature_groups: {
+    group_name: "dense"
+    feature_names: "F1"
+    feature_names: "F2"
+    feature_names: "F3"
+    feature_names: "F4"
+    feature_names: "F5"
+    feature_names: "F6"
+    feature_names: "F7"
+    feature_names: "F8"
+    feature_names: "F9"
+    feature_names: "F10"
+    feature_names: "F11"
+    feature_names: "F12"
+    feature_names: "F13"
+    wide_deep:DEEP
+  }
+  feature_groups: {
+    group_name: "sparse"
+    feature_names: "C1"
+    feature_names: "C2"
+    feature_names: "C3"
+    feature_names: "C4"
+    feature_names: "C5"
+    feature_names: "C6"
+    feature_names: "C7"
+    feature_names: "C8"
+    feature_names: "C9"
+    feature_names: "C10"
+    feature_names: "C11"
+    feature_names: "C12"
+    feature_names: "C13"
+    feature_names: "C14"
+    feature_names: "C15"
+    feature_names: "C16"
+    feature_names: "C17"
+    feature_names: "C18"
+    feature_names: "C19"
+    feature_names: "C20"
+    feature_names: "C21"
+    feature_names: "C22"
+    feature_names: "C23"
+    feature_names: "C24"
+    feature_names: "C25"
+    feature_names: "C26"
+    wide_deep:DEEP
+  }
+  backbone {
+    blocks {
+      name: 'bottom_mlp'
+      inputs {
+        feature_group_name: 'dense'
+      }
+      keras_layer {
+        class_name: 'MLP'
+        mlp {
+          hidden_units: [64, 32, 16]
+        }
+      }
+    }
+    blocks {
+      name: 'sparse'
+      inputs {
+        feature_group_name: 'sparse'
+      }
+      input_layer {
+        only_output_feature_list: true
+      }
+    }
+    blocks {
+      name: 'dot'
+      inputs {
+        block_name: 'bottom_mlp'
+        input_fn: 'lambda x: [x]'
+      }
+      inputs {
+        block_name: 'sparse'
+      }
+      keras_layer {
+        class_name: 'DotInteraction'
+      }
+    }
+    concat_blocks: ['bottom_mlp', 'dot']
+    top_mlp {
+      hidden_units: [256, 128, 64]
+    }
+  }
+  rank_model {
+    l2_regularization: 1e-5
+  }
+  embedding_regularization: 1e-5
+}
diff --git a/examples/configs/fibinet_on_movielens.config b/examples/configs/fibinet_on_movielens.config
new file mode 100644
index 000000000..1fe36aac3
--- /dev/null
+++ b/examples/configs/fibinet_on_movielens.config
@@ -0,0 +1,204 @@
+train_input_path: "examples/data/movielens_1m/movies_train_data"
+eval_input_path: "examples/data/movielens_1m/movies_test_data"
+model_dir: "examples/ckpt/fibinet_on_movieslen_ckpt"
+
+train_config {
+  log_step_count_steps: 100
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 2000
+  sync_replicas: False
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+  metrics_set: {
+    gauc {
+      uid_field: 'user_id'
+    }
+  }
+  metrics_set: {
+    max_f1 {}
+  }
+}
+
+data_config {
+  input_fields {
+    input_name:'label'
+    input_type: INT32
+  }
+  input_fields {
+    input_name:'user_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'movie_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name:'rating'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'gender'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'age'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'job_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'zip_id'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'title'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'genres'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'year'
+    input_type: INT32
+  }
+
+  label_fields: 'label'
+  batch_size: 1024
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+  separator: '\t'
+}
+
+feature_config: {
+  features: {
+    input_names: 'user_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 12000
+  }
+  features: {
+    input_names: 'movie_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 6000
+  }
+  features: {
+    input_names: 'gender'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 2
+  }
+  features: {
+    input_names: 'job_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 21
+  }
+  features: {
+    input_names: 'age'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 7
+  }
+  features: {
+    input_names: 'genres'
+    feature_type: TagFeature
+    separator: '|'
+    embedding_dim: 16
+    hash_bucket_size: 100
+  }
+  features: {
+    input_names: 'title'
+    feature_type: SequenceFeature
+    separator: ' '
+    embedding_dim: 16
+    hash_bucket_size: 10000
+    sequence_combiner: {
+      text_cnn: {
+        filter_sizes: [2, 3, 4]
+        num_filters: [16, 8, 8]
+      }
+    }
+  }
+  features: {
+    input_names: 'year'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 36
+  }
+}
+model_config: {
+  model_name: 'FiBiNet'
+  model_class: 'RankModel'
+  feature_groups: {
+    group_name: 'all'
+    feature_names: 'user_id'
+    feature_names: 'movie_id'
+    feature_names: 'job_id'
+    feature_names: 'age'
+    feature_names: 'gender'
+    feature_names: 'year'
+    feature_names: 'genres'
+    wide_deep: DEEP
+  }
+  backbone {
+    blocks {
+      name: "all"
+      inputs {
+        feature_group_name: "all"
+      }
+      input_layer {
+        do_batch_norm: true
+        only_output_feature_list: true
+      }
+    }
+    blocks {
+      name: "fibinet"
+      inputs {
+        block_name: "all"
+      }
+      keras_layer {
+        class_name: 'FiBiNet'
+        fibinet {
+          senet {
+            reduction_ratio: 4
+          }
+          bilinear {
+            type: 'each'
+            num_output_units: 512
+          }
+          mlp {
+            hidden_units: [512, 256]
+          }
+        }
+      }
+    }
+    concat_blocks: ['fibinet']
+  }
+  rank_model {
+  }
+  embedding_regularization: 1e-4
+}
+export_config {
+  multi_placeholder: false
+}
diff --git a/examples/configs/masknet_on_movielens.config b/examples/configs/masknet_on_movielens.config
new file mode 100644
index 000000000..fd3dc1342
--- /dev/null
+++ b/examples/configs/masknet_on_movielens.config
@@ -0,0 +1,199 @@
+train_input_path: "examples/data/movielens_1m/movies_train_data"
+eval_input_path: "examples/data/movielens_1m/movies_test_data"
+model_dir: "examples/ckpt/masknet_on_movieslen_ckpt"
+
+train_config {
+  log_step_count_steps: 100
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 2000
+  sync_replicas: True
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+  metrics_set: {
+    gauc {
+      uid_field: 'user_id'
+    }
+  }
+  metrics_set: {
+    max_f1 {}
+  }
+}
+
+data_config {
+  input_fields {
+    input_name:'label'
+    input_type: INT32
+  }
+  input_fields {
+    input_name:'user_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'movie_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name:'rating'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'gender'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'age'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'job_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'zip_id'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'title'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'genres'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'year'
+    input_type: INT32
+  }
+
+  label_fields: 'label'
+  batch_size: 1024
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+  separator: '\t'
+}
+
+feature_config: {
+  features: {
+    input_names: 'user_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 12000
+  }
+  features: {
+    input_names: 'movie_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 6000
+  }
+  features: {
+    input_names: 'gender'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 2
+  }
+  features: {
+    input_names: 'job_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 21
+  }
+  features: {
+    input_names: 'age'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 7
+  }
+  features: {
+    input_names: 'genres'
+    feature_type: TagFeature
+    separator: '|'
+    embedding_dim: 16
+    hash_bucket_size: 100
+  }
+  features: {
+    input_names: 'title'
+    feature_type: SequenceFeature
+    separator: ' '
+    embedding_dim: 16
+    hash_bucket_size: 10000
+    sequence_combiner: {
+      text_cnn: {
+        filter_sizes: [2, 3, 4]
+        num_filters: [16, 8, 8]
+      }
+    }
+  }
+  features: {
+    input_names: 'year'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 36
+  }
+}
+model_config: {
+  model_name: 'MaskNet'
+  model_class: 'RankModel'
+  feature_groups: {
+    group_name: 'all'
+    feature_names: 'user_id'
+    feature_names: 'movie_id'
+    feature_names: 'job_id'
+    feature_names: 'age'
+    feature_names: 'gender'
+    feature_names: 'year'
+    feature_names: 'genres'
+    wide_deep: DEEP
+  }
+  backbone {
+    blocks {
+      name: "mask_net"
+      inputs {
+        feature_group_name: "all"
+      }
+      keras_layer {
+        class_name: 'MaskNet'
+        masknet {
+          mask_blocks {
+            aggregation_size: 512
+            output_size: 256
+          }
+          mask_blocks {
+            aggregation_size: 512
+            output_size: 256
+          }
+          mask_blocks {
+            aggregation_size: 512
+            output_size: 256
+          }
+          mlp {
+            hidden_units: [512, 256]
+          }
+        }
+      }
+    }
+    concat_blocks: ['mask_net']
+  }
+  rank_model {
+  }
+  embedding_regularization: 1e-4
+}
+export_config {
+  multi_placeholder: false
+}
diff --git a/examples/configs/mlp_on_movielens.config b/examples/configs/mlp_on_movielens.config
new file mode 100644
index 000000000..038b02a51
--- /dev/null
+++ b/examples/configs/mlp_on_movielens.config
@@ -0,0 +1,239 @@
+train_input_path: "examples/data/movielens_1m/movies_train_data"
+eval_input_path: "examples/data/movielens_1m/movies_test_data"
+model_dir: "examples/ckpt/mlp_movieslen"
+
+train_config {
+  log_step_count_steps: 100
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 2000
+  sync_replicas: True
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+  metrics_set: {
+    gauc {
+      uid_field: 'user_id'
+    }
+  }
+  metrics_set: {
+    max_f1 {}
+  }
+}
+
+data_config {
+  input_fields {
+    input_name:'label'
+    input_type: INT32
+  }
+  input_fields {
+    input_name:'user_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'movie_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name:'rating'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'gender'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'age'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'job_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'zip_id'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'title'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'genres'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'year'
+    input_type: INT32
+  }
+
+  label_fields: 'label'
+  batch_size: 1024
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+  separator: '\t'
+}
+
+feature_config: {
+  features: {
+    input_names: 'user_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 12000
+  }
+  features: {
+    input_names: 'movie_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 6000
+  }
+  features: {
+    input_names: 'gender'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 2
+  }
+  features: {
+    input_names: 'job_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 21
+  }
+  features: {
+    input_names: 'age'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 7
+  }
+  features: {
+    input_names: 'genres'
+    feature_type: TagFeature
+    separator: '|'
+    embedding_dim: 16
+    hash_bucket_size: 100
+  }
+  features: {
+    input_names: 'title'
+    feature_type: SequenceFeature
+    separator: ' '
+    embedding_dim: 16
+    hash_bucket_size: 10000
+    sequence_combiner: {
+      text_cnn: {
+        filter_sizes: [2, 3, 4]
+        num_filters: [16, 8, 8]
+      }
+    }
+  }
+  features: {
+    input_names: 'year'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 36
+  }
+}
+model_config: {
+  model_class: "RankModel"
+  feature_groups: {
+    group_name: 'features'
+    feature_names: 'user_id'
+    feature_names: 'movie_id'
+    feature_names: 'job_id'
+    feature_names: 'age'
+    feature_names: 'gender'
+    feature_names: 'year'
+    feature_names: 'genres'
+    wide_deep: DEEP
+  }
+  backbone {
+    blocks {
+      name: 'mlp'
+      inputs {
+        feature_group_name: 'features'
+      }
+      layers {
+        keras_layer {
+          class_name: 'Dense'
+          st_params {
+            fields {
+              key: 'units'
+              value: { number_value: 256 }
+            }
+            fields {
+              key: 'activation'
+              value: { string_value: 'relu' }
+            }
+          }
+        }
+      }
+      layers {
+        keras_layer {
+          class_name: 'Dropout'
+          st_params {
+            fields {
+              key: 'rate'
+              value: { number_value: 0.5 }
+            }
+          }
+        }
+      }
+      layers {
+        keras_layer {
+          class_name: 'Dense'
+          st_params {
+            fields {
+              key: 'units'
+              value: { number_value: 256 }
+            }
+            fields {
+              key: 'activation'
+              value: { string_value: 'relu' }
+            }
+          }
+        }
+      }
+      layers {
+        keras_layer {
+          class_name: 'Dropout'
+          st_params {
+            fields {
+              key: 'rate'
+              value: { number_value: 0.5 }
+            }
+          }
+        }
+      }
+      layers {
+        keras_layer {
+          class_name: 'Dense'
+          st_params {
+            fields {
+              key: 'units'
+              value: { number_value: 1 }
+            }
+          }
+        }
+      }
+    }
+    concat_blocks: 'mlp'
+  }
+  rank_model {
+    l2_regularization: 1e-4
+  }
+  embedding_regularization: 1e-4
+}
diff --git a/examples/configs/multi_tower_on_movielens.config b/examples/configs/multi_tower_on_movielens.config
new file mode 100644
index 000000000..a502922ae
--- /dev/null
+++ b/examples/configs/multi_tower_on_movielens.config
@@ -0,0 +1,224 @@
+train_input_path: "examples/data/movielens_1m/movies_train_data"
+eval_input_path: "examples/data/movielens_1m/movies_test_data"
+model_dir: "examples/ckpt/multi_tower_movieslen"
+
+train_config {
+  log_step_count_steps: 100
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 2000
+  sync_replicas: True
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+  metrics_set: {
+    gauc {
+      uid_field: 'user_id'
+    }
+  }
+  metrics_set: {
+    max_f1 {}
+  }
+}
+
+data_config {
+  input_fields {
+    input_name:'label'
+    input_type: INT32
+  }
+  input_fields {
+    input_name:'user_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'movie_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name:'rating'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'gender'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'age'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'job_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'zip_id'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'title'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'genres'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'year'
+    input_type: INT32
+  }
+
+  label_fields: 'label'
+  batch_size: 1024
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+  separator: '\t'
+}
+
+feature_config: {
+  features: {
+    input_names: 'user_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 12000
+  }
+  features: {
+    input_names: 'movie_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 6000
+  }
+  features: {
+    input_names: 'gender'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 2
+  }
+  features: {
+    input_names: 'job_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 21
+  }
+  features: {
+    input_names: 'age'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 7
+  }
+  features: {
+    input_names: 'genres'
+    feature_type: TagFeature
+    separator: '|'
+    embedding_dim: 16
+    hash_bucket_size: 100
+  }
+  features: {
+    input_names: 'title'
+    feature_type: SequenceFeature
+    separator: ' '
+    embedding_dim: 16
+    hash_bucket_size: 10000
+    sequence_combiner: {
+      text_cnn: {
+        filter_sizes: [2, 3, 4]
+        num_filters: [16, 8, 8]
+      }
+    }
+  }
+  features: {
+    input_names: 'year'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 36
+  }
+}
+model_config: {
+  model_name: "multi tower"
+  model_class: "RankModel"
+  feature_groups: {
+    group_name: 'user'
+    feature_names: 'user_id'
+    feature_names: 'job_id'
+    feature_names: 'age'
+    feature_names: 'gender'
+    wide_deep: DEEP
+  }
+  feature_groups: {
+    group_name: 'item'
+    feature_names: 'movie_id'
+    feature_names: 'year'
+    feature_names: 'genres'
+    wide_deep: DEEP
+  }
+  backbone {
+    packages {
+      name: 'user'
+      blocks {
+        name: 'mlp'
+        inputs {
+          feature_group_name: 'user'
+        }
+        keras_layer {
+          class_name: 'MLP'
+          mlp {
+            hidden_units: [256, 128]
+          }
+        }
+      }
+      concat_blocks: 'mlp'
+    }
+    packages {
+      name: 'item'
+      blocks {
+        name: 'mlp'
+        inputs {
+          feature_group_name: 'item'
+        }
+        keras_layer {
+          class_name: 'MLP'
+          mlp {
+            hidden_units: [256, 128]
+          }
+        }
+      }
+      concat_blocks: 'mlp'
+    }
+    blocks {
+      name: 'top_mlp'
+      inputs {
+        package_name: 'user'
+      }
+      inputs {
+        package_name: 'item'
+      }
+      layers {
+        keras_layer {
+          class_name: 'MLP'
+          mlp {
+            hidden_units: [128, 64]
+          }
+        }
+      }
+    }
+    concat_blocks: 'top_mlp'
+  }
+  rank_model {
+    l2_regularization: 1e-4
+  }
+  embedding_regularization: 1e-4
+}
diff --git a/examples/configs/wide_and_deep_backbone_on_movielens.config b/examples/configs/wide_and_deep_backbone_on_movielens.config
new file mode 100644
index 000000000..0f13a0511
--- /dev/null
+++ b/examples/configs/wide_and_deep_backbone_on_movielens.config
@@ -0,0 +1,219 @@
+train_input_path: "examples/data/movielens_1m/movies_train_data"
+eval_input_path: "examples/data/movielens_1m/movies_test_data"
+model_dir: "examples/ckpt/wide_and_deep_movieslen"
+
+train_config {
+  log_step_count_steps: 100
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 2000
+  sync_replicas: True
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+  metrics_set: {
+    gauc {
+      uid_field: 'user_id'
+    }
+  }
+  metrics_set: {
+    max_f1 {}
+  }
+}
+
+data_config {
+  input_fields {
+    input_name:'label'
+    input_type: INT32
+  }
+  input_fields {
+    input_name:'user_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'movie_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name:'rating'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'gender'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'age'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'job_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'zip_id'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'title'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'genres'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'year'
+    input_type: INT32
+  }
+
+  label_fields: 'label'
+  batch_size: 1024
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+  separator: '\t'
+}
+
+feature_config: {
+  features: {
+    input_names: 'user_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 12000
+  }
+  features: {
+    input_names: 'movie_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 6000
+  }
+  features: {
+    input_names: 'gender'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 2
+  }
+  features: {
+    input_names: 'job_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 21
+  }
+  features: {
+    input_names: 'age'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 7
+  }
+  features: {
+    input_names: 'genres'
+    feature_type: TagFeature
+    separator: '|'
+    embedding_dim: 16
+    hash_bucket_size: 100
+  }
+  features: {
+    input_names: 'title'
+    feature_type: SequenceFeature
+    separator: ' '
+    embedding_dim: 16
+    hash_bucket_size: 10000
+    sequence_combiner: {
+      text_cnn: {
+        filter_sizes: [2, 3, 4]
+        num_filters: [16, 8, 8]
+      }
+    }
+  }
+  features: {
+    input_names: 'year'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 36
+  }
+}
+model_config: {
+  model_class: "RankModel"
+  feature_groups: {
+    group_name: 'wide'
+    feature_names: 'user_id'
+    feature_names: 'movie_id'
+    feature_names: 'job_id'
+    feature_names: 'age'
+    feature_names: 'gender'
+    feature_names: 'year'
+    feature_names: 'genres'
+    wide_deep: WIDE
+  }
+  feature_groups: {
+    group_name: 'deep'
+    feature_names: 'user_id'
+    feature_names: 'movie_id'
+    feature_names: 'job_id'
+    feature_names: 'age'
+    feature_names: 'gender'
+    feature_names: 'year'
+    feature_names: 'genres'
+    wide_deep: DEEP
+  }
+  backbone {
+    blocks {
+      name: 'wide'
+      inputs {
+        feature_group_name: 'wide'
+      }
+      input_layer {
+        only_output_feature_list: true
+      }
+    }
+    blocks {
+      name: 'deep_logit'
+      inputs {
+        feature_group_name: 'deep'
+      }
+      keras_layer {
+        class_name: 'MLP'
+        mlp {
+          hidden_units: [256, 256, 256, 1]
+          use_final_bn: false
+          final_activation: 'linear'
+        }
+      }
+    }
+    blocks {
+      name: 'final_logit'
+      inputs {
+        block_name: 'wide'
+        input_fn: 'lambda x: tf.add_n(x)'
+      }
+      inputs {
+        block_name: 'deep_logit'
+      }
+      merge_inputs_into_list: true
+      keras_layer {
+        class_name: 'Add'
+      }
+    }
+    concat_blocks: 'final_logit'
+  }
+  rank_model {
+    wide_output_dim: 1
+    l2_regularization: 1e-4
+  }
+  embedding_regularization: 1e-4
+}
diff --git a/examples/data/criteo/download_and_process.sh b/examples/data/criteo/download_and_process.sh
index 30061a862..f0cc8aef9 100644
--- a/examples/data/criteo/download_and_process.sh
+++ b/examples/data/criteo/download_and_process.sh
@@ -1,6 +1,7 @@
 #! /bin/bash
 if [ "$(uname)" == "Darwin" ]; then
-    curl -O https://easy-rec.oss-cn-hangzhou.aliyuncs.com/data/criteo_kaggle/kaggle-display-advertising-challenge-dataset.tar.gz
+    #curl -O https://easy-rec.oss-cn-hangzhou.aliyuncs.com/data/criteo_kaggle/kaggle-display-advertising-challenge-dataset.tar.gz
+    wget -c https://easy-rec.oss-cn-hangzhou.aliyuncs.com/data/criteo_kaggle/kaggle-display-advertising-challenge-dataset.tar.gz
 elif [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then
     wget -c https://easy-rec.oss-cn-hangzhou.aliyuncs.com/data/criteo_kaggle/kaggle-display-advertising-challenge-dataset.tar.gz
 elif [ "$(expr substr $(uname -s) 1 10)" == "MINGW32_NT" ]; then
diff --git a/examples/data/criteo/process_criteo_kaggle.py b/examples/data/criteo/process_criteo_kaggle.py
index 60b7d9776..e610e33a6 100644
--- a/examples/data/criteo/process_criteo_kaggle.py
+++ b/examples/data/criteo/process_criteo_kaggle.py
@@ -5,14 +5,21 @@
 target_columns = ['label']
 columns = target_columns + dense_features + category_features
 
+# data_train = pd.read_csv(
+#     'criteo_train_data', sep='\t', names=columns)
+#
+# for col in category_features:
+#     print(col, data_train[col].nunique())
+
 data_train = pd.read_csv(
     'criteo_kaggle_display/train.txt', sep='\t', names=columns)
 
 samples_num = data_train.shape[0]
 print('samples_num:', samples_num, round(samples_num * 0.9))
 
-data_train[:round(samples_num * 0.9)].to_csv(
+train_num = int(round(samples_num * 0.9))
+data_train[:train_num].to_csv(
     r'criteo_train_data', index=False, sep='\t', mode='a', header=False)
-data_train[round(samples_num * 0.9):].to_csv(
+data_train[train_num:].to_csv(
     r'criteo_test_data', index=False, sep='\t', mode='a', header=False)
 print('Done.')
diff --git a/examples/rank_model/readme.md b/examples/rank_model/readme.md
index 15d3f4dca..f6a2ba791 100644
--- a/examples/rank_model/readme.md
+++ b/examples/rank_model/readme.md
@@ -32,10 +32,12 @@
 | MovieLens-1M | DeepFM    | 0.8688 |
 | MovieLens-1M | DCN       | 0.8576 |
 | MovieLens-1M | AutoInt   | 0.8513 |
+| MovieLens-1M | MaskNet   | 0.8872 |
+| MovieLens-1M | FibiNet   | 0.8879 |
 
 # Criteo Research Kaggle 数据集
 
-在MovieLens-1M 数据集中, 我们提供了2个模型上的demo示例。
+在 `Criteo Research Kaggle` 数据集中, 我们提供了2个模型上的demo示例。
 
 [FM](fm.md) / [DeepFM](deepfm.md)
 
diff --git a/examples/readme.md b/examples/readme.md
index 4861b0b42..f2c337431 100644
--- a/examples/readme.md
+++ b/examples/readme.md
@@ -73,14 +73,22 @@ EasyRec的模型训练和评估都是基于config配置文件的，配置文件
 
 - [deepfm_on_movielens.config](configs/deepfm_on_movielens.config)
 
+- [deepfm_backbone_on_movielens.config](configs/deepfm_backbone_on_movielens.config)
+
 - [dcn_on_movielens.config](configs/dcn_on_movielens.config)
 
 - [autoint_on_movielens.config](configs/autoint_on_movielens.config)
 
+- [masknet_on_movielens.config](configs/masknet_on_movielens.config)
+
+- [fibinet_on_movielens.config](configs/fibinet_on_movielens.config)
+
 - [fm_on_criteo.config](configs/fm_on_criteo.config)
 
 - [deepfm_on_criteo.config](configs/deepfm_on_criteo.config)
 
+- [deepfm_backbone_on_criteo.config](configs/deepfm_backbone_on_criteo.config)
+
 **召回任务**
 
 - [dssm_on_books.config](configs/dssm_on_books.config)
@@ -201,19 +209,35 @@ python -m easy_rec.python.train_eval --pipeline_config_path examples/configs/dee
 
 - MovieLens-1M
 
-  | Model     | Epoch | AUC    |
-  | --------- | ----- | ------ |
-  | Wide&Deep | 1     | 0.8558 |
-  | DeepFM    | 1     | 0.8688 |
-  | DCN       | 1     | 0.8576 |
-  | AutoInt   | 1     | 0.8513 |
+  | Model               | Epoch | AUC    |
+  | ------------------- | ----- | ------ |
+  | MLP                 | 1     | 0.8616 |
+  | Wide&Deep           | 1     | 0.8558 |
+  | Wide&Deep(Backbone) | 1     | 0.8854 |
+  | DeepFM              | 1     | 0.8867 |
+  | DeepFM(Backbone)    | 1     | 0.8872 |
+  | DCN                 | 1     | 0.8576 |
+  | DCN_v2              | 1     | 0.8770 |
+  | AutoInt             | 1     | 0.8513 |
+  | MaskNet             | 1     | 0.8872 |
+  | FibiNet             | 1     | 0.8893 |
+
+  备注：`MovieLens-1M` 数据集较小，评估指标方差较大，以上结果仅供参考。
 
 - Criteo-Research
 
-  | Model  | Epoch | AUC    |
-  | ------ | ----- | ------ |
-  | FM     | 1     | 0.7577 |
-  | DeepFM | 1     | 0.7967 |
+  | Model             | Epoch | AUC     |
+  | ----------------- | ----- | ------- |
+  | FM                | 1     | 0.7577  |
+  | DeepFM            | 1     | 0.7970  |
+  | DeepFM (backbone) | 1     | 0.7970  |
+  | DeepFM (periodic) | 1     | 0.7980  |
+  | DeepFM (autodis)  | 1     | 0.7979  |
+  | DLRM              | 1     | 0.79785 |
+  | DLRM (backbone)   | 1     | 0.7993  |
+  | DLRM (standard)   | 1     | 0.7949  |
+  | DLRM (autodis)    | 1     | 0.7989  |
+  | DLRM (periodic)   | 1     | 0.7998  |
 
 ### 召回模型
 
diff --git a/pai_jobs/run.py b/pai_jobs/run.py
index 41c61ad31..986731d36 100644
--- a/pai_jobs/run.py
+++ b/pai_jobs/run.py
@@ -166,6 +166,8 @@
 tf.app.flags.DEFINE_string('oss_embedding_version', '', 'oss embedding version')
 
 tf.app.flags.DEFINE_bool('verbose', False, 'print more debug information')
+tf.app.flags.DEFINE_bool('place_embedding_on_cpu', False,
+                         'whether to place embedding variables on cpu')
 
 # for automl hyper parameter tuning
 tf.app.flags.DEFINE_string('model_dir', None, 'model directory')
@@ -434,7 +436,10 @@ def main(argv):
   elif FLAGS.cmd == 'export':
     check_param('export_dir')
     check_param('config')
-
+    if FLAGS.place_embedding_on_cpu:
+      os.environ['place_embedding_on_cpu'] = 'True'
+    else:
+      os.environ['place_embedding_on_cpu'] = 'False'
     redis_params = {}
     if FLAGS.redis_url:
       redis_params['redis_url'] = FLAGS.redis_url
diff --git a/samples/model_config/bst_cl_on_taobao.config b/samples/model_config/bst_cl_on_taobao.config
new file mode 100644
index 000000000..77529db5e
--- /dev/null
+++ b/samples/model_config/bst_cl_on_taobao.config
@@ -0,0 +1,304 @@
+train_input_path: "data/test/tb_data/taobao_train_data"
+eval_input_path: "data/test/tb_data/taobao_test_data"
+model_dir: "experiments/dbmtl_taobao_ckpt"
+
+train_config {
+  optimizer_config {
+    adam_optimizer {
+      learning_rate {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 1e-07
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  num_steps: 100
+  sync_replicas: true
+  save_checkpoints_steps: 100
+  log_step_count_steps: 100
+}
+
+eval_config {
+  metrics_set {
+    auc {
+    }
+  }
+}
+
+data_config {
+  batch_size: 4096
+  label_fields: "clk"
+  label_fields: "buy"
+  prefetch_size: 32
+  input_type: CSVInput
+  input_fields {
+    input_name: "clk"
+    input_type: INT32
+  }
+  input_fields {
+    input_name: "buy"
+    input_type: INT32
+  }
+  input_fields {
+    input_name: "pid"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "adgroup_id"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "cate_id"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "campaign_id"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "customer"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "brand"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "user_id"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "cms_segid"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "cms_group_id"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "final_gender_code"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "age_level"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "pvalue_level"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "shopping_level"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "occupation"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "new_user_class_level"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "tag_category_list"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "tag_brand_list"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "price"
+    input_type: INT32
+  }
+}
+
+feature_config: {
+  features {
+    input_names: "pid"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 10
+  }
+  features {
+    input_names: "adgroup_id"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 100000
+  }
+  features {
+    input_names: "cate_id"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 10000
+    embedding_name: 'category'
+  }
+  features {
+    input_names: "campaign_id"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 100000
+  }
+  features {
+    input_names: "customer"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 100000
+  }
+  features {
+    input_names: "brand"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 100000
+    embedding_name: 'brand'
+  }
+  features {
+    input_names: "user_id"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 100000
+  }
+  features {
+    input_names: "cms_segid"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 100
+  }
+  features {
+    input_names: "cms_group_id"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 100
+  }
+  features {
+    input_names: "final_gender_code"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 10
+  }
+  features {
+    input_names: "age_level"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 10
+  }
+  features {
+    input_names: "pvalue_level"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 10
+  }
+  features {
+    input_names: "shopping_level"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 10
+  }
+  features {
+    input_names: "occupation"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 10
+  }
+  features {
+    input_names: "new_user_class_level"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 10
+  }
+  features : {
+    input_names: 'tag_category_list'
+    feature_type: SequenceFeature
+    separator: '|'
+    hash_bucket_size: 10000
+    embedding_dim: 16
+    embedding_name: 'category'
+  }
+  features : {
+    input_names: 'tag_brand_list'
+    feature_type: SequenceFeature
+    separator: '|'
+    hash_bucket_size: 100000
+    embedding_dim: 16
+    embedding_name: 'brand'
+  }
+  features {
+    input_names: "price"
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 50
+  }
+}
+
+model_config {
+  model_class: "DBMTL"
+  feature_groups {
+    group_name: "all"
+    feature_names: "user_id"
+    feature_names: "cms_segid"
+    feature_names: "cms_group_id"
+    feature_names: "age_level"
+    feature_names: "pvalue_level"
+    feature_names: "shopping_level"
+    feature_names: "occupation"
+    feature_names: "new_user_class_level"
+    feature_names: "adgroup_id"
+    feature_names: "cate_id"
+    feature_names: "campaign_id"
+    feature_names: "customer"
+    feature_names: "brand"
+    feature_names: "price"
+    feature_names: "pid"
+    wide_deep: DEEP
+  }
+
+  feature_groups {
+    group_name: "seq"
+    feature_names: "brand"
+    feature_names: "cate_id"
+    feature_names: "tag_category_list"
+    feature_names: "tag_brand_list"
+    sequence_encoders {
+      bst {
+        hidden_size: 256
+        num_attention_heads: 4
+        num_hidden_layers: 1
+        intermediate_size: 512
+        hidden_act: 'gelu'
+        max_position_embeddings: 50
+        hidden_dropout_prob: 0.1
+        attention_probs_dropout_prob: 0
+        need_contrastive_learning: true
+      }
+    }
+    wide_deep: DEEP
+  }
+
+  dbmtl {
+    bottom_dnn {
+      hidden_units: [1024, 512, 256]
+    }
+    task_towers {
+      tower_name: "ctr"
+      label_name: "clk"
+      loss_type: CLASSIFICATION
+      metrics_set: {
+        auc {}
+      }
+      dnn {
+        hidden_units: [256, 128, 64, 32]
+      }
+      relation_dnn {
+        hidden_units: [32]
+      }
+      weight: 1.0
+    }
+    l2_regularization: 1e-6
+    use_sequence_encoder: true
+  }
+  embedding_regularization: 5e-6
+}