Skip to content

Model Definition Summary

brightcoder01 edited this page Dec 13, 2019 · 14 revisions

Model Definition Summary

Use Feature Column

Feature Column Definition

CATEGORICAL_FEATURE_KEYS = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]
NUMERIC_FEATURE_KEYS = [
    "age",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
]
OPTIONAL_NUMERIC_FEATURE_KEYS = [
    "education-num",
]
LABEL_KEY = "label"

def get_feature_columns():
    feature_columns = []
    for numeric_feature_key in NUMERIC_FEATURE_KEYS:
        numeric_feature = tf.feature_column.numeric_column(numeric_feature_key)
        feature_columns.append(numeric_feature)

    for categorical_feature_key in CATEGORICAL_FEATURE_KEYS:
        embedding_feature = tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_hash_bucket(categorical_feature_key, hash_bucket_size=64),
            dimension=16
        )
        feature_columns.append(embedding_feature)

    return feature_columns

def get_feature_input_layers():
    feature_input_layers = {}
    for numeric_feature_key in NUMERIC_FEATURE_KEYS:
        feature_input_layers[numeric_feature_key] = tf.keras.Input(
            shape=(1,), name=numeric_feature_key, dtype=tf.float32
        )

    for categorical_feature_key in CATEGORICAL_FEATURE_KEYS:
        feature_input_layers[categorical_feature_key] = tf.keras.Input(
            shape=(1,), name=categorical_feature_key, dtype=tf.string
        )

    return feature_input_layers

Sequential

def custom_model(feature_columns):
    model = tf.keras.Sequential([
        tf.keras.layers.DenseFeatures(feature_columns=feature_columns),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
        ])

    return model

feature_columns = get_feature_columns()
model = custom_model(feature_columns)

Functional

def custom_model(feature_columns, feature_inputs):
    feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
    x = feature_layer(feature_inputs)
    x = tf.keras.layers.Dense(16, activation="relu")(x)
    x = tf.keras.layers.Dense(16, activation="relu")(x)
    y = tf.keras.layers.Dense(1, activation="sigmoid")(x)

    model = tf.keras.Model(inputs=feature_inputs, outputs=y)

    return model

feature_columns = get_feature_columns()
feature_inputs = get_feature_input_layers()
model = custom_model(feature_columns, feature_inputs)

Subclass

class CustomModel(tf.keras.Model):
    def __init__(self, feature_columns):
        super(CustomModel, self).__init__(name="census_model")
        self.dense_features = tf.keras.layers.DenseFeatures(feature_columns)
        self.dense_1 = tf.keras.layers.Dense(16, activation="relu")
        self.dense_2 = tf.keras.layers.Dense(16, activation="relu")
        self.dense_3 = tf.keras.layers.Dense(1, activation="sigmoid")

    def call(self, inputs, training=False):
        x = self.dense_features(inputs)
        x = self.dense_1(x)
        x = self.dense_2(x)
        x = self.dense_3(x)

        return x

feature_columns = get_feature_columns()
model = CustomModel(feature_columns)

Model Definition Entry point

From the examples above, for the Sequential and Subclass model, we only need feature_columns, it's a list of feature column elements, F functional model we need additional feature_inputs, it's a dict from the feature_name to tf.keras.Input layer.
In order to unify the signature of the model create entry point, we will choose the super set of the parameters. For Sequential and Functional models:

def custom_model(feature_columns, feature_inputs=None):

For Subclass models:

class CustomModel(tf.keras.Model):
    def __init__(self, feature_columns, feature_inputs=None):

Open questions

  1. Single shared embedding vs several separate embeddings? Generally the same value in different column from the source data should be treated as different value.
  2. For the feature column input, the instances of a minibatch should be the same size. How to solve the var len input? Such as the clicked item id list.
    • Truncate/Padding in dataset_fn
    • Truncate/Padding in feature column normalizer_fn
  3. For sparse input, use the embedding_column or dense embedding layer?
  4. How to decide the hash_bucket_size for the categorical input?
  5. Integration with SQLFlow, how to generate the code of the feature column definition and dataset_fn using code_gen?