Skip to content

Model Definition Summary

brightcoder01 edited this page Dec 15, 2019 · 14 revisions

Model Definition Summary

Use Feature Column

Feature Column Definition

CATEGORICAL_FEATURE_KEYS = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]
NUMERIC_FEATURE_KEYS = [
    "age",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
]
OPTIONAL_NUMERIC_FEATURE_KEYS = [
    "education-num",
]
LABEL_KEY = "label"

def get_feature_columns():
    feature_columns = []
    for numeric_feature_key in NUMERIC_FEATURE_KEYS:
        numeric_feature = tf.feature_column.numeric_column(numeric_feature_key)
        feature_columns.append(numeric_feature)

    for categorical_feature_key in CATEGORICAL_FEATURE_KEYS:
        embedding_feature = tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_hash_bucket(categorical_feature_key, hash_bucket_size=64),
            dimension=16
        )
        feature_columns.append(embedding_feature)

    return feature_columns

def get_feature_input_layers():
    feature_input_layers = {}
    for numeric_feature_key in NUMERIC_FEATURE_KEYS:
        feature_input_layers[numeric_feature_key] = tf.keras.Input(
            shape=(1,), name=numeric_feature_key, dtype=tf.float32
        )

    for categorical_feature_key in CATEGORICAL_FEATURE_KEYS:
        feature_input_layers[categorical_feature_key] = tf.keras.Input(
            shape=(1,), name=categorical_feature_key, dtype=tf.string
        )

    return feature_input_layers

Sequential

def custom_model(feature_columns):
    model = tf.keras.Sequential([
        tf.keras.layers.DenseFeatures(feature_columns=feature_columns),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
        ])

    return model

feature_columns = get_feature_columns()
model = custom_model(feature_columns)

Functional

def custom_model(feature_columns, feature_inputs):
    feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
    x = feature_layer(feature_inputs)
    x = tf.keras.layers.Dense(16, activation="relu")(x)
    x = tf.keras.layers.Dense(16, activation="relu")(x)
    y = tf.keras.layers.Dense(1, activation="sigmoid")(x)

    model = tf.keras.Model(inputs=feature_inputs, outputs=y)

    return model

feature_columns = get_feature_columns()
feature_inputs = get_feature_input_layers()
model = custom_model(feature_columns, feature_inputs)

Subclass

class CustomModel(tf.keras.Model):
    def __init__(self, feature_columns):
        super(CustomModel, self).__init__(name="census_model")
        self.dense_features = tf.keras.layers.DenseFeatures(feature_columns)
        self.dense_1 = tf.keras.layers.Dense(16, activation="relu")
        self.dense_2 = tf.keras.layers.Dense(16, activation="relu")
        self.dense_3 = tf.keras.layers.Dense(1, activation="sigmoid")

    def call(self, inputs, training=False):
        x = self.dense_features(inputs)
        x = self.dense_1(x)
        x = self.dense_2(x)
        x = self.dense_3(x)

        return x

feature_columns = get_feature_columns()
model = CustomModel(feature_columns)

Integration with SQLFlow

Open Questions

  1. Single shared embedding vs several separate embeddings? Generally the same value in different column from the source data should be treated as different value.
    • How to aggregate multiple categorical inputs into a embedding table.
  2. For the feature column input, the instances of a minibatch should be the same size. How to solve the var len input? Such as the clicked item id list.
    • Truncate/Padding in dataset_fn
    • Truncate/Padding in feature column normalizer_fn
  3. For sparse input, use the embedding_column or dense embedding layer?
  4. How to decide the hash_bucket_size for the categorical input?
    • statisical
  5. Integration with SQLFlow, how to generate the code of the feature column definition and dataset_fn using code_gen
    • TBD