-
Notifications
You must be signed in to change notification settings - Fork 115
Model Definition Summary
brightcoder01 edited this page Dec 13, 2019
·
14 revisions
CATEGORICAL_FEATURE_KEYS = [
"workclass",
"education",
"marital-status",
"occupation",
"relationship",
"race",
"sex",
"native-country",
]
NUMERIC_FEATURE_KEYS = [
"age",
"capital-gain",
"capital-loss",
"hours-per-week",
]
OPTIONAL_NUMERIC_FEATURE_KEYS = [
"education-num",
]
LABEL_KEY = "label"
def get_feature_columns():
feature_columns = []
for numeric_feature_key in NUMERIC_FEATURE_KEYS:
numeric_feature = tf.feature_column.numeric_column(numeric_feature_key)
feature_columns.append(numeric_feature)
for categorical_feature_key in CATEGORICAL_FEATURE_KEYS:
embedding_feature = tf.feature_column.embedding_column(
tf.feature_column.categorical_column_with_hash_bucket(categorical_feature_key, hash_bucket_size=64),
dimension=16
)
feature_columns.append(embedding_feature)
return feature_columns
def get_feature_input_layers():
feature_input_layers = {}
for numeric_feature_key in NUMERIC_FEATURE_KEYS:
feature_input_layers[numeric_feature_key] = tf.keras.Input(
shape=(1,), name=numeric_feature_key, dtype=tf.float32
)
for categorical_feature_key in CATEGORICAL_FEATURE_KEYS:
feature_input_layers[categorical_feature_key] = tf.keras.Input(
shape=(1,), name=categorical_feature_key, dtype=tf.string
)
return feature_input_layers
def custom_model(feature_columns):
model = tf.keras.Sequential([
tf.keras.layers.DenseFeatures(feature_columns=feature_columns),
tf.keras.layers.Dense(16, activation='relu'),
tf.keras.layers.Dense(16, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
return model
feature_columns = get_feature_columns()
model = custom_model(feature_columns)
def custom_model(feature_columns, feature_inputs):
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
x = feature_layer(feature_inputs)
x = tf.keras.layers.Dense(16, activation="relu")(x)
x = tf.keras.layers.Dense(16, activation="relu")(x)
y = tf.keras.layers.Dense(1, activation="sigmoid")(x)
model = tf.keras.Model(inputs=feature_inputs, outputs=y)
return model
feature_columns = get_feature_columns()
feature_inputs = get_feature_input_layers()
model = custom_model(feature_columns, feature_inputs)
class CustomModel(tf.keras.Model):
def __init__(self, feature_columns):
super(CustomModel, self).__init__(name="census_model")
self.dense_features = tf.keras.layers.DenseFeatures(feature_columns)
self.dense_1 = tf.keras.layers.Dense(16, activation="relu")
self.dense_2 = tf.keras.layers.Dense(16, activation="relu")
self.dense_3 = tf.keras.layers.Dense(1, activation="sigmoid")
def call(self, inputs, training=False):
x = self.dense_features(inputs)
x = self.dense_1(x)
x = self.dense_2(x)
x = self.dense_3(x)
return x
feature_columns = get_feature_columns()
model = CustomModel(feature_columns)
From the examples above, for the Sequential and Subclass model, we only need feature_columns
, it's a list of feature column elements, F functional model we need additional feature_inputs
, it's a dict from the feature_name to tf.keras.Input layer.
In order to unify the signature of the model create entry point, we will choose the super set of the parameters.
For Sequential and Functional models:
def custom_model(feature_columns, feature_inputs=None):
For Subclass models:
class CustomModel(tf.keras.Model):
def __init__(self, feature_columns, feature_inputs=None):
- Single shared embedding vs several separate embeddings? Generally the same value in different column from the source data should be treated as different value.
- How to aggregate multiple categorical inputs into a embedding table.
- For the feature column input, the instances of a minibatch should be the same size. How to solve the var len input? Such as the clicked item id list.
- Truncate/Padding in dataset_fn
- Truncate/Padding in feature column normalizer_fn
- For sparse input, use the embedding_column or dense embedding layer?
- How to decide the hash_bucket_size for the categorical input?
- statisical
- Integration with SQLFlow, how to generate the code of the feature column definition and dataset_fn using code_gen
- TBD