From 93427dc62d250827fce5b9d4e33eb3ca5f18f226 Mon Sep 17 00:00:00 2001
From: Pritam Dodeja <pritam.dodeja@gmail.com>
Date: Fri, 6 May 2022 05:08:09 -0400
Subject: [PATCH 1/3] Fix to only transform raw data when requested.

When read_raw_data_for_training is set to False when invoking the main
function, common.transform_data was being called on raw train and test
data anyway.  This fix moves the transformation to the block where
read_raw_data_for_training is True. The scenario here is the data has
already been preprocessed, and the user wishes to re-use that
preprocessed data.
---
 examples/census_example_v2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/census_example_v2.py b/examples/census_example_v2.py
index 66d37680..4c3ac8a4 100644
--- a/examples/census_example_v2.py
+++ b/examples/census_example_v2.py
@@ -237,11 +237,11 @@ def main(input_data_dir,
   train_data_file = os.path.join(input_data_dir, 'adult.data')
   test_data_file = os.path.join(input_data_dir, 'adult.test')
 
-  common.transform_data(train_data_file, test_data_file, working_dir)
 
   if read_raw_data_for_training:
     raw_train_and_eval_patterns = (train_data_file, test_data_file)
     transformed_train_and_eval_patterns = None
+    common.transform_data(train_data_file, test_data_file, working_dir)
   else:
     train_pattern = os.path.join(working_dir,
                                  common.TRANSFORMED_TRAIN_DATA_FILEBASE + '*')

From 6e24dc30f7d4593944e0914bfebc9b8bdd45366a Mon Sep 17 00:00:00 2001
From: Pritam Dodeja <pritam.dodeja@gmail.com>
Date: Tue, 10 May 2022 04:29:02 -0400
Subject: [PATCH 2/3] Shape related code is simplified by using (1,).

Since this is tabular data we're dealing with, the code has been
updated to treat it as such.  The net result is simpler shape related
code.  Education-num is treated as dense here instead of sparse as it
was before.  It might be missing values in the data, so it might call
for some sort of imputation to be done.
---
 examples/census_example_common.py | 19 +++++++------------
 examples/census_example_v2.py     | 13 +++++++++----
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/examples/census_example_common.py b/examples/census_example_common.py
index 9f578bcc..ffc8c318 100644
--- a/examples/census_example_common.py
+++ b/examples/census_example_common.py
@@ -56,14 +56,14 @@
 ]
 
 
-RAW_DATA_FEATURE_SPEC = dict([(name, tf.io.FixedLenFeature([], tf.string))
+RAW_DATA_FEATURE_SPEC = dict([(name, tf.io.FixedLenFeature(shape=(1,), dtype=tf.string))
                               for name in CATEGORICAL_FEATURE_KEYS] +
-                             [(name, tf.io.FixedLenFeature([], tf.float32))
+                             [(name, tf.io.FixedLenFeature(shape=(1,), dtype=tf.float32))
                               for name in NUMERIC_FEATURE_KEYS] +
-                             [(name, tf.io.VarLenFeature(tf.float32))
+                             [(name, tf.io.FixedLenFeature(shape=(1,), dtype=tf.float32))
                               for name in OPTIONAL_NUMERIC_FEATURE_KEYS] +
                              [(LABEL_KEY,
-                               tf.io.FixedLenFeature([], tf.string))])
+                               tf.io.FixedLenFeature(shape=(1,), dtype=tf.string))])
 
 _SCHEMA = dataset_metadata.DatasetMetadata(
     schema_utils.schema_from_feature_spec(RAW_DATA_FEATURE_SPEC)).schema
@@ -125,14 +125,9 @@ def preprocessing_fn(inputs):
       outputs[key] = tft.scale_to_0_1(inputs[key])
 
     for key in OPTIONAL_NUMERIC_FEATURE_KEYS:
-      # This is a SparseTensor because it is optional. Here we fill in a default
-      # value when it is missing.
-      sparse = tf.sparse.SparseTensor(inputs[key].indices, inputs[key].values,
-                                      [inputs[key].dense_shape[0], 1])
-      dense = tf.sparse.to_dense(sp_input=sparse, default_value=0.)
-      # Reshaping from a batch of vectors of size 1 to a batch to scalars.
-      dense = tf.squeeze(dense, axis=1)
-      outputs[key] = tft.scale_to_0_1(dense)
+      # This is being treated as a dense tensor that might be missing
+      # values. Might call for some sort of imputation.
+      outputs[key] = tf.identity(inputs[key])
 
     # For all categorical columns except the label column, we generate a
     # vocabulary, and convert the string feature to a one-hot encoding.
diff --git a/examples/census_example_v2.py b/examples/census_example_v2.py
index 4c3ac8a4..458a708e 100644
--- a/examples/census_example_v2.py
+++ b/examples/census_example_v2.py
@@ -93,7 +93,12 @@ def transform_dataset(data):
         raw_features[key] = tf.RaggedTensor.from_tensor(
             tf.expand_dims(val, -1)).to_sparse()
         continue
-      raw_features[key] = val
+      # We receive the raw data as scalars of length batch, but
+      # we need them to be tensors of shape (1,) with
+      # batch number of them.  This can be thought of as adding a batch
+      # dimension, but more simply, this is us saying we want to treat
+      # each observation as a tensor of shape (1, ), which is a vector.
+      raw_features[key] = tf.expand_dims(val, -1)
     transformed_features = tft_layer(raw_features)
     data_labels = transformed_features.pop(common.LABEL_KEY)
     return (transformed_features, data_labels)
@@ -128,7 +133,7 @@ def serve_tf_examples_fn(serialized_tf_examples):
     return {'classes': classes, 'scores': outputs}
 
   concrete_serving_fn = serve_tf_examples_fn.get_concrete_function(
-      tf.TensorSpec(shape=[None], dtype=tf.string, name='inputs'))
+      tf.TensorSpec(shape=(1,), dtype=tf.string, name='inputs'))
   signatures = {'serving_default': concrete_serving_fn}
 
   # This is required in order to make this model servable with model_server.
@@ -191,12 +196,12 @@ def train_and_evaluate(raw_train_eval_data_path_pattern,
   for key, spec in feature_spec.items():
     if isinstance(spec, tf.io.VarLenFeature):
       inputs[key] = tf.keras.layers.Input(
-          shape=[None], name=key, dtype=spec.dtype, sparse=True)
+          shape=(1,), name=key, dtype=spec.dtype, sparse=True)
     elif isinstance(spec, tf.io.FixedLenFeature):
       # TODO(b/208879020): Move into schema such that spec.shape is [1] and not
       # [] for scalars.
       inputs[key] = tf.keras.layers.Input(
-          shape=spec.shape or [1], name=key, dtype=spec.dtype)
+          shape=spec.shape, name=key, dtype=spec.dtype)
     else:
       raise ValueError('Spec type is not supported: ', key, spec)
 

From cd280975eb633f2e6ae65b7f25b4288c9f841369 Mon Sep 17 00:00:00 2001
From: Pritam Dodeja <pritam.dodeja@gmail.com>
Date: Tue, 10 May 2022 05:07:02 -0400
Subject: [PATCH 3/3] Scaling education-num from 0_1

---
 examples/census_example_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/census_example_common.py b/examples/census_example_common.py
index ffc8c318..160274fe 100644
--- a/examples/census_example_common.py
+++ b/examples/census_example_common.py
@@ -127,7 +127,7 @@ def preprocessing_fn(inputs):
     for key in OPTIONAL_NUMERIC_FEATURE_KEYS:
       # This is being treated as a dense tensor that might be missing
       # values. Might call for some sort of imputation.
-      outputs[key] = tf.identity(inputs[key])
+      outputs[key] = tft.scale_to_0_1(inputs[key])
 
     # For all categorical columns except the label column, we generate a
     # vocabulary, and convert the string feature to a one-hot encoding.