JCSDA-internal · Dooruk · Apr 3, 2025 · Apr 3, 2025
diff --git a/src/eva/data/soca_restart.py b/src/eva/data/soca_restart.py
@@ -74,8 +74,9 @@ def execute(self, dataset_config, data_collections, timing):
         # -------------------------
         soca_vars = get(dataset_config, self.logger, 'variables', default=[])
         coord_vars = get(dataset_config, self.logger, 'coordinate variables', default=None)
+        opt_vars = get(dataset_config, self.logger, 'optional variables', default=None)
 
-        # Read orographic fields first
+        # Read grid fields first
         # -------------------------
         var_dict = {}
         group_name = 'SOCAgrid'
@@ -90,6 +91,17 @@ def execute(self, dataset_config, data_collections, timing):
             dims, data = read_soca(soca_filenames, var, self.logger)
             var_dict[group_name + '::' + var] = (dims, data)
 
+        # Create opt_vars if requested
+        # -------------------------
+        if opt_vars is not None:
+            for var in opt_vars:
+                if var == 'depth':
+                    dims, data = create_depth_variable(soca_filenames, self.logger)
+                    var_dict[group_name + '::' + var] = (dims, data)
+                else:
+                    self.logger.abort(f"{var} is not a valid optional variable. Valid optional "
+                                     f"variables are: depth")
+
         # Create dataset_config from data dictionary
         # -------------------------
         ds = xr.Dataset(var_dict)
@@ -149,13 +161,53 @@ def read_soca(file, variable, logger):
 
     with Dataset(file, mode='r') as f:
         try:
-            dims = ["lon", "lat"]
+            dims = ["lat", "lon"]
             if len(f.variables[variable].dimensions) > 3:
-                dims = ["lev", "lon", "lat"]
+                dims = ["lev", "lat", "lon"]
             var = np.squeeze(f.variables[variable][:])
         except KeyError:
             logger.abort(f"{variable} is not a valid variable. \nExiting ...")
 
     return dims, var
 
 # --------------------------------------------------------------------------------------------------
+
+def create_depth_variable(file, logger):
+    """
+    Create the time dependent depth variable by cumulatively summing the `h` variable over the `zl`
+    dimension.
+
+    Args:
+        file (str): Path to the SOCA file.
+        logger (Logger): Logger for logging messages.
+
+    Returns:
+        tuple: A tuple containing dimensions (list) and data (numpy.ndarray) for the depth variable.
+    """
+
+    with Dataset(file, mode='r') as f:
+        h_dims = f.variables['h'].dimensions
+        h_data = f.variables['h'][:]
+
+    # Ensure `h` has the expected dimensions, which should be (time, zl, yh, xh)
+    if len(h_dims) != 4 or list(h_dims) != ['time', 'zl', 'yh', 'xh']:
+        logger.abort(f"The `h` variable does not have the expected dimensions or "
+                     f"shape: {h_dims}. Expected (time, zl, yh, xh).")
+
+    # Handle the `time` dimension
+    if h_data.shape[0] > 1:
+        logger.info(f"The `h` variable has a time dimension of size {h_data.shape[0]}. "
+                    f"Using the first time step for depth calculation.")
+        h_data = h_data[0, ...]
+    else:
+        h_data = np.squeeze(h_data, axis=0)
+
+    # Compute the cumulative sum over the `zl` dimension
+    depth_data = np.cumsum(h_data, axis=0)
+
+    # Define the dimensions for the depth variable
+    depth_dims = ['lev', 'lat', 'lon']
+
+    return depth_dims, depth_data
+
+# --------------------------------------------------------------------------------------------------
diff --git a/src/eva/transforms/select_dimension.py b/src/eva/transforms/select_dimension.py
@@ -0,0 +1,118 @@
+# (C) Copyright 2025- NOAA/NWS/EMC
+#
+# (C) Copyright 2025- United States Government as represented by the Administrator of the
+# National Aeronautics and Space Administration. All Rights Reserved.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+
+from eva.transforms.transform_utils import parse_for_dict, split_collectiongroupvariable
+from eva.transforms.transform_utils import replace_cgv
+from eva.utilities.config import get
+from eva.utilities.logger import Logger
+
+def select_dimension(config, data_collections):
+    """
+    Selects and processes data variables for a specified dimension (e.g., latitude, longitude, depth).
+
+    Args:
+        config (dict): A configuration dictionary containing transformation parameters.
+        data_collections (DataCollections): An instance of the DataCollections class containing
+        input data.
+
+    Returns:
+        None
+
+    This function selects and processes data variables within the provided data collections based
+    on a specified dimension (e.g., latitude, longitude, depth) and either a single value or a range
+    of values for that dimension. The resulting processed variables are added to the data collections.
+
+    Example:
+        ::
+
+                config = {
+                    'collections': [...],
+                    'groups': [...],
+                    'variables': [...],
+                    'new name': 'dimension_selected_variable',
+                    'starting field': 'original_variable',
+                    'dimension': 'lat',  # Dimension to select (e.g., lat, lon, depth)
+                    'value': 5,  # Single value
+                    # OR
+                    'start value': -5,
+                    'end value': 5  # Range of values
+                }
+                select_dimension(config, data_collections)
+    """
+
+    # Create a logger
+    logger = Logger('SelectDimensionTransform')
+
+    # Parse config for dictionary
+    [collections, groups, variables] = parse_for_dict(config, logger)
+
+    # Parse config for the expression and new collection/group/variable naming
+    new_name_template = get(config, logger, 'new name')
+    starting_field_template = get(config, logger, 'starting field')
+
+    # Get the dimension to select
+    dimension = get(config, logger, 'dimension', abort_on_failure=True)
+
+    # Get value or start value and end value from yaml file
+    values = [None]
+    value = get(config, logger, 'value', abort_on_failure=False)
+    if value is not None:
+        values = [value]
+    else:
+        start_value = get(config, logger, 'start value', abort_on_failure=False)
+        end_value = get(config, logger, 'end value', abort_on_failure=False)
+        if start_value is not None and end_value is not None:
+            values = [start_value, end_value]
+
+    if None in values:
+        logger.abort(f'{dimension} value(s) for transformation not specified in yaml file. '
+                     f'This should be either value: value or '
+                     f'start value: value and end value: value')
+    if len(values) > 2:
+        logger.info(f'WARNING: More than 2 {dimension} values specified in yaml file. '
+                    f'Only the first 2 values will be used.')
+
+    # Loop over the templates
+    for collection in collections:
+        for group in groups:
+            for variable in variables:
+
+                if variable is not None:
+                    # Replace collection, group, variable in template
+                    [new_name, starting_field] = replace_cgv(logger, collection, group, variable,
+                                                             new_name_template,
+                                                             starting_field_template)
+                else:
+                    new_name = new_name_template
+                    starting_field = starting_field_template
+
+                # Get a dataset for the requested collection::group::variable
+                cgv = split_collectiongroupvariable(logger, starting_field)
+                select_dim = data_collections.get_variable_data_array(cgv[0], cgv[1], cgv[2])
+
+                # Get the collection, group, var for new dataset
+                cgv_new = split_collectiongroupvariable(logger, new_name)
+
+                # If a range of values is provided, select the slice; otherwise, select a single value
+                if len(values) == 2:
+                    if dimension in ['lat', 'lon'] and select_dim[dimension].ndim == 2:
+                        # Create a mask for the range
+                        mask = (select_dim[dimension] >= values[0]) & (select_dim[dimension] <= values[1])
+                        select_dim = select_dim.where(mask, drop=True)
+                    else:
+                        select_dim = select_dim.sel({dimension: slice(values[0], values[1])})
+                else:
+                    if dimension in ['lat', 'lon'] and select_dim[dimension].ndim == 2:
+                        # Create a mask for the single value
+                        mask = select_dim[dimension] == values[0]
+                        select_dim = select_dim.where(mask, drop=True)
+                    else:
+                        select_dim = select_dim.sel({dimension: values[0]}, drop=True)
+
+                data_collections.add_variable_to_collection(cgv_new[0], cgv_new[1],
+                                                            cgv_new[2], select_dim)