diff --git a/soil_id/us_soil.py b/soil_id/us_soil.py index 6850938..c0ec692 100644 --- a/soil_id/us_soil.py +++ b/soil_id/us_soil.py @@ -29,11 +29,10 @@ import soil_id.config from .color import getProfileLAB, lab2munsell, munsell2rgb -from .services import get_soil_series_data, get_soilweb_data, sda_return +from .services import get_soil_series_data, get_soilweb_data, sda_return, get_elev_data from .soil_sim import soil_sim from .utils import ( aggregate_data, - compute_site_similarity, drop_cokey_horz, extract_mucompdata_STATSGO, extract_muhorzdata_STATSGO, @@ -1574,12 +1573,10 @@ def rank_soils( # Adjust the bottom depth based on bedrock depth if bedrock is not None: if bedrock is not soil_df.empty and soil_df["bottom"].iloc[-1] > bedrock: - # Find the last valid index where bottom depth is less than or equal to bedrock - last_valid_index = soil_df.loc[soil_df["bottom"] <= bedrock].index[-1] - # Filter the DataFrame up to the last valid index - soil_df = soil_df.loc[:last_valid_index].copy() - # Set the bottom depth of the last row to the bedrock depth - soil_df.at[last_valid_index, "bottom"] = bedrock + # Remove rows where top depth is already below bedrock + soil_df = soil_df[soil_df["top"] < bedrock].copy() + # If any remaining row has bottom depth exceeding bedrock, truncate it + soil_df.loc[soil_df["bottom"] > bedrock, "bottom"] = bedrock # Drop the original horizonDepth column soil_df.drop(columns=["horizonDepth"], inplace=True) @@ -1610,60 +1607,73 @@ def rank_soils( rfvDepth = soil_df.rfvDepth.tolist() horizonDepthB = [int(x) for x in soil_df.bottom.tolist()] horizonDepthT = [int(x) for x in soil_df.top.tolist()] - lab_Color = soil_df.lab_Color + lab_series = soil_df.lab_Color + + # Determine the maximum depth for user specified horizons + if not horizonDepthB: + max_user_depth = 0 + else: + max_user_depth = max(horizonDepthB) + if bedrock is not None: + max_user_depth = min(bedrock, max_user_depth) # Generate user specified percent clay, sand, and rfv distributions spt = [getSand(sh) for sh in soilHorizon] cpt = [getClay(sh) for sh in soilHorizon] p_cfg = [getCF_fromClass(rf) for rf in rfvDepth] - p_sandpct_intpl = [ - spt[i] - for i in range(len(soilHorizon)) - for _ in range(horizonDepthT[i], horizonDepthB[i]) - ] - p_claypct_intpl = [ - cpt[i] - for i in range(len(soilHorizon)) - for _ in range(horizonDepthT[i], horizonDepthB[i]) - ] - p_cfg_intpl = [ - p_cfg[i] - for i in range(len(soilHorizon)) - for _ in range(horizonDepthT[i], horizonDepthB[i]) - ] + # Initialize full-length property arrays with NaNs + sand_array = [np.nan] * max_user_depth + clay_array = [np.nan] * max_user_depth + cfg_array = [np.nan] * max_user_depth + + for i in range(len(soilHorizon)): + t = horizonDepthT[i] + b = horizonDepthB[i] + if t >= max_user_depth: + continue + b = min(b, max_user_depth) + for d in range(t, b): + sand_array[d] = spt[i] + clay_array[d] = cpt[i] + cfg_array[d] = p_cfg[i] + + p_sandpct_intpl = pd.DataFrame(sand_array) + p_claypct_intpl = pd.DataFrame(clay_array) + p_cfg_intpl = pd.DataFrame(cfg_array) # Length of interpolated texture and RF depth - p_bottom_depth = pd.DataFrame([-999, "sample_pedon", soil_df_slice.bottom.iloc[-1]]).T + p_bottom_depth = pd.DataFrame([-999, "sample_pedon", max_user_depth]).T p_bottom_depth.columns = ["cokey", "compname", "bottom_depth"] # Pedon color data - if not isinstance(lab_Color, pd.DataFrame): - lab_Color = pd.DataFrame(lab_Color) - if not lab_Color.isnull().all().all(): # Use all().all() to check the entire DataFrame - lab_Color = lab_Color.apply( - lambda x: [np.nan, np.nan, np.nan] if x.isnull().all() else x, axis=1 - ) - p_lab_intpl = [ + lab_array = [[np.nan, np.nan, np.nan] for _ in range(max_user_depth)] + + # Force correct structure + lab_cleaned = [ + row if (isinstance(row, (list, tuple)) and len(row) == 3) else [np.nan, np.nan, np.nan] + for row in lab_series + ] + + # now unpack into three columns + lab_Color = pd.DataFrame(lab_cleaned, columns=["L", "A", "B"]) + + # Interpolate colors across depth + for i in range(len(lab_Color)): + t = horizonDepthT[i] + b = horizonDepthB[i] + if t >= max_user_depth: + continue + b = min(b, max_user_depth) + color_val = ( lab_Color.iloc[i].tolist() - for i in range(len(lab_Color)) - for _ in range(horizonDepthT[i], horizonDepthB[i]) - ] - p_lab_intpl_list = [item[0] for item in p_lab_intpl] # Access the inner list - p_lab_intpl = pd.DataFrame(p_lab_intpl_list, columns=["L", "A", "B"]).reset_index( - drop=True + if not pd.isnull(lab_Color.iloc[i]).all() + else [np.nan, np.nan, np.nan] ) - else: - lab_Color = lab_Color.dropna() # Remove rows where all elements are None - p_lab_intpl = pd.DataFrame( - np.nan, index=np.arange(200), columns=["L", "A", "B"] - ).reset_index(drop=True) + for d in range(t, b): + lab_array[d] = color_val - # Adjust depth interval for each dataset - p_sandpct_intpl = adjust_depth_interval(p_sandpct_intpl) - p_claypct_intpl = adjust_depth_interval(p_claypct_intpl) - p_cfg_intpl = adjust_depth_interval(p_cfg_intpl) - p_lab_intpl = adjust_depth_interval(p_lab_intpl) + p_lab_intpl = pd.DataFrame(lab_array, columns=["L", "A", "B"]).reset_index(drop=True) # Construct final dataframe with adjusted data p_compname = pd.Series("sample_pedon", index=np.arange(len(p_sandpct_intpl))) @@ -1798,6 +1808,20 @@ def rank_soils( horz_vars = [p_hz_data] horz_vars.extend([group.reset_index(drop=True).loc[pedon_slice_index] for group in groups]) + global_prop_bounds = { + "sandpct_intpl": (10.0, 92.0), + "claypct_intpl": (5.0, 70.0), + "rfv_intpl": (0.0, 80.0), + "l": (10.0, 95.0), + "a": (-10.0, 35.0), + "b": (-10.0, 60.0), + } + + # numeric range (upper – lower): + global_prop_ranges = { + prop: upper - lower for prop, (lower, upper) in global_prop_bounds.items() + } + # Calculate similarity for each depth slice dis_mat_list = [] for i in ( @@ -1825,9 +1849,13 @@ def rank_soils( .drop(["compname"], axis=1) .columns.tolist() ) - sliceT = sliceT[sample_pedon_slice_vars] + sliceT = sliceT[sample_pedon_slice_vars] + + theoretical_prop_ranges = [global_prop_ranges[c] for c in sample_pedon_slice_vars] + D = gower_distances( + sliceT, theoretical_ranges=theoretical_prop_ranges + ) # Equal weighting given to all soil variables - D = gower_distances(sliceT) # Equal weighting given to all soil variables dis_mat_list.append(D) # Determine if any components have all NaNs at every slice @@ -1837,40 +1865,29 @@ def rank_soils( "Not Ranked" if np.ma.is_masked(x) else "Ranked" for x in D_check[0][1:] ] - # Calculate maximum dissimilarity - dis_max_slice = [ - np.nanmax(matrix) if not np.isnan(matrix).all() else np.nan for matrix in dis_mat_list - ] - dis_max = np.nanmax(dis_max_slice) - dis_max_slice = [dis_max if np.isnan(x) else x for x in dis_max_slice] + # Maximum dissimilarity + dis_max = 1.0 # Apply depth weight depth_weight = np.concatenate((np.repeat(0.2, 20), np.repeat(1.0, 180)), axis=0) depth_weight = depth_weight[pedon_slice_index] - # Infill Nan data - - # Update dis_mat_list using numpy operations + # Infill Nan data: soil vs non‑soil logic for i, dis_mat in enumerate(dis_mat_list): - soil_slice = soil_matrix.iloc[i, :] + soil_slice = soil_matrix.iloc[i, :].values.astype(bool) + pedon_idx = 0 # assuming sample_pedon is always row 0 - # Identify where NaN values exist and where the corresponding soil_slice is 1 - nan_and_soil_slice_is_one = np.isnan(dis_mat) & np.isin( - np.arange(dis_mat.shape[1]), np.where(soil_slice == 1)[0] - ) - - # Identify where NaN values exist and both corresponding values in soil_slice are 0 - rows, cols = np.where(np.isnan(dis_mat)) - both_zero_rows_cols = [ - (row, col) - for row, col in zip(rows, cols) - if soil_slice[row] == 0 and soil_slice[col] == 0 - ] + # 1) If pedon has data here, any NaN in pedon↔component → max dissimilarity + if soil_slice[pedon_idx]: + # components are those indices where soil_slice[j] is False + nonsoil_j = np.where(~soil_slice)[0] + for j in nonsoil_j: + if np.isnan(dis_mat[pedon_idx, j]): + dis_mat[pedon_idx, j] = dis_max + dis_mat[j, pedon_idx] = dis_max - # Assign max dissimilarity or 0 based on conditions - dis_mat[nan_and_soil_slice_is_one] = dis_max - for row, col in both_zero_rows_cols: - dis_mat[row, col] = 0 + # 2) Every other NaN (component–component or missing–missing) → zero + dis_mat[np.isnan(dis_mat)] = 0.0 dis_mat_list[i] = dis_mat @@ -1888,32 +1905,69 @@ def rank_soils( D_horz = None # ---Site Data Similarity--- + if pElev is None: + pElev_dict = get_elev_data(lon, lat) + try: + pElev = float(pElev_dict["value"]) + except (KeyError, TypeError, ValueError): + pElev = None # or some default + + # 1) “Raw” guard on the three possible site inputs: + provided = { + "slope_r": pSlope, + "elev_r": pElev, + "bottom_depth": bedrock, # this determines if bottom_depth is set + } - # Initialize variables for site similarity - p_slope = pd.DataFrame(["sample_pedon", pSlope, pElev]).T - p_slope.columns = ["compname", "slope_r", "elev_r"] - - # Check conditions to determine the data columns and feature weights - if (pSlope is not None) and (p_bottom_depth.bottom_depth.any() > 0): - D_site = compute_site_similarity( - p_slope, - mucompdata_pd, - slices_of_soil, - ["slope_r", "elev_r", "bottom_depth"], - feature_weight=np.array([1.0, 0.5, 0.5]), - ) + # 2) Figure out which of those are actually non-null + features = [ + name + for name, val in provided.items() + if val is not None and not (isinstance(val, float) and np.isnan(val)) + ] + + # 3) If fewer than two features, skip entirely + if len(features) < 2: + D_site = None else: - D_site = compute_site_similarity( - p_slope, mucompdata_pd, slices_of_soil, feature_weight=np.array([1.0, 0.5]) - ) + # 4) Build your one‐row pedon DataFrame (only slope/elev, depth comes from merge) + pedon_dict = {"compname": "sample_pedon"} + if "slope_r" in features: + pedon_dict["slope_r"] = pSlope + if "elev_r" in features: + pedon_dict["elev_r"] = pElev + pedon_df = pd.DataFrame([pedon_dict]) + + # 5) Build your map‐unit library (only the columns you need) + lib_cols = ["compname"] + [f for f in features if f in ("slope_r", "elev_r")] + lib_df = mucompdata_pd[lib_cols].copy() + + # 6) Stack them together + full_df = pd.concat([pedon_df, lib_df], ignore_index=True) + + # 7) If you need bottom_depth, merge it in for *all* rows + if "bottom_depth" in features: + full_df = full_df.merge( + slices_of_soil[["compname", "bottom_depth"]], on="compname", how="left" + ) + + # 8) Build your weight vector + DEFAULT_WEIGHTS = {"slope_r": 1.0, "elev_r": 0.5, "bottom_depth": 1.5} + weights = np.array([DEFAULT_WEIGHTS[f] for f in features]) - # Adjust the distances and apply weight - site_wt = 0.5 - D_site = (1 - D_site) * site_wt + # 9) Compute Gower distances on exactly those feature columns + site_mat = full_df.set_index("compname")[features] + D_raw = gower_distances(site_mat, feature_weight=weights) + + # 10 Replace any NaNs with the max distance, then (optionally) convert to similarity + D_site = np.where(np.isnan(D_raw), np.nanmax(D_raw), D_raw) + + site_wt = 0.5 + D_site = (1 - D_site) * site_wt # Create the D_final dataframe based on the availability of D_horz and D_site data - # When both D_horz and D_site are available + # When both D_horz and D_site are available (relative weights: 66% horz, 33% site) if D_horz is not None and D_site is not None: D_site_hz = np.sum([D_site, D_horz], axis=0) / (1 + site_wt) D_final = pd.concat( @@ -2166,34 +2220,6 @@ def rank_soils( return output_data -def adjust_depth_interval(data, target_length=200): - """Adjusts the depth interval of user data.""" - - # Convert input to a DataFrame - if isinstance(data, list): - data = pd.DataFrame(data) - elif isinstance(data, pd.Series): - data = data.to_frame() - - # Ensure data is a DataFrame at this point - if not isinstance(data, pd.DataFrame): - raise TypeError("Data must be a list, Series, or DataFrame") - - length = len(data) - - if length > target_length: - # Truncate data if it exceeds the target length - data = data.iloc[:target_length] - elif length < target_length: - # Extend data if it's shorter than the target length - add_length = target_length - length - add_data = pd.DataFrame(np.nan, index=np.arange(add_length), columns=data.columns) - data = pd.concat([data, add_data]) - - data.reset_index(drop=True, inplace=True) - return data - - # Helper function to update dataframes based on depth conditions def update_intpl_data( df, col_names, values, very_bottom, OSD_depth_add, OSD_depth_remove, OSD_max_bottom_int diff --git a/soil_id/utils.py b/soil_id/utils.py index 1888293..6c7815d 100644 --- a/soil_id/utils.py +++ b/soil_id/utils.py @@ -814,231 +814,148 @@ def check_pairwise_arrays(X, Y, precomputed=False, dtype=None): return X, Y -def gower_distances(X, Y=None, feature_weight=None, categorical_features=None): +def gower_distances( + X, Y=None, feature_weight=None, categorical_features=None, theoretical_ranges=None +): """ - Computes the gower distances between X and Y. - Gower is a similarity measure for categorical, boolean, and numerical mixed data. + Computes the Gower distances between X and Y using mixed-type data, + with optional hybrid normalization: per-slice min/max with a floor + based on theoretical feature ranges. Parameters: ---------- - X : array-like, or pd.DataFrame - Shape (n_samples, n_features) - Y : array-like, or pd.DataFrame, optional - Shape (n_samples, n_features) - feature_weight : array-like, optional - Shape (n_features). According to the Gower formula, it's an attribute weight. - categorical_features : array-like, optional - Shape (n_features). Indicates whether a column is a categorical attribute. - - Returns: - ------- - ndarray : Gower distances. Shape (n_samples, n_samples) + X : array-like or pd.DataFrame, shape (n_samples, n_features) + Y : array-like or pd.DataFrame, shape (m_samples, n_features), optional + feature_weight : array-like, shape (n_features,), optional + categorical_features : array-like of bools or indices, optional + theoretical_ranges : array-like, shape (n_numeric_features,), optional + If provided, the floor for each numeric feature's denominator is + set to 10%% of its theoretical span. """ - + # Reject sparse inputs if issparse(X) or (Y is not None and issparse(Y)): raise TypeError("Sparse matrices are not supported for gower distance") - # Ensure arrays are numpy arrays + # Ensure numpy arrays and pairwise shape X = np.asarray(X) - dtype = ( object - if not np.issubdtype(X.dtype, np.number) or np.isnan(X.sum()) + if not np.issubdtype(X.dtype, np.number) or np.isnan(X).any() else type(np.zeros(1, X.dtype).flat[0]) ) X, Y = check_pairwise_arrays(X, Y, dtype=dtype) - n_rows, n_cols = X.shape + # Determine categorical features mask if categorical_features is None: - categorical_features = np.array( - [not np.issubdtype(type(val), np.number) for val in X[0, :]] - ) + mask = [not np.issubdtype(type(val), np.number) for val in X[0]] + categorical_features = np.array(mask, dtype=bool) else: categorical_features = np.array(categorical_features) + if categorical_features.dtype == int: + mask = np.zeros(n_cols, dtype=bool) + mask[categorical_features] = True + categorical_features = mask - if np.issubdtype(categorical_features.dtype, int): - new_categorical_features = np.zeros(n_cols, dtype=bool) - new_categorical_features[categorical_features] = True - categorical_features = new_categorical_features - - # Split data into categorical and numeric + # Split data X_cat = X[:, categorical_features] X_num = X[:, ~categorical_features] - # Calculate ranges and max values for normalization - max_of_numeric = np.nanmax(X_num, axis=0) - min_of_numeric = np.nanmin(X_num, axis=0) - ranges_of_numeric = max_of_numeric - min_of_numeric - ranges_of_numeric[ranges_of_numeric == 0] = 1 + # Hybrid numeric normalization + slice_max = np.nanmax(X_num, axis=0) + slice_min = np.nanmin(X_num, axis=0) + slice_range = slice_max - slice_min - # Normalize numeric data - X_num = (X_num - min_of_numeric) / ranges_of_numeric + if theoretical_ranges is None: + # avoid division by zero + denom = np.where(slice_range == 0, 1.0, slice_range) + else: + theo = np.array(theoretical_ranges, dtype=float) + floor = 0.1 * theo + denom = np.maximum(slice_range, floor) + + X_num = (X_num - slice_min) / denom # Handle feature weights if feature_weight is None: - feature_weight = np.ones(X_num.shape[1]) - + feature_weight = np.ones(X_num.shape[1], dtype=float) feature_weight_num = feature_weight[~categorical_features] + feature_weight_cat = feature_weight[categorical_features] - # Conditional processing for Y + # Process Y if Y is not None: Y_cat = Y[:, categorical_features] Y_num = Y[:, ~categorical_features] - # Normalize numeric data safely for Y_num - Y_num = np.where( - ranges_of_numeric != 0, (Y_num - min_of_numeric) / ranges_of_numeric, Y_num - ) + if theoretical_ranges is None: + Y_num = np.where(denom != 0, (Y_num - slice_min) / denom, Y_num) + else: + Y_num = (Y_num - slice_min) / denom else: Y_cat = X_cat.copy() Y_num = X_num.copy() - # Ensure feature_weight_cat is defined - feature_weight_cat = feature_weight[categorical_features] - - # # Handle feature weights - # if feature_weight is None: - # feature_weight = np.ones(n_cols) - - # feature_weight_cat = feature_weight[categorical_features] - # feature_weight_num = feature_weight[~categorical_features] - - # Y_cat = X_cat if Y is None else Y[:, categorical_features] - # Y_num = X_num if Y is None else Y[:, ~categorical_features] - # Y_num /= max_of_numeric - + # Compute pairwise distances dm = np.zeros((n_rows, Y.shape[0]), dtype=np.float32) - - # Calculate pairwise gower distances - for i in range(X_num.shape[0]): + total_weight = feature_weight.sum() + for i in range(n_rows): start = i if Y is None else 0 - result = _gower_distance_row( - X_cat[i, :], - X_num[i, :], - Y_cat[start:, :], - Y_num[start:, :], + row = _gower_distance_row( + X_cat[i], + X_num[i], + Y_cat[start:], + Y_num[start:], feature_weight_cat, feature_weight_num, - feature_weight.sum(), - categorical_features, - ranges_of_numeric, - max_of_numeric, + total_weight, ) - dm[i, start:] = result - if Y is None: # If Y is not provided, the matrix is symmetric - dm[start:, i] = result - + dm[i, start:] = row + if Y is None: + dm[start:, i] = row return dm - # # Calculate pairwise gower distances - # for i in range(n_rows): - # start = i if Y is None else 0 - # result = _gower_distance_row( - # X_cat[i, :], - # X_num[i, :], - # Y_cat[start:, :], - # Y_num[start:, :], - # feature_weight_cat, - # feature_weight_num, - # feature_weight.sum(), - # categorical_features, - # ranges_of_numeric, - # max_of_numeric, - # ) - # dm[i, start:] = result - # if Y is None: # If Y is not provided, the matrix is symmetric - # dm[start:, i] = result - - # return dm def _gower_distance_row( - xi_cat, - xi_num, - xj_cat, - xj_num, - feature_weight_cat, - feature_weight_num, - feature_weight_sum, - categorical_features, - ranges_of_numeric, - max_of_numeric, + xi_cat, xi_num, xj_cat, xj_num, feature_weight_cat, feature_weight_num, feature_weight_sum ): """ - Compute the Gower distance between a single row and a set of rows. - - This function calculates the Gower distance between a single data point (xi) - and a set of data points (xj). Both categorical and numerical features are - considered in the calculation. - - Parameters: - - xi_cat: Categorical data for xi. - - xi_num: Numerical data for xi. - - xj_cat: Categorical data for xj. - - xj_num: Numerical data for xj. - - feature_weight_cat: Weights for categorical features. - - feature_weight_num: Weights for numerical features. - - feature_weight_sum: Sum of all feature weights. - - ranges_of_numeric: Normalized ranges for numeric features. - - Returns: - - Gower distance between xi and each row in xj. + Compute Gower distance between one row xi and rows xj. xi_num and xj_num + are already normalized to [0,1] using the hybrid approach. """ + # Categorical distance (0 if equal, 1 if not) + sij_cat = (xi_cat != xj_cat).astype(int) + sum_cat = np.dot(sij_cat, feature_weight_cat) - # Calculate distance for categorical data - sij_cat = np.where(xi_cat == xj_cat, 0, 1) - sum_cat = np.sum(feature_weight_cat * sij_cat, axis=1) - - # Calculate distance for numerical data - abs_delta = np.abs(xi_num - xj_num) - sij_num = np.divide( - abs_delta, - ranges_of_numeric, - out=np.zeros_like(abs_delta), - where=ranges_of_numeric != 0, - ) - sum_num = np.sum(feature_weight_num * sij_num, axis=1) - - # Combine distances for categorical and numerical data - sum_sij = (sum_cat + sum_num) / feature_weight_sum + # Numeric distance (absolute difference, already normalized) + sum_num = np.dot(np.abs(xi_num - xj_num), feature_weight_num) - return sum_sij + # Combined + return (sum_cat + sum_num) / feature_weight_sum def compute_site_similarity( - p_slope, mucompdata, slices, additional_columns=None, feature_weight=None -): + data: pd.DataFrame, features: list[str], feature_weight: np.ndarray +) -> np.ndarray: """ - Compute gower distances for site similarity based on the provided feature weights. + Compute Gower distances among the rows of `data` using only `features` + and the matching weights in `feature_weight`. Parameters: - - p_slope: DataFrame containing sample_pedon information. - - mucompdata: DataFrame containing component data. - - slices: DataFrame containing slices of soil. - - additional_columns: List of additional columns to consider. - - feature_weight: Array of weights for features. + - data: DataFrame with columns "compname" + at least all names in `features`. + - features: list of column names to include in the distance. + - feature_weight: 1D array of floats, same length as `features`. Returns: - - D_site: Gower distances array for site similarity. + - (n×n) numpy array of Gower distances, with NaNs replaced by the max distance. """ - # Combine pedon slope data with component data - site_vars = pd.concat([p_slope, mucompdata[["compname", "slope_r", "elev_r"]]], axis=0) - - # If additional columns are specified, merge them - if additional_columns: - site_vars = pd.merge(slices, site_vars, on="compname", how="left") - site_mat = site_vars[additional_columns] - else: - site_mat = site_vars[["slope_r", "elev_r"]] - - site_mat = site_mat.set_index(slices.compname.values) - - # Compute the gower distances - D_site = gower_distances(site_mat, feature_weight=feature_weight) + # 1) subset & index by compname + site_mat = data.set_index("compname")[features] - # Replace NaN values with the maximum value in the array - D_site = np.where(np.isnan(D_site), np.nanmax(D_site), D_site) + # 2) compute + D = gower_distances(site_mat, feature_weight=feature_weight) - return D_site + # 3) fill NaNs + D = np.where(np.isnan(D), np.nanmax(D), D) + return D def compute_text_comp(bedrock, p_sandpct_intpl, soilHorizon):