Skip to content

fix: gower normalization and site-based score_data return with no user data input #262

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
274 changes: 150 additions & 124 deletions soil_id/us_soil.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,10 @@
import soil_id.config

from .color import getProfileLAB, lab2munsell, munsell2rgb
from .services import get_soil_series_data, get_soilweb_data, sda_return
from .services import get_soil_series_data, get_soilweb_data, sda_return, get_elev_data
from .soil_sim import soil_sim
from .utils import (
aggregate_data,
compute_site_similarity,
drop_cokey_horz,
extract_mucompdata_STATSGO,
extract_muhorzdata_STATSGO,
Expand Down Expand Up @@ -1574,12 +1573,10 @@ def rank_soils(
# Adjust the bottom depth based on bedrock depth
if bedrock is not None:
if bedrock is not soil_df.empty and soil_df["bottom"].iloc[-1] > bedrock:
# Find the last valid index where bottom depth is less than or equal to bedrock
last_valid_index = soil_df.loc[soil_df["bottom"] <= bedrock].index[-1]
# Filter the DataFrame up to the last valid index
soil_df = soil_df.loc[:last_valid_index].copy()
# Set the bottom depth of the last row to the bedrock depth
soil_df.at[last_valid_index, "bottom"] = bedrock
# Remove rows where top depth is already below bedrock
soil_df = soil_df[soil_df["top"] < bedrock].copy()
# If any remaining row has bottom depth exceeding bedrock, truncate it
soil_df.loc[soil_df["bottom"] > bedrock, "bottom"] = bedrock

# Drop the original horizonDepth column
soil_df.drop(columns=["horizonDepth"], inplace=True)
Expand Down Expand Up @@ -1610,60 +1607,73 @@ def rank_soils(
rfvDepth = soil_df.rfvDepth.tolist()
horizonDepthB = [int(x) for x in soil_df.bottom.tolist()]
horizonDepthT = [int(x) for x in soil_df.top.tolist()]
lab_Color = soil_df.lab_Color
lab_series = soil_df.lab_Color

# Determine the maximum depth for user specified horizons
if not horizonDepthB:
max_user_depth = 0
else:
max_user_depth = max(horizonDepthB)
if bedrock is not None:
max_user_depth = min(bedrock, max_user_depth)

# Generate user specified percent clay, sand, and rfv distributions
spt = [getSand(sh) for sh in soilHorizon]
cpt = [getClay(sh) for sh in soilHorizon]
p_cfg = [getCF_fromClass(rf) for rf in rfvDepth]

p_sandpct_intpl = [
spt[i]
for i in range(len(soilHorizon))
for _ in range(horizonDepthT[i], horizonDepthB[i])
]
p_claypct_intpl = [
cpt[i]
for i in range(len(soilHorizon))
for _ in range(horizonDepthT[i], horizonDepthB[i])
]
p_cfg_intpl = [
p_cfg[i]
for i in range(len(soilHorizon))
for _ in range(horizonDepthT[i], horizonDepthB[i])
]
# Initialize full-length property arrays with NaNs
sand_array = [np.nan] * max_user_depth
clay_array = [np.nan] * max_user_depth
cfg_array = [np.nan] * max_user_depth

for i in range(len(soilHorizon)):
t = horizonDepthT[i]
b = horizonDepthB[i]
if t >= max_user_depth:
continue
b = min(b, max_user_depth)
for d in range(t, b):
sand_array[d] = spt[i]
clay_array[d] = cpt[i]
cfg_array[d] = p_cfg[i]

p_sandpct_intpl = pd.DataFrame(sand_array)
p_claypct_intpl = pd.DataFrame(clay_array)
p_cfg_intpl = pd.DataFrame(cfg_array)

# Length of interpolated texture and RF depth
p_bottom_depth = pd.DataFrame([-999, "sample_pedon", soil_df_slice.bottom.iloc[-1]]).T
p_bottom_depth = pd.DataFrame([-999, "sample_pedon", max_user_depth]).T
p_bottom_depth.columns = ["cokey", "compname", "bottom_depth"]

# Pedon color data
if not isinstance(lab_Color, pd.DataFrame):
lab_Color = pd.DataFrame(lab_Color)
if not lab_Color.isnull().all().all(): # Use all().all() to check the entire DataFrame
lab_Color = lab_Color.apply(
lambda x: [np.nan, np.nan, np.nan] if x.isnull().all() else x, axis=1
)
p_lab_intpl = [
lab_array = [[np.nan, np.nan, np.nan] for _ in range(max_user_depth)]

# Force correct structure
lab_cleaned = [
row if (isinstance(row, (list, tuple)) and len(row) == 3) else [np.nan, np.nan, np.nan]
for row in lab_series
]

# now unpack into three columns
lab_Color = pd.DataFrame(lab_cleaned, columns=["L", "A", "B"])

# Interpolate colors across depth
for i in range(len(lab_Color)):
t = horizonDepthT[i]
b = horizonDepthB[i]
if t >= max_user_depth:
continue
b = min(b, max_user_depth)
color_val = (
lab_Color.iloc[i].tolist()
for i in range(len(lab_Color))
for _ in range(horizonDepthT[i], horizonDepthB[i])
]
p_lab_intpl_list = [item[0] for item in p_lab_intpl] # Access the inner list
p_lab_intpl = pd.DataFrame(p_lab_intpl_list, columns=["L", "A", "B"]).reset_index(
drop=True
if not pd.isnull(lab_Color.iloc[i]).all()
else [np.nan, np.nan, np.nan]
)
else:
lab_Color = lab_Color.dropna() # Remove rows where all elements are None
p_lab_intpl = pd.DataFrame(
np.nan, index=np.arange(200), columns=["L", "A", "B"]
).reset_index(drop=True)
for d in range(t, b):
lab_array[d] = color_val

# Adjust depth interval for each dataset
p_sandpct_intpl = adjust_depth_interval(p_sandpct_intpl)
p_claypct_intpl = adjust_depth_interval(p_claypct_intpl)
p_cfg_intpl = adjust_depth_interval(p_cfg_intpl)
p_lab_intpl = adjust_depth_interval(p_lab_intpl)
p_lab_intpl = pd.DataFrame(lab_array, columns=["L", "A", "B"]).reset_index(drop=True)

# Construct final dataframe with adjusted data
p_compname = pd.Series("sample_pedon", index=np.arange(len(p_sandpct_intpl)))
Expand Down Expand Up @@ -1798,6 +1808,20 @@ def rank_soils(
horz_vars = [p_hz_data]
horz_vars.extend([group.reset_index(drop=True).loc[pedon_slice_index] for group in groups])

global_prop_bounds = {
"sandpct_intpl": (10.0, 92.0),
"claypct_intpl": (5.0, 70.0),
"rfv_intpl": (0.0, 80.0),
"l": (10.0, 95.0),
"a": (-10.0, 35.0),
"b": (-10.0, 60.0),
}

# numeric range (upper – lower):
global_prop_ranges = {
prop: upper - lower for prop, (lower, upper) in global_prop_bounds.items()
}

# Calculate similarity for each depth slice
dis_mat_list = []
for i in (
Expand Down Expand Up @@ -1825,9 +1849,13 @@ def rank_soils(
.drop(["compname"], axis=1)
.columns.tolist()
)
sliceT = sliceT[sample_pedon_slice_vars]
sliceT = sliceT[sample_pedon_slice_vars]

theoretical_prop_ranges = [global_prop_ranges[c] for c in sample_pedon_slice_vars]
D = gower_distances(
sliceT, theoretical_ranges=theoretical_prop_ranges
) # Equal weighting given to all soil variables

D = gower_distances(sliceT) # Equal weighting given to all soil variables
dis_mat_list.append(D)

# Determine if any components have all NaNs at every slice
Expand All @@ -1837,40 +1865,29 @@ def rank_soils(
"Not Ranked" if np.ma.is_masked(x) else "Ranked" for x in D_check[0][1:]
]

# Calculate maximum dissimilarity
dis_max_slice = [
np.nanmax(matrix) if not np.isnan(matrix).all() else np.nan for matrix in dis_mat_list
]
dis_max = np.nanmax(dis_max_slice)
dis_max_slice = [dis_max if np.isnan(x) else x for x in dis_max_slice]
# Maximum dissimilarity
dis_max = 1.0

# Apply depth weight
depth_weight = np.concatenate((np.repeat(0.2, 20), np.repeat(1.0, 180)), axis=0)
depth_weight = depth_weight[pedon_slice_index]

# Infill Nan data

# Update dis_mat_list using numpy operations
# Infill Nan data: soil vs non‑soil logic
for i, dis_mat in enumerate(dis_mat_list):
soil_slice = soil_matrix.iloc[i, :]
soil_slice = soil_matrix.iloc[i, :].values.astype(bool)
pedon_idx = 0 # assuming sample_pedon is always row 0

# Identify where NaN values exist and where the corresponding soil_slice is 1
nan_and_soil_slice_is_one = np.isnan(dis_mat) & np.isin(
np.arange(dis_mat.shape[1]), np.where(soil_slice == 1)[0]
)

# Identify where NaN values exist and both corresponding values in soil_slice are 0
rows, cols = np.where(np.isnan(dis_mat))
both_zero_rows_cols = [
(row, col)
for row, col in zip(rows, cols)
if soil_slice[row] == 0 and soil_slice[col] == 0
]
# 1) If pedon has data here, any NaN in pedon↔component → max dissimilarity
if soil_slice[pedon_idx]:
# components are those indices where soil_slice[j] is False
nonsoil_j = np.where(~soil_slice)[0]
for j in nonsoil_j:
if np.isnan(dis_mat[pedon_idx, j]):
dis_mat[pedon_idx, j] = dis_max
dis_mat[j, pedon_idx] = dis_max

# Assign max dissimilarity or 0 based on conditions
dis_mat[nan_and_soil_slice_is_one] = dis_max
for row, col in both_zero_rows_cols:
dis_mat[row, col] = 0
# 2) Every other NaN (component–component or missing–missing) → zero
dis_mat[np.isnan(dis_mat)] = 0.0

dis_mat_list[i] = dis_mat

Expand All @@ -1888,32 +1905,69 @@ def rank_soils(
D_horz = None

# ---Site Data Similarity---
if pElev is None:
pElev_dict = get_elev_data(lon, lat)
try:
pElev = float(pElev_dict["value"])
except (KeyError, TypeError, ValueError):
pElev = None # or some default

# 1) “Raw” guard on the three possible site inputs:
provided = {
"slope_r": pSlope,
"elev_r": pElev,
"bottom_depth": bedrock, # this determines if bottom_depth is set
}

# Initialize variables for site similarity
p_slope = pd.DataFrame(["sample_pedon", pSlope, pElev]).T
p_slope.columns = ["compname", "slope_r", "elev_r"]

# Check conditions to determine the data columns and feature weights
if (pSlope is not None) and (p_bottom_depth.bottom_depth.any() > 0):
D_site = compute_site_similarity(
p_slope,
mucompdata_pd,
slices_of_soil,
["slope_r", "elev_r", "bottom_depth"],
feature_weight=np.array([1.0, 0.5, 0.5]),
)
# 2) Figure out which of those are actually non-null
features = [
name
for name, val in provided.items()
if val is not None and not (isinstance(val, float) and np.isnan(val))
]

# 3) If fewer than two features, skip entirely
if len(features) < 2:
D_site = None
else:
D_site = compute_site_similarity(
p_slope, mucompdata_pd, slices_of_soil, feature_weight=np.array([1.0, 0.5])
)
# 4) Build your one‐row pedon DataFrame (only slope/elev, depth comes from merge)
pedon_dict = {"compname": "sample_pedon"}
if "slope_r" in features:
pedon_dict["slope_r"] = pSlope
if "elev_r" in features:
pedon_dict["elev_r"] = pElev
pedon_df = pd.DataFrame([pedon_dict])

# 5) Build your map‐unit library (only the columns you need)
lib_cols = ["compname"] + [f for f in features if f in ("slope_r", "elev_r")]
lib_df = mucompdata_pd[lib_cols].copy()

# 6) Stack them together
full_df = pd.concat([pedon_df, lib_df], ignore_index=True)

# 7) If you need bottom_depth, merge it in for *all* rows
if "bottom_depth" in features:
full_df = full_df.merge(
slices_of_soil[["compname", "bottom_depth"]], on="compname", how="left"
)

# 8) Build your weight vector
DEFAULT_WEIGHTS = {"slope_r": 1.0, "elev_r": 0.5, "bottom_depth": 1.5}
weights = np.array([DEFAULT_WEIGHTS[f] for f in features])

# Adjust the distances and apply weight
site_wt = 0.5
D_site = (1 - D_site) * site_wt
# 9) Compute Gower distances on exactly those feature columns
site_mat = full_df.set_index("compname")[features]
D_raw = gower_distances(site_mat, feature_weight=weights)

# 10 Replace any NaNs with the max distance, then (optionally) convert to similarity
D_site = np.where(np.isnan(D_raw), np.nanmax(D_raw), D_raw)

site_wt = 0.5
D_site = (1 - D_site) * site_wt

# Create the D_final dataframe based on the availability of D_horz and D_site data

# When both D_horz and D_site are available
# When both D_horz and D_site are available (relative weights: 66% horz, 33% site)
if D_horz is not None and D_site is not None:
D_site_hz = np.sum([D_site, D_horz], axis=0) / (1 + site_wt)
D_final = pd.concat(
Expand Down Expand Up @@ -2166,34 +2220,6 @@ def rank_soils(
return output_data


def adjust_depth_interval(data, target_length=200):
"""Adjusts the depth interval of user data."""

# Convert input to a DataFrame
if isinstance(data, list):
data = pd.DataFrame(data)
elif isinstance(data, pd.Series):
data = data.to_frame()

# Ensure data is a DataFrame at this point
if not isinstance(data, pd.DataFrame):
raise TypeError("Data must be a list, Series, or DataFrame")

length = len(data)

if length > target_length:
# Truncate data if it exceeds the target length
data = data.iloc[:target_length]
elif length < target_length:
# Extend data if it's shorter than the target length
add_length = target_length - length
add_data = pd.DataFrame(np.nan, index=np.arange(add_length), columns=data.columns)
data = pd.concat([data, add_data])

data.reset_index(drop=True, inplace=True)
return data


# Helper function to update dataframes based on depth conditions
def update_intpl_data(
df, col_names, values, very_bottom, OSD_depth_add, OSD_depth_remove, OSD_max_bottom_int
Expand Down
Loading