Skip to content

Commit 8f4e7ff

Browse files
committed
perf: optimize color similarity step
1 parent 8daa864 commit 8f4e7ff

File tree

1 file changed

+35
-19
lines changed

1 file changed

+35
-19
lines changed

soil_id/global_soil.py

Lines changed: 35 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1007,33 +1007,49 @@ def rank_soils_global(
10071007

10081008
# Calculate color similarity
10091009
if not cr_df.isnull().values.any():
1010-
color_sim = []
10111010
w_df, r_df, y_df = cr_df.iloc[0], cr_df.iloc[1], cr_df.iloc[2]
1012-
fao_list = [item.lower() for item in fao90]
10131011

1014-
for compname in D_final_horz.compname:
1015-
soilgroup = re.sub(r"\d+$", "", " ".join(compname.split()[1:])).lower()
1012+
# Vectorized computation of color probabilities
1013+
def norm_pdf_vec(x, mean_arr, std_arr):
1014+
var = np.square(std_arr)
1015+
denom = np.sqrt(2 * np.pi * var)
1016+
num = np.exp(-np.square(x - mean_arr) / (2 * var))
1017+
return num / denom
1018+
1019+
# Convert to numpy arrays
1020+
wmf, wsf = np.array(wmf, dtype=np.float64), np.array(wsf, dtype=np.float64)
1021+
rmf, rsf = np.array(rmf, dtype=np.float64), np.array(rsf, dtype=np.float64)
1022+
ymf, ysf = np.array(ymf, dtype=np.float64), np.array(ysf, dtype=np.float64)
10161023

1017-
prob_w, prob_r, prob_y = [], [], []
1024+
prob_w = norm_pdf_vec(float(w_df), wmf, wsf)
1025+
prob_r = norm_pdf_vec(float(r_df), rmf, rsf)
1026+
prob_y = norm_pdf_vec(float(y_df), ymf, ysf)
10181027

1019-
idx = fao_list.index(soilgroup) if soilgroup in fao_list else -1
1028+
# Normalize probabilities
1029+
def normalize(arr):
1030+
min_val, max_val = np.min(arr), np.max(arr)
1031+
return (arr - min_val) / (max_val - min_val) if max_val != min_val else np.ones_like(arr)
10201032

1021-
for mw, sw, mr, sr, my, sy in zip(wmf, wsf, rmf, rsf, ymf, ysf):
1022-
prob_w.append(norm(float(mw), float(sw)).pdf(float(w_df)))
1023-
prob_r.append(norm(float(mr), float(sr)).pdf(float(r_df)))
1024-
prob_y.append(norm(float(my), float(sy)).pdf(float(y_df)))
1033+
prob_w = normalize(prob_w)
1034+
prob_r = normalize(prob_r)
1035+
prob_y = normalize(prob_y)
10251036

1026-
max_prob_w, min_prob_w = max(prob_w), min(prob_w)
1027-
max_prob_r, min_prob_r = max(prob_r), min(prob_r)
1028-
max_prob_y, min_prob_y = max(prob_y), min(prob_y)
1037+
# Prepare FAO soil groups for lookup
1038+
fao_list = [item.lower() for item in fao90]
1039+
fao_index_map = {name: i for i, name in enumerate(fao_list)}
10291040

1030-
for j in range(len(fao_list)):
1031-
prob_w[j] = (prob_w[j] - min_prob_w) / (max_prob_w - min_prob_w)
1032-
prob_r[j] = (prob_r[j] - min_prob_r) / (max_prob_r - min_prob_r)
1033-
prob_y[j] = (prob_y[j] - min_prob_y) / (max_prob_y - min_prob_y)
1041+
# Vectorized scoring loop
1042+
compnames = D_final_horz.compname.str.lower()
1043+
color_sim = []
10341044

1035-
crsr = (prob_w[idx] + prob_r[idx] + prob_y[idx]) / 3.0 if idx != -1 else 1.0
1036-
color_sim.append(crsr)
1045+
for name in compnames:
1046+
soilgroup = re.sub(r"\d+$", "", " ".join(name.split()[1:])).strip()
1047+
idx = fao_index_map.get(soilgroup, -1)
1048+
if idx != -1:
1049+
score = (prob_w[idx] + prob_r[idx] + prob_y[idx]) / 3.0
1050+
else:
1051+
score = 1.0
1052+
color_sim.append(score)
10371053

10381054
color_sim = pd.Series(color_sim)
10391055

0 commit comments

Comments
 (0)