Skip to content

Commit fc3de1d

Browse files
committed
perf: optimize getProfile
1 parent d06f80f commit fc3de1d

File tree

1 file changed

+56
-140
lines changed

1 file changed

+56
-140
lines changed

soil_id/utils.py

Lines changed: 56 additions & 140 deletions
Original file line numberDiff line numberDiff line change
@@ -354,153 +354,69 @@ def aggregate_data(data, bottom_depths, sd=2):
354354

355355

356356
def getProfile(data, variable):
357-
var = []
358-
var_grp = []
359-
var_pct_intpl = []
360-
var_pct_intpl_grp = []
361-
if variable == "sandtotal_r":
362-
for i in range(len(data)):
363-
if data["texture"].iloc[i] is None:
364-
var.append(data["sandtotal_r"].iloc[i])
365-
var_grp.append(np.nan)
366-
else:
367-
var.append(data["sandtotal_r"].iloc[i])
368-
var_grp.append(getSand(data["texture"].iloc[i]))
369-
if variable == "claytotal_r":
370-
for i in range(len(data)):
371-
if data["texture"].iloc[i] is None:
372-
var.append(data["claytotal_r"].iloc[i])
373-
var_grp.append(np.nan)
374-
else:
375-
var.append(data["claytotal_r"].iloc[i])
376-
var_grp.append(getClay(data["texture"].iloc[i]))
377-
if variable == "total_frag_volume":
378-
for i in range(len(data)):
379-
if data["total_frag_volume"].iloc[i] is None:
380-
var.append(np.nan)
381-
var_grp.append(np.nan)
382-
else:
383-
var.append(data["total_frag_volume"].iloc[i])
384-
var_grp.append(getCF(data["total_frag_volume"].iloc[i]))
385-
if variable == "CEC":
386-
for i in range(len(data)):
387-
if data["CEC"].iloc[i] is None:
388-
var.append(np.nan)
389-
else:
390-
var.append(data["CEC"].iloc[i])
391-
if variable == "pH":
392-
for i in range(len(data)):
393-
if data["pH"].iloc[i] is None:
394-
var.append(np.nan)
395-
else:
396-
var.append(data["pH"].iloc[i])
397-
if variable == "EC":
398-
for i in range(len(data)):
399-
if data["EC"].iloc[i] is None:
400-
var.append(np.nan)
401-
else:
402-
var.append(data["EC"].iloc[i])
403-
404-
# Return empty fields when there is no depth data or the top depth is not 0
405-
if variable == "sandtotal_r" or variable == "claytotal_r" or variable == "total_frag_volume":
406-
if pd.isnull(data["hzdept_r"]).any() or pd.isnull(data["hzdepb_r"]).any():
407-
var_pct_intpl_final = pd.DataFrame(np.nan, index=np.arange(200), columns=np.arange(2))
408-
var_pct_intpl_final.columns = ["var_pct_intpl", "var_pct_intpl_grp"]
409-
return var_pct_intpl_final
410-
411-
if data["hzdept_r"].iloc[0] != 0:
412-
var_pct_intpl_final = pd.DataFrame(np.nan, index=np.arange(200), columns=np.arange(2))
413-
var_pct_intpl_final.columns = ["var_pct_intpl", "var_pct_intpl_grp"]
414-
return var_pct_intpl_final
415-
416-
MisHrz = 0
417-
for i in range(len(data["hzdept_r"])):
418-
if i == len(data["hzdept_r"]) - 1:
419-
break
420-
421-
if data["hzdept_r"].iloc[i + 1] > data["hzdepb_r"].iloc[i]:
422-
MisHrz = 1
423-
elif data["hzdept_r"].iloc[i + 1] < data["hzdepb_r"].iloc[i]:
424-
data["hzdept_r"].iloc[i + 1] == data["hzdepb_r"].iloc[i]
425-
426-
if MisHrz == 1:
427-
var_pct_intpl_final = pd.DataFrame(np.nan, index=np.arange(200), columns=np.arange(2))
428-
var_pct_intpl_final.columns = ["var_pct_intpl", "var_pct_intpl_grp"]
429-
return var_pct_intpl_final
430-
431-
if len(data["hzdept_r"]) == 1:
432-
for i in range(int(data["hzdepb_r"].iloc[0]) - int(data["hzdept_r"].iloc[0])):
433-
var_pct_intpl.append(var[0])
434-
var_pct_intpl_grp.append(var_grp[0])
435-
else:
436-
for i in range(len(data["hzdepb_r"])):
437-
for j in range(int(data["hzdepb_r"].iloc[i]) - int(data["hzdept_r"].iloc[i])):
438-
var_pct_intpl.append(var[i])
439-
var_pct_intpl_grp.append(var_grp[i])
440-
441-
var_pct_intpl_final = pd.DataFrame([var_pct_intpl, var_pct_intpl_grp])
442-
var_pct_intpl_final = var_pct_intpl_final.T
443-
var_pct_intpl_final = var_pct_intpl_final.reset_index(drop=True)
444-
var_pct_intpl_final.columns = ["var_pct_intpl", "var_pct_intpl_grp"]
445-
446-
if len(var_pct_intpl_final.index) > 200:
447-
var_pct_intpl_final = var_pct_intpl_final.iloc[0:200]
448-
var_pct_intpl_final = var_pct_intpl_final.reset_index(drop=True)
357+
def pad_dataframe(df, columns, length=200):
358+
if len(df) > length:
359+
return df.iloc[:length].reset_index(drop=True)
449360
else:
450-
Na_add = 200 - len(var_pct_intpl_final.index)
451-
pd_add = pd.DataFrame(np.nan, index=np.arange(Na_add), columns=np.arange(2))
452-
pd_add.columns = ["var_pct_intpl", "var_pct_intpl_grp"]
453-
var_pct_intpl_final = pd.concat([var_pct_intpl_final, pd_add], axis=0)
454-
var_pct_intpl_final = var_pct_intpl_final.reset_index(drop=True)
361+
pad_len = length - len(df)
362+
pad_df = pd.DataFrame(np.nan, index=np.arange(pad_len), columns=columns)
363+
return pd.concat([df, pad_df], ignore_index=True)
364+
365+
# Handle main variable and group variable
366+
if variable in ["sandtotal_r", "claytotal_r", "total_frag_volume"]:
367+
if variable == "sandtotal_r":
368+
var = data["sandtotal_r"].to_numpy()
369+
var_grp = data["texture"].apply(lambda x: getSand(x) if pd.notnull(x) else np.nan).to_numpy()
370+
elif variable == "claytotal_r":
371+
var = data["claytotal_r"].to_numpy()
372+
var_grp = data["texture"].apply(lambda x: getClay(x) if pd.notnull(x) else np.nan).to_numpy()
373+
else: # total_frag_volume
374+
var = data["total_frag_volume"].apply(lambda x: x if pd.notnull(x) else np.nan).to_numpy()
375+
var_grp = data["total_frag_volume"].apply(lambda x: getCF(x) if pd.notnull(x) else np.nan).to_numpy()
376+
377+
elif variable in ["CEC", "pH", "EC"]:
378+
var = data[variable].apply(lambda x: x if pd.notnull(x) else np.nan).to_numpy()
379+
var_grp = None
455380
else:
456-
if pd.isnull(data["hzdept_r"]).any() or pd.isnull(data["hzdepb_r"]).any():
457-
var_pct_intpl_final = pd.DataFrame(np.nan, index=np.arange(200), columns=np.arange(1))
458-
var_pct_intpl_final.columns = ["var_pct_intpl"]
459-
return var_pct_intpl_final
460-
461-
if data["hzdept_r"].iloc[0] != 0:
462-
var_pct_intpl_final = pd.DataFrame(np.nan, index=np.arange(200), columns=np.arange(1))
463-
var_pct_intpl_final.columns = ["var_pct_intpl"]
464-
return var_pct_intpl_final
465-
466-
MisHrz = 0
467-
for i in range(len(data["hzdept_r"])):
468-
if i == len(data["hzdept_r"]) - 1:
469-
break
381+
raise ValueError(f"Unsupported variable: {variable}")
470382

471-
if data["hzdept_r"].iloc[i + 1] > data["hzdepb_r"].iloc[i]:
472-
MisHrz = 1
473-
elif data["hzdept_r"].iloc[i + 1] < data["hzdepb_r"].iloc[i]:
474-
data["hzdept_r"].iloc[i + 1] == data["hzdepb_r"].iloc[i]
383+
# Check horizon consistency
384+
if data["hzdept_r"].isnull().any() or data["hzdepb_r"].isnull().any():
385+
return pad_dataframe(pd.DataFrame(columns=["var_pct_intpl", "var_pct_intpl_grp"] if var_grp is not None else ["var_pct_intpl"]), ["var_pct_intpl", "var_pct_intpl_grp"] if var_grp is not None else ["var_pct_intpl"])
475386

476-
if MisHrz == 1:
477-
var_pct_intpl_final = pd.DataFrame(np.nan, index=np.arange(200), columns=np.arange(1))
478-
var_pct_intpl_final.columns = ["var_pct_intpl"]
479-
return var_pct_intpl_final
387+
if data["hzdept_r"].iloc[0] != 0:
388+
return pad_dataframe(pd.DataFrame(columns=["var_pct_intpl", "var_pct_intpl_grp"] if var_grp is not None else ["var_pct_intpl"]), ["var_pct_intpl", "var_pct_intpl_grp"] if var_grp is not None else ["var_pct_intpl"])
480389

481-
if len(data["hzdept_r"]) == 1:
482-
for i in range(int(data["hzdepb_r"].iloc[0]) - int(data["hzdept_r"].iloc[0])):
483-
var_pct_intpl.append(var[0])
484-
else:
485-
for i in range(len(data["hzdepb_r"])):
486-
for j in range(int(data["hzdepb_r"].iloc[i]) - int(data["hzdept_r"].iloc[i])):
487-
var_pct_intpl.append(var[i])
390+
# Check for mismatched horizons
391+
hzdept = data["hzdept_r"].to_numpy()
392+
hzdepb = data["hzdepb_r"].to_numpy()
393+
if any(hzdept[i+1] > hzdepb[i] for i in range(len(hzdept) - 1)):
394+
return pad_dataframe(pd.DataFrame(columns=["var_pct_intpl", "var_pct_intpl_grp"] if var_grp is not None else ["var_pct_intpl"]), ["var_pct_intpl", "var_pct_intpl_grp"] if var_grp is not None else ["var_pct_intpl"])
488395

489-
var_pct_intpl_final = pd.DataFrame([var_pct_intpl])
490-
var_pct_intpl_final = var_pct_intpl_final.T
491-
var_pct_intpl_final = var_pct_intpl_final.reset_index(drop=True)
492-
var_pct_intpl_final.columns = ["var_pct_intpl"]
396+
# Interpolation
397+
var_pct_intpl = []
398+
var_pct_intpl_grp = []
493399

494-
if len(var_pct_intpl_final.index) > 200:
495-
var_pct_intpl_final = var_pct_intpl_final.iloc[0:200]
496-
var_pct_intpl_final = var_pct_intpl_final.reset_index(drop=True)
497-
else:
498-
Na_add = 200 - len(var_pct_intpl_final.index)
499-
pd_add = pd.DataFrame(np.nan, index=np.arange(Na_add), columns=np.arange(1))
500-
pd_add.columns = ["var_pct_intpl"]
501-
var_pct_intpl_final = pd.concat([var_pct_intpl_final, pd_add], axis=0)
502-
var_pct_intpl_final = var_pct_intpl_final.reset_index(drop=True)
503-
return var_pct_intpl_final
400+
for i in range(len(hzdept)):
401+
top = int(hzdept[i])
402+
bottom = int(hzdepb[i])
403+
depth_range = bottom - top
404+
var_pct_intpl.extend([var[i]] * depth_range)
405+
if var_grp is not None:
406+
var_pct_intpl_grp.extend([var_grp[i]] * depth_range)
407+
408+
# Build result DataFrame
409+
if var_grp is not None:
410+
df_result = pd.DataFrame({
411+
"var_pct_intpl": var_pct_intpl,
412+
"var_pct_intpl_grp": var_pct_intpl_grp
413+
})
414+
return pad_dataframe(df_result, ["var_pct_intpl", "var_pct_intpl_grp"])
415+
else:
416+
df_result = pd.DataFrame({
417+
"var_pct_intpl": var_pct_intpl
418+
})
419+
return pad_dataframe(df_result, ["var_pct_intpl"])
504420

505421

506422
def max_comp_depth(data):

0 commit comments

Comments
 (0)