diff --git a/.gitignore b/.gitignore index 29c5fd7a..d4ef1064 100644 --- a/.gitignore +++ b/.gitignore @@ -29,4 +29,4 @@ prof/ Data/ bulk_test_result* -get-pip.py* \ No newline at end of file +get-pip.py* diff --git a/Makefile b/Makefile index 3c2227a8..eac84691 100644 --- a/Makefile +++ b/Makefile @@ -13,6 +13,7 @@ setup-git-hooks: lint: ruff check soil_id + ruff format soil_id --diff format: ruff format soil_id @@ -73,7 +74,6 @@ process_bulk_test_results_legacy: python -m soil_id.tests.legacy.process_bulk_test_results $(RESULTS_FILE) # Donwload Munsell CSV, SHX, SHP, SBX, SBN, PRJ, DBF -# 1tN23iVe6X1fcomcfveVp4w3Pwd0HJuTe: LandPKS_munsell_rgb_lab.csv # 1WUa9e3vTWPi6G8h4OI3CBUZP5y7tf1Li: gsmsoilmu_a_us.shx # 1l9MxC0xENGmI_NmGlBY74EtlD6SZid_a: gsmsoilmu_a_us.shp # 1asGnnqe0zI2v8xuOszlsNmZkOSl7cJ2n: gsmsoilmu_a_us.sbx @@ -84,7 +84,6 @@ process_bulk_test_results_legacy: download-soil-data: mkdir -p Data cd Data; \ - gdown 1tN23iVe6X1fcomcfveVp4w3Pwd0HJuTe; \ gdown 1WUa9e3vTWPi6G8h4OI3CBUZP5y7tf1Li; \ gdown 1l9MxC0xENGmI_NmGlBY74EtlD6SZid_a; \ gdown 1asGnnqe0zI2v8xuOszlsNmZkOSl7cJ2n; \ @@ -119,6 +118,7 @@ dump_soil_id_db: pg_dump --format=custom $(DATABASE_URL) -t hwsd2_segment -t hwsd2_data -t landpks_munsell_rgb_lab -t normdist1 -t normdist2 -t wise_soil_data -t wrb2006_to_fao90 -t wrb_fao90_desc -f $(DATABASE_DUMP_FILE) restore_soil_id_db: - pg_restore --dbname=$(DATABASE_URL) --single-transaction --clean --if-exists --no-owner $(DATABASE_DUMP_FILE) + psql $(DATABASE_URL) -c "CREATE EXTENSION IF NOT EXISTS postgis;" + pg_restore --dbname=$(DATABASE_URL) --clean --if-exists --no-owner --verbose $(DATABASE_DUMP_FILE) psql $(DATABASE_URL) -c "CLUSTER hwsd2_segment USING hwsd2_segment_shape_idx;" psql $(DATABASE_URL) -c "ANALYZE;" diff --git a/soil_id/config.py b/soil_id/config.py index f228a462..29eecddb 100644 --- a/soil_id/config.py +++ b/soil_id/config.py @@ -13,33 +13,20 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see https://www.gnu.org/licenses/. import os -import tempfile - -from platformdirs import user_cache_dir DATA_PATH = os.environ.get("DATA_PATH", "Data") # Numpy seeding RANDOM_SEED = os.environ.get("RANDOM_SEED", 19) -# Output -APP_NAME = os.environ.get("APP_NAME", "org.terraso.soilid") -TEMP_DIR = tempfile.TemporaryDirectory() -CACHE_DIR = user_cache_dir(APP_NAME) -OUTPUT_PATH = TEMP_DIR.name -SOIL_ID_RANK_PATH = f"{OUTPUT_PATH}/soil_id_rank.csv" -SOIL_ID_PROB_PATH = f"{OUTPUT_PATH}/soil_id_cond_prob.csv" -REQUESTS_CACHE_PATH = f"{CACHE_DIR}/requests_cache" - # Determines if in/out of US US_AREA_PATH = f"{DATA_PATH}/SoilID_US_Areas.shz" # US Soil ID STATSGO_PATH = f"{DATA_PATH}/gsmsoilmu_a_us.shp" -MUNSELL_RGB_LAB_PATH = f"{DATA_PATH}/LandPKS_munsell_rgb_lab.csv" # Database -DB_NAME = os.environ.get("DB_NAME", "terraso_backend") +DB_NAME = os.environ.get("DB_NAME", "soil_id") DB_HOST = os.environ.get("DB_HOST") DB_USERNAME = os.environ.get("DB_USERNAME") DB_PASSWORD = os.environ.get("DB_PASSWORD") diff --git a/soil_id/db.py b/soil_id/db.py index 7d20042c..0db5d956 100644 --- a/soil_id/db.py +++ b/soil_id/db.py @@ -32,133 +32,18 @@ def get_datastore_connection(): Returns: Connection object if successful, otherwise exits the program. """ - conn = None try: - conn = psycopg.connect( + return psycopg.connect( host=soil_id.config.DB_HOST, user=soil_id.config.DB_USERNAME, password=soil_id.config.DB_PASSWORD, dbname=soil_id.config.DB_NAME, ) - return conn except Exception as err: logging.error(f"Database connection failed: {err}") raise -# us, global -def save_model_output( - plot_id, model_version, result_blob, soilIDRank_output_pd, mucompdata_cond_prob -): - """ - Save the output of the model to the 'landpks_soil_model' table. - """ - conn = None - try: - conn = get_datastore_connection() - cur = conn.cursor() - - sql = """ - INSERT INTO landpks_soil_model - (plot_id, model_version, result_blob, soilIDRank_output_pd, mucompdata_cond_prob) - VALUES (%s, %s, %s, %s, %s) - """ - cur.execute( - sql, - ( - plot_id, - model_version, - result_blob, - soilIDRank_output_pd, - mucompdata_cond_prob, - ), - ) - conn.commit() - - except Exception as err: - logging.error(err) - conn.rollback() - return None - finally: - conn.close() - - -# us, global -def save_rank_output(record_id, model_version, rank_blob): - """ - Update the rank of the soil model in the 'landpks_soil_model' table. - """ - conn = None - try: - conn = get_datastore_connection() - cur = conn.cursor() - - sql = """UPDATE landpks_soil_model - SET soilrank = %s - WHERE ID = %s AND model_version = %s""" - cur.execute(sql, (rank_blob, record_id, model_version)) - conn.commit() - - except Exception as err: - logging.error(err) - conn.rollback() - return None - finally: - conn.close() - - -# us, global -def load_model_output(plot_id): - """ - Load model output based on plot ID and model version. - """ - conn = None - try: - conn = get_datastore_connection() - cur = conn.cursor() - model_version = 2 - sql = """SELECT ID, result_blob, soilIDRank_output_pd, mucompdata_cond_prob - FROM landpks_soil_model - WHERE plot_id = %s AND model_version = %s - ORDER BY ID DESC LIMIT 1""" - cur.execute(sql, plot_id, model_version) - results = cur.fetchall() - for row in results: - model_run = [row[0], row[1], row[2], row[3]] - return model_run - except Exception as err: - logging.error(err) - return None - finally: - conn.close() - - -# global only -def save_soilgrids_output(plot_id, model_version, soilgrids_blob): - """ - Save the output of the soil grids to the 'landpks_soilgrids_model' table. - """ - conn = None - try: - conn = get_datastore_connection() - cur = conn.cursor() - - sql = """ - INSERT INTO landpks_soilgrids_model - (plot_id, model_version, soilgrids_blob) - VALUES (%s, %s, %s) - """ - cur.execute(sql, (plot_id, model_version, soilgrids_blob)) - conn.commit() - - except Exception as err: - logging.error(err) - conn.rollback() - return None - finally: - conn.close() - - def get_hwsd2_profile_data(connection, hwsd2_mu_select): """ Retrieve HWSD v2 data based on selected hwsd2 (map unit) values. @@ -219,7 +104,7 @@ def get_hwsd2_profile_data(connection, hwsd2_mu_select): return pd.DataFrame() -def extract_hwsd2_data(connection, lon, lat, buffer_dist, table_name): +def extract_hwsd2_data(connection, lon, lat, buffer_dist): """ Fetches HWSD soil data from a PostGIS table within a given buffer around a point, performing distance and intersection calculations directly on geographic coordinates. @@ -228,156 +113,42 @@ def extract_hwsd2_data(connection, lon, lat, buffer_dist, table_name): lon (float): Longitude of the problem point. lat (float): Latitude of the problem point. buffer_dist (int): Buffer distance in meters. - table_name (str): Name of the PostGIS table (e.g., "hwsdv2"). Returns: - DataFrame: Merged data from hwsdv2 and hwsdv2_data. + DataFrame: Merged data from hwsd2_segment and hwsd2_data. + """ + point = f"ST_SetSRID(ST_Point({lon}, {lat}), 4326)::geography" + main_query = f""" + SELECT + hwsd2_id as hwsd2, + MIN(ST_Distance( + shape, + {point} + )) AS distance, + BOOL_OR(ST_Intersects(shape, {point})) AS pt_intersect + FROM hwsd2_segment + WHERE ST_DWithin(shape, {point}, {buffer_dist}) + GROUP BY hwsd2_id; """ - # Compute the buffer polygon (in WKT) around the problem point. - # Here, we use the geography type to compute a buffer in meters, - # then cast it back to geometry in EPSG:4326. - # buffer_query = """ - # WITH buffer AS ( - # SELECT ST_AsText( - # ST_Buffer( - # ST_SetSRID(ST_Point(%s, %s), 4326)::geography, - # %s - # )::geometry - # ) AS wkt - # ) - # SELECT wkt FROM buffer; - # """ - with connection.cursor(): - # cur.execute(buffer_query, (lon, lat, buffer_dist)) - # buffer_wkt = cur.fetchone()[0] - # print("Buffer WKT:", buffer_wkt) - - # Build the main query that uses the computed buffer. - # Distance is computed by casting geometries to geography, - # which returns the geodesic distance in meters. - # Q1 - # main_query = f""" - # WITH - # -- Step 1: Get the polygon that contains the point - # point_poly AS ( - # SELECT geom - # FROM {table_name} - # WHERE ST_Intersects( - # geom, - # ST_SetSRID(ST_Point({lon}, {lat}), 4326) - # ) - # ), - - # -- Step 2: Get polygons that intersect the buffer - # valid_geom AS ( - # SELECT - # hwsd2, - # geom - # FROM {table_name} - # WHERE geom && ST_GeomFromText('{buffer_wkt}', 4326) - # AND ST_Intersects(geom, ST_GeomFromText('{buffer_wkt}', 4326)) - # ) - - # -- Step 3: Filter to those that either contain the point or border the point's polygon - # SELECT - # vg.hwsd2, - # ST_AsEWKB(vg.geom) AS geom, - # ST_Distance( - # vg.geom::geography, - # ST_SetSRID(ST_Point({lon}, {lat}), 4326)::geography - # ) AS distance, - # ST_Intersects( - # vg.geom, - # ST_SetSRID(ST_Point({lon}, {lat}), 4326) - # ) AS pt_intersect - # FROM valid_geom vg, point_poly pp - # WHERE - # ST_Intersects(vg.geom, ST_SetSRID(ST_Point({lon}, {lat}), 4326)) - # OR ST_Intersects(vg.geom, pp.geom); - # """ - - # # Q2 - # main_query = f""" - # WITH - # inputs AS ( - # SELECT - # ST_GeomFromText('{buffer_wkt}', 4326) AS buffer_geom, - # ST_SetSRID(ST_Point({lon}, {lat}), 4326) AS pt_geom - # ), - - # valid_geom AS ( - # SELECT - # hwsd2, - # geom - # FROM {table_name}, inputs - # WHERE geom && inputs.buffer_geom - # AND ST_Intersects(geom, inputs.buffer_geom) - # ) - - # SELECT - # vg.hwsd2, - # ST_AsEWKB(vg.geom) AS geom, - # ST_Distance( - # ST_ClosestPoint(vg.geom::geography, inputs.pt_geom::geography), - # inputs.pt_geom::geography - # ) AS distance, - # ST_Intersects(vg.geom, inputs.pt_geom) AS pt_intersect - # FROM valid_geom vg, inputs; - # """ - - # Q3 - # point = f"ST_SetSRID(ST_Point({lon}, {lat}), 4326)" - # main_query = f""" - # SELECT - # geom, - # hwsd2, - # ST_Distance( - # geom::geography, - # {point}::geography - # ) AS distance, - # ST_Intersects(geom, {point}) AS pt_intersect - # FROM {table_name} - # WHERE ST_DWithin(geom::geography, {point}::geography, {buffer_dist}); - # """ - - - # Q4 - point = f"ST_SetSRID(ST_Point({lon}, {lat}), 4326)::geography" - main_query = f""" - SELECT - hwsd2_id as hwsd2, - MIN(ST_Distance( - shape, - {point} - )) AS distance, - BOOL_OR(ST_Intersects(shape, {point})) AS pt_intersect - FROM hwsd2_segment - WHERE ST_DWithin(shape, {point}, {buffer_dist}) - GROUP BY hwsd2_id; - """ - - # Use GeoPandas to execute the main query and load results into a GeoDataFrame. - hwsd = pd.read_sql_query(main_query, connection) - - # Get the list of hwsd2 identifiers. - hwsd2_mu_select = hwsd["hwsd2"].tolist() - # Call get_hwsd2_profile_data using the same connection. - hwsd_data = get_hwsd2_profile_data(connection, hwsd2_mu_select) + # Use GeoPandas to execute the main query and load results into a GeoDataFrame. + hwsd = pd.read_sql_query(main_query, connection) - # Merge the two datasets. - merged = pd.merge(hwsd_data, hwsd, on="hwsd2", how="left").drop_duplicates() - return merged + # Get the list of hwsd2 identifiers. + hwsd2_mu_select = hwsd["hwsd2"].tolist() + # Call get_hwsd2_profile_data using the same connection. + hwsd_data = get_hwsd2_profile_data(connection, hwsd2_mu_select) -# global + # Merge the two datasets. + merged = pd.merge(hwsd_data, hwsd, on="hwsd2", how="left").drop_duplicates() + return merged # Function to fetch data from a PostgreSQL table -def fetch_table_from_db(connection, table_name): +def query_db(connection, query): try: with connection.cursor() as cur: - query = f"SELECT * FROM {table_name} ORDER BY id ASC;" cur.execute(query) rows = cur.fetchall() @@ -388,44 +159,8 @@ def fetch_table_from_db(connection, table_name): return None -def get_WRB_descriptions(connection, WRB_Comp_List): - """ - Retrieve WRB descriptions based on provided WRB component list. - """ - try: - with connection.cursor() as cur: - - # Create placeholders for the SQL IN clause - placeholders = ", ".join(["%s"] * len(WRB_Comp_List)) - sql = f"""SELECT WRB_tax, Description_en, Management_en, Description_es, Management_es, - Description_ks, Management_ks, Description_fr, Management_fr - FROM wrb_fao90_desc - WHERE WRB_tax IN ({placeholders})""" - - # Execute the query with the parameters - cur.execute(sql, tuple(WRB_Comp_List)) - results = cur.fetchall() - - # Convert the results to a pandas DataFrame - data = pd.DataFrame( - results, - columns=[ - "WRB_tax", - "Description_en", - "Management_en", - "Description_es", - "Management_es", - "Description_ks", - "Management_ks", - "Description_fr", - "Management_fr", - ], - ) - - return data - except Exception as err: - logging.error(f"Error querying PostgreSQL: {err}") - return None +def fetch_normdist(connection): + return query_db(connection, "SELECT * FROM normdist2 ORDER BY id ASC;") # global only @@ -516,3 +251,7 @@ def execute_query(query, params): except Exception as err: logging.error(f"Error querying PostgreSQL: {err}") return None + + +def fetch_munsell_rgb_lab(connection): + return pd.read_sql_query("SELECT * FROM landpks_munsell_rgb_lab;", connection) diff --git a/soil_id/global_soil.py b/soil_id/global_soil.py index c55a90c3..a417a015 100644 --- a/soil_id/global_soil.py +++ b/soil_id/global_soil.py @@ -26,7 +26,7 @@ import pandas as pd from .color import calculate_deltaE2000 -from .db import extract_hwsd2_data, fetch_table_from_db, get_WRB_descriptions, getSG_descriptions +from .db import extract_hwsd2_data, fetch_normdist, getSG_descriptions from .services import get_soilgrids_classification_data, get_soilgrids_property_data from .utils import ( adjust_depth_interval, @@ -45,8 +45,6 @@ silt_calc, ) -# local libraries - @dataclass class SoilListOutputData: @@ -56,21 +54,13 @@ class SoilListOutputData: # entry points -# getSoilLocationBasedGlobal -# list_soils -# rank_soils -# rankPredictionGlobal -# getSoilGridsGlobal -# getSoilGridsUS +# list_soils_global +# rank_soils_global -# when a site is created, call list_soils/getSoilLocationBasedGlobal. -# when a site is created, call getSoilGridsGlobal -# after user has collected data, call rank_soils/rankPredictionGlobal. +# when a site is created, call list_soils_global +# after user has collected data, call rank_soils_global -################################################################################################## -# getSoilLocationBasedGlobal # -################################################################################################## def list_soils_global(connection, lon, lat, buffer_dist=100000): # Extract HWSD2 Data try: @@ -78,7 +68,6 @@ def list_soils_global(connection, lon, lat, buffer_dist=100000): connection, lon, lat, - table_name="hwsdv2", buffer_dist=buffer_dist, ) except KeyError: @@ -437,16 +426,6 @@ def list_soils_global(connection, lon, lat, buffer_dist=100000): # Replace NaN values with an empty string mucompdata_cond_prob = mucompdata_cond_prob.fillna("") - # Merge component descriptions - WRB_Comp_Desc = get_WRB_descriptions( - connection, - mucompdata_cond_prob["compname_grp"].drop_duplicates().tolist() - ) - - mucompdata_cond_prob = pd.merge( - mucompdata_cond_prob, WRB_Comp_Desc, left_on="compname_grp", right_on="WRB_tax", how="left" - ) - mucompdata_cond_prob = mucompdata_cond_prob.drop_duplicates().reset_index(drop=True) mucomp_index = mucompdata_cond_prob.index @@ -461,19 +440,6 @@ def list_soils_global(connection, lon, lat, buffer_dist=100000): "minCompDistance": row.min_dist, "soilDepth": row.c_very_bottom, }, - "siteDescription": { - key: row[key] - for key in [ - "Description_en", - "Management_en", - "Description_es", - "Management_es", - "Description_ks", - "Management_ks", - "Description_fr", - "Management_fr", - ] - }, } for _, row in mucompdata_cond_prob.iterrows() ] @@ -574,13 +540,8 @@ def convert_to_serializable(obj): ) -############################################################################################## -# rankPredictionGlobal # -############################################################################################## def rank_soils_global( connection, - lon, - lat, list_output_data: SoilListOutputData, soilHorizon, topDepth, @@ -954,7 +915,7 @@ def rank_soils_global( ysf = [] # Load color distribution data from NormDist2 (FAO90) table - rows = fetch_table_from_db(connection, "NormDist2") + rows = fetch_normdist(connection) row_id = 0 for row in rows: # row is a tuple; iterate over its values. @@ -1027,7 +988,9 @@ def norm_pdf_vec(x, mean_arr, std_arr): # Normalize probabilities def normalize(arr): min_val, max_val = np.min(arr), np.max(arr) - return (arr - min_val) / (max_val - min_val) if max_val != min_val else np.ones_like(arr) + return ( + (arr - min_val) / (max_val - min_val) if max_val != min_val else np.ones_like(arr) + ) prob_w = normalize(prob_w) prob_r = normalize(prob_r) @@ -1255,11 +1218,6 @@ def normalize(arr): return output_data -################################################################################################## -# getSoilGridsGlobal # -################################################################################################## - - def sg_list(connection, lon, lat): """ Query the SoilGrids API (via get_soilgrids_property_data) and post-process diff --git a/soil_id/tests/global/generate_bulk_test_results.py b/soil_id/tests/global/generate_bulk_test_results.py index 8604ff1d..e51a0f2c 100644 --- a/soil_id/tests/global/generate_bulk_test_results.py +++ b/soil_id/tests/global/generate_bulk_test_results.py @@ -57,14 +57,11 @@ else: start_time = time.perf_counter() try: - list_result = list_soils_global(connection=connection, lat=lat, lon=lon) result_record["list_result"] = list_result.soil_list_json result_record["rank_result"] = rank_soils_global( connection=connection, - lat=lat, - lon=lon, list_output_data=list_result, topDepth=pedon["TOPDEP"].values.tolist(), bottomDepth=pedon["BOTDEP"].values.tolist(), diff --git a/soil_id/tests/global/process_bulk_test_results.py b/soil_id/tests/global/process_bulk_test_results.py index 874acc49..852c78fe 100644 --- a/soil_id/tests/global/process_bulk_test_results.py +++ b/soil_id/tests/global/process_bulk_test_results.py @@ -44,13 +44,14 @@ def last_word(s): return s.split()[-1].lower() + secondary_index = [ i for i, match in enumerate(matches) if last_word(match["component"]) == last_word(result_record["pedon_name"]) or last_word(match["component"]) == (last_word(result_record["pedon_name"]) + "s") ] - + if len(secondary_index) == 0: result_record["secondary_result"] = "missing" else: @@ -71,7 +72,11 @@ def last_word(s): print(result_groups.count()["pedon_key"] / len(df) * 100) print("# Secondary result proportions:\n") -print(secondary_result_groups.count()["pedon_key"] / (len(df) - df["secondary_result"].isnull().sum()) * 100) +print( + secondary_result_groups.count()["pedon_key"] + / (len(df) - df["secondary_result"].isnull().sum()) + * 100 +) if len(df) < 11: print("\n# Execution times:\n") @@ -99,4 +104,6 @@ def last_word(s): indented_lines = [" " + line for line in lines] print("\n".join(indented_lines) + "\n") -df[['pedon_key', 'pedon_name', 'lat', 'lon', 'result', 'secondary_result', 'all_soils']].to_csv('global_algorithm_results.csv', index=False) \ No newline at end of file +df[["pedon_key", "pedon_name", "lat", "lon", "result", "secondary_result", "all_soils"]].to_csv( + "global_algorithm_results.csv", index=False +) diff --git a/soil_id/tests/global/test_global.py b/soil_id/tests/global/test_global.py index bdaabe45..4403c9ab 100644 --- a/soil_id/tests/global/test_global.py +++ b/soil_id/tests/global/test_global.py @@ -27,17 +27,16 @@ {"lat": -10.07856, "lon": 15.107436}, ] + @pytest.mark.parametrize("location", test_locations) def test_soil_location(location): with get_datastore_connection() as connection: logging.info(f"Testing {location['lon']}, {location['lat']}") start_time = time.perf_counter() list_soils_result = list_soils_global(connection, location["lon"], location["lat"]) - logging.info(f"...time: {(time.perf_counter()-start_time):.2f}s") + logging.info(f"...time: {(time.perf_counter() - start_time):.2f}s") rank_soils_global( connection, - location["lon"], - location["lat"], list_output_data=list_soils_result, soilHorizon=["Loam"], topDepth=[0], diff --git a/soil_id/tests/legacy/generate_bulk_test_results.py b/soil_id/tests/legacy/generate_bulk_test_results.py index e4728a0b..dc4276ef 100644 --- a/soil_id/tests/legacy/generate_bulk_test_results.py +++ b/soil_id/tests/legacy/generate_bulk_test_results.py @@ -23,6 +23,7 @@ import pandas import numpy + def get_rfv(cf): if 0 <= cf < 2: return "0-1%" @@ -35,14 +36,36 @@ def get_rfv(cf): elif 61 <= cf <= 100: return ">60%" + def call_legacy(lat, lon, pedon): requests.get(f"https://api.landpotential.org/soilidlist?longitude={lon}&latitude={lat}") base = f"https://api.landpotential.org/soilidrank?longitude={lon}&latitude={lat}" - depths = "&".join([f"soilHorizon{idx + 1}_Depth={botdep}" for idx, botdep in enumerate(pedon["BOTDEP"].values.tolist())]) - rfv = "&".join([f"soilHorizon{idx + 1}_RFV={get_rfv(rfv)}" for idx, rfv in enumerate(pedon["RFV"].values.tolist()) if not numpy.isnan(rfv) ]) - texture = "&".join([f"soilHorizon{idx + 1}={textClass.upper()}" for idx, textClass in enumerate(pedon["textClass"].values.tolist())]) - color = "&".join([f"soilHorizon{idx + 1}_LAB={",".join([f"{v}" for v in lab])}" for idx, lab in enumerate(pedon[["L", "A", "B"]].values.tolist())]) + depths = "&".join( + [ + f"soilHorizon{idx + 1}_Depth={botdep}" + for idx, botdep in enumerate(pedon["BOTDEP"].values.tolist()) + ] + ) + rfv = "&".join( + [ + f"soilHorizon{idx + 1}_RFV={get_rfv(rfv)}" + for idx, rfv in enumerate(pedon["RFV"].values.tolist()) + if not numpy.isnan(rfv) + ] + ) + texture = "&".join( + [ + f"soilHorizon{idx + 1}={textClass.upper()}" + for idx, textClass in enumerate(pedon["textClass"].values.tolist()) + ] + ) + color = "&".join( + [ + f"soilHorizon{idx + 1}_LAB={','.join([f'{v}' for v in lab])}" + for idx, lab in enumerate(pedon[["L", "A", "B"]].values.tolist()) + ] + ) req = "&".join([base, depths, rfv, texture, color]) print(req) @@ -51,6 +74,7 @@ def call_legacy(lat, lon, pedon): return resp.json() + test_data_df = pandas.read_csv( os.path.join(os.path.dirname(__file__), "../global/global_pedon_test_dataset.csv") ) @@ -66,7 +90,7 @@ def call_legacy(lat, lon, pedon): # buffering=1 is line buffering with open(result_file_name, "w", buffering=1) as result_file: result_agg = {} - + for pedon_key, pedon in pedons: lat = pedon["Y_LatDD"].values[0] lon = pedon["X_LonDD"].values[0] diff --git a/soil_id/tests/legacy/process_bulk_test_results.py b/soil_id/tests/legacy/process_bulk_test_results.py index df39181b..c7f009ae 100644 --- a/soil_id/tests/legacy/process_bulk_test_results.py +++ b/soil_id/tests/legacy/process_bulk_test_results.py @@ -44,13 +44,14 @@ def last_word(s): return s.split()[-1].lower() + secondary_index = [ i for i, match in enumerate(matches) if last_word(match["component"]) == last_word(result_record["pedon_name"]) or last_word(match["component"]) == (last_word(result_record["pedon_name"]) + "s") ] - + if len(secondary_index) == 0: result_record["secondary_result"] = "missing" else: @@ -71,7 +72,11 @@ def last_word(s): print(result_groups.count()["pedon_key"] / len(df) * 100) print("# Secondary result proportions:\n") -print(secondary_result_groups.count()["pedon_key"] / (len(df) - df["secondary_result"].isnull().sum()) * 100) +print( + secondary_result_groups.count()["pedon_key"] + / (len(df) - df["secondary_result"].isnull().sum()) + * 100 +) if len(df) < 11: print("\n# Execution times:\n") @@ -79,19 +84,3 @@ def last_word(s): else: print("\n# Execution time deciles:\n") print(pandas.qcut(df["execution_time_s"], q=10, retbins=True)[1]) - - -if "crash" in result_groups.groups: - crashes = result_groups.get_group(("crash",)) - # counts = df.value_counts(subset="traceback").sort_values(ascending=False) - - # print(f"\n# Unique crash tracebacks ({len(counts)} unique, {len(crashes)} total):\n") - - # for idx, (traceback, count) in enumerate(counts.to_dict().items()): - # example = crashes.loc[crashes["traceback"] == traceback].iloc[0] - # print( - # f"Traceback #{idx + 1}, occurred {count} times. Example pedon: {example['pedon_key']}, lat: {example['lat']}, lon: {example['lon']}" - # ) - # lines = traceback.splitlines() - # indented_lines = [" " + line for line in lines] - # print("\n".join(indented_lines) + "\n") \ No newline at end of file diff --git a/soil_id/tests/test_find_region.py b/soil_id/tests/test_find_region.py index 635ee6d9..06be9f43 100644 --- a/soil_id/tests/test_find_region.py +++ b/soil_id/tests/test_find_region.py @@ -17,13 +17,14 @@ import pytest test_locations = [ - (38.968984, -103.625974, 'US'), - (-6.708697, -69.306646, 'Global'), - (-3.521766, -136.995712, 'Global') + (38.968984, -103.625974, "US"), + (-6.708697, -69.306646, "Global"), + (-3.521766, -136.995712, "Global"), ] + @pytest.mark.parametrize("location", test_locations) def test_find_region(location): - lat, lon, region = location + lat, lon, region = location assert find_region_for_location(lat=lat, lon=lon) == region diff --git a/soil_id/tests/us/generate_bulk_test_results.py b/soil_id/tests/us/generate_bulk_test_results.py index 2264c005..a771fb16 100644 --- a/soil_id/tests/us/generate_bulk_test_results.py +++ b/soil_id/tests/us/generate_bulk_test_results.py @@ -22,6 +22,7 @@ import pandas +from soil_id.db import get_datastore_connection from soil_id.us_soil import list_soils, rank_soils @@ -34,6 +35,7 @@ def clean_soil_list_json(obj): return list(map(clean_soil_list_json, obj)) return obj + test_data_df = pandas.read_csv( os.path.join(os.path.dirname(__file__), "US_SoilID_KSSL_LPKS_Testing.csv") ) @@ -61,7 +63,7 @@ def clean_soil_list_json(obj): start_time = time.perf_counter() try: - list_result = list_soils(lat=lat, lon=lon) + list_result = list_soils(get_datastore_connection(), lat=lat, lon=lon) result_record["list_result"] = list_result.soil_list_json result_record["rank_result"] = rank_soils( diff --git a/soil_id/tests/us/process_bulk_test_results.py b/soil_id/tests/us/process_bulk_test_results.py index c4c91a5f..1de6e2cb 100644 --- a/soil_id/tests/us/process_bulk_test_results.py +++ b/soil_id/tests/us/process_bulk_test_results.py @@ -78,4 +78,6 @@ indented_lines = [" " + line for line in lines] print("\n".join(indented_lines) + "\n") -df[['pedon_key', 'pedon_name', 'lat', 'lon', 'result', 'all_soils']].to_csv('us_algorithm_results.csv', index=False) +df[["pedon_key", "pedon_name", "lat", "lon", "result", "all_soils"]].to_csv( + "us_algorithm_results.csv", index=False +) diff --git a/soil_id/tests/us/test_us.py b/soil_id/tests/us/test_us.py index 1a104d35..5d8ade87 100644 --- a/soil_id/tests/us/test_us.py +++ b/soil_id/tests/us/test_us.py @@ -18,6 +18,7 @@ import pytest +from soil_id.db import get_datastore_connection from soil_id.us_soil import list_soils, rank_soils test_locations = [ @@ -55,7 +56,7 @@ def test_soil_location(location): cracks = False start_time = time.perf_counter() - list_soils_result = list_soils(location["lon"], location["lat"]) + list_soils_result = list_soils(get_datastore_connection(), location["lon"], location["lat"]) logging.info(f"...time: {(time.perf_counter() - start_time):.2f}s") rank_soils( location["lon"], @@ -74,7 +75,9 @@ def test_soil_location(location): def test_empty_rank(): - SoilListOutputData = list_soils(test_locations[0]["lon"], test_locations[0]["lat"]) + SoilListOutputData = list_soils( + get_datastore_connection(), test_locations[0]["lon"], test_locations[0]["lat"] + ) rank_soils( test_locations[0]["lon"], test_locations[0]["lat"], diff --git a/soil_id/us_soil.py b/soil_id/us_soil.py index fd69378d..2d0acb8e 100644 --- a/soil_id/us_soil.py +++ b/soil_id/us_soil.py @@ -25,10 +25,8 @@ import pandas as pd from pandas import json_normalize -# local libraries -import soil_id.config - from .color import getProfileLAB, lab2munsell, munsell2rgb +from .db import fetch_munsell_rgb_lab from .services import get_soil_series_data, get_soilweb_data, sda_return, get_elev_data from .soil_sim import soil_sim from .utils import ( @@ -55,15 +53,11 @@ ) # entry points -# getSoilLocationBasedGlobal # list_soils # rank_soils -# rankPredictionGlobal -# getSoilGridsGlobal -# when a site is created, call list_soils/getSoilLocationBasedGlobal. -# when a site is created, call getSoilGridsGlobal -# after user has collected data, call rank_soils/rankPredictionGlobal. +# when a site is created, call list_soils +# after user has collected data, call rank_soils # set Pandas dataframe options pd.set_option("future.no_silent_downcasting", True) @@ -76,12 +70,9 @@ class SoilListOutputData: map_unit_component_data_csv: str -############################################################################################ -# list_soils # -############################################################################################ -def list_soils(lon, lat): +def list_soils(connection, lon, lat): # Load in LAB to Munsell conversion look-up table - color_ref = pd.read_csv(soil_id.config.MUNSELL_RGB_LAB_PATH) + color_ref = fetch_munsell_rgb_lab(connection) LAB_ref = color_ref[["cielab_l", "cielab_a", "cielab_b"]] munsell_ref = color_ref[["hue", "value", "chroma"]] @@ -1538,9 +1529,6 @@ def list_soils(lon, lat): ) -############################################################################################## -# rank_soils # -############################################################################################## def rank_soils( lon, lat, @@ -2046,7 +2034,7 @@ def rank_soils( # Concatenate the sorted and ranked groups D_final = pd.concat(soilIDList_data).reset_index(drop=True) - + # Merge with the Rank_Filter data D_final = pd.merge(D_final, Rank_Filter, on="compname", how="left") diff --git a/soil_id/utils.py b/soil_id/utils.py index 1909758c..ebb19f1d 100644 --- a/soil_id/utils.py +++ b/soil_id/utils.py @@ -945,32 +945,6 @@ def _gower_distance_row( return (sum_cat + sum_num) / feature_weight_sum -def compute_site_similarity( - data: pd.DataFrame, features: list[str], feature_weight: np.ndarray -) -> np.ndarray: - """ - Compute Gower distances among the rows of `data` using only `features` - and the matching weights in `feature_weight`. - - Parameters: - - data: DataFrame with columns "compname" + at least all names in `features`. - - features: list of column names to include in the distance. - - feature_weight: 1D array of floats, same length as `features`. - - Returns: - - (n×n) numpy array of Gower distances, with NaNs replaced by the max distance. - """ - # 1) subset & index by compname - site_mat = data.set_index("compname")[features] - - # 2) compute - D = gower_distances(site_mat, feature_weight=feature_weight) - - # 3) fill NaNs - D = np.where(np.isnan(D), np.nanmax(D), D) - return D - - def compute_text_comp(bedrock, p_sandpct_intpl, soilHorizon): """ Computes a value based on the depth of bedrock and length of sand percentages. @@ -1065,48 +1039,6 @@ def compute_lab_comp(cr_df): return 20 if not cr_df.dropna().empty else 0 -def compute_data_completeness( - bedrock, p_sandpct_intpl, soilHorizon, p_cfg_intpl, rfvDepth, cracks, cr_df -): - text_comp = compute_text_comp(bedrock, p_sandpct_intpl, soilHorizon) - rf_comp = compute_rf_comp(bedrock, p_cfg_intpl, rfvDepth) - crack_comp = compute_crack_comp(cracks) - lab_comp = compute_lab_comp(cr_df) - - data_completeness = text_comp + rf_comp + crack_comp + lab_comp - - # Generate data completeness comment - if text_comp < 45: - text_comment = " soil texture," - else: - text_comment = "" - if rf_comp < 30: - rf_comment = " soil rock fragments," - else: - rf_comment = "" - if lab_comp < 20: - lab_comment = " soil color (20-50cm)," - else: - lab_comment = "" - if crack_comp < 5: - crack_comment = " soil cracking," - else: - crack_comment = "" - if data_completeness < 100: - text_completeness = ( - "To improve predictions, complete data entry for:" - + crack_comment - + text_comment - + rf_comment - + lab_comment - + " and re-sync." - ) - else: - text_completeness = "SoilID data entry for this site is complete." - - return data_completeness, text_completeness - - def trim_fraction(text): """ Removes trailing ".0" from a given text string. @@ -1782,35 +1714,6 @@ def pedon_color(lab_Color, top, bottom): return [pedon_l_mean, pedon_a_mean, pedon_b_mean] -def extract_values(obj, key): - """ - Pull all values of the specified key from a nested dictionary or list. - - Parameters: - - obj (dict or list): The nested dictionary or list to search. - - key: The key to look for. - - Returns: - - list: A list of values associated with the specified key. - """ - - arr = [] - - def extract(obj, key): - if isinstance(obj, dict): - if key in obj: - arr.append(obj[key]) - for k, v in obj.items(): - if isinstance(v, (dict, list)): - extract(v, key) - elif isinstance(obj, list): - for item in obj: - extract(item, key) - - extract(obj, key) - return arr - - # Functions for AWC simulation @@ -2235,17 +2138,6 @@ def entropy_score(series): return sorted_information_gains -# function to get data and aggregate SG data -def sg_get_and_agg(variable, sg_data_w, bottom, return_depth=False): - pd_int = getProfile_SG(sg_data_w, variable, c_bot=False) - if return_depth: - pd_lpks, lpks_depths = agg_data_layer(data=pd_int.var_pct_intpl, bottom=bottom, depth=True) - return (pd_lpks.replace(np.nan, ""), lpks_depths) - else: - pd_lpks = agg_data_layer(data=pd_int.var_pct_intpl, bottom=bottom, depth=False) - return pd_lpks.replace(np.nan, "") - - def adjust_depth_interval(data, target_length=200): """Adjusts the depth interval of user data."""