Merge pull request #318 from hubmapconsortium/yuanzhou/optimization

yuanzhou · web-flow · commit 84fa13346fb2 · 2022-03-28T15:26:27.000-04:00
Yuanzhou/optimization
diff --git a/.gitignore b/.gitignore
@@ -25,6 +25,9 @@ src/VERSION
 # Rotated log files
 **/*.log*
 
+# requests-cache generated sqlite
+**/*.sqlite
+
 #ignore eclipse files
 .project
 .pydevproject
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
@@ -28,6 +28,8 @@ services:
     volumes:
       # Mount the app config to container in order to keep it outside of the image
       - "../src/instance:/usr/src/app/src/instance"
+      # Mount the directory of requests_cache generated sqlite database
+      - "../src/requests_cache:/usr/src/app/requests_cache"
       # Mount the logging to container
       - "../log:/usr/src/app/log"
       # Mount the schema yaml file
diff --git a/src/requests_cache/README.md b/src/requests_cache/README.md
diff --git a/src/requirements.txt b/src/requirements.txt
@@ -6,9 +6,8 @@
 # Add this itsdangerous dependency and downgrade to 2.0.1 as a temporary solution
 itsdangerous==2.0.1
 
-cachetools==4.2.1
 Flask==1.1.2
-neo4j==4.2.1
+neo4j==4.4
 prov==2.0.0
 Werkzeug==1.0.1
 
@@ -19,6 +18,9 @@ nested-lookup==0.2.22
 requests==2.25.1
 PyYAML==5.4.1
 
+# 0.5.2 compatible with Python 3.6
+requests_cache==0.5.2
+
 # The branch name of commons to be used during image build
 # Default is master branch specified in docker-compose.yml if not set
 git+https://github.com/hubmapconsortium/commons.git@${COMMONS_BRANCH}#egg=hubmap-commons
diff --git a/src/schema/provenance_schema.yaml b/src/schema/provenance_schema.yaml
@@ -192,12 +192,19 @@ ENTITIES:
         type: string
         description: "Free text description of the collection"
       ###### Transient properties ######
-      datasets:
+      # Causing performanc issue
+      # datasets:
+      #   type: list
+      #   transient: true
+      #   generated: true
+      #   description: "The datasets that are contained in the collection."
+      #   on_read_trigger: get_collection_datasets
+      dataset_uuids:
         type: list
         transient: true
         generated: true
-        description: "The datasets that are contained in the collection."
-        on_read_trigger: get_collection_datasets
+        description: "The dataset uuids that are contained in the collection."
+        on_read_trigger: get_collection_dataset_uuids
 
   ############################################# Dataset #############################################
   Dataset:
@@ -766,9 +773,11 @@ ENTITIES:
         description: 'List of datasets to remove from a Upload. Provide as a json array of the dataset uuids like: ["232934234234234234234270c0ea6c51d604a850558ef2247d0b4", "230948203482234234234a57bfe9c056d08a0f8e6cd612baa3bfa"]'
         # Use after_update_trigger instead of before_update_trigger since we are not updating this property
         after_update_trigger: unlink_datasets_from_upload 
+      # Different from the handling of Collection (only returns dataset_uuids)
       datasets:
         type: list
         generated: true # Disallow user input from request json when being created
         transient: true
         description: "The datasets that are contained in this Upload."
+        # A few time-consuming read triggers are excluded
         on_read_trigger: get_upload_datasets
diff --git a/src/schema/schema_constants.py b/src/schema/schema_constants.py
@@ -0,0 +1,21 @@
+class SchemaConstants(object):
+
+    # File path to the requests_cache generated sqlite (without extension) within docker container, DO NOT MODIFY
+    # Expire the cache after the time-to-live (seconds)
+    REQUESTS_CACHE_BACKEND = 'sqlite'
+    REQUESTS_CACHE_SQLITE_NAME = '/usr/src/app/requests_cache/entity-api'
+    REQUESTS_CACHE_TTL = 7200
+
+    # Constants used by validators
+    INGEST_API_APP = 'ingest-api'
+    INGEST_PIPELINE_APP = 'ingest-pipeline'
+    HUBMAP_APP_HEADER = 'X-Hubmap-Application'
+    DATASET_STATUS_PUBLISHED = 'published'
+
+    # Used by triggers, all lowercase for easy comparision
+    ACCESS_LEVEL_PUBLIC = 'public'
+    ACCESS_LEVEL_CONSORTIUM = 'consortium'
+    ACCESS_LEVEL_PROTECTED = 'protected'
+
+    # Yaml file to parse organ description
+    ORGAN_TYPES_YAML = 'https://raw.githubusercontent.com/hubmapconsortium/search-api/master/src/search-schema/data/definitions/enums/organ_types.yaml'
diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py
@@ -1,20 +1,20 @@
 import ast
+import time
 import yaml
 import logging
 import requests
+import requests_cache
 from cachetools import cached, TTLCache
 
 # Don't confuse urllib (Python native library) with urllib3 (3rd-party library, requests also uses urllib3)
 from requests.packages.urllib3.exceptions import InsecureRequestWarning
 from flask import Response
 
-# Use the current_app proxy, which points to the application handling the current activity
-from flask import current_app as app
-
 # Local modules
 from schema import schema_errors
 from schema import schema_triggers
 from schema import schema_validators
+from schema.schema_constants import SchemaConstants
 
 # HuBMAP commons
 from hubmap_commons.hm_auth import AuthHelper
@@ -25,12 +25,10 @@
 # Suppress InsecureRequestWarning warning when requesting status on https with ssl cert verify disabled
 requests.packages.urllib3.disable_warnings(category = InsecureRequestWarning)
 
-# LRU Cache implementation with per-item time-to-live (TTL) value
-# with a memoizing callable that saves up to maxsize results based on a Least Frequently Used (LFU) algorithm
-# with a per-item time-to-live (TTL) value
-# The maximum integer number of entries in the cache queue: 128
-# Expire the cache after the time-to-live (seconds): two hours, 7200 seconds
-cache = TTLCache(128, ttl=7200)
+# Requests cache generates the sqlite file
+# File path without the .sqlite extension
+# Expire the cache after the time-to-live (7200 seconds)
+requests_cache.install_cache(SchemaConstants.REQUESTS_CACHE_SQLITE_NAME, backend=SchemaConstants.REQUESTS_CACHE_BACKEND, expire_after=SchemaConstants.REQUESTS_CACHE_TTL)
 
 # In Python, "privacy" depends on "consenting adults'" levels of agreement, we can't force it.
 # A single leading underscore means you're not supposed to access it "from the outside"
@@ -41,6 +39,7 @@
 _auth_helper = None
 _neo4j_driver = None
 
+
 ####################################################################################################
 ## Provenance yaml schema initialization
 ####################################################################################################
@@ -97,7 +96,6 @@ def initialize(valid_yaml_file,
 dict
     A dict containing the schema details
 """
-@cached(cache)
 def load_provenance_schema(valid_yaml_file):
     with open(valid_yaml_file) as file:
         schema_dict = yaml.safe_load(file)
@@ -1060,7 +1058,10 @@ def get_hubmap_ids(id, user_token):
     request_headers = _create_request_headers(user_token)
 
     # Disable ssl certificate verification
-    response = requests.get(url = target_url, headers = request_headers, verify = False) 
+    response = requests.get(url = target_url, headers = request_headers, verify = False)
+
+    # Verify if the cached response being used
+    _verify_request_cache(target_url, response.from_cache)
     
     # Invoke .raise_for_status(), an HTTPError will be raised with certain status codes
     response.raise_for_status()
@@ -1536,4 +1537,16 @@ def _create_request_headers(user_token):
 
     return headers_dict
 
+"""
+Verify if the cached response being used
 
+Parameters
+----------
+url: str
+    The request url
+response_from_cache: bool
+    If response.from_cache is used or not
+"""
+def _verify_request_cache(url, response_from_cache):
+    now = time.ctime(int(time.time()))
+    logger.info(f"Time: {now} / GET request URL: {url} / Requests cache used: {response_from_cache}")
diff --git a/src/schema/schema_neo4j_queries.py b/src/schema/schema_neo4j_queries.py
@@ -76,22 +76,47 @@ def get_dataset_organ_and_donor_info(neo4j_driver, uuid):
     organ_name = None
     donor_metadata = None
 
-    query = (f"MATCH (e:Dataset)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(s:Sample)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(d:Donor) "
-             # Filter out the Lab entities
-             f"WHERE e.uuid='{uuid}' AND s.specimen_type='organ' AND EXISTS(s.organ) "
-             # COLLECT() returns a list
-             # apoc.coll.toSet() reruns a set containing unique nodes
-             f"RETURN s.organ AS organ_name, d.metadata AS donor_metadata")
+    with neo4j_driver.session() as session:
+        # Old time-consuming single query, it takes a significant amounts of DB hits
+        # query = (f"MATCH (e:Dataset)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(s:Sample)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(d:Donor) "
+        #          f"WHERE e.uuid='{uuid}' AND s.specimen_type='organ' AND EXISTS(s.organ) "
+        #          f"RETURN s.organ AS organ_name, d.metadata AS donor_metadata")
 
-    logger.info("======get_dataset_organ_and_donor_info() query======")
-    logger.info(query)
+        # logger.info("======get_dataset_organ_and_donor_info() query======")
+        # logger.info(query)
 
-    with neo4j_driver.session() as session:
-        record = session.read_transaction(_execute_readonly_tx, query)
+        # with neo4j_driver.session() as session:
+        #     record = session.read_transaction(_execute_readonly_tx, query)
+
+        #     if record:
+        #         organ_name = record['organ_name']
+        #         donor_metadata = record['donor_metadata']
+
+        # To improve the query performance, we implement the two-step queries to drastically reduce the DB hits
+        sample_query = (f"MATCH (e:Dataset)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(s:Sample) "
+                        f"WHERE e.uuid='{uuid}' AND s.specimen_type='organ' AND EXISTS(s.organ) "
+                        f"RETURN DISTINCT s.organ AS organ_name, s.uuid AS sample_uuid")
+
+        logger.info("======get_dataset_organ_and_donor_info() sample_query======")
+        logger.info(sample_query)
+
+        sample_record = session.read_transaction(_execute_readonly_tx, sample_query)
+
+        if sample_record:
+            organ_name = sample_record['organ_name']
+            sample_uuid = sample_record['sample_uuid']
+
+            donor_query = (f"MATCH (s:Sample)<-[:ACTIVITY_OUTPUT]-(a:Activity)<-[:ACTIVITY_INPUT]-(d:Donor) "
+                           f"WHERE s.uuid='{sample_uuid}' AND s.specimen_type='organ' AND EXISTS(s.organ) "
+                           f"RETURN DISTINCT d.metadata AS donor_metadata")
+
+            logger.info("======get_dataset_organ_and_donor_info() donor_query======")
+            logger.info(donor_query)
+
+            donor_record = session.read_transaction(_execute_readonly_tx, donor_query)
 
-        if record:
-            organ_name = record['organ_name']
-            donor_metadata = record['donor_metadata']
+            if donor_record:
+                donor_metadata = donor_record['donor_metadata']
 
     return organ_name, donor_metadata
 
@@ -336,8 +361,43 @@ def get_dataset_upload(neo4j_driver, uuid, property_key = None):
     return result
 
 
+# """
+# Get a list of associated dataset dicts for a given collection
+
+# Parameters
+# ----------
+# neo4j_driver : neo4j.Driver object
+#     The neo4j database connection pool
+# uuid : str
+#     The uuid of collection
+
+# Returns
+# -------
+# list
+#     The list containing associated dataset dicts
+# """
+# def get_collection_datasets(neo4j_driver, uuid):
+#     results = []
+
+#     query = (f"MATCH (e:Entity)-[:IN_COLLECTION]->(c:Collection) "
+#              f"WHERE c.uuid = '{uuid}' "
+#              f"RETURN apoc.coll.toSet(COLLECT(e)) AS {record_field_name}")
+
+#     logger.info("======get_collection_datasets() query======")
+#     logger.info(query)
+
+#     with neo4j_driver.session() as session:
+#         record = session.read_transaction(_execute_readonly_tx, query)
+
+#         if record and record[record_field_name]:
+#             # Convert the list of nodes to a list of dicts
+#             results = _nodes_to_dicts(record[record_field_name])
+
+#     return results
+
+
 """
-Get a list of associated dataset dicts for a given collection
+Get a list of associated dataset uuids for a given Collection
 
 Parameters
 ----------
@@ -349,24 +409,24 @@ def get_dataset_upload(neo4j_driver, uuid, property_key = None):
 Returns
 -------
 list
-    The list containing associated dataset dicts
+    The list of associated dataset uuids
 """
-def get_collection_datasets(neo4j_driver, uuid):
+def get_collection_dataset_uuids(neo4j_driver, uuid):
     results = []
 
-    query = (f"MATCH (e:Entity)-[:IN_COLLECTION]->(c:Collection) "
+    query = (f"MATCH (e:Dataset)-[:IN_COLLECTION]->(c:Collection) "
              f"WHERE c.uuid = '{uuid}' "
              f"RETURN apoc.coll.toSet(COLLECT(e)) AS {record_field_name}")
 
-    logger.info("======get_collection_datasets() query======")
+    logger.info("======get_collection_dataset_uuids() query======")
     logger.info(query)
 
     with neo4j_driver.session() as session:
         record = session.read_transaction(_execute_readonly_tx, query)
 
         if record and record[record_field_name]:
-            # Convert the list of nodes to a list of dicts
-            results = _nodes_to_dicts(record[record_field_name])
+            # Just return the list of uuids
+            results = record[record_field_name]
 
     return results
 
@@ -467,7 +527,7 @@ def unlink_datasets_from_upload(neo4j_driver, upload_uuid, dataset_uuids_list):
 
 
 """
-Get a list of associated dataset dicts for a given collection
+Get a list of associated dataset dicts for a given Upload
 
 Parameters
 ----------
@@ -484,7 +544,7 @@ def unlink_datasets_from_upload(neo4j_driver, upload_uuid, dataset_uuids_list):
 def get_upload_datasets(neo4j_driver, uuid):
     results = []
 
-    query = (f"MATCH (e:Entity)-[:IN_UPLOAD]->(s:Upload) "
+    query = (f"MATCH (e:Dataset)-[:IN_UPLOAD]->(s:Upload) "
              f"WHERE s.uuid = '{uuid}' "
              f"RETURN apoc.coll.toSet(COLLECT(e)) AS {record_field_name}")
 
diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py
diff --git a/src/schema/schema_validators.py b/src/schema/schema_validators.py