Merge pull request #297 from hubmapconsortium/test-release

yuanzhou · web-flow · commit 99b5cd55cb84 · 2022-02-22T11:10:49.000-05:00
v2.0.24 release
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.0.23
+2.0.24
diff --git a/docker/docker-compose.dev.yml b/docker/docker-compose.dev.yml
@@ -8,16 +8,9 @@ services:
     # since nginx is running under non-root user hubmap
     ports:
       - "3333:8080"
-    environment:
-      - HOST_GID=${HOST_GID:-1000}
-      - HOST_UID=${HOST_UID:-1000}
-    init: true
-    restart: always
     volumes:
       # Mount the VERSION file and BUILD file
       - "../VERSION:/usr/src/app/VERSION"
       - "../BUILD:/usr/src/app/BUILD"
       # Mount the source code
-      - "../src:/usr/src/app/src"
-      # Mount conf.d on host machine to the nginx conf.d on container
-      - "./entity-api/nginx/conf.d:/etc/nginx/conf.d"
+      - "../src:/usr/src/app/src"
diff --git a/docker/docker-compose.localhost.yml b/docker/docker-compose.localhost.yml
@@ -3,12 +3,9 @@ version: "3.7"
 services:
   
   entity-api:
-    environment:
-      - HOST_GID=${HOST_GID:-1000}
-      - HOST_UID=${HOST_UID:-1000}
     volumes:
       # Mount the VERSION file and BUILD file
       - "../VERSION:/usr/src/app/VERSION"
       - "../BUILD:/usr/src/app/BUILD"
       # Mount the source code to container
-      - "../src:/usr/src/app/src"
+      - "../src:/usr/src/app/src"
diff --git a/docker/docker-compose.prod.yml b/docker/docker-compose.prod.yml
@@ -7,14 +7,4 @@ services:
     # Only root can listen on ports below 1024, we use higher-numbered ports
     # since nginx is running under non-root user hubmap
     ports:
-      - "3333:8080"
-    environment:
-      - HOST_GID=${HOST_GID:-1000}
-      - HOST_UID=${HOST_UID:-1000}
-    init: true
-    restart: always
-    volumes:
-      # Mount conf.d on host machine to the nginx conf.d on container
-      - "./entity-api/nginx/conf.d:/etc/nginx/conf.d"
-
-
+      - "3333:8080"
diff --git a/docker/docker-compose.stage.yml b/docker/docker-compose.stage.yml
@@ -7,14 +7,4 @@ services:
     # Only root can listen on ports below 1024, we use higher-numbered ports
     # since nginx is running under non-root user hubmap
     ports:
-      - "3333:8080"
-    environment:
-      - HOST_GID=${HOST_GID:-1000}
-      - HOST_UID=${HOST_UID:-1000}
-    init: true
-    restart: always
-    volumes:
-      # Mount conf.d on host machine to the nginx conf.d on container
-      - "./entity-api/nginx/conf.d:/etc/nginx/conf.d"
-
-
+      - "3333:8080"
diff --git a/docker/docker-compose.test.yml b/docker/docker-compose.test.yml
@@ -7,13 +7,4 @@ services:
     # Only root can listen on ports below 1024, we use higher-numbered ports
     # since nginx is running under non-root user hubmap
     ports:
-      - "3333:8080"
-    environment:
-      - HOST_GID=${HOST_GID:-1000}
-      - HOST_UID=${HOST_UID:-1000}
-    init: true
-    restart: always
-    volumes:
-      # Mount conf.d on host machine to the nginx conf.d on container
-      - "./entity-api/nginx/conf.d:/etc/nginx/conf.d"
-
+      - "3333:8080"
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
@@ -20,13 +20,20 @@ services:
       timeout: 10s
       retries: 3
       start_period: 40s
+    environment:
+      - HOST_GID=${HOST_GID:-1000}
+      - HOST_UID=${HOST_UID:-1000}
+    init: true
+    restart: always
     volumes:
       # Mount the app config to container in order to keep it outside of the image
       - "../src/instance:/usr/src/app/src/instance"
       # Mount the logging to container
       - "../log:/usr/src/app/log"
       # Mount the schema yaml file
       - "../src/schema/provenance_schema.yaml:/usr/src/app/src/schema/provenance_schema.yaml"
+      # Mount conf.d on host machine to the nginx conf.d on container
+      - "./entity-api/nginx/conf.d:/etc/nginx/conf.d"
     networks:
       - gateway_hubmap
 
diff --git a/src/app.py b/src/app.py
@@ -544,6 +544,9 @@ def get_entity_types():
 Result filtering is supported based on query string
 For example: /<entity_type>/entities?property=uuid
 
+NOTE: this endpoint is NOT exposed via AWS API Gateway due to performance consideration
+It's only used by search-api with making internal calls during index/reindex time bypassing AWS API Gateway
+
 Parameters
 ----------
 entity_type : str
@@ -2209,7 +2212,29 @@ def get_associated_organs_from_dataset(id):
 """
 Get the complete provenance info for all datasets
 
-Authorization handled by gateway. HuBMAP-Read group is required for this call. 
+Authentication
+-------
+No token is required, however if a token is given it must be valid or an error will be raised. If no token with HuBMAP
+Read Group access is given, only datasets designated as "published" will be returned
+
+Query Parameters
+-------
+    format : string
+        Designates the output format of the returned data. Accepted values are "json" and "tsv". If none provided, by 
+        default will return a tsv.
+    group_uuid : string
+        Filters returned datasets by a given group uuid. 
+    organ : string
+        Filters returned datasets related to a samples of the given organ. Accepts 2 character organ codes. These codes
+        must match the organ types yaml at https://raw.githubusercontent.com/hubmapconsortium/search-api/test-release/src/search-schema/data/definitions/enums/organ_types.yaml
+        or an error will be raised
+    has_rui_info : string
+        Accepts strings "true" or "false. Any other value will result in an error. If true, only datasets connected to 
+        an sample that contain rui info will be returned. If false, only datasets that are NOT connected to samples
+        containing rui info will be returned. By default, no filtering is performed. 
+    dataset_status : string
+        Filters results by dataset status. Accepted values are "Published", "QA", and "NEW". If a user only has access
+        to published datasets and enters QA or New, an error will be raised. By default, no filtering is performed 
 
 Returns
 -------
@@ -2258,6 +2283,7 @@ def get_prov_info():
     HEADER_PROCESSED_DATASET_PORTAL_URL = 'processed_dataset_portal_url'
     ASSAY_TYPES_URL = 'https://raw.githubusercontent.com/hubmapconsortium/search-api/master/src/search-schema/data/definitions/enums/assay_types.yaml'
     ORGAN_TYPES_URL = 'https://raw.githubusercontent.com/hubmapconsortium/search-api/master/src/search-schema/data/definitions/enums/organ_types.yaml'
+    HEADER_PREVIOUS_VERSION_HUBMAP_IDS = 'previous_version_hubmap_ids'
 
     headers = [
         HEADER_DATASET_UUID, HEADER_DATASET_HUBMAP_ID, HEADER_DATASET_STATUS, HEADER_DATASET_GROUP_NAME,
@@ -2270,8 +2296,16 @@ def get_prov_info():
         HEADER_DONOR_GROUP_NAME, HEADER_RUI_LOCATION_HUBMAP_ID, HEADER_RUI_LOCATION_SUBMISSION_ID,
         HEADER_RUI_LOCATION_UUID, HEADER_SAMPLE_METADATA_HUBMAP_ID, HEADER_SAMPLE_METADATA_SUBMISSION_ID,
         HEADER_SAMPLE_METADATA_UUID, HEADER_PROCESSED_DATASET_UUID, HEADER_PROCESSED_DATASET_HUBMAP_ID,
-        HEADER_PROCESSED_DATASET_STATUS, HEADER_PROCESSED_DATASET_PORTAL_URL
+        HEADER_PROCESSED_DATASET_STATUS, HEADER_PROCESSED_DATASET_PORTAL_URL, HEADER_PREVIOUS_VERSION_HUBMAP_IDS
     ]
+    published_only = True
+
+    # Token is not required, but if an invalid token is provided,
+    # we need to tell the client with a 401 error
+    validate_token_if_auth_header_exists(request)
+
+    if user_in_hubmap_read_group(request):
+        published_only = False
 
     # Parsing the organ types yaml has to be done here rather than calling schema.schema_triggers.get_organ_description
     # because that would require using a urllib request for each dataset
@@ -2328,13 +2362,19 @@ def get_prov_info():
         if dataset_status is not None:
             if dataset_status.lower() not in ['new', 'qa', 'published']:
                 bad_request_error("Invalid Dataset Status. Must be 'new', 'qa', or 'published' Case-Insensitive")
-            param_dict['dataset_status'] = dataset_status
+            if published_only and dataset_status.lower() != 'published':
+                bad_request_error(f"Invalid Dataset Status. No auth token given or token is not a member of HuBMAP-Read"
+                                  " Group. If no token with HuBMAP-Read Group access is given, only datasets marked "
+                                  "'Published' are available. Try again with a proper token, or change/remove "
+                                  "dataset_status")
+            if not published_only:
+                param_dict['dataset_status'] = dataset_status
 
     # Instantiation of the list dataset_prov_list
     dataset_prov_list = []
 
     # Call to app_neo4j_queries to prepare and execute the database query
-    prov_info = app_neo4j_queries.get_prov_info(neo4j_driver_instance, param_dict)
+    prov_info = app_neo4j_queries.get_prov_info(neo4j_driver_instance, param_dict, published_only)
 
     # Each dataset's provinence info is placed into a dictionary
     for dataset in prov_info:
@@ -2497,6 +2537,15 @@ def get_prov_info():
                 internal_dict[HEADER_PROCESSED_DATASET_UUID] = ",".join(processed_dataset_status_list)
                 internal_dict[HEADER_PROCESSED_DATASET_UUID] = ",".join(processed_dataset_portal_url_list)
 
+
+        if dataset['previous_version_hubmap_ids'] is not None:
+            previous_version_hubmap_ids_list = []
+            for item in dataset['previous_version_hubmap_ids']:
+                previous_version_hubmap_ids_list.append(item)
+            internal_dict[HEADER_PREVIOUS_VERSION_HUBMAP_IDS] = previous_version_hubmap_ids_list
+            if return_json is False:
+                internal_dict[HEADER_PREVIOUS_VERSION_HUBMAP_IDS] = ",".join(previous_version_hubmap_ids_list)
+
         # Each dataset's dictionary is added to the list to be returned
         dataset_prov_list.append(internal_dict)
 
@@ -2515,6 +2564,33 @@ def get_prov_info():
         output.headers['Content-Disposition'] = 'attachment; filename=prov-info.tsv'
         return output
 
+
+"""
+Get the complete provenance info for a given dataset
+
+Authentication
+-------
+No token is required, however if a token is given it must be valid or an error will be raised. If no token with HuBMAP
+Read Group access is given, only datasets designated as "published" will be returned
+
+Query Parameters
+-------
+format : string
+        Designates the output format of the returned data. Accepted values are "json" and "tsv". If none provided, by 
+        default will return a tsv.
+
+Path Parameters
+-------
+id : string
+    A HuBMAP_ID or UUID for a dataset. If an invalid dataset id is given, an error will be raised    
+
+Returns
+-------
+json
+    an array of each datatset's provenance info
+tsv
+    a text file of tab separated values where each row is a dataset and the columns include all its prov info
+"""
 @app.route('/datasets/<id>/prov-info', methods=['GET'])
 def get_prov_info_for_dataset(id):
     # Token is not required, but if an invalid token provided,
diff --git a/src/app_neo4j_queries.py b/src/app_neo4j_queries.py
@@ -932,50 +932,55 @@ def get_associated_organs_from_dataset(neo4j_driver, dataset_uuid):
     The neo4j database connection pool
 param_dict : dictionary
     Dictionary containing any parameters desired to filter for certain results
+published_only : boolean
+    If a user does not have a token with HuBMAP-Read Group access, published_only is set to true. This will cause only 
+    datasets with status = 'Published' to be included in the result.
 """
-def get_prov_info(neo4j_driver, param_dict):
+def get_prov_info(neo4j_driver, param_dict, published_only):
     group_uuid_query_string = ''
     organ_query_string = 'OPTIONAL MATCH'
     organ_where_clause = ""
     rui_info_query_string = 'OPTIONAL MATCH (ds)<-[*]-(ruiSample:Sample)'
     rui_info_where_clause = "WHERE NOT ruiSample.rui_location IS NULL AND NOT trim(ruiSample.rui_location) = '' "
     dataset_status_query_string = ''
-    first_param = True
+    published_only_query_string = ''
     if 'group_uuid' in param_dict:
-        first_param = False
-        group_uuid_query_string = f" WHERE toUpper(ds.group_uuid) = '{param_dict['group_uuid'].upper()}'"
+        group_uuid_query_string = f" AND toUpper(ds.group_uuid) = '{param_dict['group_uuid'].upper()}'"
     if 'organ' in param_dict:
         organ_query_string = 'MATCH'
         # organ_where_clause = f", organ: '{param_dict['organ'].upper()}'"
-        organ_where_clause = f" WHERE toUPPER(organ.organ) = '{param_dict['organ'].upper()}'"
+        organ_where_clause = f" WHERE toUpper(organ.organ) = '{param_dict['organ'].upper()}'"
     if 'has_rui_info' in param_dict:
         rui_info_query_string = 'MATCH (ds)<-[*]-(ruiSample:Sample)'
         if param_dict['has_rui_info'].lower() == 'false':
             rui_info_query_string = 'MATCH (ds:Dataset)'
             rui_info_where_clause = "WHERE NOT EXISTS {MATCH (ds)<-[*]-(ruiSample:Sample) WHERE NOT ruiSample.rui_location IS NULL AND NOT TRIM(ruiSample.rui_location) = ''} MATCH (ds)<-[*]-(ruiSample:Sample)"
     if 'dataset_status' in param_dict:
-        if first_param:
-            dataset_status_query_string = f" WHERE toUpper(ds.status) = '{param_dict['dataset_status'].upper()}'"
-        else:
-            dataset_status_query_string = f" AND toUpper(ds.status) = '{param_dict['dataset_status'].upper()}'"
+        dataset_status_query_string = f" AND toUpper(ds.status) = '{param_dict['dataset_status'].upper()}'"
+    if published_only:
+        published_only_query_string = f" AND toUpper(ds.status) = 'PUBLISHED'"
     query = (f"MATCH (ds:Dataset)<-[:ACTIVITY_OUTPUT]-(a)<-[:ACTIVITY_INPUT]-(firstSample:Sample)<-[*]-(donor:Donor)"
+             f"WHERE not (ds)-[:REVISION_OF]->(:Dataset)"
              f"{group_uuid_query_string}"
              f"{dataset_status_query_string}"
+             f"{published_only_query_string}"
              f" WITH ds, COLLECT(distinct donor) AS DONOR, COLLECT(distinct firstSample) AS FIRSTSAMPLE"
+             f" OPTIONAL MATCH (ds)<-[:REVISION_OF]-(rev:Dataset)"
+             f" WITH ds, DONOR, FIRSTSAMPLE, COLLECT(rev.hubmap_id) as REVISIONS"
              f" OPTIONAL MATCH (ds)<-[*]-(metaSample:Sample)"
              f" WHERE NOT metaSample.metadata IS NULL AND NOT TRIM(metaSample.metadata) = ''"
-             f" WITH ds, FIRSTSAMPLE, DONOR, collect(distinct metaSample) as METASAMPLE"
+             f" WITH ds, FIRSTSAMPLE, DONOR, REVISIONS, collect(distinct metaSample) as METASAMPLE"
              f" {rui_info_query_string}"
              f" {rui_info_where_clause}"
-             f" WITH ds, FIRSTSAMPLE, DONOR, METASAMPLE, collect(distinct ruiSample) as RUISAMPLE"
+             f" WITH ds, FIRSTSAMPLE, DONOR, REVISIONS, METASAMPLE, collect(distinct ruiSample) as RUISAMPLE"
              f" {organ_query_string} (donor)-[:ACTIVITY_INPUT]->(oa)-[:ACTIVITY_OUTPUT]->(organ:Sample {{specimen_type:'organ'}})-[*]->(ds)"
              f" {organ_where_clause}"
-             f" WITH ds, FIRSTSAMPLE, DONOR, METASAMPLE, RUISAMPLE, COLLECT(DISTINCT organ) AS ORGAN "
+             f" WITH ds, FIRSTSAMPLE, DONOR, REVISIONS, METASAMPLE, RUISAMPLE, COLLECT(DISTINCT organ) AS ORGAN "
              f" OPTIONAL MATCH (ds)-[:ACTIVITY_INPUT]->(a3)-[:ACTIVITY_OUTPUT]->(processed_dataset:Dataset)"
-             f" WITH ds, FIRSTSAMPLE, DONOR, METASAMPLE, RUISAMPLE, ORGAN, COLLECT(distinct processed_dataset) AS PROCESSED_DATASET"
+             f" WITH ds, FIRSTSAMPLE, DONOR, REVISIONS, METASAMPLE, RUISAMPLE, ORGAN, COLLECT(distinct processed_dataset) AS PROCESSED_DATASET"
              f" RETURN ds.uuid, FIRSTSAMPLE, DONOR, RUISAMPLE, ORGAN, ds.hubmap_id, ds.status, ds.group_name,"
              f" ds.group_uuid, ds.created_timestamp, ds.created_by_user_email, ds.last_modified_timestamp, "
-             f" ds.last_modified_user_email, ds.lab_dataset_id, ds.data_types, METASAMPLE, PROCESSED_DATASET")
+             f" ds.last_modified_user_email, ds.lab_dataset_id, ds.data_types, METASAMPLE, PROCESSED_DATASET, REVISIONS")
     logger.debug("======get_prov_info() query======")
     logger.debug(query)
     with neo4j_driver.session() as session:
@@ -1033,6 +1038,11 @@ def get_prov_info(neo4j_driver, param_dict):
                 node_dict = _node_to_dict(entry)
                 content_sixteen.append(node_dict)
             record_dict['processed_dataset'] = content_sixteen
+            content_seventeen = []
+            for entry in record_contents[17]:
+                node_dict = _node_to_dict(entry)
+                content_seventeen.append(node_dict)
+            record_dict['previous_version_hubmap_ids'] = content_seventeen
             list_of_dictionaries.append(record_dict)
     return list_of_dictionaries