Merge pull request #405 from hubmapconsortium/Derek-Furst/identify-paired-datasets

yuanzhou · web-flow · commit 8c951d44ef17 · 2023-01-04T14:42:22.000-05:00
Derek furst/identify paired datasets
diff --git a/entity-api-spec.yaml b/entity-api-spec.yaml
@@ -2014,5 +2014,47 @@ paths:
           description: The user's token has expired or the user did not supply a valid token
         '403':
           description: THe user is not authorized to use this method
+        '500':
+          description: Internal error
+  '/datasets/{id}/paired-dataset':
+    get:
+      summary: Retrieve uuids for associated dataset of given data_type which shares a sample ancestor of given dataset id
+      parameters:
+        - name: id
+          in: path
+          description: The unique identifier of entity.  This identifier can be either an HuBMAP ID (e.g. HBM123.ABCD.456) or UUID (32 digit hexadecimal number)
+          required: true
+          schema:
+            type: string
+        - name: data_type
+          in: query
+          description: The desired data_type to be searched for
+          required: true
+          schema:
+            type: string
+        - name: search_depth
+          in: query
+          description: The maximum number of generations of datasets beneath the sample to search for the paired dataset
+          required: false
+          schema:
+            type: integer
+      responses:
+        '200':
+          description: returns a list (json array) of the uuids (if any) of the matching paired datasets.
+          content:
+            application/json:
+              schema:
+                type: array
+                properties:
+                  uuid:
+                    type: string
+                    description: The unique identifier for the  unpublisheddataset
+
+        '404':
+          description: Not found. No matching datasets were found, or the none were found that the user is authorized to see.
+        '401':
+          description: The user's token has expired or the user did not supply a valid token
+        '403':
+          description: The given dataset is unpublished and the user does not have the authorization to view it.
         '500':
           description: Internal error
diff --git a/src/app.py b/src/app.py
@@ -3399,6 +3399,114 @@ def unpublished():
     else:
         return jsonify(unpublished_info)
 
+"""
+Retrieve uuids for associated dataset of given data_type which 
+shares a sample ancestor of given dataset id
+
+Returns
+--------
+json array
+    List of uuids of all datasets (if any) of the specified data_type
+     who share a sample ancestor with the dataset with the given id
+
+Authorization
+-------------
+This endpoint is publicly accessible, however if a token is provided, 
+it must be valid. If the given dataset uuid is for an unpublished dataset,
+the user must be part of the HuBMAP-Read-Group. If not, a 403 will be raised.
+
+Path Parameters
+---------------
+id : str
+    The HuBMAP ID (e.g. HBM123.ABCD.456) or UUID of target dataset
+
+Required Query Paramters
+------------------------
+data_type : str
+    The data type to be searched for.
+    
+Optional Query Paramters
+------------------------
+search_depth : int
+    The max number of generations of datasets to search for associated paired 
+    dataset. This number is the number of generations between the shared sample
+    ancestor and the target dataset (if any) rather than the starting dataset. 
+    This number counts dataset generations and not activity nodes or any other 
+    intermediate steps between 2 datasets. If no search_depth is given, the 
+    search will traverse all descendants of the sample ancestor.  
+
+If the associated datasets (if any exist) returned are unpublished, they    
+"""
+@app.route('/datasets/<id>/paired-dataset', methods=['GET'])
+def paired_dataset(id):
+    if request.headers.get('Authorization') is not None:
+        try:
+            user_token = auth_helper_instance.getAuthorizationTokens(request.headers)
+        except Exception:
+            msg = "Failed to parse the Authorization token by calling commons.auth_helper.getAuthorizationTokens()"
+            # Log the full stack trace, prepend a line with our message
+            logger.exception(msg)
+            internal_server_error(msg)
+        # When the Authoriztion header provided but the user_token is a flask.Response instance,
+        # it MUST be a 401 error with message.
+        # That's how commons.auth_helper.getAuthorizationTokens() was designed
+        if isinstance(user_token, Response):
+            # We wrap the message in a json and send back to requester as 401 too
+            # The Response.data returns binary string, need to decode
+            unauthorized_error(user_token.get_data().decode())
+        # Also check if the parased token is invalid or expired
+        # Set the second paremeter as False to skip group check
+        user_info = auth_helper_instance.getUserInfo(user_token, False)
+        if isinstance(user_info, Response):
+            unauthorized_error(user_info.get_data().decode())
+
+    accepted_arguments = ['data_type', 'search_depth']
+    if not bool(request.args):
+        bad_request_error(f"'data_type' is a required argument")
+    else:
+        for argument in request.args:
+            if argument not in accepted_arguments:
+                bad_request_error(f"{argument} is an unrecognized argument.")
+        if 'data_type' not in request.args:
+            bad_request_error(f"'data_type' is a required argument")
+        else:
+            data_type = request.args.get('data_type')
+        if 'search_depth' in request.args:
+            try:
+                search_depth = int(request.args.get('search_depth'))
+            except ValueError:
+                bad_request_error(f"'search_depth' must be an integer")
+        else:
+            search_depth = None
+    # Use the internal token to query the target entity
+    # since public entities don't require user token
+    token = get_internal_token()
+
+    # Query target entity against uuid-api and neo4j and return as a dict if exists
+    # Then retrieve the allowable data access level (public, protected or consortium)
+    # for the dataset and HuBMAP Component ID that the dataset belongs to
+    entity_dict = query_target_entity(id, token)
+    uuid = entity_dict['uuid']
+    normalized_entity_type = entity_dict['entity_type']
+
+    # Only for Dataset and Upload
+    if normalized_entity_type != 'Dataset':
+        bad_request_error("The target entity of the specified id is not a Dataset")
+
+    if entity_dict['status'].lower() != DATASET_STATUS_PUBLISHED:
+        if not user_in_hubmap_read_group(request):
+            forbidden_error("Access not granted")
+
+    paired_dataset = app_neo4j_queries.get_paired_dataset(neo4j_driver_instance, uuid, data_type, search_depth)
+    out_list = []
+    for result in paired_dataset:
+        if user_in_hubmap_read_group(request) or result['status'].lower() == 'published':
+            out_list.append(result['uuid'])
+    if len(out_list) < 1:
+        not_found_error(f"Search for paired datasets of type {data_type} for dataset with id {uuid} returned no results")
+    else:
+        return jsonify(out_list), 200
+
 
 ####################################################################################################
 ## Internal Functions
@@ -4138,7 +4246,7 @@ def query_target_entity(id, user_token):
                 not_found_error(e.response.text)
             else:
                 internal_server_error(e.response.text)
-    
+
     # Final return
     return entity_dict
 
diff --git a/src/app_neo4j_queries.py b/src/app_neo4j_queries.py
@@ -1327,6 +1327,49 @@ def get_unpublished(neo4j_driver):
         rval = session.run(query).data()
         return rval
 
+"""
+Returns a list of dictionaries corresponding to matches to the neo4j query
+containing uuid of matched datasets (if any) and their status
+
+Paramters
+---------
+neo4j_driver : neo4j.Driver object
+    The neo4j database connection pool
+
+uuid
+----
+The id of the dataset who's paired datasets will be returned
+
+data_type
+---------
+The datatype of the paired datasets being searched for
+
+search_depth (optional)
+----------------------
+The max number of generations that will be searched beneath the 
+sample ancestor of the given dataset uuid. The value given will be
+doubled in the query so that it only counts dataset nodes, not 
+activity nodes between datasets. For example, a value of "2" will search
+(s:Sample)-[r1]->(a1:Activity)-[r2]->(d1:Dataset)-[r3]->(a2:Activity)-[r4]->(d2:Dataset)
+or 2*2 nodes beyond the sample
+"""
+def get_paired_dataset(neo4j_driver, uuid, data_type, search_depth):
+    # search depth is doubled because there is an activity node between each entity node
+    number_of_jumps = f"*"
+    if search_depth is not None:
+        search_depth = 2 * search_depth
+        number_of_jumps = f"*..{search_depth}"
+    data_type = f"['{data_type}']"
+    query = (
+        f'MATCH (ds:Dataset)<-[*]-(s:Sample) WHERE ds.uuid = "{uuid}" AND (:Dataset)<-[]-()<-[]-(s)'
+        f'MATCH (ods)<-[{number_of_jumps}]-(s) WHERE ods.data_types = "{data_type}"'
+        f'return ods.uuid as uuid, ods.status as status'
+    )
+    paired_datasets = []
+    with neo4j_driver.session() as session:
+        rval = session.run(query).data()
+        return rval
+
 ####################################################################################################
 ## Internal Functions
 ####################################################################################################