Skip to content

Commit 8c951d4

Browse files
authored
Merge pull request #405 from hubmapconsortium/Derek-Furst/identify-paired-datasets
Derek furst/identify paired datasets
2 parents ac01369 + 7134ff7 commit 8c951d4

File tree

3 files changed

+194
-1
lines changed

3 files changed

+194
-1
lines changed

entity-api-spec.yaml

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2014,5 +2014,47 @@ paths:
20142014
description: The user's token has expired or the user did not supply a valid token
20152015
'403':
20162016
description: THe user is not authorized to use this method
2017+
'500':
2018+
description: Internal error
2019+
'/datasets/{id}/paired-dataset':
2020+
get:
2021+
summary: Retrieve uuids for associated dataset of given data_type which shares a sample ancestor of given dataset id
2022+
parameters:
2023+
- name: id
2024+
in: path
2025+
description: The unique identifier of entity. This identifier can be either an HuBMAP ID (e.g. HBM123.ABCD.456) or UUID (32 digit hexadecimal number)
2026+
required: true
2027+
schema:
2028+
type: string
2029+
- name: data_type
2030+
in: query
2031+
description: The desired data_type to be searched for
2032+
required: true
2033+
schema:
2034+
type: string
2035+
- name: search_depth
2036+
in: query
2037+
description: The maximum number of generations of datasets beneath the sample to search for the paired dataset
2038+
required: false
2039+
schema:
2040+
type: integer
2041+
responses:
2042+
'200':
2043+
description: returns a list (json array) of the uuids (if any) of the matching paired datasets.
2044+
content:
2045+
application/json:
2046+
schema:
2047+
type: array
2048+
properties:
2049+
uuid:
2050+
type: string
2051+
description: The unique identifier for the unpublisheddataset
2052+
2053+
'404':
2054+
description: Not found. No matching datasets were found, or the none were found that the user is authorized to see.
2055+
'401':
2056+
description: The user's token has expired or the user did not supply a valid token
2057+
'403':
2058+
description: The given dataset is unpublished and the user does not have the authorization to view it.
20172059
'500':
20182060
description: Internal error

src/app.py

Lines changed: 109 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3399,6 +3399,114 @@ def unpublished():
33993399
else:
34003400
return jsonify(unpublished_info)
34013401

3402+
"""
3403+
Retrieve uuids for associated dataset of given data_type which
3404+
shares a sample ancestor of given dataset id
3405+
3406+
Returns
3407+
--------
3408+
json array
3409+
List of uuids of all datasets (if any) of the specified data_type
3410+
who share a sample ancestor with the dataset with the given id
3411+
3412+
Authorization
3413+
-------------
3414+
This endpoint is publicly accessible, however if a token is provided,
3415+
it must be valid. If the given dataset uuid is for an unpublished dataset,
3416+
the user must be part of the HuBMAP-Read-Group. If not, a 403 will be raised.
3417+
3418+
Path Parameters
3419+
---------------
3420+
id : str
3421+
The HuBMAP ID (e.g. HBM123.ABCD.456) or UUID of target dataset
3422+
3423+
Required Query Paramters
3424+
------------------------
3425+
data_type : str
3426+
The data type to be searched for.
3427+
3428+
Optional Query Paramters
3429+
------------------------
3430+
search_depth : int
3431+
The max number of generations of datasets to search for associated paired
3432+
dataset. This number is the number of generations between the shared sample
3433+
ancestor and the target dataset (if any) rather than the starting dataset.
3434+
This number counts dataset generations and not activity nodes or any other
3435+
intermediate steps between 2 datasets. If no search_depth is given, the
3436+
search will traverse all descendants of the sample ancestor.
3437+
3438+
If the associated datasets (if any exist) returned are unpublished, they
3439+
"""
3440+
@app.route('/datasets/<id>/paired-dataset', methods=['GET'])
3441+
def paired_dataset(id):
3442+
if request.headers.get('Authorization') is not None:
3443+
try:
3444+
user_token = auth_helper_instance.getAuthorizationTokens(request.headers)
3445+
except Exception:
3446+
msg = "Failed to parse the Authorization token by calling commons.auth_helper.getAuthorizationTokens()"
3447+
# Log the full stack trace, prepend a line with our message
3448+
logger.exception(msg)
3449+
internal_server_error(msg)
3450+
# When the Authoriztion header provided but the user_token is a flask.Response instance,
3451+
# it MUST be a 401 error with message.
3452+
# That's how commons.auth_helper.getAuthorizationTokens() was designed
3453+
if isinstance(user_token, Response):
3454+
# We wrap the message in a json and send back to requester as 401 too
3455+
# The Response.data returns binary string, need to decode
3456+
unauthorized_error(user_token.get_data().decode())
3457+
# Also check if the parased token is invalid or expired
3458+
# Set the second paremeter as False to skip group check
3459+
user_info = auth_helper_instance.getUserInfo(user_token, False)
3460+
if isinstance(user_info, Response):
3461+
unauthorized_error(user_info.get_data().decode())
3462+
3463+
accepted_arguments = ['data_type', 'search_depth']
3464+
if not bool(request.args):
3465+
bad_request_error(f"'data_type' is a required argument")
3466+
else:
3467+
for argument in request.args:
3468+
if argument not in accepted_arguments:
3469+
bad_request_error(f"{argument} is an unrecognized argument.")
3470+
if 'data_type' not in request.args:
3471+
bad_request_error(f"'data_type' is a required argument")
3472+
else:
3473+
data_type = request.args.get('data_type')
3474+
if 'search_depth' in request.args:
3475+
try:
3476+
search_depth = int(request.args.get('search_depth'))
3477+
except ValueError:
3478+
bad_request_error(f"'search_depth' must be an integer")
3479+
else:
3480+
search_depth = None
3481+
# Use the internal token to query the target entity
3482+
# since public entities don't require user token
3483+
token = get_internal_token()
3484+
3485+
# Query target entity against uuid-api and neo4j and return as a dict if exists
3486+
# Then retrieve the allowable data access level (public, protected or consortium)
3487+
# for the dataset and HuBMAP Component ID that the dataset belongs to
3488+
entity_dict = query_target_entity(id, token)
3489+
uuid = entity_dict['uuid']
3490+
normalized_entity_type = entity_dict['entity_type']
3491+
3492+
# Only for Dataset and Upload
3493+
if normalized_entity_type != 'Dataset':
3494+
bad_request_error("The target entity of the specified id is not a Dataset")
3495+
3496+
if entity_dict['status'].lower() != DATASET_STATUS_PUBLISHED:
3497+
if not user_in_hubmap_read_group(request):
3498+
forbidden_error("Access not granted")
3499+
3500+
paired_dataset = app_neo4j_queries.get_paired_dataset(neo4j_driver_instance, uuid, data_type, search_depth)
3501+
out_list = []
3502+
for result in paired_dataset:
3503+
if user_in_hubmap_read_group(request) or result['status'].lower() == 'published':
3504+
out_list.append(result['uuid'])
3505+
if len(out_list) < 1:
3506+
not_found_error(f"Search for paired datasets of type {data_type} for dataset with id {uuid} returned no results")
3507+
else:
3508+
return jsonify(out_list), 200
3509+
34023510

34033511
####################################################################################################
34043512
## Internal Functions
@@ -4138,7 +4246,7 @@ def query_target_entity(id, user_token):
41384246
not_found_error(e.response.text)
41394247
else:
41404248
internal_server_error(e.response.text)
4141-
4249+
41424250
# Final return
41434251
return entity_dict
41444252

src/app_neo4j_queries.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1327,6 +1327,49 @@ def get_unpublished(neo4j_driver):
13271327
rval = session.run(query).data()
13281328
return rval
13291329

1330+
"""
1331+
Returns a list of dictionaries corresponding to matches to the neo4j query
1332+
containing uuid of matched datasets (if any) and their status
1333+
1334+
Paramters
1335+
---------
1336+
neo4j_driver : neo4j.Driver object
1337+
The neo4j database connection pool
1338+
1339+
uuid
1340+
----
1341+
The id of the dataset who's paired datasets will be returned
1342+
1343+
data_type
1344+
---------
1345+
The datatype of the paired datasets being searched for
1346+
1347+
search_depth (optional)
1348+
----------------------
1349+
The max number of generations that will be searched beneath the
1350+
sample ancestor of the given dataset uuid. The value given will be
1351+
doubled in the query so that it only counts dataset nodes, not
1352+
activity nodes between datasets. For example, a value of "2" will search
1353+
(s:Sample)-[r1]->(a1:Activity)-[r2]->(d1:Dataset)-[r3]->(a2:Activity)-[r4]->(d2:Dataset)
1354+
or 2*2 nodes beyond the sample
1355+
"""
1356+
def get_paired_dataset(neo4j_driver, uuid, data_type, search_depth):
1357+
# search depth is doubled because there is an activity node between each entity node
1358+
number_of_jumps = f"*"
1359+
if search_depth is not None:
1360+
search_depth = 2 * search_depth
1361+
number_of_jumps = f"*..{search_depth}"
1362+
data_type = f"['{data_type}']"
1363+
query = (
1364+
f'MATCH (ds:Dataset)<-[*]-(s:Sample) WHERE ds.uuid = "{uuid}" AND (:Dataset)<-[]-()<-[]-(s)'
1365+
f'MATCH (ods)<-[{number_of_jumps}]-(s) WHERE ods.data_types = "{data_type}"'
1366+
f'return ods.uuid as uuid, ods.status as status'
1367+
)
1368+
paired_datasets = []
1369+
with neo4j_driver.session() as session:
1370+
rval = session.run(query).data()
1371+
return rval
1372+
13301373
####################################################################################################
13311374
## Internal Functions
13321375
####################################################################################################

0 commit comments

Comments
 (0)