Skip to content

Commit 84fa133

Browse files
authored
Merge pull request #318 from hubmapconsortium/yuanzhou/optimization
Yuanzhou/optimization
2 parents dbfcd99 + 61e4708 commit 84fa133

File tree

10 files changed

+281
-122
lines changed

10 files changed

+281
-122
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ src/VERSION
2525
# Rotated log files
2626
**/*.log*
2727

28+
# requests-cache generated sqlite
29+
**/*.sqlite
30+
2831
#ignore eclipse files
2932
.project
3033
.pydevproject

docker/docker-compose.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ services:
2828
volumes:
2929
# Mount the app config to container in order to keep it outside of the image
3030
- "../src/instance:/usr/src/app/src/instance"
31+
# Mount the directory of requests_cache generated sqlite database
32+
- "../src/requests_cache:/usr/src/app/requests_cache"
3133
# Mount the logging to container
3234
- "../log:/usr/src/app/log"
3335
# Mount the schema yaml file

src/requests_cache/README.md

Whitespace-only changes.

src/requirements.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,8 @@
66
# Add this itsdangerous dependency and downgrade to 2.0.1 as a temporary solution
77
itsdangerous==2.0.1
88

9-
cachetools==4.2.1
109
Flask==1.1.2
11-
neo4j==4.2.1
10+
neo4j==4.4
1211
prov==2.0.0
1312
Werkzeug==1.0.1
1413

@@ -19,6 +18,9 @@ nested-lookup==0.2.22
1918
requests==2.25.1
2019
PyYAML==5.4.1
2120

21+
# 0.5.2 compatible with Python 3.6
22+
requests_cache==0.5.2
23+
2224
# The branch name of commons to be used during image build
2325
# Default is master branch specified in docker-compose.yml if not set
2426
git+https://github.com/hubmapconsortium/commons.git@${COMMONS_BRANCH}#egg=hubmap-commons

src/schema/provenance_schema.yaml

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -192,12 +192,19 @@ ENTITIES:
192192
type: string
193193
description: "Free text description of the collection"
194194
###### Transient properties ######
195-
datasets:
195+
# Causing performanc issue
196+
# datasets:
197+
# type: list
198+
# transient: true
199+
# generated: true
200+
# description: "The datasets that are contained in the collection."
201+
# on_read_trigger: get_collection_datasets
202+
dataset_uuids:
196203
type: list
197204
transient: true
198205
generated: true
199-
description: "The datasets that are contained in the collection."
200-
on_read_trigger: get_collection_datasets
206+
description: "The dataset uuids that are contained in the collection."
207+
on_read_trigger: get_collection_dataset_uuids
201208

202209
############################################# Dataset #############################################
203210
Dataset:
@@ -766,9 +773,11 @@ ENTITIES:
766773
description: 'List of datasets to remove from a Upload. Provide as a json array of the dataset uuids like: ["232934234234234234234270c0ea6c51d604a850558ef2247d0b4", "230948203482234234234a57bfe9c056d08a0f8e6cd612baa3bfa"]'
767774
# Use after_update_trigger instead of before_update_trigger since we are not updating this property
768775
after_update_trigger: unlink_datasets_from_upload
776+
# Different from the handling of Collection (only returns dataset_uuids)
769777
datasets:
770778
type: list
771779
generated: true # Disallow user input from request json when being created
772780
transient: true
773781
description: "The datasets that are contained in this Upload."
782+
# A few time-consuming read triggers are excluded
774783
on_read_trigger: get_upload_datasets

src/schema/schema_constants.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
class SchemaConstants(object):
2+
3+
# File path to the requests_cache generated sqlite (without extension) within docker container, DO NOT MODIFY
4+
# Expire the cache after the time-to-live (seconds)
5+
REQUESTS_CACHE_BACKEND = 'sqlite'
6+
REQUESTS_CACHE_SQLITE_NAME = '/usr/src/app/requests_cache/entity-api'
7+
REQUESTS_CACHE_TTL = 7200
8+
9+
# Constants used by validators
10+
INGEST_API_APP = 'ingest-api'
11+
INGEST_PIPELINE_APP = 'ingest-pipeline'
12+
HUBMAP_APP_HEADER = 'X-Hubmap-Application'
13+
DATASET_STATUS_PUBLISHED = 'published'
14+
15+
# Used by triggers, all lowercase for easy comparision
16+
ACCESS_LEVEL_PUBLIC = 'public'
17+
ACCESS_LEVEL_CONSORTIUM = 'consortium'
18+
ACCESS_LEVEL_PROTECTED = 'protected'
19+
20+
# Yaml file to parse organ description
21+
ORGAN_TYPES_YAML = 'https://raw.githubusercontent.com/hubmapconsortium/search-api/master/src/search-schema/data/definitions/enums/organ_types.yaml'

src/schema/schema_manager.py

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,20 @@
11
import ast
2+
import time
23
import yaml
34
import logging
45
import requests
6+
import requests_cache
57
from cachetools import cached, TTLCache
68

79
# Don't confuse urllib (Python native library) with urllib3 (3rd-party library, requests also uses urllib3)
810
from requests.packages.urllib3.exceptions import InsecureRequestWarning
911
from flask import Response
1012

11-
# Use the current_app proxy, which points to the application handling the current activity
12-
from flask import current_app as app
13-
1413
# Local modules
1514
from schema import schema_errors
1615
from schema import schema_triggers
1716
from schema import schema_validators
17+
from schema.schema_constants import SchemaConstants
1818

1919
# HuBMAP commons
2020
from hubmap_commons.hm_auth import AuthHelper
@@ -25,12 +25,10 @@
2525
# Suppress InsecureRequestWarning warning when requesting status on https with ssl cert verify disabled
2626
requests.packages.urllib3.disable_warnings(category = InsecureRequestWarning)
2727

28-
# LRU Cache implementation with per-item time-to-live (TTL) value
29-
# with a memoizing callable that saves up to maxsize results based on a Least Frequently Used (LFU) algorithm
30-
# with a per-item time-to-live (TTL) value
31-
# The maximum integer number of entries in the cache queue: 128
32-
# Expire the cache after the time-to-live (seconds): two hours, 7200 seconds
33-
cache = TTLCache(128, ttl=7200)
28+
# Requests cache generates the sqlite file
29+
# File path without the .sqlite extension
30+
# Expire the cache after the time-to-live (7200 seconds)
31+
requests_cache.install_cache(SchemaConstants.REQUESTS_CACHE_SQLITE_NAME, backend=SchemaConstants.REQUESTS_CACHE_BACKEND, expire_after=SchemaConstants.REQUESTS_CACHE_TTL)
3432

3533
# In Python, "privacy" depends on "consenting adults'" levels of agreement, we can't force it.
3634
# A single leading underscore means you're not supposed to access it "from the outside"
@@ -41,6 +39,7 @@
4139
_auth_helper = None
4240
_neo4j_driver = None
4341

42+
4443
####################################################################################################
4544
## Provenance yaml schema initialization
4645
####################################################################################################
@@ -97,7 +96,6 @@ def initialize(valid_yaml_file,
9796
dict
9897
A dict containing the schema details
9998
"""
100-
@cached(cache)
10199
def load_provenance_schema(valid_yaml_file):
102100
with open(valid_yaml_file) as file:
103101
schema_dict = yaml.safe_load(file)
@@ -1060,7 +1058,10 @@ def get_hubmap_ids(id, user_token):
10601058
request_headers = _create_request_headers(user_token)
10611059

10621060
# Disable ssl certificate verification
1063-
response = requests.get(url = target_url, headers = request_headers, verify = False)
1061+
response = requests.get(url = target_url, headers = request_headers, verify = False)
1062+
1063+
# Verify if the cached response being used
1064+
_verify_request_cache(target_url, response.from_cache)
10641065

10651066
# Invoke .raise_for_status(), an HTTPError will be raised with certain status codes
10661067
response.raise_for_status()
@@ -1536,4 +1537,16 @@ def _create_request_headers(user_token):
15361537

15371538
return headers_dict
15381539

1540+
"""
1541+
Verify if the cached response being used
15391542
1543+
Parameters
1544+
----------
1545+
url: str
1546+
The request url
1547+
response_from_cache: bool
1548+
If response.from_cache is used or not
1549+
"""
1550+
def _verify_request_cache(url, response_from_cache):
1551+
now = time.ctime(int(time.time()))
1552+
logger.info(f"Time: {now} / GET request URL: {url} / Requests cache used: {response_from_cache}")

src/schema/schema_neo4j_queries.py

Lines changed: 82 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -76,22 +76,47 @@ def get_dataset_organ_and_donor_info(neo4j_driver, uuid):
7676
organ_name = None
7777
donor_metadata = None
7878

79-
query = (f"MATCH (e:Dataset)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(s:Sample)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(d:Donor) "
80-
# Filter out the Lab entities
81-
f"WHERE e.uuid='{uuid}' AND s.specimen_type='organ' AND EXISTS(s.organ) "
82-
# COLLECT() returns a list
83-
# apoc.coll.toSet() reruns a set containing unique nodes
84-
f"RETURN s.organ AS organ_name, d.metadata AS donor_metadata")
79+
with neo4j_driver.session() as session:
80+
# Old time-consuming single query, it takes a significant amounts of DB hits
81+
# query = (f"MATCH (e:Dataset)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(s:Sample)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(d:Donor) "
82+
# f"WHERE e.uuid='{uuid}' AND s.specimen_type='organ' AND EXISTS(s.organ) "
83+
# f"RETURN s.organ AS organ_name, d.metadata AS donor_metadata")
8584

86-
logger.info("======get_dataset_organ_and_donor_info() query======")
87-
logger.info(query)
85+
# logger.info("======get_dataset_organ_and_donor_info() query======")
86+
# logger.info(query)
8887

89-
with neo4j_driver.session() as session:
90-
record = session.read_transaction(_execute_readonly_tx, query)
88+
# with neo4j_driver.session() as session:
89+
# record = session.read_transaction(_execute_readonly_tx, query)
90+
91+
# if record:
92+
# organ_name = record['organ_name']
93+
# donor_metadata = record['donor_metadata']
94+
95+
# To improve the query performance, we implement the two-step queries to drastically reduce the DB hits
96+
sample_query = (f"MATCH (e:Dataset)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(s:Sample) "
97+
f"WHERE e.uuid='{uuid}' AND s.specimen_type='organ' AND EXISTS(s.organ) "
98+
f"RETURN DISTINCT s.organ AS organ_name, s.uuid AS sample_uuid")
99+
100+
logger.info("======get_dataset_organ_and_donor_info() sample_query======")
101+
logger.info(sample_query)
102+
103+
sample_record = session.read_transaction(_execute_readonly_tx, sample_query)
104+
105+
if sample_record:
106+
organ_name = sample_record['organ_name']
107+
sample_uuid = sample_record['sample_uuid']
108+
109+
donor_query = (f"MATCH (s:Sample)<-[:ACTIVITY_OUTPUT]-(a:Activity)<-[:ACTIVITY_INPUT]-(d:Donor) "
110+
f"WHERE s.uuid='{sample_uuid}' AND s.specimen_type='organ' AND EXISTS(s.organ) "
111+
f"RETURN DISTINCT d.metadata AS donor_metadata")
112+
113+
logger.info("======get_dataset_organ_and_donor_info() donor_query======")
114+
logger.info(donor_query)
115+
116+
donor_record = session.read_transaction(_execute_readonly_tx, donor_query)
91117

92-
if record:
93-
organ_name = record['organ_name']
94-
donor_metadata = record['donor_metadata']
118+
if donor_record:
119+
donor_metadata = donor_record['donor_metadata']
95120

96121
return organ_name, donor_metadata
97122

@@ -336,8 +361,43 @@ def get_dataset_upload(neo4j_driver, uuid, property_key = None):
336361
return result
337362

338363

364+
# """
365+
# Get a list of associated dataset dicts for a given collection
366+
367+
# Parameters
368+
# ----------
369+
# neo4j_driver : neo4j.Driver object
370+
# The neo4j database connection pool
371+
# uuid : str
372+
# The uuid of collection
373+
374+
# Returns
375+
# -------
376+
# list
377+
# The list containing associated dataset dicts
378+
# """
379+
# def get_collection_datasets(neo4j_driver, uuid):
380+
# results = []
381+
382+
# query = (f"MATCH (e:Entity)-[:IN_COLLECTION]->(c:Collection) "
383+
# f"WHERE c.uuid = '{uuid}' "
384+
# f"RETURN apoc.coll.toSet(COLLECT(e)) AS {record_field_name}")
385+
386+
# logger.info("======get_collection_datasets() query======")
387+
# logger.info(query)
388+
389+
# with neo4j_driver.session() as session:
390+
# record = session.read_transaction(_execute_readonly_tx, query)
391+
392+
# if record and record[record_field_name]:
393+
# # Convert the list of nodes to a list of dicts
394+
# results = _nodes_to_dicts(record[record_field_name])
395+
396+
# return results
397+
398+
339399
"""
340-
Get a list of associated dataset dicts for a given collection
400+
Get a list of associated dataset uuids for a given Collection
341401
342402
Parameters
343403
----------
@@ -349,24 +409,24 @@ def get_dataset_upload(neo4j_driver, uuid, property_key = None):
349409
Returns
350410
-------
351411
list
352-
The list containing associated dataset dicts
412+
The list of associated dataset uuids
353413
"""
354-
def get_collection_datasets(neo4j_driver, uuid):
414+
def get_collection_dataset_uuids(neo4j_driver, uuid):
355415
results = []
356416

357-
query = (f"MATCH (e:Entity)-[:IN_COLLECTION]->(c:Collection) "
417+
query = (f"MATCH (e:Dataset)-[:IN_COLLECTION]->(c:Collection) "
358418
f"WHERE c.uuid = '{uuid}' "
359419
f"RETURN apoc.coll.toSet(COLLECT(e)) AS {record_field_name}")
360420

361-
logger.info("======get_collection_datasets() query======")
421+
logger.info("======get_collection_dataset_uuids() query======")
362422
logger.info(query)
363423

364424
with neo4j_driver.session() as session:
365425
record = session.read_transaction(_execute_readonly_tx, query)
366426

367427
if record and record[record_field_name]:
368-
# Convert the list of nodes to a list of dicts
369-
results = _nodes_to_dicts(record[record_field_name])
428+
# Just return the list of uuids
429+
results = record[record_field_name]
370430

371431
return results
372432

@@ -467,7 +527,7 @@ def unlink_datasets_from_upload(neo4j_driver, upload_uuid, dataset_uuids_list):
467527

468528

469529
"""
470-
Get a list of associated dataset dicts for a given collection
530+
Get a list of associated dataset dicts for a given Upload
471531
472532
Parameters
473533
----------
@@ -484,7 +544,7 @@ def unlink_datasets_from_upload(neo4j_driver, upload_uuid, dataset_uuids_list):
484544
def get_upload_datasets(neo4j_driver, uuid):
485545
results = []
486546

487-
query = (f"MATCH (e:Entity)-[:IN_UPLOAD]->(s:Upload) "
547+
query = (f"MATCH (e:Dataset)-[:IN_UPLOAD]->(s:Upload) "
488548
f"WHERE s.uuid = '{uuid}' "
489549
f"RETURN apoc.coll.toSet(COLLECT(e)) AS {record_field_name}")
490550

0 commit comments

Comments
 (0)