Skip to content

Commit 5c66102

Browse files
authored
Merge pull request #507 from hubmapconsortium/yuanzhou/cache-improvement
Cache for complete entity and normalzied entity
2 parents 42d07ee + 584feac commit 5c66102

File tree

4 files changed

+382
-142
lines changed

4 files changed

+382
-142
lines changed

src/app.py

Lines changed: 103 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -163,15 +163,15 @@ def http_internal_server_error(e):
163163
if MEMCACHED_MODE:
164164
try:
165165
# Use client pool to maintain a pool of already-connected clients for improved performance
166-
# The uwsgi config launches the app across multiple threads (2) inside each process (4), making essentially 8 processes
166+
# The uwsgi config launches the app across multiple threads (8) inside each process (32), making essentially 256 processes
167167
# Set the connect_timeout and timeout to avoid blocking the process when memcached is slow, defaults to "forever"
168168
# connect_timeout: seconds to wait for a connection to the memcached server
169169
# timeout: seconds to wait for send or reveive calls on the socket connected to memcached
170170
# Use the ignore_exc flag to treat memcache/network errors as cache misses on calls to the get* methods
171171
# Set the no_delay flag to sent TCP_NODELAY (disable Nagle's algorithm to improve TCP/IP networks and decrease the number of packets)
172172
# If you intend to use anything but str as a value, it is a good idea to use a serializer
173173
memcached_client_instance = PooledClient(app.config['MEMCACHED_SERVER'],
174-
max_pool_size = 8,
174+
max_pool_size = 256,
175175
connect_timeout = 1,
176176
timeout = 30,
177177
ignore_exc = True,
@@ -369,7 +369,7 @@ def flush_all_cache():
369369
Parameters
370370
----------
371371
id : str
372-
The HuBMAP ID (e.g. HBM123.ABCD.456) or UUID of target entity (Donor/Dataset/Sample/Upload)
372+
The HuBMAP ID (e.g. HBM123.ABCD.456) or UUID of target entity (Donor/Dataset/Sample/Upload/Collection/Publication)
373373
374374
Returns
375375
-------
@@ -381,12 +381,8 @@ def flush_cache(id):
381381
msg = ''
382382

383383
if MEMCACHED_MODE:
384-
msg = f'No cache found from Memcached for entity {id}'
385-
cache_key = f'{MEMCACHED_PREFIX}{id}'
386-
387-
if memcached_client_instance.get(cache_key) is not None:
388-
memcached_client_instance.delete(cache_key)
389-
msg = f'The cached data has been deleted from Memcached for entity {id}'
384+
delete_cache(id)
385+
msg = f'The cached data has been deleted from Memcached for entity {id}'
390386
else:
391387
msg = 'No caching is being used because Memcached mode is not enabled at all'
392388

@@ -633,14 +629,16 @@ def get_entity_by_id(id):
633629
# Otherwise query against uuid-api and neo4j to get the entity dict if the id exists
634630
entity_dict = query_target_entity(id, token)
635631
normalized_entity_type = entity_dict['entity_type']
636-
# To verify if a Collection is public, it is necessary to have its Datasets, which
637-
# are populated as triggered data. So pull back the complete entity for
638-
# _get_entity_visibility() to check.
632+
633+
# Get the generated complete entity result from cache if exists
634+
# Otherwise re-generate on the fly
639635
complete_dict = schema_manager.get_complete_entity_result(token, entity_dict)
640636

641637
# Determine if the entity is publicly visible base on its data, only.
642-
entity_scope = _get_entity_visibility( normalized_entity_type=normalized_entity_type
643-
,entity_dict=complete_dict)
638+
# To verify if a Collection is public, it is necessary to have its Datasets, which
639+
# are populated as triggered data. So pull back the complete entity for
640+
# _get_entity_visibility() to check.
641+
entity_scope = _get_entity_visibility(normalized_entity_type=normalized_entity_type, entity_dict=complete_dict)
644642

645643
# Initialize the user as authorized if the data is public. Otherwise, the
646644
# user is not authorized and credentials must be checked.
@@ -1441,34 +1439,18 @@ def update_entity(id):
14411439
if (return_all_properties is not None) and (return_all_properties.lower() == 'true'):
14421440
properties_to_skip = []
14431441

1444-
# Generate the filtered or complete entity dict to send back
1445-
complete_dict = schema_manager.get_complete_entity_result(user_token, merged_updated_dict, properties_to_skip)
1446-
1447-
# Will also filter the result based on schema
1448-
normalized_complete_dict = schema_manager.normalize_entity_result_for_response(complete_dict)
1449-
14501442
# Remove the cached entities if Memcached is being used
1443+
# DO NOT update the cache with new entity dict because the returned dict from PUT (some properties maybe skipped)
1444+
# can be different from the one generated by GET call
14511445
if MEMCACHED_MODE:
1452-
# Delete the old cache data of this entity
1453-
# DO NOT update the cache with new entity dict because the returned dict from PUT (some properties maybe skipped)
1454-
# can be different from the one generated by GET call
1455-
cache_key = f'{MEMCACHED_PREFIX}{id}'
1456-
memcached_client_instance.delete(cache_key)
1457-
1458-
logger.info(f"Deleted cache of key: {cache_key} after entity update via PUT call")
1459-
1460-
# Also delete the cache of all the direct descendants (children)
1461-
# Otherwise they'll have old cached data for the `direct_ancestor` (Sample) `direct_ancestors` (Dataset) fields
1462-
# Note: must use uuid in the Neo4j query
1463-
children_uuid_list = schema_neo4j_queries.get_children(neo4j_driver_instance, entity_dict['uuid'] , 'uuid')
1464-
1465-
logger.info(f"Also delete the cache of all the direct descendants (children) of {id} if exist")
1446+
delete_cache(id)
14661447

1467-
for child_uuid in children_uuid_list:
1468-
cache_key = f'{MEMCACHED_PREFIX}{child_uuid}'
1469-
memcached_client_instance.delete(cache_key)
1448+
# Do not return the updated dict to avoid computing overhead - 7/14/2023 by Zhou
1449+
# # Generate the complete entity dict
1450+
# complete_dict = schema_manager.get_complete_entity_result(user_token, merged_updated_dict, properties_to_skip)
14701451

1471-
logger.info(f"Deleted direct descendant cache of key: {cache_key}")
1452+
# # Will also filter the result based on schema
1453+
# normalized_complete_dict = schema_manager.normalize_entity_result_for_response(complete_dict)
14721454

14731455
# Also reindex the updated entity node in elasticsearch via search-api
14741456
if entity_dict['entity_type'] in ['Collection']:
@@ -1480,9 +1462,13 @@ def update_entity(id):
14801462
logger.log(logging.INFO
14811463
,f"Re-indexing for creation of {entity_dict['entity_type']}"
14821464
f" with UUID {entity_dict['uuid']}")
1465+
14831466
reindex_entity(entity_dict['uuid'], user_token)
14841467

1485-
return jsonify(normalized_complete_dict)
1468+
# Do not return the updated dict to avoid computing overhead - 7/14/2023 by Zhou
1469+
# return jsonify(normalized_complete_dict)
1470+
1471+
return jsonify({'message': f"{normalized_entity_type} of {id} has been updated"})
14861472

14871473

14881474
"""
@@ -3372,11 +3358,9 @@ def sankey_data():
33723358
if memcached_client_instance.get(cache_key) is not None:
33733359
dataset_sankey_list = memcached_client_instance.get(cache_key)
33743360

3375-
current_datetime = datetime.now()
3376-
33773361
if not dataset_sankey_list:
33783362
if MEMCACHED_MODE:
3379-
logger.info(f'Sankey data cache not found or expired. Making a new data fetch at time {current_datetime}')
3363+
logger.info(f'Sankey data cache not found or expired. Making a new data fetch at time {datetime.now()}')
33803364

33813365
# Call to app_neo4j_queries to prepare and execute the database query
33823366
sankey_info = app_neo4j_queries.get_sankey_info(neo4j_driver_instance)
@@ -3415,7 +3399,7 @@ def sankey_data():
34153399
# Cache the result
34163400
memcached_client_instance.set(cache_key, dataset_sankey_list, expire = SchemaConstants.MEMCACHED_TTL)
34173401
else:
3418-
logger.info(f'Using the cached sankey data at time {current_datetime}')
3402+
logger.info(f'Using the cached sankey data at time {datetime.now()}')
34193403

34203404
return jsonify(dataset_sankey_list)
34213405

@@ -4409,7 +4393,7 @@ def after_update(normalized_entity_type, user_token, entity_dict):
44094393

44104394

44114395
"""
4412-
Get target entity dict for the given id
4396+
Get target entity dict from Neo4j query for the given id
44134397
44144398
Parameters
44154399
----------
@@ -4425,20 +4409,20 @@ def after_update(normalized_entity_type, user_token, entity_dict):
44254409
"""
44264410
def query_target_entity(id, user_token):
44274411
entity_dict = None
4428-
4429-
cache_key = f'{MEMCACHED_PREFIX}{id}'
4412+
cache_result = None
44304413

4431-
if MEMCACHED_MODE:
4414+
if MEMCACHED_MODE and MEMCACHED_PREFIX and memcached_client_instance:
4415+
# If this id is hubmap_id rather than uuid, there won't be a cache
4416+
# Only uuid is used in the cache key
4417+
cache_key = f'{MEMCACHED_PREFIX}_neo4j_{id}'
44324418
# Memcached returns None if no cached data or expired
4433-
entity_dict = memcached_client_instance.get(cache_key)
4419+
cache_result = memcached_client_instance.get(cache_key)
44344420

4435-
current_datetime = datetime.now()
4436-
4437-
# Use the cached data if found and still valid
4438-
# Otherwise, make a fresh query and add to cache
4439-
if entity_dict is None:
4440-
if MEMCACHED_MODE:
4441-
logger.info(f'Cache not found or expired. Making a new query to retrieve {id} at time {current_datetime}')
4421+
# Use the cached data if the id is an uuid and we found a valid cache
4422+
# Otherwise, either the id is a hubmap_id or we don't have a cache for it even if it's uuid
4423+
if cache_result is None:
4424+
if MEMCACHED_MODE and MEMCACHED_PREFIX and memcached_client_instance:
4425+
logger.info(f'Neo4j entity cache of {id} not found or expired at time {datetime.now()}')
44424426

44434427
try:
44444428
"""
@@ -4462,15 +4446,31 @@ def query_target_entity(id, user_token):
44624446

44634447
# Get the target uuid if all good
44644448
uuid = hubmap_ids['hm_uuid']
4465-
entity_dict = schema_neo4j_queries.get_entity(neo4j_driver_instance, uuid)
44664449

4467-
# The uuid exists via uuid-api doesn't mean it's also in Neo4j
4468-
if not entity_dict:
4469-
not_found_error(f"Entity of id: {id} not found in Neo4j")
4470-
4471-
if MEMCACHED_MODE:
4472-
# Cache the result
4473-
memcached_client_instance.set(cache_key, entity_dict, expire = SchemaConstants.MEMCACHED_TTL)
4450+
# Look up the cache again by the uuid since we only use uuid in the cache key
4451+
if MEMCACHED_MODE and MEMCACHED_PREFIX and memcached_client_instance:
4452+
cache_key = f'{MEMCACHED_PREFIX}_neo4j_{uuid}'
4453+
cache_result = memcached_client_instance.get(cache_key)
4454+
4455+
if cache_result is None:
4456+
logger.info(f'Neo4j entity cache of {uuid} not found or expired at time {datetime.now()}')
4457+
4458+
# Make a new query against neo4j
4459+
entity_dict = schema_neo4j_queries.get_entity(neo4j_driver_instance, uuid)
4460+
4461+
# The uuid exists via uuid-api doesn't mean it also exists in Neo4j
4462+
if not entity_dict:
4463+
not_found_error(f"Entity of id: {uuid} not found in Neo4j")
4464+
4465+
logger.info(f'Creating neo4j entity result cache of {uuid} at time {datetime.now()}')
4466+
4467+
cache_key = f'{MEMCACHED_PREFIX}_neo4j_{uuid}'
4468+
memcached_client_instance.set(cache_key, entity_dict, expire = SchemaConstants.MEMCACHED_TTL)
4469+
else:
4470+
logger.info(f'Using neo4j entity cache of {uuid} at time {datetime.now()}')
4471+
logger.debug(entity_dict)
4472+
4473+
entity_dict = cache_result
44744474
except requests.exceptions.RequestException as e:
44754475
# Due to the use of response.raise_for_status() in schema_manager.get_hubmap_ids()
44764476
# we can access the status codes from the exception
@@ -4483,10 +4483,12 @@ def query_target_entity(id, user_token):
44834483
else:
44844484
internal_server_error(e.response.text)
44854485
else:
4486-
logger.info(f'Using the cache data of entity {id} at time {current_datetime}')
4486+
logger.info(f'Using neo4j entity cache of {id} at time {datetime.now()}')
44874487
logger.debug(entity_dict)
44884488

4489-
# Final return
4489+
entity_dict = cache_result
4490+
4491+
# One final return
44904492
return entity_dict
44914493

44924494

@@ -4501,6 +4503,42 @@ def require_json(request):
45014503
bad_request_error("A json body and appropriate Content-Type header are required")
45024504

45034505

4506+
4507+
"""
4508+
Delete the cached data of all possible keys used for the given entity id
4509+
4510+
Parameters
4511+
----------
4512+
id : str
4513+
The HuBMAP ID (e.g. HBM123.ABCD.456) or UUID of target entity (Donor/Dataset/Sample/Upload/Collection/Publication)
4514+
"""
4515+
def delete_cache(id):
4516+
if MEMCACHED_MODE:
4517+
# First delete the target entity cache
4518+
entity_dict = query_target_entity(id, get_internal_token())
4519+
entity_uuid = entity_dict['uuid']
4520+
4521+
# If the target entity is Sample (`direct_ancestor`) or Dataset/Publication (`direct_ancestors`)
4522+
# Delete the cache of all the direct descendants (children)
4523+
child_uuids = schema_neo4j_queries.get_children(neo4j_driver_instance, entity_uuid , 'uuid')
4524+
4525+
# If the target entity is Collection, delete the cache for each of its associated
4526+
# Datasets and Publications (via [:IN_COLLECTION] relationship) as well as just Publications (via [:USES_DATA] relationship)
4527+
collection_dataset_uuids = schema_neo4j_queries.get_collection_associated_datasets(neo4j_driver_instance, entity_uuid , 'uuid')
4528+
4529+
# If the target entity is Upload, delete the cache for each of its associated Datasets (via [:IN_UPLOAD] relationship)
4530+
upload_dataset_uuids = schema_neo4j_queries.get_upload_datasets(neo4j_driver_instance, entity_uuid , 'uuid')
4531+
4532+
# If the target entity is Datasets/Publication, delete the associated Collections cache, Upload cache
4533+
collection_uuids = schema_neo4j_queries.get_dataset_collections(neo4j_driver_instance, entity_uuid , 'uuid')
4534+
collection_dict = schema_neo4j_queries.get_publication_associated_collection(neo4j_driver_instance, entity_uuid)
4535+
upload_dict = schema_neo4j_queries.get_dataset_upload(neo4j_driver_instance, entity_uuid)
4536+
4537+
# We only use uuid in the cache key acorss all the cache types
4538+
uuids_list = [entity_uuid] + child_uuids + collection_dataset_uuids + upload_dataset_uuids + collection_uuids + [collection_dict['uuid']] + [upload_dict['uuid']]
4539+
schema_manager.delete_memcached_cache(uuids_list)
4540+
4541+
45044542
"""
45054543
Make a call to each search-api instance to reindex this entity node in elasticsearch
45064544

0 commit comments

Comments
 (0)