From e8b0426941e22381ddb2304b67e5dd311c7cd138 Mon Sep 17 00:00:00 2001 From: Dave MacFarlane Date: Tue, 24 Sep 2024 14:00:17 -0400 Subject: [PATCH] [dataquery] Optimize filtering of valid candidates for large datasets (#9344) This optimizes the filtering of inaccessible candidates, building on PR#9334 Previously, the entire query was loaded into memory, and then manipulated in a way that made it easier to see which candidates are valid, then returned as an array by putting the valid ones in a temporary table and selecting from it. With this change, instead of an array, a generator is used so that only one candidate (and its sessions) need to be loaded into memory at a time. This is done on an unbuffered connection, which necessitates a different database connection since only one query can be run at a time on a connection and the caller may make other queries before the generator finished. This result should be significantly less memory required for filtering candidates. --- .../endpoints/queries/query/count.class.inc | 2 +- modules/dataquery/php/query.class.inc | 75 ++++++++----------- modules/dataquery/php/queryrun.class.inc | 2 +- src/LorisInstance.php | 30 ++++++++ 4 files changed, 65 insertions(+), 44 deletions(-) diff --git a/modules/dataquery/php/endpoints/queries/query/count.class.inc b/modules/dataquery/php/endpoints/queries/query/count.class.inc index 4d09bce2e96..5ca16c04745 100644 --- a/modules/dataquery/php/endpoints/queries/query/count.class.inc +++ b/modules/dataquery/php/endpoints/queries/query/count.class.inc @@ -59,7 +59,7 @@ class Count extends \LORIS\Http\Endpoint try { $query = new \LORIS\dataquery\Query($this->loris, $queryID); - $candidates = $query->matchCandidates($user); + $candidates = \iterator_to_array($query->matchCandidates($user)); return new \LORIS\Http\Response\JSON\OK( [ 'count' => count($candidates), diff --git a/modules/dataquery/php/query.class.inc b/modules/dataquery/php/query.class.inc index 71f257fb3a9..ea1461f529f 100644 --- a/modules/dataquery/php/query.class.inc +++ b/modules/dataquery/php/query.class.inc @@ -537,7 +537,7 @@ class Query implements \LORIS\StudyEntities\AccessibleResource, * * @param \User $user The user running the query. * - * @return CandID[] + * @return iterable */ public function matchCandidates(\User $user) : iterable { @@ -557,7 +557,7 @@ class Query implements \LORIS\StudyEntities\AccessibleResource, * @param CandID[] $candIDs The CandIDs to be filtered * @param \User $user The user whose access should check. * - * @return CandID[] + * @return iterable */ private function _filterInaccessibleCandidates( array $candIDs, @@ -566,7 +566,8 @@ class Query implements \LORIS\StudyEntities\AccessibleResource, if (count($candIDs) == 0) { return []; } - $DB = $this->loris->getDatabaseConnection(); + $DB = $this->loris->getNewDatabaseConnection(); + $DB->setBuffering(false); // Put candidates into a temporary table so that it can be used in a join // clause. Directly using "c.CandID IN (candid1, candid2, candid3, etc)" is @@ -574,7 +575,8 @@ class Query implements \LORIS\StudyEntities\AccessibleResource, $DB->run("DROP TEMPORARY TABLE IF EXISTS accesscandidates"); $DB->run( "CREATE TEMPORARY TABLE accesscandidates( - CandID int(6) + CandID int(6) NOT NULL, + PRIMARY KEY(CandID) );" ); $insertstmt = "INSERT INTO accesscandidates VALUES" @@ -593,25 +595,35 @@ class Query implements \LORIS\StudyEntities\AccessibleResource, ORDER BY c.CandID", [] ); - $organized = []; + $curCandID = ''; + $canddata = null; // Foreach CandID, create the TimePointData object for all timepoints // that were returned. foreach ($rows as $row) { - $candid = $row['CandID']; - if (!isset($organized[$candid])) { - $organized[$candid] = [ - 'CandID' => new CandID($row['CandID']), - 'RegistrationProject' => $row['RegistrationProjectID'], - 'RegistrationCenter' => $row['RegistrationCenterID'], - 'Timepoints' => [], - ]; + if ($curCandID != $row['CandID']) { + if ($canddata !== null) { + $candidate = new \Candidate($canddata); + if ($candidate->isAccessibleBy($user)) { + yield $canddata->CandID; + } + } + $canddata = new \CandidateData( + candID: new CandID(strval($row['CandID'])), + registrationProjectID: $row['RegistrationProjectID'] !== null + ? \ProjectID::singleton($row['RegistrationProjectID']) + : null, + registrationCenterID: $row['RegistrationCenterID'] !== null + ? \CenterID::singleton($row['RegistrationCenterID']) + : null, + timepoints: [], + ); + $curCandID = $row['CandID']; } - if ($row['sessionID'] !== null) { - $organized[$candid]['Timepoints'][] = new \TimePoint( + $canddata->timepoints[] = new \TimePoint( new \TimePointData( - new \SessionID($row['sessionID']), + new \SessionID(strval($row['sessionID'])), \ProjectID::singleton($row['SProjectID']), \CenterID::singleton($row['SCenterID']), ) @@ -619,35 +631,14 @@ class Query implements \LORIS\StudyEntities\AccessibleResource, } } - // Remove inaccessible candidates from the temp table - foreach ($organized as $vals) { - $canddata = new \CandidateData( - candID: $vals['CandID'], - registrationProjectID: \ProjectID::singleton( - $vals['RegistrationProject'] - ), - registrationCenterID: \CenterID::singleton( - $vals['RegistrationCenter'] - ), - timepoints: $vals['Timepoints'] - ); + // check the last candidate that wouldn't have entered the check in the + // loop + if ($canddata !== null) { $candidate = new \Candidate($canddata); - - if (!$candidate->isAccessibleBy($user)) { - // Not accessible, so delete from accesscandidates - $DB->delete("accesscandidates", ['CandID' => $vals['CandID']]); - continue; + if ($candidate->isAccessibleBy($user)) { + yield $canddata->CandID; } } - - $candidates = $DB->pselectCol("SELECT CandID from accesscandidates", []); - $DB->run("DROP TEMPORARY TABLE accesscandidates"); - return array_map( - function ($row) { - return new CandID($row); - }, - $candidates - ); } /** diff --git a/modules/dataquery/php/queryrun.class.inc b/modules/dataquery/php/queryrun.class.inc index 7baa61808fc..3d24e31e024 100644 --- a/modules/dataquery/php/queryrun.class.inc +++ b/modules/dataquery/php/queryrun.class.inc @@ -69,7 +69,7 @@ class QueryRun implements \LORIS\StudyEntities\AccessibleResource, public function insertCandidates(\User $user) : void { $DB = $this->loris->getDatabaseConnection(); - $candIDs = $this->query->matchCandidates($user); + $candIDs = \iterator_to_array($this->query->matchCandidates($user)); if (count($candIDs) == 0) { return; } diff --git a/src/LorisInstance.php b/src/LorisInstance.php index fef2cd4cd1e..abe40c433a8 100644 --- a/src/LorisInstance.php +++ b/src/LorisInstance.php @@ -50,6 +50,36 @@ public function getDatabaseConnection() : \Database return $this->DB; } + /** + * Return a new database connection to this LORIS instance. + * + * @return \Database + */ + public function getNewDatabaseConnection() : \Database + { + $settings = \NDB_Factory::singleton()->settings(); + + // Pass the credentials in environment variables, so that they + // don't potentially show up in a stack trace if something goes + // wrong. + $dbname = $settings->dbName(); + putenv("LORIS_{$dbname}_USERNAME=" . $settings->dbUserName()); + putenv("LORIS_{$dbname}_PASSWORD=" . $settings->dbPassword()); + putenv("LORIS_{$dbname}_HOST=" . $settings->dbHost()); + + $db = new \Database(); + $db->connect( + $settings->dbName(), + true, + ); + + // Unset the variables now that they're no longer needed. + putenv("LORIS_{$dbname}_USERNAME="); + putenv("LORIS_{$dbname}_PASSWORD="); + putenv("LORIS_{$dbname}_HOST="); + return $db; + } + /** * Return a list of directories on the filesystem which * may contain modules.