Skip to content

Commit

Permalink
[dataquery] Optimize filtering of valid candidates for large datasets (
Browse files Browse the repository at this point in the history
…aces#9344)

This optimizes the filtering of inaccessible candidates, building on
PR#9334

Previously, the entire query was loaded into memory, and then
manipulated in a way that made it easier to see which candidates are
valid, then returned as an array by putting the valid ones in a
temporary table and selecting from it.

With this change, instead of an array, a generator is used so that only
one candidate (and its sessions) need to be loaded into memory at a
time. This is done on an unbuffered connection, which necessitates a
different database connection since only one query can be run at a time
on a connection and the caller may make other queries before the
generator finished.

This result should be significantly less memory required for filtering
candidates.
  • Loading branch information
driusan authored and ZhichGaming committed Nov 25, 2024
1 parent 256a850 commit e8b0426
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 44 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ class Count extends \LORIS\Http\Endpoint
try {
$query = new \LORIS\dataquery\Query($this->loris, $queryID);

$candidates = $query->matchCandidates($user);
$candidates = \iterator_to_array($query->matchCandidates($user));
return new \LORIS\Http\Response\JSON\OK(
[
'count' => count($candidates),
Expand Down
75 changes: 33 additions & 42 deletions modules/dataquery/php/query.class.inc
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,7 @@ class Query implements \LORIS\StudyEntities\AccessibleResource,
*
* @param \User $user The user running the query.
*
* @return CandID[]
* @return iterable
*/
public function matchCandidates(\User $user) : iterable
{
Expand All @@ -557,7 +557,7 @@ class Query implements \LORIS\StudyEntities\AccessibleResource,
* @param CandID[] $candIDs The CandIDs to be filtered
* @param \User $user The user whose access should check.
*
* @return CandID[]
* @return iterable
*/
private function _filterInaccessibleCandidates(
array $candIDs,
Expand All @@ -566,15 +566,17 @@ class Query implements \LORIS\StudyEntities\AccessibleResource,
if (count($candIDs) == 0) {
return [];
}
$DB = $this->loris->getDatabaseConnection();
$DB = $this->loris->getNewDatabaseConnection();
$DB->setBuffering(false);

// Put candidates into a temporary table so that it can be used in a join
// clause. Directly using "c.CandID IN (candid1, candid2, candid3, etc)" is
// too slow to be useable.
$DB->run("DROP TEMPORARY TABLE IF EXISTS accesscandidates");
$DB->run(
"CREATE TEMPORARY TABLE accesscandidates(
CandID int(6)
CandID int(6) NOT NULL,
PRIMARY KEY(CandID)
);"
);
$insertstmt = "INSERT INTO accesscandidates VALUES"
Expand All @@ -593,61 +595,50 @@ class Query implements \LORIS\StudyEntities\AccessibleResource,
ORDER BY c.CandID",
[]
);
$organized = [];
$curCandID = '';
$canddata = null;

// Foreach CandID, create the TimePointData object for all timepoints
// that were returned.
foreach ($rows as $row) {
$candid = $row['CandID'];
if (!isset($organized[$candid])) {
$organized[$candid] = [
'CandID' => new CandID($row['CandID']),
'RegistrationProject' => $row['RegistrationProjectID'],
'RegistrationCenter' => $row['RegistrationCenterID'],
'Timepoints' => [],
];
if ($curCandID != $row['CandID']) {
if ($canddata !== null) {
$candidate = new \Candidate($canddata);
if ($candidate->isAccessibleBy($user)) {
yield $canddata->CandID;
}
}
$canddata = new \CandidateData(
candID: new CandID(strval($row['CandID'])),
registrationProjectID: $row['RegistrationProjectID'] !== null
? \ProjectID::singleton($row['RegistrationProjectID'])
: null,
registrationCenterID: $row['RegistrationCenterID'] !== null
? \CenterID::singleton($row['RegistrationCenterID'])
: null,
timepoints: [],
);
$curCandID = $row['CandID'];
}

if ($row['sessionID'] !== null) {
$organized[$candid]['Timepoints'][] = new \TimePoint(
$canddata->timepoints[] = new \TimePoint(
new \TimePointData(
new \SessionID($row['sessionID']),
new \SessionID(strval($row['sessionID'])),
\ProjectID::singleton($row['SProjectID']),
\CenterID::singleton($row['SCenterID']),
)
);
}
}

// Remove inaccessible candidates from the temp table
foreach ($organized as $vals) {
$canddata = new \CandidateData(
candID: $vals['CandID'],
registrationProjectID: \ProjectID::singleton(
$vals['RegistrationProject']
),
registrationCenterID: \CenterID::singleton(
$vals['RegistrationCenter']
),
timepoints: $vals['Timepoints']
);
// check the last candidate that wouldn't have entered the check in the
// loop
if ($canddata !== null) {
$candidate = new \Candidate($canddata);

if (!$candidate->isAccessibleBy($user)) {
// Not accessible, so delete from accesscandidates
$DB->delete("accesscandidates", ['CandID' => $vals['CandID']]);
continue;
if ($candidate->isAccessibleBy($user)) {
yield $canddata->CandID;
}
}

$candidates = $DB->pselectCol("SELECT CandID from accesscandidates", []);
$DB->run("DROP TEMPORARY TABLE accesscandidates");
return array_map(
function ($row) {
return new CandID($row);
},
$candidates
);
}

/**
Expand Down
2 changes: 1 addition & 1 deletion modules/dataquery/php/queryrun.class.inc
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class QueryRun implements \LORIS\StudyEntities\AccessibleResource,
public function insertCandidates(\User $user) : void
{
$DB = $this->loris->getDatabaseConnection();
$candIDs = $this->query->matchCandidates($user);
$candIDs = \iterator_to_array($this->query->matchCandidates($user));
if (count($candIDs) == 0) {
return;
}
Expand Down
30 changes: 30 additions & 0 deletions src/LorisInstance.php
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,36 @@ public function getDatabaseConnection() : \Database
return $this->DB;
}

/**
* Return a new database connection to this LORIS instance.
*
* @return \Database
*/
public function getNewDatabaseConnection() : \Database
{
$settings = \NDB_Factory::singleton()->settings();

// Pass the credentials in environment variables, so that they
// don't potentially show up in a stack trace if something goes
// wrong.
$dbname = $settings->dbName();
putenv("LORIS_{$dbname}_USERNAME=" . $settings->dbUserName());
putenv("LORIS_{$dbname}_PASSWORD=" . $settings->dbPassword());
putenv("LORIS_{$dbname}_HOST=" . $settings->dbHost());

$db = new \Database();
$db->connect(
$settings->dbName(),
true,
);

// Unset the variables now that they're no longer needed.
putenv("LORIS_{$dbname}_USERNAME=");
putenv("LORIS_{$dbname}_PASSWORD=");
putenv("LORIS_{$dbname}_HOST=");
return $db;
}

/**
* Return a list of directories on the filesystem which
* may contain modules.
Expand Down

0 comments on commit e8b0426

Please sign in to comment.