Skip to content

Commit c2cfce6

Browse files
authored
Add OpenNeuro.py and OpenNeuroCache class (#65)
* Add OpenNeuroCache class * Allows to download and cache OpenNeuro data. * Download is specified by a set of entities such as dataset accession number, subject, session, run, task etc. * Data can then be replayed through a BidsStream using the BidsInterface to test an analysis pipeline
1 parent 5578219 commit c2cfce6

File tree

11 files changed

+298
-74
lines changed

11 files changed

+298
-74
lines changed

docs/how-to-wrap-your-project.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ RT-Cloud uses RPC (Remote Procedure Calls) to send command requests from the res
113113

114114
The rpyc timeout can be set when the ClientInterface is created in the experiment script, such as in the sample.py project. Simply include the rpyc_timeout= parameter (e.g. ClientInterface(rpyc_timeout=60)), the default is 60 seconds. Rpyc also has a timed() function which can be used to adjust the timeout of individual rpc calls.
115115

116-
The websocket timeout can be set using the setRPCTimeout() of remoteable objects. For example to increase the timeout of the dataInterface in the experiment script, call dataInterface.setRPCTimeout(5). The default websocket timeout is 5 seconds.
116+
The websocket timeout can be set in 1 of 2 ways. Method 1 is to set a larger timeout for all calls using the setRPCTimeout() of remoteable objects. For example to increase the timeout of the dataInterface in the experiment script, call dataInterface.setRPCTimeout(5). The default websocket timeout is 5 seconds. Method 2 is to set a larger timeout for on specific call by including a "rpc_timeout" kwarg in that call. For example dataInterface.getFile("BigFile", rpc_timeout=60). Note that before setting an RCP timeout you should check that the interface you are using is actually running over RPC because sometimes interfaces will run locally. To check that use the isRunningRemote() command, such as dataInterface.isRunningRemote(), see the openNeuroClient project for an example of this usage.
117117

118118
## **Some Alternate Configurations For Your Experiment**
119119
### **Running everything on the same computer**

environment.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ dependencies:
77
- awscli
88
- bcrypt
99
- brainiak
10+
- boto3
1011
- dcm2niix
1112
- flake8
1213
- indexed_gzip # for efficient random access of gzipped files with Nibabel

projects/openNeuroClient/openNeuroClient.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
# import important modules
22
import os
3-
from rtCommon.bidsRun import BidsRun
43
import sys
54
import numpy
65
import uuid
@@ -15,6 +14,7 @@
1514
from rtCommon.utils import loadConfigFile, stringPartialFormat
1615
from rtCommon.clientInterface import ClientInterface
1716
from rtCommon.bidsArchive import BidsArchive
17+
from rtCommon.bidsRun import BidsRun
1818

1919
# path for default configuration toml file
2020
defaultConfig = os.path.join(currPath, 'conf', 'openNeuroClient.toml')
@@ -41,8 +41,11 @@ def doRuns(cfg, bidsInterface, subjInterface, webInterface):
4141
print(f'BIDS Archive will be written to {bidsArchivePath}')
4242
newArchive = BidsArchive(bidsArchivePath)
4343
newRun = BidsRun(**entities)
44+
extraKwargs = {}
45+
if bidsInterface.isRunningRemote():
46+
extraKwargs = {"rpc_timeout": 60}
4447
# Initialize the bids stream
45-
streamId = bidsInterface.initOpenNeuroStream(cfg.dsAccessionNumber, **entities)
48+
streamId = bidsInterface.initOpenNeuroStream(cfg.dsAccessionNumber, **entities, **extraKwargs)
4649
numVols = bidsInterface.getNumVolumes(streamId)
4750
for idx in range(numVols):
4851
bidsIncremental = bidsInterface.getIncremental(streamId, idx)

rtCommon/bidsInterface.py

Lines changed: 10 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,14 @@
1010
one instance of dataInterface, as part of the projectServer with dataRemote=False.
1111
"""
1212
import os
13-
import glob
1413
import time
15-
import tempfile
16-
import nibabel as nib
1714
from rtCommon.remoteable import RemoteableExtensible
1815
from rtCommon.bidsArchive import BidsArchive
19-
from rtCommon.bidsRun import BidsRun
2016
from rtCommon.bidsIncremental import BidsIncremental
2117
from rtCommon.bidsCommon import getDicomMetadata
2218
from rtCommon.imageHandling import convertDicomImgToNifti
2319
from rtCommon.dataInterface import DataInterface
20+
from rtCommon.openNeuro import OpenNeuroCache
2421
from rtCommon.errors import RequestError, MissingMetadataError
2522

2623

@@ -58,6 +55,7 @@ def __init__(self, dataRemote=False, allowedDirs=[], scannerClockSkew=0):
5855
# Store the allowed directories to be used by the DicomToBidsStream class
5956
self.allowedDirs = allowedDirs
6057
self.scannerClockSkew = scannerClockSkew
58+
self.openNeuroCache = OpenNeuroCache(cachePath="/tmp/openneuro")
6159

6260

6361
def initDicomBidsStream(self, dicomDir, dicomFilePattern, dicomMinSize, **entities) -> int:
@@ -109,10 +107,13 @@ def initOpenNeuroStream(self, dsAccessionNumber, **entities) -> int:
109107
Returns:
110108
streamId: An identifier used when calling stream functions, such as getIncremental()
111109
"""
110+
if 'subject' not in entities or 'run' not in entities:
111+
raise RequestError("initOpenNeuroStream: Must specify subject and run number")
112+
archivePath = self.openNeuroCache.downloadData(dsAccessionNumber, **entities)
112113
# TODO - allow multiple simultaneous streams to be instantiated
113114
streamId = 1
114-
openNeuroStream = OpenNeuroStream(dsAccessionNumber, **entities)
115-
self.streamMap[streamId] = openNeuroStream
115+
bidsStream = BidsStream(archivePath, **entities)
116+
self.streamMap[streamId] = bidsStream
116117
return streamId
117118

118119
def getIncremental(self, streamId, volIdx=-1) -> BidsIncremental:
@@ -141,6 +142,9 @@ def getNumVolumes(self, streamId) -> int:
141142
stream = self.streamMap[streamId]
142143
return stream.getNumVolumes()
143144

145+
def closeStream(self, streamId):
146+
# remove the stream from the map
147+
self.streamMap.pop(streamId, None)
144148

145149
def getClockSkew(self, callerClockTime: float, roundTripTime: float) -> float:
146150
"""
@@ -299,55 +303,3 @@ def getIncremental(self, volIdx=-1) -> BidsIncremental:
299303
return incremental
300304
else:
301305
return None
302-
303-
304-
class OpenNeuroStream(BidsStream):
305-
"""
306-
A BidsStream from an OpenNeuro dataset. The OpenNeuro dataset will be automatically
307-
downloaded, as needed, on the computer where this stream is intialized.
308-
"""
309-
def __init__(self, dsAccessionNumber, **entities):
310-
"""
311-
Args:
312-
dsAccessionNumber: The OpenNeruo specific accession number for the dataset
313-
to stream.
314-
entities: BIDS entities (subject, session, task, run, suffix, datatype) that
315-
define the particular subject/run of the data to stream
316-
"""
317-
subject = entities.get('subject')
318-
run = entities.get('run')
319-
if subject is None or run is None:
320-
raise RequestError("OpenNeuroStream: Must specify subject and run number")
321-
# TODO - Use OpenNeuroService when it is available, to download
322-
# and access the dataset and get dataset entities
323-
# OpenNeuroService to provide path to dataset
324-
datasetPath = tmpDownloadOpenNeuro(dsAccessionNumber, subject, run)
325-
super().__init__(datasetPath, **entities)
326-
327-
328-
def tmpDownloadOpenNeuro(dsAccessNumber, subject, run) -> str:
329-
"""
330-
Temporary function used until we integrate in the OpenNeuro service. Downloads
331-
a portion of an OpenNeuro dataset corresponding to the subject/run.
332-
Args:
333-
dsAccessionNumber: The OpenNeruo specific accession number for the dataset
334-
to stream.
335-
subject: the specific subject name within the OpenNeuro dataset to download
336-
run: the specific run within the subject's data to download.
337-
Returns:
338-
Absolute path to where the dataset has been downloaded.
339-
340-
"""
341-
tmpDir = tempfile.gettempdir()
342-
print(f'OpenNeuro Data cached to {tmpDir}')
343-
datasetDir = os.path.join(tmpDir, dsAccessNumber)
344-
# check if already downloaded
345-
includePattern = f'sub-{subject}/func/*run-{run:02d}*'
346-
files = glob.glob(os.path.join(datasetDir, includePattern))
347-
if len(files) == 0:
348-
os.makedirs(datasetDir, exist_ok = True)
349-
awsCmd = f'aws s3 sync --no-sign-request s3://openneuro.org/{dsAccessNumber} ' \
350-
f'{datasetDir} --exclude "*/*" --include "{includePattern}"'
351-
print(f'run {awsCmd}')
352-
os.system(awsCmd)
353-
return datasetDir

rtCommon/dataInterface.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ def initScannerStream(self, imgDir: str, filePattern: str,
120120

121121
def getImageData(self, streamId: int, imageIndex: int=None, timeout: int=5) -> pydicom.dataset.FileDataset:
122122
"""
123-
Get data from a stream initialized with initScannerStream or initOpenNeuroStream
123+
Get data from a stream initialized with initScannerStream
124124
125125
Args:
126126
streamId: Id of a previously opened stream.

rtCommon/openNeuro.py

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
"""
2+
An interface to access OpenNeuro data and metadata. It can download
3+
and cache OpenNeuro data for playback.
4+
"""
5+
import os
6+
import json
7+
import boto3
8+
from botocore.config import Config
9+
from botocore import UNSIGNED
10+
import rtCommon.utils as utils
11+
12+
13+
class OpenNeuroCache():
14+
def __init__(self, cachePath="/tmp/openneuro/"):
15+
self.cachePath = cachePath
16+
self.datasetList = None
17+
self.s3Client = None
18+
os.makedirs(cachePath, exist_ok = True)
19+
20+
def getCachePath(self):
21+
return self.cachePath
22+
23+
def getS3Client(self):
24+
"""Returns an s3 client in order to reuse the same s3 client without
25+
always creating a new one. Not thread safe currently.
26+
"""
27+
if self.s3Client is None:
28+
self.s3Client = boto3.client("s3", config=Config(signature_version=UNSIGNED))
29+
return self.s3Client
30+
31+
def getDatasetList(self, refresh=False):
32+
"""
33+
Returns a list of all datasets available in OpenNeuro S3 storage
34+
"See https://openneuro.org/public/datasets for datasets info"
35+
Alternate method to access from a command line call:
36+
aws s3 --no-sign-request ls s3://openneuro.org/
37+
"""
38+
if self.datasetList is None or len(self.datasetList)==0 or refresh is True:
39+
s3Client = boto3.client("s3", config=Config(signature_version=UNSIGNED))
40+
all_datasets = s3Client.list_objects(Bucket='openneuro.org', Delimiter="/")
41+
self.datasetList = []
42+
for dataset in all_datasets.get('CommonPrefixes'):
43+
dsetName = dataset.get('Prefix')
44+
# strip trailing slash characters
45+
dsetName = dsetName.rstrip('/\\')
46+
self.datasetList.append(dsetName)
47+
return self.datasetList
48+
49+
def isValidAccessionNumber(self, dsAccessionNum):
50+
if dsAccessionNum not in self.getDatasetList():
51+
print(f"{dsAccessionNum} not in the OpenNeuro S3 datasets.")
52+
return False
53+
return True
54+
55+
def getSubjectList(self, dsAccessionNum):
56+
"""
57+
Returns a list of all the subjects in a dataset
58+
Args:
59+
dsAccessionNum - accession number of dataset to lookup
60+
Returns:
61+
list of subjects in that dataset
62+
"""
63+
if not self.isValidAccessionNumber(dsAccessionNum):
64+
return None
65+
s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))
66+
prefix = dsAccessionNum + '/sub-'
67+
dsSubjDirs = s3.list_objects(Bucket='openneuro.org', Delimiter="/", Prefix=prefix)
68+
subjects = []
69+
for info in dsSubjDirs.get('CommonPrefixes'):
70+
subj = info.get('Prefix')
71+
if subj is not None:
72+
subj = subj.split('sub-')[1]
73+
if subj is not None:
74+
subj = subj.rstrip('/\\')
75+
subjects.append(subj)
76+
return subjects
77+
78+
def getDescription(self, dsAccessionNum):
79+
"""
80+
Returns the dataset description file as a python dictionary
81+
"""
82+
if not self.isValidAccessionNumber(dsAccessionNum):
83+
return None
84+
dsDir = self.downloadData(dsAccessionNum, downloadWholeDataset=False)
85+
filePath = os.path.join(dsDir, 'dataset_description.json')
86+
descDict = None
87+
try:
88+
with open(filePath, 'r') as fp:
89+
descDict = json.load(fp)
90+
except Exception as err:
91+
print(f"Failed to load dataset_description.json: {err}")
92+
return descDict
93+
94+
def getReadme(self, dsAccessionNum):
95+
"""
96+
Return the contents of the dataset README file.
97+
Downloads toplevel dataset files if needed.
98+
"""
99+
if not self.isValidAccessionNumber(dsAccessionNum):
100+
return None
101+
dsDir = self.downloadData(dsAccessionNum, downloadWholeDataset=False)
102+
filePath = os.path.join(dsDir, 'README')
103+
readme = None
104+
try:
105+
readme = utils.readFile(filePath)
106+
except Exception as err:
107+
print(f"Failed to load README: {err}")
108+
return readme
109+
110+
111+
def getArchivePath(self, dsAccessionNum):
112+
"""Returns the directory path to the cached dataset files"""
113+
archivePath = os.path.join(self.cachePath, dsAccessionNum)
114+
return archivePath
115+
116+
117+
def downloadData(self, dsAccessionNum, downloadWholeDataset=False, **entities):
118+
"""
119+
This command will sync the specified portion of the dataset to the cache directory.
120+
Note: if only the accessionNum is supplied then it will just sync the top-level files.
121+
Sync doesn't re-download files that are already present in the directory.
122+
Consider using --delete which removes local cache files no longer on the remote.
123+
Args:
124+
dsAccessionNum: accession number of the dataset to download data for.
125+
downloadWholeDataset: boolean, if true all files in the dataset
126+
will be downloaded.
127+
entities: BIDS entities (subject, session, task, run, suffix) that
128+
define the particular subject/run of the data to download.
129+
Returns:
130+
Path to the directory containing the downloaded dataset data.
131+
"""
132+
if not self.isValidAccessionNumber(dsAccessionNum):
133+
print(f"{dsAccessionNum} not in the OpenNeuro S3 datasets.")
134+
return False
135+
136+
includePattern = ''
137+
if 'subject' in entities:
138+
subject = entities['subject']
139+
if type(subject) is int:
140+
subject = f'{subject:02d}'
141+
includePattern += f'sub-{subject}/'
142+
if 'session' in entities:
143+
session = entities['session']
144+
if includePattern == '':
145+
includePattern = '*'
146+
if type(session) is int:
147+
session = f'{session:02d}'
148+
includePattern += f'ses-{session}/'
149+
if 'task' in entities:
150+
task = entities['task']
151+
includePattern += f'*task-{task}'
152+
if 'run' in entities:
153+
run = entities['run']
154+
if type(run) is int:
155+
run = f'{run:02d}'
156+
includePattern += f'*run-{run}'
157+
if 'suffix' in entities:
158+
suffix = entities['suffix']
159+
includePattern += f'*{suffix}'
160+
if includePattern != '' or downloadWholeDataset is True:
161+
includePattern += '*'
162+
163+
datasetDir = os.path.join(self.cachePath, dsAccessionNum)
164+
awsCmd = f'aws s3 sync --no-sign-request s3://openneuro.org/{dsAccessionNum} ' \
165+
f'{datasetDir} --exclude "*/*" --include "{includePattern}"'
166+
print(f'run {awsCmd}')
167+
os.system(awsCmd)
168+
return datasetDir
169+

rtCommon/openNeuroService.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
A command-line service to be run where the where OpenNeuro data is downloaded and cached.
2+
A command-line service to be run where the OpenNeuro data is downloaded and cached.
33
This service instantiates a BidsInterface object for serving the data back to the client
44
running in the cloud. It connects to the remote projectServer.
55
Once a connection is established it waits for requets and invokes the BidsInterface

tests/test_bidsInterface.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,13 @@
22
import time
33
import math
44
import pytest
5-
from numpy.core.numeric import isclose
65
from rtCommon.bidsArchive import BidsArchive
76
from rtCommon.bidsIncremental import BidsIncremental
87
from rtCommon.imageHandling import convertDicomImgToNifti, readDicomFromFile
98
from rtCommon.clientInterface import ClientInterface
10-
from rtCommon.bidsInterface import BidsInterface, tmpDownloadOpenNeuro
9+
from rtCommon.bidsInterface import BidsInterface
1110
from rtCommon.bidsCommon import getDicomMetadata
11+
from rtCommon.openNeuro import OpenNeuroCache
1212
import rtCommon.utils as utils
1313
from tests.backgroundTestServers import BackgroundTestServers
1414
from tests.common import rtCloudPath, tmpDir
@@ -122,11 +122,18 @@ def dicomStreamTest(bidsInterface):
122122
def openNeuroStreamTest(bidsInterface):
123123
dsAccessionNumber = 'ds002338'
124124
dsSubject = 'xp201'
125-
datasetDir = tmpDownloadOpenNeuro(dsAccessionNumber, dsSubject, 1)
126125
localEntities = {'subject': dsSubject, 'run': 1, 'suffix': 'bold', 'datatype': 'func'}
127-
remoteEntities = {'subject': dsSubject, 'run': 1}
126+
remoteEntities = {'subject': dsSubject, 'run': 1, 'suffix': 'bold'}
127+
extraKwargs = {}
128+
if bidsInterface.isRunningRemote():
129+
# Set longer timeout for potentially downloading data
130+
extraKwargs = {"rpc_timeout": 60}
131+
streamId = bidsInterface.initOpenNeuroStream(dsAccessionNumber, **remoteEntities,
132+
**extraKwargs)
133+
openNeuroCache = OpenNeuroCache()
134+
datasetDir = openNeuroCache.downloadData(dsAccessionNumber, **localEntities)
128135
localBidsArchive = BidsArchive(datasetDir)
129-
streamId = bidsInterface.initOpenNeuroStream(dsAccessionNumber, **remoteEntities)
136+
130137
for idx in range(3):
131138
streamIncremental = bidsInterface.getIncremental(streamId)
132139
localIncremental = localBidsArchive._getIncremental(idx, **localEntities)

tests/test_dataInterface.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def test_rpyclocalDataInterface(self, dicomTestFilename, bigTestFile):
6464
allowedFileTypes=allowedFileTypes,
6565
dataRemote=False,
6666
subjectRemote=False)
67-
clientInterface = ClientInterface()
67+
clientInterface = ClientInterface(rpyc_timeout=70)
6868
dataInterface = clientInterface.dataInterface
6969
assert clientInterface.isDataRemote() == False
7070
assert dataInterface.isRemote == False
@@ -80,7 +80,7 @@ def test_remoteDataInterface(self, dicomTestFilename, bigTestFile):
8080
allowedFileTypes=allowedFileTypes,
8181
dataRemote=True,
8282
subjectRemote=False)
83-
clientInterface = ClientInterface()
83+
clientInterface = ClientInterface(rpyc_timeout=70)
8484
dataInterface = clientInterface.dataInterface
8585
assert clientInterface.isDataRemote() == True
8686
assert dataInterface.isRemote == True

0 commit comments

Comments
 (0)