Skip to content

Commit 818b883

Browse files
authored
Merge pull request #1571 from AllenInstitute/GH-1546/bugfix/remove-csv-log-bpc
GH 1546: Perf: Remove csv log from BehaviorProjectCache
2 parents 1a27d3b + 60737a4 commit 818b883

File tree

3 files changed

+129
-389
lines changed

3 files changed

+129
-389
lines changed

allensdk/brain_observatory/behavior/behavior_project_cache.py

Lines changed: 18 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,8 @@
11
import numpy as np
2-
import os.path
3-
import csv
42
from functools import partial
5-
from typing import Type, Optional, List, Any, Dict, Union
3+
from typing import Type, Optional, List, Union
64
from pathlib import Path
75
import pandas as pd
8-
import time
96
import logging
107

118
from allensdk.api.cache import Cache
@@ -15,23 +12,18 @@
1512
from allensdk.brain_observatory.behavior.internal.behavior_project_base\
1613
import BehaviorProjectBase
1714
from allensdk.api.caching_utilities import one_file_call_caching, call_caching
18-
from allensdk.core.exceptions import MissingDataError
1915
from allensdk.core.authentication import DbCredentials
2016

2117
BehaviorProjectApi = Type[BehaviorProjectBase]
2218

2319

2420
class BehaviorProjectCache(Cache):
2521

26-
MANIFEST_VERSION = "0.0.1-alpha"
22+
MANIFEST_VERSION = "0.0.1-alpha.1"
2723
OPHYS_SESSIONS_KEY = "ophys_sessions"
2824
BEHAVIOR_SESSIONS_KEY = "behavior_sessions"
2925
OPHYS_EXPERIMENTS_KEY = "ophys_experiments"
3026

31-
# Temporary way for scientists to keep track of analyses
32-
OPHYS_ANALYSIS_LOG_KEY = "ophys_analysis_log"
33-
BEHAVIOR_ANALYSIS_LOG_KEY = "behavior_analysis_log"
34-
3527
MANIFEST_CONFIG = {
3628
OPHYS_SESSIONS_KEY: {
3729
"spec": f"{OPHYS_SESSIONS_KEY}.csv",
@@ -47,18 +39,8 @@ class BehaviorProjectCache(Cache):
4739
"spec": f"{OPHYS_EXPERIMENTS_KEY}.csv",
4840
"parent_key": "BASEDIR",
4941
"typename": "file"
50-
},
51-
OPHYS_ANALYSIS_LOG_KEY: {
52-
"spec": f"{OPHYS_ANALYSIS_LOG_KEY}.csv",
53-
"parent_key": "BASEDIR",
54-
"typename": "file"
55-
},
56-
BEHAVIOR_ANALYSIS_LOG_KEY: {
57-
"spec": f"{BEHAVIOR_ANALYSIS_LOG_KEY}.csv",
58-
"parent_key": "BASEDIR",
59-
"typename": "file"
60-
},
6142
}
43+
}
6244

6345
def __init__(
6446
self,
@@ -283,123 +265,45 @@ def get_behavior_session_table(
283265

284266
def get_session_data(self, ophys_experiment_id: int, fixed: bool = False):
285267
"""
286-
Note -- This method mocks the behavior of a cache. No files are
287-
actually downloaded for local access. Instead, it adds the
288-
session id to a csv log. If the "fixed" parameter is true,
289-
then the API will first check to ensure that the log is present
290-
in the record before pulling the data.
268+
Note -- This method mocks the behavior of a cache. Future
269+
development will include an NWB reader to read from
270+
a true local cache (once nwb files are created).
271+
TODO: Using `fixed` will raise a NotImplementedError since there
272+
is no real cache.
291273
"""
292-
# TODO: Future development will include an NWB reader to read from
293-
# a true local cache (once nwb files are created)
294-
# For now just check the log if pass `fixed`
295-
path = self.get_cache_path(None, self.OPHYS_ANALYSIS_LOG_KEY)
296274
if fixed:
297-
self.logger.warning(
298-
"Warning! Passing `fixed=True` does not ensure that the "
299-
"underlying data has not changed, as no data are actually "
300-
"cached locally. The log will be updated each time the data "
301-
"are pulled from the database for tracking purposes.")
302-
try:
303-
record = pd.read_csv(path)
304-
except FileNotFoundError:
305-
raise MissingDataError(
306-
"No analysis log found! Add to the log by getting "
307-
"session data with fixed=False.")
308-
if ophys_experiment_id not in record["ophys_experiment_id"].values:
309-
raise MissingDataError(
310-
f"Data for ophys experiment {ophys_experiment_id} not "
311-
"found!")
312-
275+
raise NotImplementedError
313276
fetch_session = partial(self.fetch_api.get_session_data,
314277
ophys_experiment_id)
315-
write_log = partial(_write_log, path=path,
316-
key_name="ophys_experiment_id",
317-
key_value=ophys_experiment_id)
318278
return call_caching(
319279
fetch_session,
320-
write_log,
321-
lazy=False,
280+
lambda x: x, # not writing anything
281+
lazy=False, # can't actually read from file cache
322282
read=fetch_session
323283
)
324284

325285
def get_behavior_session_data(self, behavior_session_id: int,
326286
fixed: bool = False):
327287
"""
328-
Note -- This method mocks the behavior of a cache. No files are
329-
actually downloaded for local access. Instead, it adds the
330-
session id to a csv log. If the "fixed" parameter is true,
331-
then the API will first check to ensure that the log is present
332-
in the record before pulling the data.
288+
Note -- This method mocks the behavior of a cache. Future
289+
development will include an NWB reader to read from
290+
a true local cache (once nwb files are created).
291+
TODO: Using `fixed` will raise a NotImplementedError since there
292+
is no real cache.
333293
"""
334-
# TODO: Future development will include an NWB reader to read from
335-
# a true local cache (once nwb files are created)
336-
# For now just check the log if pass `fixed`
337-
path = self.get_cache_path(None, self.BEHAVIOR_ANALYSIS_LOG_KEY)
338294
if fixed:
339-
self.logger.warning(
340-
"Warning! Passing `fixed=True` does not ensure that the "
341-
"underlying data has not changed, as no data are actually "
342-
"cached locally. The log will be updated each time the data "
343-
"are pulled from the database for tracking purposes.")
344-
try:
345-
record = pd.read_csv(path)
346-
except FileNotFoundError:
347-
raise MissingDataError(
348-
"No analysis log found! Add to the log by getting "
349-
"session data with fixed=False.")
350-
if behavior_session_id not in record["behavior_session_id"].values:
351-
raise MissingDataError(
352-
f"Data for ophys experiment {behavior_session_id} not "
353-
"found!")
295+
raise NotImplementedError
354296

355297
fetch_session = partial(self.fetch_api.get_behavior_only_session_data,
356298
behavior_session_id)
357-
write_log = partial(_write_log, path=path,
358-
key_name="behavior_session_id",
359-
key_value=behavior_session_id)
360299
return call_caching(
361300
fetch_session,
362-
write_log,
301+
lambda x: x, # not writing anything
363302
lazy=False, # can't actually read from file cache
364303
read=fetch_session
365304
)
366305

367306

368-
def _write_log(data: Any, path: str, key_name: str, key_value: Any):
369-
"""
370-
Helper method to create and add to a log. Invoked any time a session
371-
object is created via BehaviorProjectCache.
372-
:param data: Unused, required because call_caching method assumes
373-
all writer functions have data as the first positional argument
374-
:param path: Path to save the log file
375-
:type path: str path
376-
:param key_name: Name of the id used to track the session object.
377-
Typically "behavior_session_id" or "ophys_session_id".
378-
:type key_name: str
379-
:param key_value: Value of the id used to track the session object.
380-
Usually an int.
381-
"""
382-
now = round(time.time())
383-
keys = [key_name, "created_at", "updated_at"]
384-
values = [key_value, now, now]
385-
if os.path.exists(path):
386-
record = (pd.read_csv(path, index_col=key_name)
387-
.to_dict(orient="index"))
388-
experiment = record.get(key_value)
389-
if experiment:
390-
experiment.update({"updated_at": now})
391-
else:
392-
record.update({key_value: dict(zip(keys[1:], values[1:]))})
393-
(pd.DataFrame.from_dict(record, orient="index")
394-
.rename_axis(index=key_name)
395-
.to_csv(path))
396-
else:
397-
with open(path, "w") as f:
398-
w = csv.DictWriter(f, fieldnames=keys)
399-
w.writeheader()
400-
w.writerow(dict(zip(keys, values)))
401-
402-
403307
def _write_csv(path, df, array_fields=None):
404308
"""Private writer that encodes array fields into pipe-delimited strings
405309
for saving a csv.

allensdk/test/brain_observatory/behavior/test_behavior_project_cache.py

Lines changed: 1 addition & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,8 @@
33
import pandas as pd
44
import tempfile
55
import logging
6-
import time
76
from allensdk.brain_observatory.behavior.behavior_project_cache import (
87
BehaviorProjectCache)
9-
from allensdk.core.exceptions import MissingDataError
108

119

1210
@pytest.fixture
@@ -102,74 +100,10 @@ def test_behavior_table_reads_from_cache(TempdirBehaviorCache, behavior_table,
102100
assert [expected_first[0]] == caplog.record_tuples
103101

104102

105-
def test_behavior_session_fails_fixed_if_no_cache(TempdirBehaviorCache):
106-
cache = TempdirBehaviorCache
107-
with pytest.raises(MissingDataError):
108-
cache.get_behavior_session_data(1, fixed=True)
109-
cache.get_behavior_session_data(1)
110-
# Also fails if there is a cache, but the id is not contained therein
111-
with pytest.raises(MissingDataError):
112-
cache.get_behavior_session_data(2, fixed=True)
113-
114-
115-
def test_session_fails_fixed_if_no_cache(TempdirBehaviorCache):
116-
cache = TempdirBehaviorCache
117-
with pytest.raises(MissingDataError):
118-
cache.get_session_data(1, fixed=True)
119-
cache.get_session_data(1)
120-
# Also fails if there is a cache, but the id is not contained therein
121-
with pytest.raises(MissingDataError):
122-
cache.get_session_data(2, fixed=True)
123-
124-
125103
def test_get_session_table_by_experiment(TempdirBehaviorCache):
126104
expected = (pd.DataFrame({"ophys_session_id": [1, 2, 2, 3],
127105
"ophys_experiment_id": [4, 5, 6, 7]})
128106
.set_index("ophys_experiment_id"))
129107
actual = TempdirBehaviorCache.get_session_table(by="ophys_experiment_id")[
130108
["ophys_session_id"]]
131-
pd.testing.assert_frame_equal(expected, actual)
132-
133-
134-
def test_write_behavior_log(TempdirBehaviorCache):
135-
expected_cols = ["behavior_session_id", "created_at", "updated_at"]
136-
expected_ids = [1, 2]
137-
expected_times = [False, True]
138-
cache = TempdirBehaviorCache
139-
cache.get_behavior_session_data(1)
140-
cache.get_behavior_session_data(2)
141-
time.sleep(1)
142-
cache.get_behavior_session_data(1)
143-
path = cache.manifest.path_info.get("behavior_analysis_log").get("spec")
144-
# Log exists
145-
assert os.path.exists(path)
146-
actual = pd.read_csv(path)
147-
# columns exist
148-
assert list(actual) == expected_cols
149-
# ids exist
150-
assert actual["behavior_session_id"].values.tolist() == expected_ids
151-
# first one should have updated different than created since accessed 2x
152-
assert ((actual["created_at"] == actual["updated_at"]).values.tolist()
153-
== expected_times)
154-
155-
156-
def test_write_session_log(TempdirBehaviorCache):
157-
expected_cols = ["ophys_experiment_id", "created_at", "updated_at"]
158-
expected_ids = [1, 2]
159-
expected_times = [False, True]
160-
cache = TempdirBehaviorCache
161-
cache.get_session_data(1)
162-
cache.get_session_data(2)
163-
time.sleep(1)
164-
cache.get_session_data(1)
165-
path = cache.manifest.path_info.get("ophys_analysis_log").get("spec")
166-
# Log exists
167-
assert os.path.exists(path)
168-
actual = pd.read_csv(path)
169-
# columns exist
170-
assert list(actual) == expected_cols
171-
# ids exist
172-
assert actual["ophys_experiment_id"].values.tolist() == expected_ids
173-
# first one should have updated different than created since accessed 2x
174-
assert ((actual["created_at"] == actual["updated_at"]).values.tolist()
175-
== expected_times)
109+
pd.testing.assert_frame_equal(expected, actual)

0 commit comments

Comments
 (0)