1
1
from dataclasses import asdict , dataclass , field
2
- from typing import Callable , Optional , Dict , List , Set , Tuple
3
2
from enum import Enum
3
+ from functools import partial
4
+ from itertools import groupby , repeat , tee
5
+ from typing import Callable , Generator , Optional , Dict , List , Set , Tuple , Iterable , Union
4
6
from pathlib import Path
5
7
import re
8
+ from more_itertools import interleave_longest , peekable
6
9
import pandas as pd
7
10
import numpy as np
8
11
9
- from ..._params import SourceSignalPair
12
+ from delphi_utils .nancodes import Nans
13
+ from ..._params import SourceSignalPair , TimePair
14
+ from .smooth_diff import generate_smooth_rows , generate_row_diffs
15
+ from ...utils import time_value_range , shift_time_value
16
+
17
+
18
+ IDENTITY : Callable = lambda rows , ** kwargs : rows
10
19
11
20
12
21
class HighValuesAre (str , Enum ):
@@ -21,6 +30,7 @@ class SignalFormat(str, Enum):
21
30
fraction = "fraction"
22
31
raw_count = "raw_count"
23
32
raw = "raw"
33
+ count = "count"
24
34
25
35
26
36
class SignalCategory (str , Enum ):
@@ -201,7 +211,7 @@ def _load_data_sources():
201
211
202
212
203
213
data_sources , data_sources_df = _load_data_sources ()
204
- data_source_by_id = {d .source : d for d in data_sources }
214
+ data_sources_by_id = {d .source : d for d in data_sources }
205
215
206
216
207
217
def _load_data_signals (sources : List [DataSource ]):
@@ -230,7 +240,7 @@ def _load_data_signals(sources: List[DataSource]):
230
240
data_signals_by_key = {d .key : d for d in data_signals }
231
241
# also add the resolved signal version to the signal lookup
232
242
for d in data_signals :
233
- source = data_source_by_id .get (d .source )
243
+ source = data_sources_by_id .get (d .source )
234
244
if source and source .uses_db_alias :
235
245
data_signals_by_key [(source .db_source , d .signal )] = d
236
246
@@ -265,7 +275,7 @@ def create_source_signal_alias_mapper(source_signals: List[SourceSignalPair]) ->
265
275
alias_to_data_sources : Dict [str , List [DataSource ]] = {}
266
276
transformed_pairs : List [SourceSignalPair ] = []
267
277
for pair in source_signals :
268
- source = data_source_by_id .get (pair .source )
278
+ source = data_sources_by_id .get (pair .source )
269
279
if not source or not source .uses_db_alias :
270
280
transformed_pairs .append (pair )
271
281
continue
@@ -299,3 +309,159 @@ def map_row(source: str, signal: str) -> str:
299
309
return signal_source .source
300
310
301
311
return transformed_pairs , map_row
312
+
313
+
314
+ def _resolve_all_signals (source_signals : Union [SourceSignalPair , List [SourceSignalPair ]], data_sources_by_id : Dict [str , DataSource ]) -> Union [SourceSignalPair , List [SourceSignalPair ]]:
315
+ """Expand a request for all signals to an explicit list of signal names.
316
+
317
+ Example: SourceSignalPair("jhu-csse", signal=True) would return SourceSignalPair("jhu-csse", [<list of all JHU signals>]).
318
+ """
319
+ return [SourceSignalPair ("src" , ["sig1" , "sig2" ])]
320
+
321
+
322
+
323
+ def _reindex_iterable (iterable : Iterable [Dict ], time_pairs : List [TimePair ], fill_value : Optional [int ] = None ) -> Iterable :
324
+ """Produces an iterable that fills in gaps in the time window of another iterable.
325
+
326
+ Used to produce an iterable with a contiguous time index for time series operations.
327
+
328
+ We iterate over contiguous range of days made from time_pairs. If `iterable`, which is assumed to be sorted by its "time_value" key,
329
+ is missing a time_value in the range, a dummy row entry is returned with the correct date and the value fields set appropriately.
330
+ """
331
+ if time_pairs is None :
332
+ return iterable
333
+
334
+ day_range_index = get_day_range (time_pairs )
335
+ for day in day_range_index .time_values :
336
+ if day in next_value (iterable ):
337
+ return next_value (iterable )
338
+ else :
339
+ yield updated_default_value
340
+
341
+
342
+ def _get_base_signal_transform (signal : Union [DataSignal , Tuple [str , str ]], data_signals_by_key : Dict [Tuple [str , str ], DataSignal ] = data_signals_by_key ) -> Callable :
343
+ """Given a DataSignal, return the transformation that needs to be applied to its base signal to derive the signal."""
344
+ return IDENTITY
345
+
346
+
347
+ def get_transform_types (source_signal_pairs : List [SourceSignalPair ], data_sources_by_id : Dict [str , DataSource ] = data_sources_by_id , data_signals_by_key : Dict [Tuple [str , str ], DataSignal ] = data_signals_by_key ) -> Set [Callable ]:
348
+ """Return a collection of the unique transforms required for transforming a given source-signal pair list.
349
+
350
+ Example:
351
+ SourceSignalPair("src", ["sig", "sig_smoothed", "sig_diff"]) would return {IDENTITY, SMOOTH, DIFF}.
352
+
353
+ Used to pad the user DB query with extra days.
354
+ """
355
+ return set ([IDENTITY ])
356
+
357
+
358
+ def get_pad_length (source_signal_pairs : List [SourceSignalPair ], smoother_window_length : int , data_sources_by_id : Dict [str , DataSource ] = data_sources_by_id , data_signals_by_key : Dict [Tuple [str , str ], DataSignal ] = data_signals_by_key ):
359
+ """Returns the size of the extra date padding needed, depending on the transformations the source-signal pair list requires.
360
+
361
+ Example:
362
+ If smoothing is required, we fetch an extra 6 days. If both diffing and smoothing is required on the same signal, then we fetch 7 extra days.
363
+
364
+ Used to pad the user DB query with extra days.
365
+ """
366
+ transform_types = get_transform_types (source_signal_pairs , data_sources_by_id = data_sources_by_id , data_signals_by_key = data_signals_by_key )
367
+ if SMOOTH in transform_types :
368
+ return 7
369
+ else :
370
+ return 0
371
+
372
+
373
+ def pad_time_pairs (time_pairs : List [TimePair ], pad_length : int ) -> List [TimePair ]:
374
+ """Pads a list of TimePairs with another TimePair that extends the smallest time value by the pad_length, if needed.
375
+
376
+ Example:
377
+ [TimePair("day", [20210407])] with pad_length 6 would return [TimePair("day", [20210407]), TimePair("day", [(20210401, 20210407)])].
378
+
379
+ Used to pad the user DB query with extra days.
380
+ """
381
+ return [TimePair ("day" , [(20210401 , 20210407 )])]
382
+
383
+
384
+ def pad_time_window (time_window : Tuple [int , int ], pad_length : int ) -> Tuple [int , int ]:
385
+ """Extend a time window on the left by pad_length.
386
+
387
+ Example:
388
+ (20210407, 20210413) with pad_length 6 would return (20210401, 20210413).
389
+
390
+ Used to pad the user DB query with extra days.
391
+ """
392
+ return (20210401 , 20210407 )
393
+
394
+
395
+ def get_day_range (time_pairs : Union [TimePair , List [TimePair ]]) -> TimePair :
396
+ """Combine a list of TimePairs into a single contiguous, explicit TimePair object.
397
+
398
+ Example:
399
+ [TimePair("day", [20210407, 20210408]), TimePair("day", [(20210408, 20210410)])] would return
400
+ TimePair("day", [20210407, 20210408, 20210409, 20210410]).
401
+
402
+ Used to produce a contiguous time index for time series operations.
403
+ """
404
+ return TimePair ("day" , [20210407 , 20210408 , 20210409 , 20210410 ])
405
+
406
+
407
+ def _generate_transformed_rows (
408
+ parsed_rows : Iterable [Dict ], time_pairs : Optional [List [TimePair ]] = None , transform_dict : Optional [Dict [str , List [Tuple [str , str ]]]]= None , transform_args : Optional [Dict ] = None , group_keyfunc : Optional [Callable ] = None , data_signals_by_key : Dict [Tuple [str , str ], DataSignal ] = data_signals_by_key ,
409
+ ) -> Iterable [Dict ]:
410
+ """Applies time-series transformations to streamed rows from a database.
411
+
412
+ Parameters:
413
+ parsed_rows: Iterable[Dict]
414
+ Streamed rows from the database.
415
+ time_pairs: Optional[List[TimePair]], default None
416
+ A list of TimePairs, which can be used to create a continguous time index for time-series operations.
417
+ The min and max dates in the TimePairs list is used.
418
+ transform_dict: Optional[Dict[Tuple[str, str], List[Tuple[str, str]]]], default None
419
+ A dictionary mapping base sources to a list of their derived signals that the user wishes to query.
420
+ For example, transform_dict may be {("jhu-csse", "confirmed_cumulative_num): [("jhu-csse", "confirmed_incidence_num"), ("jhu-csse", "confirmed_7dav_incidence_num")]}.
421
+ transform_args: Optional[Dict], default None
422
+ A dictionary of keyword arguments for the transformer functions.
423
+ group_keyfunc: Optional[Callable], default None
424
+ The groupby function to use to order the streamed rows. Note that Python groupby does not do any sorting, so
425
+ parsed_rows are assumed to be sorted in accord with this groupby.
426
+ data_signals_by_key: Dict[Tuple[str, str], DataSignal], default data_signals_by_key
427
+ The dictionary of DataSignals which is used to find the base signal transforms
428
+
429
+ Yields:
430
+ transformed rows: Dict
431
+ The transformed rows returned in an interleaved fashion. Non-transformed rows have the IDENTITY operation applied.
432
+ """
433
+ if not transform_args :
434
+ transform_args = dict ()
435
+ if not transform_dict :
436
+ transform_dict = dict ()
437
+ if not group_keyfunc :
438
+ group_keyfunc = lambda row : (row ["geo_type" ], row ["geo_value" ], row ["source" ], row ["signal" ])
439
+
440
+ try :
441
+ for key , group in groupby (parsed_rows , group_keyfunc ):
442
+ _ , _ , source_name , signal_name = key
443
+ # Extract the list of derived signals.
444
+ # Create a list of source-signal pairs along with the transformation required for the signal.
445
+ # Put the current time series on a contiguous time index.
446
+ # Create copies of the iterable, with smart memory usage.
447
+ # Create a list of transformed group iterables, remembering their derived name as needed.
448
+ # Traverse through the transformed iterables in an interleaved fashion, which makes sure that only a small window
449
+ # of the original iterable (group) is stored in memory.
450
+ for row in transform_group (group ):
451
+ yield row
452
+ except Exception as e :
453
+ print (f"Tranformation encountered error of type { type (e )} , with message { e } . Yielding None and stopping." )
454
+ yield None
455
+
456
+
457
+ def get_basename_signals (source_signal_pairs : List [SourceSignalPair ], data_sources_by_id : Dict [str , DataSource ] = data_sources_by_id , data_signals_by_key : Dict [Tuple [str , str ], DataSignal ] = data_signals_by_key ) -> Tuple [List [SourceSignalPair ], Generator ]:
458
+ """From a list of SourceSignalPairs, return the base signals required to derive them and a transformation function to take a stream
459
+ of the base signals and return the transformed signals.
460
+
461
+ Example:
462
+ SourceSignalPair("src", signal=["sig_base", "sig_smoothed"]) would return SourceSignalPair("src", signal=["sig_base"]) and a transformation function
463
+ that will take the returned database query for "sig_base" and return both the base time series and the smoothed time series.
464
+ """
465
+ transform_dict = {("src" , "sig_base" ): [("src" , "sig_base" ), ("src" , "sig_smoothed" )]}
466
+ row_transform_generator = partial (_generate_transformed_rows , transform_dict = transform_dict , data_signals_by_key = data_signals_by_key )
467
+ return SourceSignalPair ("src" , signal = ["sig_base" ]), row_transform_generator
0 commit comments