Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added notebooks/__init__.py
Empty file.
348 changes: 202 additions & 146 deletions notebooks/amplicon_pre_prep_file_generator.ipynb

Large diffs are not rendered by default.

322 changes: 161 additions & 161 deletions notebooks/matrix_tube_pipeline_seqcount_norm.ipynb

Large diffs are not rendered by default.

706 changes: 440 additions & 266 deletions notebooks/metatranscriptomics_matrix_pipeline_seqcount_norm.ipynb

Large diffs are not rendered by default.

Large diffs are not rendered by default.

768 changes: 384 additions & 384 deletions notebooks/test_output/QC/YYYY_MM_DD_NPH_7_10_matrix_df.txt

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

121 changes: 121 additions & 0 deletions notebooks/tests/notebook_test_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import unittest
import papermill as pm
import tempfile
from pathlib import Path
import os
import re

SAVE_DIR = "~/Desktop"


class TestNotebook(unittest.TestCase):
NOTEBOOK = "amplicon_pre_prep_file_generator.ipynb"
_OUT_PARAM_VARIABLE_KEY = "param_variable"
_FILE_PATH_KEY = "is_filepath" # key for file path parameters
_ZERO_DATES_FUNC_KEY = "zero_dates_func" # func to replace for dates

# TODO: turn off before committing
_SAVE_UNMATCHED_OUTPUTS = False # whether to save unmatched outputs

def setUp(self):
self.notebooks_dir = os.path.dirname(os.path.dirname(__file__))
self.test_data_dir = os.path.join(self.notebooks_dir, 'test_data')
self.test_output_dir = os.path.join(self.notebooks_dir, 'test_output')

def _help_test_files_exact_text_match(self, file_1, file_2, filename=None,
zero_dates_func=None):
"""Helper function to compare two text files for exact match."""

filename = f"{filename} " if filename else ""
msg = f"{filename}files do not match exactly."
self.maxDiff = None
with open(file_1, 'r', encoding='utf-8') as f1, \
open(file_2, 'r', encoding='utf-8') as f2:
text1 = f1.read()
text2 = f2.read()
if zero_dates_func:
text1 = zero_dates_func(text1)
text2 = zero_dates_func(text2)
try:
self.assertMultiLineEqual(text1, text2, msg=msg)
except AssertionError as e:
if self._SAVE_UNMATCHED_OUTPUTS:
# save the unmatched output files for inspection
file_info = [(file_1, text1), (file_2, text2)]
for curr_index in range(len(file_info)):
curr_file, curr_text = file_info[curr_index]
base_name = os.path.basename(curr_file)
save_fp = os.path.join(
SAVE_DIR, f"UNMATCHED_{curr_index+1}_{base_name}")
with open(save_fp, 'w', encoding='utf-8') as sf:
sf.write(curr_text)

raise e

def _replace_illumina_date(self, text):
"""Helper function to replace illumina date strings in text."""

date_pattern = r',\nDate,\d{4}-\d{2}-\d{2},'
replacement = r',\nDate,0000-00-00,'
return re.sub(date_pattern, replacement, text)

def _run_notebook_test(self, run_params, out_param_details):
"""Verify notebook produces expected output files.

Expects out_param_details to be a dict mapping output parameter name
to a details dict containing at least `_OUT_PARAM_VARIABLE_KEY` and
optionally `_FILE_PATH_KEY` and `_ZERO_DATES_FUNC_KEY`.
"""

with tempfile.TemporaryDirectory() as tmp_dir:
tmp_path = Path(tmp_dir)

# Populate run_params with formatted output paths and ensure
# directories exist for any file path outputs.
for curr_param_name, curr_details in out_param_details.items():
curr_param_variable = \
curr_details[self._OUT_PARAM_VARIABLE_KEY]
run_params[curr_param_name] = \
curr_param_variable.format(path=tmp_path)

if curr_details.get(self._FILE_PATH_KEY, False):
# extract directory path by removing {path}/ and filename
dir_path = os.path.dirname(
curr_param_variable.replace("{path}/", ""))
# create any necessary directories in the temp path
full_dir_path = tmp_path / dir_path
os.makedirs(full_dir_path, exist_ok=True)
# end if
# next curr_param_details

pm.execute_notebook(
input_path=f"{self.notebooks_dir}/{self.NOTEBOOK}",
output_path=f"{tmp_path}/{self.NOTEBOOK}",
parameters=run_params,
log_output=True,
)

# Validate that expected files were produced and contents match
for curr_param_name, curr_details in out_param_details.items():
if not curr_details.get(self._FILE_PATH_KEY, False):
continue
curr_param_variable = \
curr_details[self._OUT_PARAM_VARIABLE_KEY]
curr_param_zero_dates = \
curr_details.get(self._ZERO_DATES_FUNC_KEY, None)
curr_generated_fp = curr_param_variable.format(path=tmp_path)
self.assertTrue(
os.path.exists(curr_generated_fp),
# note, intentionally not using full generated output fp,
# which will have a long temp directory folder name in it
msg=(f"Notebook did not produce file at "
f"{curr_param_variable}"))

curr_expected_fp = curr_param_variable.format(
path=self.test_output_dir)
self.maxDiff = None

# confirm that the written file matches the original
self._help_test_files_exact_text_match(
curr_expected_fp, curr_generated_fp, curr_param_name,
zero_dates_func=curr_param_zero_dates)
177 changes: 177 additions & 0 deletions notebooks/tests/test_amplicon_pre_prep_file_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
from notebooks.tests.notebook_test_helpers import TestNotebook


class TestAmpliconNotebook(TestNotebook):
NOTEBOOK = "amplicon_pre_prep_file_generator.ipynb"

def test_amplicon_main_path(self):
"""Verify notebook produces expected output files."""

run_params = {
'seq_type': '16S',
'sample_accession_fp': (
f"{self.test_data_dir}/Plate_Maps/"
"2022_summer_Celeste_Adaptation_16_17_18_21_sa_file.tsv"
),
'metadata_fp': (
f"{self.test_data_dir}/Plate_Maps/"
"12986_20230314-090655.txt"
),
'compression_layout': [
{
# top left plate
'Plate Position': '1',
'Primer Plate #': '1',
# VisionMate output
'Plate map file':
(f'{self.test_data_dir}/Plate_Maps/'
'2022_summer_Celeste_Adaptation_16_plate_map.tsv'),

'Sample Plate': 'Plate_16', # Plate_#
'Project Name': 'Celeste_Adaptation_12986',
'center_project_name': 'Celeste Adapt',
'Project Abbreviation': 'ADAPT',
'experiment_design_description':
'16S sequencing of antibiotic time series',
'Plate elution volume': '70',

'Plating': 'SF', # initials
'Extraction Kit Lot': '166032128',
'Extraction Robot': 'Carmen_HOWE_KF3',
'TM1000 8 Tool': '109379Z',
'Primer Date': '2021-08-17', # yyyy-mm-dd
'MasterMix Lot': '978215',
'Water Lot': 'RNBJ0628',
'TM10 8 Tool': '865HS8',
'Processing Robot': 'Echo550',
'TM300 8 Tool': 'not applicable',
'TM50 8 Tool': 'not applicable',
'instrument_model': 'Illumina MiSeq',
'run_date': '2023-03-02', # date of MiSeq run
'Original Name': '' # leave empty
},
{
# top right plate
'Plate Position': '2',
'Primer Plate #': '2',
'Plate map file': (
f'{self.test_data_dir}/Plate_Maps/'
'2022_summer_Celeste_Adaptation_17_plate_map.tsv'
),

'Sample Plate': 'Plate_17', # Plate_#
'Project Name': 'Celeste_Adaptation_12986',
'center_project_name': 'Celeste Adapt',
'Project Abbreviation': 'ADAPT',
'experiment_design_description':
'16S sequencing of antibiotic time series',
'Plate elution volume': '70',

'Plating': 'SF',
'Extraction Kit Lot': '166032128',
'Extraction Robot': 'Carmen_HOWE_KF3',
'TM1000 8 Tool': '109379Z',
'Primer Date': '2021-08-17',
'MasterMix Lot': '978215',
'Water Lot': 'RNBJ0628',
'TM10 8 Tool': '865HS8',
'Processing Robot': 'Echo550',
'TM300 8 Tool': 'not applicable',
'TM50 8 Tool': 'not applicable',
'instrument_model': 'Illumina MiSeq',
'run_date': '2023-03-02',
'Original Name': ''
},
{
# bottom left plate
'Plate Position': '3',
'Primer Plate #': '3',
'Plate map file': (
f'{self.test_data_dir}/Plate_Maps/'
'2022_summer_Celeste_Adaptation_18_plate_map.tsv'
),
'Plate elution volume': '70',

'Sample Plate': 'Plate_18', # Plate_#
'Project Name': 'Celeste_Adaptation_12986',
'center_project_name': 'Celeste Adapt',
'Project Abbreviation': 'ADAPT',
'experiment_design_description':
'16S sequencing of antibiotic time series',

'Plating': 'SF',
'Extraction Kit Lot': '166032128',
'Extraction Robot': 'Carmen_HOWE_KF3',
'TM1000 8 Tool': '109379Z',
'Primer Date': '2021-08-17',
'MasterMix Lot': '978215',
'Water Lot': 'RNBJ0628',
'TM10 8 Tool': '865HS8',
'Processing Robot': 'Echo550',
'TM300 8 Tool': 'not applicable',
'TM50 8 Tool': 'not applicable',
'instrument_model': 'Illumina MiSeq',
'run_date': '2023-03-02',
'Original Name': ''
},
{
# bottom right plate
'Plate Position': '4',
'Primer Plate #': '4',
'Plate map file': (
f'{self.test_data_dir}/Plate_Maps/'
'2022_summer_Celeste_Adaptation_21_plate_map.tsv'
),
'Plate elution volume': '70',

'Sample Plate': 'Plate_21',
'Project Name': 'Celeste_Adaptation_12986',
'center_project_name': 'Celeste Adapt',
'Project Abbreviation': 'ADAPT',
'experiment_design_description':
'16S sequencing of antibiotic time series',

'Plating': 'SF',
'Extraction Kit Lot': '166032128',
'Extraction Robot': 'Carmen_HOWE_KF3',
'TM1000 8 Tool': '109379Z',
'Primer Date': '2021-08-17',
'MasterMix Lot': '978215',
'Water Lot': 'RNBJ0628',
'TM10 8 Tool': '865HS8',
'Processing Robot': 'Echo550',
'TM300 8 Tool': 'not applicable',
'TM50 8 Tool': 'not applicable',
'instrument_model': 'Illumina MiSeq',
'run_date': '2023-03-02',
'Original Name': ''
},
],
'well_col': 'Well',
'blanks_dir': f'{self.test_data_dir}/BLANKS',
'katharoseq_dir': None,
'files': [
(f'{self.test_data_dir}/amplicon/'
'20230201_IL515fBC_806r_ABTX_11052_174_178_182_185_'
'MF_notebook_updated.txt')
],
'keep_these': ['ABTX_Plate_174', 'ABTX_Plate_178'],
}

output_params = {
'output_filename': {
self._OUT_PARAM_VARIABLE_KEY:
('{path}/amplicon/20230302_IL515fBC_806_Celeste_'
'Adaptation_12986_Plate_16_17_18_21.txt'),
self._FILE_PATH_KEY: True,
},
'merged_output_filename': {
self._OUT_PARAM_VARIABLE_KEY:
('{path}/amplicon/20230203_IL515fBC_806_ABTX_11052_'
'Plates_174_178_182_185_ADAPT_12986_Plate_16_17_18_21_'
'merged.txt'),
self._FILE_PATH_KEY: True,
}
}

self._run_notebook_test(run_params, output_params)
Loading
Loading