From 12d75278c7ca660f54ddd03d759816b8ac1a44f1 Mon Sep 17 00:00:00 2001
From: Lucille Delisle <lucille.delisle@epfl.ch>
Date: Thu, 23 Nov 2023 17:04:03 +0100
Subject: [PATCH] add --store-read-ids and --read-ids-whitelist options

---
 cite_seq_count/__main__.py                    |  77 ++++++++++++++--
 cite_seq_count/io.py                          |  22 +++++
 cite_seq_count/processing.py                  |  82 ++++++++++++++----
 .../correct_R1_with_cell_barcode_mm.fastq.gz  | Bin 0 -> 2391 bytes
 tests/test_data/tags/pass/correct_3.csv       |   4 +
 tests/test_data/whitelist.csv                 |   2 +
 tests/test_data/whitelist_single.csv          |   1 +
 tests/test_processing.py                      |   2 +-
 8 files changed, 162 insertions(+), 28 deletions(-)
 create mode 100644 tests/test_data/fastq/correct_R1_with_cell_barcode_mm.fastq.gz
 create mode 100644 tests/test_data/tags/pass/correct_3.csv
 create mode 100644 tests/test_data/whitelist.csv
 create mode 100644 tests/test_data/whitelist_single.csv

diff --git a/cite_seq_count/__main__.py b/cite_seq_count/__main__.py
index bf1c3ba..06c4acf 100755
--- a/cite_seq_count/__main__.py
+++ b/cite_seq_count/__main__.py
@@ -203,6 +203,35 @@ def get_args():
         help=("Allow for a sliding window when aligning."),
     )
 
+    # Read ID output
+    readids = parser.add_argument_group(
+        "Read IDs", description=("parameters to get read ids per cell")
+    )
+
+    readids.add_argument(
+        "--store-read-ids",
+        dest="store_read_ids",
+        required=False,
+        default=False,
+        action="store_true",
+        help=("Write read ids per cell to file."),
+    )
+
+    readids.add_argument(
+        "--read-ids-whitelist",
+        dest="read_ids_whitelist",
+        required=False,
+        type=str,
+        help=(
+            "A csv file containning a whitelist of barcodes for which"
+            " read ids should be written to files.\n\n"
+            "\tExample:\n"
+            "\tATGCTAGTGCTA\n\tGCTAGTCAGGAT\n\tCGACTGCTAACG\n\n"
+            "Or 10X-style:\n"
+            "\tATGCTAGTGCTA-1\n\tGCTAGTCAGGAT-1\n\tCGACTGCTAACG-1\n"
+        ),
+    )
+
     # Remaining arguments.
     parser.add_argument(
         "-T",
@@ -380,6 +409,15 @@ def main():
     else:
         whitelist = False
 
+    if args.read_ids_whitelist and args.store_read_ids:
+        (read_ids_whitelist, _) = preprocessing.parse_whitelist_csv(
+            filename=args.read_ids_whitelist,
+            barcode_length=args.cb_last - args.cb_first + 1,
+            collapsing_threshold=args.bc_threshold,
+        )
+    else:
+        read_ids_whitelist = False
+
     # Load TAGs/ABs.
     ab_map = preprocessing.parse_tags_csv(args.tags)
     ab_map = preprocessing.check_tags(ab_map, args.max_error)
@@ -415,6 +453,7 @@ def main():
 
     # Initialize the counts dicts that will be generated from each input fastq pair
     final_results = defaultdict(lambda: defaultdict(Counter))
+    readsid_per_cell = {}
     umis_per_cell = Counter()
     reads_per_cell = Counter()
     merged_no_match = Counter()
@@ -437,7 +476,7 @@ def main():
         # Run with one process
         if n_threads <= 1 or n_reads < 1000001:
             print("CITE-seq-Count is running with one core.")
-            (_final_results, _merged_no_match) = processing.map_reads(
+            (_final_results, _readsid_per_cell, _merged_no_match) = processing.map_reads(
                 read1_path=read1_path,
                 read2_path=read2_path,
                 tags=ab_map,
@@ -449,6 +488,7 @@ def main():
                 start_trim=args.start_trim,
                 maximum_distance=args.max_error,
                 sliding_window=args.sliding_window,
+                store_read_ids = args.store_read_ids,
             )
             print("Mapping done")
             _umis_per_cell = Counter()
@@ -480,6 +520,7 @@ def main():
                         args.start_trim,
                         args.max_error,
                         args.sliding_window,
+                        args.store_read_ids,
                     ),
                     callback=parallel_results.append,
                     error_callback=sys.stderr,
@@ -491,6 +532,7 @@ def main():
 
             (
                 _final_results,
+                _readsid_per_cell,
                 _umis_per_cell,
                 _reads_per_cell,
                 _merged_no_match,
@@ -511,6 +553,10 @@ def main():
                 else:
                     # Explicitly save the counter to that tag
                     final_results[cell_barcode][tag] = _final_results[cell_barcode][tag]
+        for cell_barcode in _readsid_per_cell:
+            if cell_barcode not in readsid_per_cell:
+                readsid_per_cell[cell_barcode] = []
+            readsid_per_cell[cell_barcode] += _readsid_per_cell[cell_barcode]
     ordered_tags_map = OrderedDict()
     for i, tag in enumerate(ab_map.values()):
         ordered_tags_map[tag] = i
@@ -531,10 +577,12 @@ def main():
             if not whitelist:
                 (
                     final_results,
+                    final_readsid,
                     umis_per_cell,
                     bcs_corrected,
                 ) = processing.correct_cells(
                     final_results=final_results,
+                    readsid_per_cell=readsid_per_cell,
                     reads_per_cell=reads_per_cell,
                     umis_per_cell=umis_per_cell,
                     expected_cells=args.expected_cells,
@@ -544,10 +592,12 @@ def main():
             else:
                 (
                     final_results,
+                    final_readsid,
                     umis_per_cell,
                     bcs_corrected,
                 ) = processing.correct_cells_whitelist(
                     final_results=final_results,
+                    readsid_per_cell=readsid_per_cell,
                     umis_per_cell=umis_per_cell,
                     whitelist=whitelist,
                     collapsing_threshold=args.bc_threshold,
@@ -600,13 +650,14 @@ def main():
     )
 
     # Write uncorrected cells to dense output
-    io.write_dense(
-        sparse_matrix=umi_aberrant_matrix,
-        index=list(ordered_tags_map.keys()),
-        columns=aberrant_cells,
-        outfolder=os.path.join(args.outfolder, "uncorrected_cells"),
-        filename="dense_umis.tsv",
-    )
+    if (len(aberrant_cells) > 0):
+        io.write_dense(
+            sparse_matrix=umi_aberrant_matrix,
+            index=list(ordered_tags_map.keys()),
+            columns=aberrant_cells,
+            outfolder=os.path.join(args.outfolder, "uncorrected_cells"),
+            filename="dense_umis.tsv",
+        )
 
     # Create sparse matrices for results
     (umi_results_matrix, read_results_matrix) = processing.generate_sparse_matrices(
@@ -665,6 +716,16 @@ def main():
             outfolder=args.outfolder,
             filename="dense_umis.tsv",
         )
+    
+    # Write read ids to disk if requested
+    if args.store_read_ids:
+        print("Writting read ids")
+        io.write_read_ids(
+            readsid_per_cell = final_readsid,
+            read_ids_whitelist = read_ids_whitelist,
+            outfolder=args.outfolder,
+            subfolder="read_ids"
+        )
 
 
 if __name__ == "__main__":
diff --git a/cite_seq_count/io.py b/cite_seq_count/io.py
index 2dc04f0..ed8a0cc 100644
--- a/cite_seq_count/io.py
+++ b/cite_seq_count/io.py
@@ -66,3 +66,25 @@ def write_unmapped(merged_no_match, top_unknowns, outfolder, filename):
         unknown_file.write('tag,count\n')
         for element in top_unmapped:
             unknown_file.write('{},{}\n'.format(element[0],element[1]))
+
+
+def write_read_ids(readsid_per_cell, read_ids_whitelist, outfolder, subfolder):
+    """
+    Writes a file per cell in outfolder/subfolder
+
+    Args:
+        readsid_per_cell (dict): A dict of array where keys are cells and array contains read ids
+        read_ids_whitelist (set): The set of barcodes for which read ids should be written
+        outfolder (string): Path of the output folder
+        subfolder (string): Name of the output subfolder
+    """
+    # Create the folder:
+    prefix = os.path.join(outfolder, subfolder)
+    os.makedirs(prefix, exist_ok=True)
+
+    # Loop for each cell_id
+    for cell_barcode in readsid_per_cell:
+        if not read_ids_whitelist or cell_barcode in read_ids_whitelist:
+            with open(os.path.join(prefix, f"{cell_barcode}.txt"), 'w') as fo:
+                fo.write("\n".join(readsid_per_cell[cell_barcode]))
+                fo.write("\n")
diff --git a/cite_seq_count/processing.py b/cite_seq_count/processing.py
index 57b35b7..66bac91 100644
--- a/cite_seq_count/processing.py
+++ b/cite_seq_count/processing.py
@@ -97,6 +97,7 @@ def map_reads(
     start_trim,
     maximum_distance,
     sliding_window,
+    store_read_ids,
 ):
     """Read through R1/R2 files and generate a islice starting at a specific index.
 
@@ -117,13 +118,16 @@ def map_reads(
         start_trim (int): Number of bases to trim at the start.
         maximum_distance (int): Maximum distance given by the user.
         sliding_window (bool): A bool enabling a sliding window search
+        store_read_ids (bool): A bool enabling storage of read ids
 
     Returns:
         results (dict): A dict of dict of Counters with the mapping results.
+        readsid_per_cell (dict): A dict of array where keys are cells and array contains read ids
         no_match (Counter): A counter with unmapped sequences.
     """
     # Initiate values
     results = {}
+    readsid_per_cell = {}
     no_match = Counter()
     n = 1
     t = time.time()
@@ -131,11 +135,18 @@ def map_reads(
         read2_path, "rt"
     ) as textfile2:
 
-        # Read all 2nd lines from 4 line chunks. If first_n not None read only 4 times the given amount.
-        secondlines = islice(
-            zip(textfile1, textfile2), indexes[0] * 4 + 1, indexes[1] * 4 + 1, 4
+        # Read all lines. If first_n not None read only 4 times the given amount.
+        lines = islice(
+            zip(textfile1, textfile2), indexes[0] * 4, indexes[1] * 4 + 1
         )
-        for read1, read2 in secondlines:
+        for i, (read1, read2) in enumerate(lines):
+            if i % 4 == 0:
+                # This is the read_id:
+                readid = read1.strip().split()[0]
+                continue
+            elif i % 4 in [2, 3]:
+                # This is '+' or qual
+                continue
             read1 = read1.strip()
             read2 = read2.strip()
 
@@ -159,6 +170,9 @@ def map_reads(
 
             if cell_barcode not in results:
                 results[cell_barcode] = defaultdict(Counter)
+            if store_read_ids:
+                if cell_barcode not in readsid_per_cell:
+                    readsid_per_cell[cell_barcode] = []
 
             if sliding_window:
                 best_match = find_best_match_shift(TAG_seq, tags, maximum_distance)
@@ -166,6 +180,8 @@ def map_reads(
                 best_match = find_best_match(TAG_seq, tags, maximum_distance)
 
             results[cell_barcode][best_match][UMI] += 1
+            if store_read_ids:
+                readsid_per_cell[cell_barcode].append(readid)
 
             if best_match == "unmapped":
                 no_match[TAG_seq] += 1
@@ -193,7 +209,7 @@ def map_reads(
         "Mapping done for process {}. Processed {:,} reads".format(os.getpid(), n - 1)
     )
     sys.stdout.flush()
-    return (results, no_match)
+    return (results, readsid_per_cell, no_match)
 
 
 def merge_results(parallel_results):
@@ -204,17 +220,20 @@ def merge_results(parallel_results):
 
     Returns:
         merged_results (dict): Results combined as a dict of dict of Counters
+        readsid_per_cell (dict): A dict of array where keys are cells and array contains read ids
         umis_per_cell (Counter): Total umis per cell as a Counter
         reads_per_cell (Counter): Total reads per cell as a Counter
         merged_no_match (Counter): Unmapped tags as a Counter
     """
     merged_results = {}
+    readsid_per_cell = {}
     merged_no_match = Counter()
     umis_per_cell = Counter()
     reads_per_cell = Counter()
     for chunk in parallel_results:
         mapped = chunk[0]
-        unmapped = chunk[1]
+        reads = chunk[1]
+        unmapped = chunk[2]
         for cell_barcode in mapped:
             if cell_barcode not in merged_results:
                 merged_results[cell_barcode] = defaultdict(Counter)
@@ -227,8 +246,14 @@ def merge_results(parallel_results):
                         ][UMI]
                         umis_per_cell[cell_barcode] += len(mapped[cell_barcode][TAG])
                         reads_per_cell[cell_barcode] += mapped[cell_barcode][TAG][UMI]
+        for cell_barcode in reads:
+            if cell_barcode not in readsid_per_cell:
+                readsid_per_cell[cell_barcode] = []
+            readsid_per_cell[cell_barcode] += reads[cell_barcode]
+
+
         merged_no_match.update(unmapped)
-    return (merged_results, umis_per_cell, reads_per_cell, merged_no_match)
+    return (merged_results, readsid_per_cell, umis_per_cell, reads_per_cell, merged_no_match)
 
 
 def correct_umis(final_results, collapsing_threshold, top_cells, max_umis):
@@ -292,7 +317,7 @@ def update_umi_counts(UMIclusters, cell_tag_counts):
     return (cell_tag_counts, temp_corrected_umis)
 
 
-def collapse_cells(true_to_false, umis_per_cell, final_results, ab_map):
+def collapse_cells(true_to_false, umis_per_cell, final_results, readsid_per_cell, ab_map):
     """
     Collapses cell barcodes based on the mapping true_to_false
 
@@ -300,11 +325,13 @@ def collapse_cells(true_to_false, umis_per_cell, final_results, ab_map):
         true_to_false (dict): Mapping between the reference and the "mutated" barcodes.
         umis_per_cell (Counter): Counter of number of umis per cell.
         final_results (dict): Dict of dict of Counters with mapping results.
+        readsid_per_cell (dict): A dict of array where keys are cells and array contains read ids
         ab_map (dict): Dict of the TAGS.
 
     Returns:
         umis_per_cell (Counter): Counter of number of umis per cell.
         final_results (dict): Same as input but with corrected cell barcodes.
+        readsid_per_cell (dict): Same as input but with corrected cell barcodes.
         corrected_barcodes (int): How many cell barcodes have been corrected.
     """
     print("Collapsing cell barcodes")
@@ -315,6 +342,8 @@ def collapse_cells(true_to_false, umis_per_cell, final_results, ab_map):
             final_results[real_barcode] = defaultdict()
             for TAG in ab_map:
                 final_results[real_barcode][TAG] = Counter()
+        if real_barcode not in readsid_per_cell:
+            readsid_per_cell[real_barcode] = []
         for fake_barcode in true_to_false[real_barcode]:
             temp = final_results.pop(fake_barcode)
             corrected_barcodes += 1
@@ -326,11 +355,16 @@ def collapse_cells(true_to_false, umis_per_cell, final_results, ab_map):
             umis_per_cell[real_barcode] += temp_umi_counts
             # reads_per_cell[real_barcode] += temp_read_counts
 
-    return (umis_per_cell, final_results, corrected_barcodes)
+            if fake_barcode in readsid_per_cell:
+                temp2 = readsid_per_cell.pop(fake_barcode)
+                readsid_per_cell[real_barcode] += temp2
+
+    return (umis_per_cell, final_results, readsid_per_cell, corrected_barcodes)
 
 
 def correct_cells(
     final_results,
+    readsid_per_cell,
     reads_per_cell,
     umis_per_cell,
     collapsing_threshold,
@@ -342,13 +376,15 @@ def correct_cells(
     
     Args:
         final_results (dict): Dict of dict of Counters with mapping results.
+        readsid_per_cell (dict): A dict of array where keys are cells and array contains read ids
         umis_per_cell (Counter): Counter of number of umis per cell.
-        collapsing_threshold (int): Max distance between umis.
+        collapsing_threshold (int): Max distance between cells.
         expected_cells (int): Number of expected cells.
         ab_map (dict): Dict of the TAGS.
     
     Returns:
-        final_results (dict): Same as input but with corrected umis.
+        final_results (dict): Same as input but with corrected cell barcodes.
+        readsid_per_cell (dict): Same as input but with corrected cell barcodes.
         umis_per_cell (Counter): Counter of umis per cell after cell barcode correction
         corrected_umis (int): How many umis have been corrected.
     """
@@ -361,31 +397,39 @@ def correct_cells(
         plotfile_prefix=False,
     )
 
-    (umis_per_cell, final_results, corrected_barcodes) = collapse_cells(
+    (umis_per_cell, final_results, readsid_per_cell, corrected_barcodes) = collapse_cells(
         true_to_false=true_to_false,
         umis_per_cell=umis_per_cell,
         final_results=final_results,
+        readsid_per_cell=readsid_per_cell,
         ab_map=ab_map,
     )
-    return (final_results, umis_per_cell, corrected_barcodes)
+    return (final_results, readsid_per_cell, umis_per_cell, corrected_barcodes)
 
 
 def correct_cells_whitelist(
-    final_results, umis_per_cell, whitelist, collapsing_threshold, ab_map
+    final_results,
+    readsid_per_cell,
+    umis_per_cell,
+    whitelist,
+    collapsing_threshold,
+    ab_map
 ):
     """
     Corrects cell barcodes.
     
     Args:
         final_results (dict): Dict of dict of Counters with mapping results.
+        readsid_per_cell (dict): A dict of array where keys are cells and array contains read ids
         umis_per_cell (Counter): Counter of UMIs per cell.
         whitelist (set): The whitelist reference given by the user.
-        collapsing_threshold (int): Max distance between umis.
+        collapsing_threshold (int): Max distance between cell barcodes.
         ab_map (OrederedDict): Tags in an ordered dict.
 
     
     Returns:
-        final_results (dict): Same as input but with corrected umis.
+        final_results (dict): Same as input but with corrected cell barcodes.
+        readsid_per_cell (dict): Same as input but with corrected cell barcodes.
         umis_per_cell (Counter): Updated UMI counts after correction.
         corrected_barcodes (int): How many umis have been corrected.
     """
@@ -403,10 +447,10 @@ def correct_cells_whitelist(
         whitelist=whitelist,
         collapsing_threshold=collapsing_threshold,
     )
-    (umis_per_cell, final_results, corrected_barcodes) = collapse_cells(
-        true_to_false, umis_per_cell, final_results, ab_map
+    (umis_per_cell, final_results, readsid_per_cell, corrected_barcodes) = collapse_cells(
+        true_to_false, umis_per_cell, final_results, readsid_per_cell, ab_map
     )
-    return (final_results, umis_per_cell, corrected_barcodes)
+    return (final_results, readsid_per_cell, umis_per_cell, corrected_barcodes)
 
 
 def find_true_to_false_map(
diff --git a/tests/test_data/fastq/correct_R1_with_cell_barcode_mm.fastq.gz b/tests/test_data/fastq/correct_R1_with_cell_barcode_mm.fastq.gz
new file mode 100644
index 0000000000000000000000000000000000000000..4dc0f1d5fdc35f1694d987ddff4599d6f794580e
GIT binary patch
literal 2391
zcmV-d38?lTiwFo#8D3=o17mM;a%E$5Us5qHW?^%5aR9YkOS0oO4BWp{WS>Q>zr@wB
zw9t+#ULhyQ+`odREJF4W7!!PxsibUII}kyj(E#cB@$=vRes(VU!yo(M{SkNi&+hhc
z;C0-C>;CEfN~o`Q6obD0{@TQUzJB2AJsF`d9d{#s5BM<du74Wozur*{`oPF@C3hou
zzVyty8(vp(Lovu`Bp!Z`5i1#GBN=2hLJx_54>u`BE|ASgpbtFxeD`h^jWEb&guG2&
zfGgZ08ez}}MuyVHle?2k-1;;!6gK_>eHg-TMkp`?JV6`3-9q>k#h_**-Q#<#6gK4u
zd)O&Nqjy~=?h2!9Yl+7Z=}(}ST`#VMQMSFO_aWspSsBF5XOz)nJN-G33$ByXo>9iM
zzj;i_`x=jxX_EdA^ftOqPJBigci+Pk<ajD8i@f_8Wm^l;hKI|MFRQo~Mj1(rzI=eu
zY4<mo#WKq1v54=I&%NvA+-H>0W8iaoLoa(d_Zelh0=*FM-u1m~C8Mmn?~^m&F}dP1
z%DOMP;`gJnFBK)f#QflMBm{YEl9@lFY;1Ysv3?fWvX#duawyr5*sfQqenwfVKFJa)
zQODtWt?XYYGsDn>VryM5=RTusYq>GZ@DSHh#V`dcAwp;or5|qSL@SIkdkpVMKDa^2
z{fshtj9vshBw3N`KBJ5=mR`{Z(G8;<V;N=CFgK4)vW98&*o|QZIrpm=rl|Uma>Th5
zV@lo6C?k)-84-Hk4RY=?%3Ld{Hwh$!+@R!sMp;!KzMo!>4@LD|DBHf=JT`XXzEpWk
z*Zku*Y5tOx)XHIA;Ko6=l2JA*NH}VL={emf*L_AA83q#%ddfiiBIiD%tO}NLjXFA5
zku^V~tjv$H0qNjIx$ZN{xE7DT6rZ_K*8GgJ_LWd30e$7T$!wWX&3bH->wc9UE5;ZK
zVL<Lra@}WC^HgDy>weW#p;Y(%`5vP&rS4~xy-Ny5%F0RZ_ZijPiBC$uU(<<~)`I6m
z!x-Eo=RTvFvvN{2e@#}FI2pAmH4biCWlqkhW^FmixnHF%OKU-YjgHZ~S?>24W$j+^
zkf?)hmg_#F%sn!dbGY5jO5M+><~i>y=YG|ox7fV~Ql{wV-7M!mqneq2mg{~i^CJ`R
ztk`{7^E1lGV?67d<=k(c^<iKX!GvyBW_=lDoWn5l&&r&?hWU#+8LSSHa*HzS%P1qm
z@X_8P=YG}Mo<4<Jw0^&03Rj$v__Jq-Zc*xfMj1VJv!ZDJ8Y|`b7<~XC>=wE1Gs?P_
zJX2a`Wi1smrBc3NT!cSlMeg?*Wu9GJ8fKCE{VEMpsxLgdSme52HM=M?4Ei}JrBylO
z%P1=+Ljt4Zx>cU@Gs;?DFu*<6(ku1lLYXzp)xWGtzhBe8=>7isP5pjx4s%Y}D8^QK
z&d(_8X~L-+h+$Sa_Zel){o^|2S+(3h+D>_xVOFK?*Dy@+gcp>KjB~3z>tmGlVn<fv
zn=<RGQRDiwXOnZkb=q_5k~X>SH+M-~kvBQ_TPt#^gXyfuZQA+%g=)V-qMY$H-tM?D
z|E8SpH!}Z$<_954Ru;K`$te5oUy-cz`>k_J?kqNK&R@}4l&tu40R@`Bd&tSpu#e23
zP(%;zOE&|KbI^~;7tC<>zvCG8;r@HY%)Rt`yBRp`m(_%H>P6(=J?>}x@2k0BAI906
znUD@8o_ljI{;xQO+04AGW)jV`uEwVmT!bo_nME@Uvzdvws)@|QWwk{#!>|t{?cHf;
zPCc@O%-qHQo6|DPYR1QKIc-zSTrgWer}V&WG+Isf5CfWF<`8)H_GG5hLf{FrIV~pc
z0)>&8UUVA6Y^zDaF5P$~GlRGqhP8)4H-1kD6wHt>u9samiDnpPTn*|w)GDQCH_NNZ
zFlz`r!xc_jln}UJ*1So?dxpTO<jn=MMp{f|I0iDaDUo);tYwY&rk7cueQzSB)VSi^
z^_{Xe7tFYtbl#Sx|J}=Vzqi$7SbKW*!}p|TDFjGK!YAq8vaAIdW=&5PP`%6o)jI7x
z1oouD=~>B)%mNHE)|>M>-{~}23oy(WY3V`F^z3C8V3^Hm`*Zw;vewI5fMM1Uz)Sp!
z!*ws${@z32g4vvwCSx9tOJ+LJX$&(mbn0F3;~Fv}vjD@s!HnDyw3<158jQ!ZH>1MP
z7tE@m(~psG!LS?Tj(}mt5J+kQw=c-dEQdgb8ObJ`w-nQA6fKZp?OI^?CM{6XX*Uf$
z$SlyRq0=k|kxF|rDlCv;*1TcWIm#?hr8-N6AmEsn<OGq;%N4*kGa1&dp+{K@w610+
zVT4G9P8$_1kYPrr-8gNOS)iFZ(@i3DNMuH40ft!vI-Fm}YGf8*Si7<r<+(tWvMD(2
zKIIrU(P>pvj^b#DwZQZcL!e3v6qOAM3AmBW$Q=R0jH`i|fHR2GCYc2oW>uXW(33nD
zs0!#}dOiaczsr+ZfMI53gEE5a=H1JEz+YJ42{USe{W{7ZGl~|-Fl#^`=~por+f8yu
zz%Z+T!ZYsa_n$9U0^if~g0+uSCwVSVHBv2wK>CeGy06vE%5*Tp+U4M^bObeWuo%!W
zU6TUY%vwiq!EBs%AKuTh7N{ED7xRYa0<+u^RLuo`7Xk_k)P%rq#O*B41**hti3N^;
zPCTXOtgt|aSv!J&+dnuZ?q;ncxL`)7`H<X!{$^G>f(&cd(2J}Esx)+Qn8JIr$ShE`
zHzf^?d$PD@1~al2U|4%cu>8S};Kl;WpPLT)BqYvyvnXc-8D<>~?R#F0q6IR{*b$_k
z{NN)pqjUrrW)!z+Msl2aBX<N0GgnVGBX<N<W{Mg*P0VBZyA70%Aj7PA!$o?PJA$er
z{TrvPvKDA|T1+i6E+lrV+z~L$d?1OltkmMJ+Fam*wV&dy%3PqaNS_M%Y}i-%z5v6F
zkrvW8E=@UBnFScuE(cec1*+s=Y5ahUK>vc($XbA5?Ng3To(@(`If{G3l4Da?phj{O
z8}`H_9z}Xnrh^$~Rh=<k8QByTsJSxon>K1wI)cWj`hCi=X<DFS%2ASow<5jCTA->(
zmqOq+yx)|Q!p4LA6hw*T$&4}=$T0Km2UIH$hTaqw$T0g69=eJHe>adk7hsrf=6@hC
J045Aj007>{sFVNz

literal 0
HcmV?d00001

diff --git a/tests/test_data/tags/pass/correct_3.csv b/tests/test_data/tags/pass/correct_3.csv
new file mode 100644
index 0000000..410078b
--- /dev/null
+++ b/tests/test_data/tags/pass/correct_3.csv
@@ -0,0 +1,4 @@
+CGTACGTAGCCTAGC,test1
+CGTAGCTCGAAAAAA,test2
+CGTCGAAGCTGAACG,test3
+CGTCGTAGCTGATCG,test4
diff --git a/tests/test_data/whitelist.csv b/tests/test_data/whitelist.csv
new file mode 100644
index 0000000..c7e3da4
--- /dev/null
+++ b/tests/test_data/whitelist.csv
@@ -0,0 +1,2 @@
+TACATATTCTTTACTG
+TAGAGGGAAGTCAAGC
diff --git a/tests/test_data/whitelist_single.csv b/tests/test_data/whitelist_single.csv
new file mode 100644
index 0000000..df59693
--- /dev/null
+++ b/tests/test_data/whitelist_single.csv
@@ -0,0 +1 @@
+TACATATTCTTTACTG
diff --git a/tests/test_processing.py b/tests/test_processing.py
index 63eea35..90b04bc 100644
--- a/tests/test_processing.py
+++ b/tests/test_processing.py
@@ -147,7 +147,7 @@ def test_find_best_match_with_3_distance_reverse(data):
     'test_find_best_match_with_3_distance',
     'test_find_best_match_with_3_distance_reverse',])
 def test_classify_reads_multi_process(data):
-    (results, no_match) = processing.map_reads(
+    (results, reads_cells, no_match) = processing.map_reads(
         pytest.correct_R1_path,
         pytest.correct_R2_path,
         pytest.tags,