From 12d75278c7ca660f54ddd03d759816b8ac1a44f1 Mon Sep 17 00:00:00 2001 From: Lucille Delisle Date: Thu, 23 Nov 2023 17:04:03 +0100 Subject: [PATCH] add --store-read-ids and --read-ids-whitelist options --- cite_seq_count/__main__.py | 77 ++++++++++++++-- cite_seq_count/io.py | 22 +++++ cite_seq_count/processing.py | 82 ++++++++++++++---- .../correct_R1_with_cell_barcode_mm.fastq.gz | Bin 0 -> 2391 bytes tests/test_data/tags/pass/correct_3.csv | 4 + tests/test_data/whitelist.csv | 2 + tests/test_data/whitelist_single.csv | 1 + tests/test_processing.py | 2 +- 8 files changed, 162 insertions(+), 28 deletions(-) create mode 100644 tests/test_data/fastq/correct_R1_with_cell_barcode_mm.fastq.gz create mode 100644 tests/test_data/tags/pass/correct_3.csv create mode 100644 tests/test_data/whitelist.csv create mode 100644 tests/test_data/whitelist_single.csv diff --git a/cite_seq_count/__main__.py b/cite_seq_count/__main__.py index bf1c3ba..06c4acf 100755 --- a/cite_seq_count/__main__.py +++ b/cite_seq_count/__main__.py @@ -203,6 +203,35 @@ def get_args(): help=("Allow for a sliding window when aligning."), ) + # Read ID output + readids = parser.add_argument_group( + "Read IDs", description=("parameters to get read ids per cell") + ) + + readids.add_argument( + "--store-read-ids", + dest="store_read_ids", + required=False, + default=False, + action="store_true", + help=("Write read ids per cell to file."), + ) + + readids.add_argument( + "--read-ids-whitelist", + dest="read_ids_whitelist", + required=False, + type=str, + help=( + "A csv file containning a whitelist of barcodes for which" + " read ids should be written to files.\n\n" + "\tExample:\n" + "\tATGCTAGTGCTA\n\tGCTAGTCAGGAT\n\tCGACTGCTAACG\n\n" + "Or 10X-style:\n" + "\tATGCTAGTGCTA-1\n\tGCTAGTCAGGAT-1\n\tCGACTGCTAACG-1\n" + ), + ) + # Remaining arguments. parser.add_argument( "-T", @@ -380,6 +409,15 @@ def main(): else: whitelist = False + if args.read_ids_whitelist and args.store_read_ids: + (read_ids_whitelist, _) = preprocessing.parse_whitelist_csv( + filename=args.read_ids_whitelist, + barcode_length=args.cb_last - args.cb_first + 1, + collapsing_threshold=args.bc_threshold, + ) + else: + read_ids_whitelist = False + # Load TAGs/ABs. ab_map = preprocessing.parse_tags_csv(args.tags) ab_map = preprocessing.check_tags(ab_map, args.max_error) @@ -415,6 +453,7 @@ def main(): # Initialize the counts dicts that will be generated from each input fastq pair final_results = defaultdict(lambda: defaultdict(Counter)) + readsid_per_cell = {} umis_per_cell = Counter() reads_per_cell = Counter() merged_no_match = Counter() @@ -437,7 +476,7 @@ def main(): # Run with one process if n_threads <= 1 or n_reads < 1000001: print("CITE-seq-Count is running with one core.") - (_final_results, _merged_no_match) = processing.map_reads( + (_final_results, _readsid_per_cell, _merged_no_match) = processing.map_reads( read1_path=read1_path, read2_path=read2_path, tags=ab_map, @@ -449,6 +488,7 @@ def main(): start_trim=args.start_trim, maximum_distance=args.max_error, sliding_window=args.sliding_window, + store_read_ids = args.store_read_ids, ) print("Mapping done") _umis_per_cell = Counter() @@ -480,6 +520,7 @@ def main(): args.start_trim, args.max_error, args.sliding_window, + args.store_read_ids, ), callback=parallel_results.append, error_callback=sys.stderr, @@ -491,6 +532,7 @@ def main(): ( _final_results, + _readsid_per_cell, _umis_per_cell, _reads_per_cell, _merged_no_match, @@ -511,6 +553,10 @@ def main(): else: # Explicitly save the counter to that tag final_results[cell_barcode][tag] = _final_results[cell_barcode][tag] + for cell_barcode in _readsid_per_cell: + if cell_barcode not in readsid_per_cell: + readsid_per_cell[cell_barcode] = [] + readsid_per_cell[cell_barcode] += _readsid_per_cell[cell_barcode] ordered_tags_map = OrderedDict() for i, tag in enumerate(ab_map.values()): ordered_tags_map[tag] = i @@ -531,10 +577,12 @@ def main(): if not whitelist: ( final_results, + final_readsid, umis_per_cell, bcs_corrected, ) = processing.correct_cells( final_results=final_results, + readsid_per_cell=readsid_per_cell, reads_per_cell=reads_per_cell, umis_per_cell=umis_per_cell, expected_cells=args.expected_cells, @@ -544,10 +592,12 @@ def main(): else: ( final_results, + final_readsid, umis_per_cell, bcs_corrected, ) = processing.correct_cells_whitelist( final_results=final_results, + readsid_per_cell=readsid_per_cell, umis_per_cell=umis_per_cell, whitelist=whitelist, collapsing_threshold=args.bc_threshold, @@ -600,13 +650,14 @@ def main(): ) # Write uncorrected cells to dense output - io.write_dense( - sparse_matrix=umi_aberrant_matrix, - index=list(ordered_tags_map.keys()), - columns=aberrant_cells, - outfolder=os.path.join(args.outfolder, "uncorrected_cells"), - filename="dense_umis.tsv", - ) + if (len(aberrant_cells) > 0): + io.write_dense( + sparse_matrix=umi_aberrant_matrix, + index=list(ordered_tags_map.keys()), + columns=aberrant_cells, + outfolder=os.path.join(args.outfolder, "uncorrected_cells"), + filename="dense_umis.tsv", + ) # Create sparse matrices for results (umi_results_matrix, read_results_matrix) = processing.generate_sparse_matrices( @@ -665,6 +716,16 @@ def main(): outfolder=args.outfolder, filename="dense_umis.tsv", ) + + # Write read ids to disk if requested + if args.store_read_ids: + print("Writting read ids") + io.write_read_ids( + readsid_per_cell = final_readsid, + read_ids_whitelist = read_ids_whitelist, + outfolder=args.outfolder, + subfolder="read_ids" + ) if __name__ == "__main__": diff --git a/cite_seq_count/io.py b/cite_seq_count/io.py index 2dc04f0..ed8a0cc 100644 --- a/cite_seq_count/io.py +++ b/cite_seq_count/io.py @@ -66,3 +66,25 @@ def write_unmapped(merged_no_match, top_unknowns, outfolder, filename): unknown_file.write('tag,count\n') for element in top_unmapped: unknown_file.write('{},{}\n'.format(element[0],element[1])) + + +def write_read_ids(readsid_per_cell, read_ids_whitelist, outfolder, subfolder): + """ + Writes a file per cell in outfolder/subfolder + + Args: + readsid_per_cell (dict): A dict of array where keys are cells and array contains read ids + read_ids_whitelist (set): The set of barcodes for which read ids should be written + outfolder (string): Path of the output folder + subfolder (string): Name of the output subfolder + """ + # Create the folder: + prefix = os.path.join(outfolder, subfolder) + os.makedirs(prefix, exist_ok=True) + + # Loop for each cell_id + for cell_barcode in readsid_per_cell: + if not read_ids_whitelist or cell_barcode in read_ids_whitelist: + with open(os.path.join(prefix, f"{cell_barcode}.txt"), 'w') as fo: + fo.write("\n".join(readsid_per_cell[cell_barcode])) + fo.write("\n") diff --git a/cite_seq_count/processing.py b/cite_seq_count/processing.py index 57b35b7..66bac91 100644 --- a/cite_seq_count/processing.py +++ b/cite_seq_count/processing.py @@ -97,6 +97,7 @@ def map_reads( start_trim, maximum_distance, sliding_window, + store_read_ids, ): """Read through R1/R2 files and generate a islice starting at a specific index. @@ -117,13 +118,16 @@ def map_reads( start_trim (int): Number of bases to trim at the start. maximum_distance (int): Maximum distance given by the user. sliding_window (bool): A bool enabling a sliding window search + store_read_ids (bool): A bool enabling storage of read ids Returns: results (dict): A dict of dict of Counters with the mapping results. + readsid_per_cell (dict): A dict of array where keys are cells and array contains read ids no_match (Counter): A counter with unmapped sequences. """ # Initiate values results = {} + readsid_per_cell = {} no_match = Counter() n = 1 t = time.time() @@ -131,11 +135,18 @@ def map_reads( read2_path, "rt" ) as textfile2: - # Read all 2nd lines from 4 line chunks. If first_n not None read only 4 times the given amount. - secondlines = islice( - zip(textfile1, textfile2), indexes[0] * 4 + 1, indexes[1] * 4 + 1, 4 + # Read all lines. If first_n not None read only 4 times the given amount. + lines = islice( + zip(textfile1, textfile2), indexes[0] * 4, indexes[1] * 4 + 1 ) - for read1, read2 in secondlines: + for i, (read1, read2) in enumerate(lines): + if i % 4 == 0: + # This is the read_id: + readid = read1.strip().split()[0] + continue + elif i % 4 in [2, 3]: + # This is '+' or qual + continue read1 = read1.strip() read2 = read2.strip() @@ -159,6 +170,9 @@ def map_reads( if cell_barcode not in results: results[cell_barcode] = defaultdict(Counter) + if store_read_ids: + if cell_barcode not in readsid_per_cell: + readsid_per_cell[cell_barcode] = [] if sliding_window: best_match = find_best_match_shift(TAG_seq, tags, maximum_distance) @@ -166,6 +180,8 @@ def map_reads( best_match = find_best_match(TAG_seq, tags, maximum_distance) results[cell_barcode][best_match][UMI] += 1 + if store_read_ids: + readsid_per_cell[cell_barcode].append(readid) if best_match == "unmapped": no_match[TAG_seq] += 1 @@ -193,7 +209,7 @@ def map_reads( "Mapping done for process {}. Processed {:,} reads".format(os.getpid(), n - 1) ) sys.stdout.flush() - return (results, no_match) + return (results, readsid_per_cell, no_match) def merge_results(parallel_results): @@ -204,17 +220,20 @@ def merge_results(parallel_results): Returns: merged_results (dict): Results combined as a dict of dict of Counters + readsid_per_cell (dict): A dict of array where keys are cells and array contains read ids umis_per_cell (Counter): Total umis per cell as a Counter reads_per_cell (Counter): Total reads per cell as a Counter merged_no_match (Counter): Unmapped tags as a Counter """ merged_results = {} + readsid_per_cell = {} merged_no_match = Counter() umis_per_cell = Counter() reads_per_cell = Counter() for chunk in parallel_results: mapped = chunk[0] - unmapped = chunk[1] + reads = chunk[1] + unmapped = chunk[2] for cell_barcode in mapped: if cell_barcode not in merged_results: merged_results[cell_barcode] = defaultdict(Counter) @@ -227,8 +246,14 @@ def merge_results(parallel_results): ][UMI] umis_per_cell[cell_barcode] += len(mapped[cell_barcode][TAG]) reads_per_cell[cell_barcode] += mapped[cell_barcode][TAG][UMI] + for cell_barcode in reads: + if cell_barcode not in readsid_per_cell: + readsid_per_cell[cell_barcode] = [] + readsid_per_cell[cell_barcode] += reads[cell_barcode] + + merged_no_match.update(unmapped) - return (merged_results, umis_per_cell, reads_per_cell, merged_no_match) + return (merged_results, readsid_per_cell, umis_per_cell, reads_per_cell, merged_no_match) def correct_umis(final_results, collapsing_threshold, top_cells, max_umis): @@ -292,7 +317,7 @@ def update_umi_counts(UMIclusters, cell_tag_counts): return (cell_tag_counts, temp_corrected_umis) -def collapse_cells(true_to_false, umis_per_cell, final_results, ab_map): +def collapse_cells(true_to_false, umis_per_cell, final_results, readsid_per_cell, ab_map): """ Collapses cell barcodes based on the mapping true_to_false @@ -300,11 +325,13 @@ def collapse_cells(true_to_false, umis_per_cell, final_results, ab_map): true_to_false (dict): Mapping between the reference and the "mutated" barcodes. umis_per_cell (Counter): Counter of number of umis per cell. final_results (dict): Dict of dict of Counters with mapping results. + readsid_per_cell (dict): A dict of array where keys are cells and array contains read ids ab_map (dict): Dict of the TAGS. Returns: umis_per_cell (Counter): Counter of number of umis per cell. final_results (dict): Same as input but with corrected cell barcodes. + readsid_per_cell (dict): Same as input but with corrected cell barcodes. corrected_barcodes (int): How many cell barcodes have been corrected. """ print("Collapsing cell barcodes") @@ -315,6 +342,8 @@ def collapse_cells(true_to_false, umis_per_cell, final_results, ab_map): final_results[real_barcode] = defaultdict() for TAG in ab_map: final_results[real_barcode][TAG] = Counter() + if real_barcode not in readsid_per_cell: + readsid_per_cell[real_barcode] = [] for fake_barcode in true_to_false[real_barcode]: temp = final_results.pop(fake_barcode) corrected_barcodes += 1 @@ -326,11 +355,16 @@ def collapse_cells(true_to_false, umis_per_cell, final_results, ab_map): umis_per_cell[real_barcode] += temp_umi_counts # reads_per_cell[real_barcode] += temp_read_counts - return (umis_per_cell, final_results, corrected_barcodes) + if fake_barcode in readsid_per_cell: + temp2 = readsid_per_cell.pop(fake_barcode) + readsid_per_cell[real_barcode] += temp2 + + return (umis_per_cell, final_results, readsid_per_cell, corrected_barcodes) def correct_cells( final_results, + readsid_per_cell, reads_per_cell, umis_per_cell, collapsing_threshold, @@ -342,13 +376,15 @@ def correct_cells( Args: final_results (dict): Dict of dict of Counters with mapping results. + readsid_per_cell (dict): A dict of array where keys are cells and array contains read ids umis_per_cell (Counter): Counter of number of umis per cell. - collapsing_threshold (int): Max distance between umis. + collapsing_threshold (int): Max distance between cells. expected_cells (int): Number of expected cells. ab_map (dict): Dict of the TAGS. Returns: - final_results (dict): Same as input but with corrected umis. + final_results (dict): Same as input but with corrected cell barcodes. + readsid_per_cell (dict): Same as input but with corrected cell barcodes. umis_per_cell (Counter): Counter of umis per cell after cell barcode correction corrected_umis (int): How many umis have been corrected. """ @@ -361,31 +397,39 @@ def correct_cells( plotfile_prefix=False, ) - (umis_per_cell, final_results, corrected_barcodes) = collapse_cells( + (umis_per_cell, final_results, readsid_per_cell, corrected_barcodes) = collapse_cells( true_to_false=true_to_false, umis_per_cell=umis_per_cell, final_results=final_results, + readsid_per_cell=readsid_per_cell, ab_map=ab_map, ) - return (final_results, umis_per_cell, corrected_barcodes) + return (final_results, readsid_per_cell, umis_per_cell, corrected_barcodes) def correct_cells_whitelist( - final_results, umis_per_cell, whitelist, collapsing_threshold, ab_map + final_results, + readsid_per_cell, + umis_per_cell, + whitelist, + collapsing_threshold, + ab_map ): """ Corrects cell barcodes. Args: final_results (dict): Dict of dict of Counters with mapping results. + readsid_per_cell (dict): A dict of array where keys are cells and array contains read ids umis_per_cell (Counter): Counter of UMIs per cell. whitelist (set): The whitelist reference given by the user. - collapsing_threshold (int): Max distance between umis. + collapsing_threshold (int): Max distance between cell barcodes. ab_map (OrederedDict): Tags in an ordered dict. Returns: - final_results (dict): Same as input but with corrected umis. + final_results (dict): Same as input but with corrected cell barcodes. + readsid_per_cell (dict): Same as input but with corrected cell barcodes. umis_per_cell (Counter): Updated UMI counts after correction. corrected_barcodes (int): How many umis have been corrected. """ @@ -403,10 +447,10 @@ def correct_cells_whitelist( whitelist=whitelist, collapsing_threshold=collapsing_threshold, ) - (umis_per_cell, final_results, corrected_barcodes) = collapse_cells( - true_to_false, umis_per_cell, final_results, ab_map + (umis_per_cell, final_results, readsid_per_cell, corrected_barcodes) = collapse_cells( + true_to_false, umis_per_cell, final_results, readsid_per_cell, ab_map ) - return (final_results, umis_per_cell, corrected_barcodes) + return (final_results, readsid_per_cell, umis_per_cell, corrected_barcodes) def find_true_to_false_map( diff --git a/tests/test_data/fastq/correct_R1_with_cell_barcode_mm.fastq.gz b/tests/test_data/fastq/correct_R1_with_cell_barcode_mm.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..4dc0f1d5fdc35f1694d987ddff4599d6f794580e GIT binary patch literal 2391 zcmV-d38?lTiwFo#8D3=o17mM;a%E$5Us5qHW?^%5aR9YkOS0oO4BWp{WS>Q>zr@wB zw9t+#ULhyQ+`odREJF4W7!!PxsibUII}kyj(E#cB@$=vRes(VU!yo(M{SkNi&+hhc z;C0-C>;CEfN~o`Q6obD0{@TQUzJB2AJsF`d9d{#s5BMu`BE|ASgpbtFxeD`h^jWEb&guG2& zfGgZ08ez}}MuyVHle?2k-1;;!6gK_>eHg-TMkp`?JV6`3-9q>k#h_**-Q#<#6gK4u zd)O&Nqjy~=?h2!9Yl+7Z=}(}ST`#VMQMSFO_aWspSsBF5XOz)nJN-G33$ByXo>9iM zzj;i_`x=jxX_EdA^ftOqPJBigci+Pk0W8iaoLoa(d_Zelh0=*FM-u1m~C8Mmn?~^m&F}dP1 z%DOMP;`gJnFBK)f#QflMBm{YEl9@lFY;1Ysv3?fWvX#duawyr5*sfQqenwfVKFJa) zQODtWt?XYYGsDn>VryM5=RTusYq>GZ@DSHh#V`dcAwp;or5|qSL@SIkdkpVMKDa^2 z{fshtj9vshBw3N`KBJ5=mR`{Z(G8;Th5 zV@lo6C?k)-84-Hk4RY=?%3Ld{Hwh$!+@R!sMp;!KzMo!>4@LD|DBHf=JT`XXzEpWk z*Zku*Y5tOx)XHIA;Ko6=l2JA*NH}VL={emf*L_AA83q#%ddfiiBIiD%tO}NLjXFA5 zku^V~tjv$H0qNjIx$ZN{xE7DT6rZ_K*8GgJ_LWd30e$7T$!wWX&3bH->wc9UE5;ZK zVLweW#p;Y(%`5vP&rS4~xy-Ny5%F0RZ_ZijPiBC$uU(<<~)`I6m z!x-Eo=RTvFvvN{2e@#}FI2pAmH4biCWlqkhW^FmixnHF%OKU-YjgHZ~S?>24W$j+^ zkf?)hmg_#F%sn!dbGY5jO5M+><~i>y=YG|ox7fV~Ql{wV-7M!mqneq2mg{~i^CJ`R ztk`{7^E1lGV?67d<=k(c^=wE1Gs?P_ zJX2a`Wi1smrBc3NT!cSlMeg?*Wu9GJ8fKCE{VEMpsxLgdSme52HM=M?4Ei}JrBylO z%P1=+Ljt4Zx>cU@Gs;?DFu*<6(ku1lLYXzp)xWGtzhBe8=>7isP5pjx4s%Y}D8^QK z&d(_8X~L-+h+$Sa_Zel){o^|2S+(3h+D>_xVOFK?*Dy@+gcp>KjB~3z>tmGlVnSH+M-~kvBQ_TPt#^gXyfuZQA+%g=)V-qMY$H-tM?D z|E8SpH!}Z$<_954Ru;K`$te5oUy-cz`>k_J?kqNK&R@}4l&tu40R@`Bd&tSpu#e23 zP(%;zOE&|KbI^~;7tC<>zvCG8;r@HY%)Rt`yBRp`m(_%H>P6(=J?>}x@2k0BAI906 znUD@8o_ljI{;xQO+04AGW)jV`uEwVmT!bo_nME@Uvzdvws)@|QWwk{#!>|t{?cHf; zPCc@O%-qHQo6|DPYR1QKIc-zSTrgWer}V&WG+Isf5CfWF<`8)H_GG5hLf{FrIV~pc z0)>&8UUVA6Y^zDaF5P$~GlRGqhP8)4H-1kD6wHt>u9samiDnpPTn*|w)GDQCH_NNZ zFlz`r!xc_jln}UJ*1So?dxpTOB)%mNHE)|>M>-{~}23oy(WY3V`F^z3C8V3^Hm`*Zw;vewI5fMM1Uz)Sp! z!*ws${@z32g4vvwCSx9tOJ+LJX$&(mbn0F3;~Fv}vjD@s!HnDyw3<158jQ!ZH>1MP z7tE@m(~psG!LS?Tj(}mt5J+kQw=c-dEQdgb8ObJ`w-nQA6fKZp?OI^?CM{6XX*Uf$ z$SlyRq0=k|kxF|rDlCv;*1TcWIm#?hr8-N6AmEsnpvj^b#DwZQZcL!e3v6qOAM3AmBW$Q=R0jH`i|fHR2GCYc2oW>uXW(33nD zs0!#}dOiaczsr+ZfMI53gEE5a=H1JEz+YJ42{USe{W{7ZGl~|-Fl#^`=~por+f8yu zz%Z+T!ZYsa_n$9U0^if~g0+uSCwVSVHBv2wK>CeGy06vE%5*Tp+U4M^bObeWuo%!W zU6TUY%vwiq!EBs%AKuTh7N{ED7xRYa0<+u^RLuo`7Xk_k)P%rq#O*B41**hti3N^; zPCTXOtgt|aSv!J&+dnuZ?q;ncxL`)7`Hf(&cd(2J}Esx)+Qn8JIr$ShE` zHzf^?d$PD@1~al2U|4%cu>8S};Kl;WpPLT)BqYvyvnXc-8D<>~?R#F0q6IR{*b$_k z{NN)pqjUrrW)!z+Msl2aBXvc($XbA5?Ng3To(@(`If{G3l4Da?phj{O z8}`H_9z}Xnrh^$~Rh=K1wI)cWj`hCi=X( zmqOq+yx)|Q!p4LA6hw*T$&4}=$T0Km2UIH$hTaqw$T0g69=eJHe>adk7hsrf=6@hC J045Aj007>{sFVNz literal 0 HcmV?d00001 diff --git a/tests/test_data/tags/pass/correct_3.csv b/tests/test_data/tags/pass/correct_3.csv new file mode 100644 index 0000000..410078b --- /dev/null +++ b/tests/test_data/tags/pass/correct_3.csv @@ -0,0 +1,4 @@ +CGTACGTAGCCTAGC,test1 +CGTAGCTCGAAAAAA,test2 +CGTCGAAGCTGAACG,test3 +CGTCGTAGCTGATCG,test4 diff --git a/tests/test_data/whitelist.csv b/tests/test_data/whitelist.csv new file mode 100644 index 0000000..c7e3da4 --- /dev/null +++ b/tests/test_data/whitelist.csv @@ -0,0 +1,2 @@ +TACATATTCTTTACTG +TAGAGGGAAGTCAAGC diff --git a/tests/test_data/whitelist_single.csv b/tests/test_data/whitelist_single.csv new file mode 100644 index 0000000..df59693 --- /dev/null +++ b/tests/test_data/whitelist_single.csv @@ -0,0 +1 @@ +TACATATTCTTTACTG diff --git a/tests/test_processing.py b/tests/test_processing.py index 63eea35..90b04bc 100644 --- a/tests/test_processing.py +++ b/tests/test_processing.py @@ -147,7 +147,7 @@ def test_find_best_match_with_3_distance_reverse(data): 'test_find_best_match_with_3_distance', 'test_find_best_match_with_3_distance_reverse',]) def test_classify_reads_multi_process(data): - (results, no_match) = processing.map_reads( + (results, reads_cells, no_match) = processing.map_reads( pytest.correct_R1_path, pytest.correct_R2_path, pytest.tags,