spl-IsoQuant/splisoquant.py at main · algbio/spl-IsoQuant · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
#
# ############################################################################
# Copyright (c) 2022-2026 University of Helsinki
# Copyright (c) 2019-2022 Saint Petersburg State University
# # All Rights Reserved
# See file LICENSE for details.
############################################################################
import argparse
import glob
import json
import logging
import os.path
import pickle
import shutil
import sys
import time
import gzip
from collections import namedtuple
from io import StringIO
from traceback import print_exc
from concurrent.futures import ProcessPoolExecutor
import concurrent.futures

import pysam
import gffutils
import pyfaidx

from src.error_codes import IsoQuantExitCode
from src.modes import IsoQuantMode, ISOQUANT_MODES
from src.gtf2db import convert_gtf_to_db
from src.read_mapper import (
    DATA_TYPE_ALIASES,
    SUPPORTED_STRANDEDNESS,
    SUPPORTED_ALIGNERS,
    ASSEMBLY,
    PACBIO_CCS_DATA,
    NANOPORE_DATA,
    DataSetReadMapper
)
from src.alignment_processor import PolyATrimmed
from src.dataset_processor import DatasetProcessor, PolyAUsageStrategies
from src.graph_based_model_construction import StrandnessReportingLevel
from src.long_read_assigner import AmbiguityResolvingMethod
from src.long_read_counter import COUNTING_STRATEGIES, CountingStrategy, GroupedOutputFormat, NormalizationMethod
from src.input_data_storage import InputDataStorage, InputDataType
from src.multimap_resolver import MultimapResolvingStrategy
from src.stats import combine_counts
from src.barcode_calling import process_single_thread, process_in_parallel, get_umi_length
from src.common import setup_worker_logging, _get_log_params


logger = logging.getLogger('IsoQuant')

# Large output file types for --large_output option
LARGE_OUTPUT_TYPES = ["read_assignments", "corrected_bed", "read2transcripts", "allinfo", "none"]


def bool_str(s):
    s = s.lower()
    if s not in {'false', 'true', '0', '1'}:
        raise ValueError('Not a valid boolean string')
    return s == 'true' or s == '1'


def parse_args(cmd_args=None, namespace=None):
    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter)
    ref_args_group = parser.add_argument_group('Reference data')
    input_args_group = parser.add_argument_group('Input data')
    output_args_group = parser.add_argument_group('Output naming')
    pipeline_args_group = parser.add_argument_group('Pipeline options')
    algo_args_group = parser.add_argument_group('Algorithm settings')
    output_setup_args_group = parser.add_argument_group('Output configuration')
    align_args_group = parser.add_argument_group('Aligner settings')
    filer_args_group = parser.add_argument_group('Read filtering options')
    sc_args_group = parser.add_argument_group('Single-cell/spatial-related options:')

    other_options = parser.add_argument_group("Additional options:")
    show_full_help = '--full_help' in cmd_args

    def add_additional_option(*args, **kwargs):  # show command only with --full-help
        if not show_full_help:
            kwargs['help'] = argparse.SUPPRESS
        other_options.add_argument(*args, **kwargs)

    def add_option_to_group(opt_group, *args, **kwargs):  # show command only with --full-help
        opt_group.add_argument(*args, **kwargs)

    def add_additional_option_to_group(opt_group, *args, **kwargs):  # show command only with --full-help
        if not show_full_help:
            kwargs['help'] = argparse.SUPPRESS
        opt_group.add_argument(*args, **kwargs)

    def add_option_to_group(opt_group, *args, **kwargs):  # show command only with --full-help
        opt_group.add_argument(*args, **kwargs)

    def add_hidden_option(*args, **kwargs):  # show command only with --full-help
        kwargs['help'] = argparse.SUPPRESS
        parser.add_argument(*args, **kwargs)

    parser.add_argument("--full_help", action='help', help="show full list of options")

    parser.add_argument("--test", action=TestMode, nargs=0, help="run IsoQuant on toy dataset")
    add_hidden_option('--debug', action='store_true', default=False,
                      help='Debug log output.')

    output_args_group.add_argument("--output", "-o", help="output folder, will be created automatically "
                                                          "[default=isoquant_output]",
                                   type=str, default="isoquant_output")
    output_args_group.add_argument('--prefix', '-p', type=str,
                                   help='experiment name; to be used for folder and file naming; default is OUT',
                                   default="OUT")
    output_args_group.add_argument('--labels', '-l', nargs='+', type=str,
                                   help='sample/replica labels to be used as column names; input file names are used '
                                        'if not set; must be equal to the number of input files given via --fastq/--bam')
    # REFERENCE
    ref_args_group.add_argument("--reference", "-r", help="reference genome in FASTA format (can be gzipped)",
                                type=str)
    ref_args_group.add_argument("--genedb", "-g", help="gene database in gffutils DB format or GTF/GFF "
                                                       "format (optional)", type=str)
    ref_args_group.add_argument('--complete_genedb', action='store_true', default=False,
                                help="use this flag if gene annotation contains transcript and gene metafeatures, "
                                     "e.g. with official annotations, such as GENCODE; "
                                     "speeds up gene database conversion")
    add_additional_option_to_group(ref_args_group, "--discard_chr", nargs="+", help="chromosome IDs to ignore", type=str, default=[])
    add_additional_option_to_group(ref_args_group, "--process_only_chr", nargs="+", help="chromosome IDs to process",
                                   type=str, default=None)
    add_additional_option_to_group(ref_args_group, "--index", help="genome index for specified aligner (optional)",
                                   type=str)

    # INPUT READS

    input_args = input_args_group.add_mutually_exclusive_group()
    input_args.add_argument('--bam', nargs='+', type=str,
                            help='sorted and indexed BAM file(s), each file will be treated as a separate sample')
    input_args.add_argument('--fastq', nargs='+', type=str,
                            help='input FASTQ/FASTA file(s) with reads, each file will be treated as a separate sample; '
                                 'reference genome should be provided when using reads as input')
    input_args.add_argument('--unmapped_bam', nargs='+', type=str,
                            help='unmapped BAM file(s), each file will be treated as a separate sample')
    input_args.add_argument('--yaml', type=str, help='yaml file containing all input files, one entry per sample'
                                                     ', check readme for format info')

    input_args_group.add_argument('--illumina_bam', nargs='+', type=str,
                                  help='sorted and indexed file(s) with Illumina reads from the same sample')

    input_args_group.add_argument("--read_group", nargs='+', type=str,
                                  help="one or more ways to group feature counts (no grouping by default); "
                                       "multiple grouping strategies can be specified (space-separated); "
                                       "supported formats: "
                                       "tag:TAG (BAM tag), "
                                       "file:FILE:READ_COL:GROUP_COL(S):DELIM (TSV file, use comma-separated columns for multi-column grouping, e.g., file:table.tsv:0:1,2,3), "
                                       "read_id:DELIM (read ID suffix), "
                                       "file_name (original filename), "
                                       "barcode_spot (map barcodes to spots/cell types using --barcode2spot), "
                                       "barcode_barcode (map barcodes to spots using --barcode2barcode), "
                                       "barcode (group by barcode from --barcoded_reads)")

    add_additional_option_to_group(input_args_group, "--read_assignments", nargs='+', type=str,
                                   help="reuse read assignments (binary format)", default=None)

    # INPUT PROPERTIES
    input_args_group.add_argument("--data_type", "-d", type=str, choices=DATA_TYPE_ALIASES.keys(),
                        help="type of data to process, supported types are: " + ", ".join(DATA_TYPE_ALIASES.keys()))
    input_args_group.add_argument('--stranded',  type=str, help="reads strandness type, supported values are: " +
                        ", ".join(SUPPORTED_STRANDEDNESS), default="none")
    input_args_group.add_argument('--polya_trimmed', default=PolyATrimmed.none.name, type=str,
                                  choices=[e.name for e in PolyATrimmed],
                                  help="define reads which had polyA tail trimmed")
    input_args_group.add_argument('--fl_data', action='store_true', default=False,
                        help="reads represent FL transcripts; both ends of the read are considered to be reliable")

    # SC ARGUMENTS
    add_option_to_group(sc_args_group, "--mode", "-m", type=str, choices=ISOQUANT_MODES,
                        help="IsoQuant modes: " + ", ".join(ISOQUANT_MODES) +
                             "; default:%s" % IsoQuantMode.bulk.name, default=IsoQuantMode.bulk.name)
    add_option_to_group(sc_args_group, '--barcode_whitelist', type=str, nargs='+',
                        help='file(s) with barcode whitelist(s) for barcode calling')
    add_option_to_group(sc_args_group, "--barcoded_reads", type=str, nargs='+',
                        help='TSV file(s) with barcoded reads; barcodes will be called automatically if not provided')
    add_option_to_group(sc_args_group, "--barcoded_bam", action='store_true', default=False,
                        help='extract barcodes and UMIs from BAM tags (CB/UB by default); bypasses barcode calling')
    add_additional_option_to_group(sc_args_group, "--barcode_tag", type=str, default="CB",
                                   help='BAM tag for cell barcode (default: CB)')
    add_additional_option_to_group(sc_args_group, "--umi_tag", type=str, default="UB",
                                   help='BAM tag for UMI (default: UB)')
    add_additional_option_to_group(sc_args_group, "--strip_barcode_suffix", action='store_true', default=False,
                                   help='remove suffix after dash from barcodes extracted from BAM tag (e.g. ACGT-1 -> ACGT)')
    add_option_to_group(sc_args_group, "--barcode2spot", type=str,
                        help='TSV file mapping barcode to cell type / spot id. '
                             'Format: file.tsv or file.tsv:barcode_col:spot_cols '
                             '(e.g., file.tsv:0:1,2,3 for multiple spot columns)')
    add_option_to_group(sc_args_group, "--barcode2barcode", type=str,
                        help='TSV file mapping barcode to spot IDs for UMI deduplication; '
                             'format: file.tsv or file.tsv:barcode_col:spot_cols')
    add_option_to_group(sc_args_group, "--molecule", type=str,
                        help='molecule definition file (MDF) for custom_sc mode; '
                             'defines molecule structure for universal barcode extraction')

    # ALGORITHM
    add_additional_option_to_group(algo_args_group, "--report_novel_unspliced", "-u", type=bool_str,
                                   help="report novel monoexonic transcripts (true/false), "
                                        "default: false for ONT, true for other data types")
    add_additional_option_to_group(algo_args_group, "--report_canonical",  type=str,
                                   choices=[e.name for e in StrandnessReportingLevel],
                                   help="reporting level for novel transcripts based on canonical splice sites;"
                                        " default: " + StrandnessReportingLevel.auto.name,
                                   default=StrandnessReportingLevel.auto.name)
    add_additional_option_to_group(algo_args_group, "--polya_requirement", type=str,
                                   choices=[e.name for e in PolyAUsageStrategies],
                                   help="require polyA tails to be present when reporting transcripts; "
                                        "default: auto (requires polyA only when polyA percentage is >= 70%%)",
                                   default=PolyAUsageStrategies.auto.name)

    add_additional_option_to_group(algo_args_group, "--transcript_quantification", choices=COUNTING_STRATEGIES,
                                   help="transcript quantification strategy", type=str,
                                   default=CountingStrategy.unique_only.name)
    add_additional_option_to_group(algo_args_group, "--gene_quantification", choices=COUNTING_STRATEGIES,
                                   help="gene quantification strategy", type=str,
                                   default=CountingStrategy.unique_splicing_consistent.name)

    add_additional_option_to_group(algo_args_group, "--matching_strategy",
                                   choices=["exact", "precise", "default", "loose"],
                                   help="read-to-isoform matching strategy from the most strict to least",
                                   type=str, default=None)
    add_additional_option_to_group(algo_args_group, "--splice_correction_strategy",
                                   choices=["none", "default_pacbio", "default_ont",
                                            "conservative_ont", "all", "assembly"],
                                   help="read alignment correction strategy to use", type=str, default=None)
    add_additional_option_to_group(algo_args_group, "--model_construction_strategy",
                                   choices=["reliable", "default_pacbio", "sensitive_pacbio", "fl_pacbio",
                                            "default_ont", "sensitive_ont", "all", "assembly"],
                                   help="transcript model construction strategy to use", type=str, default=None)
    add_additional_option_to_group(algo_args_group, "--delta", type=int, default=None,
                                   help="delta for inexact splice junction comparison, chosen automatically based on data type")
    add_additional_option_to_group(algo_args_group, "--use_replicas", type=bool_str, default=True,
                                   help="require novel transcripts to be confirmed by multiple files "
                                        "when file_name grouping is used (default: true)")


    # PIPELINE STEPS
    pipeline_args_group.add_argument("--threads", "-t", help="number of threads to use", type=int,
                                     default="16")

    resume_args = pipeline_args_group.add_mutually_exclusive_group()
    resume_args.add_argument("--resume", action="store_true", default=False,
                             help="resume failed run, specify output folder, input options are not allowed")
    resume_args.add_argument("--force", action="store_true", default=False,
                             help="force to overwrite the previous run")

    add_additional_option_to_group(pipeline_args_group, '--clean_start', action='store_true', default=False,
                                   help='Do not use previously generated index, feature db or alignments.')
    add_additional_option_to_group(pipeline_args_group, "--no_model_construction", action="store_true",
                                   default=False, help="run only read assignment and quantification")
    add_additional_option_to_group(pipeline_args_group, "--run_aligner_only", action="store_true", default=False,
                                   help="align reads to reference without running further analysis")
    add_additional_option_to_group(pipeline_args_group, "--no_gtf_check", help="do not perform GTF checks",
                                   dest="gtf_check",
                                   action='store_false', default=True)
    add_additional_option_to_group(pipeline_args_group, "--high_memory",
                                   help="increase RAM consumption (store alignment and the genome in RAM)",
                                   action='store_true', default=False)
    add_additional_option_to_group(pipeline_args_group, "--keep_tmp", help="do not remove temporary files "
                                                                           "in the end", action='store_true',
                                   default=False)

    # OUTPUT SETUP
    output_setup_args_group.add_argument('--check_canonical', action='store_true', default=False,
                                     help="report whether splice junctions are canonical")
    output_setup_args_group.add_argument("--sqanti_output", help="produce SQANTI-like TSV output",
                                     action='store_true', default=False)
    output_setup_args_group.add_argument("--count_exons", help="perform exon and intron counting",
                                     action='store_true', default=False)
    add_additional_option_to_group(output_setup_args_group,"--bam_tags",
                                   help="comma separated list of BAM tags to be imported to read_assignments.tsv",
                                   type=str)
    add_additional_option_to_group(output_setup_args_group, "--no_gzip", help="do not gzip large output files", dest="gzipped",
                                   action='store_false', default=True)
    add_additional_option_to_group(output_setup_args_group, "--large_output", nargs='*', type=str,
                                   default=["read_assignments", "allinfo"],
                                   help="large output files to generate: " + ", ".join(LARGE_OUTPUT_TYPES) +
                                        " (default: read_assignments allinfo)")
    add_additional_option_to_group(output_setup_args_group, "--normalization_method", type=str, choices=[e.name for e in NormalizationMethod],
                                   help="TPM normalization method: simple - conventional normalization using all counted reads;"
                                        "usable_reads - includes all assigned reads;"
                                        "none - do not convert counts to TPM.",
                                   default=NormalizationMethod.simple.name)
    add_additional_option_to_group(output_setup_args_group, "--counts_format", type=str, nargs='+',
                                   choices=[e.name for e in GroupedOutputFormat],
                                   help="output format for grouped counts",
                                   default=[GroupedOutputFormat.default.name])

    add_additional_option_to_group(output_setup_args_group, "--genedb_output", help="output folder for converted gene "
                                                                                    "database, will be created automatically "
                                                                                    " (same as output by default)", type=str)

    # ALIGNER
    add_additional_option_to_group(align_args_group, "--aligner", help="force to use this alignment method, can be " + ", ".join(SUPPORTED_ALIGNERS)
                                        + "; chosen based on data type if not set", type=str)
    add_additional_option_to_group(align_args_group,  "--no_junc_bed", action="store_true", default=False,
                          help="do NOT use annotation for read mapping")
    add_additional_option_to_group(align_args_group, "--junc_bed_file", type=str,
                          help="annotation in BED format produced by minimap's paftools.js gff2bed "
                               "(will be created automatically if not given)")
    add_additional_option_to_group(align_args_group, "--indexing_options", type=str,
                                   help="additional options that will be passed to the aligner indexer")
    add_additional_option_to_group(align_args_group, "--mapping_options", type=str,
                                   help="additional options that will be passed to the aligner")

    # READ FILTERING
    add_additional_option_to_group(filer_args_group, "--use_secondary",
                                   help="use secondary alignments (slower processing)",
                                   action='store_true', default=False)
    add_additional_option_to_group(filer_args_group, "--no_secondary",
                                   help="deprecated, secondary alignments are not used by default (kept for user convenience)",
                                   action='store_true', default=False)
    add_additional_option_to_group(filer_args_group, "--min_mapq",
                                   help="ignore alignments with MAPQ < this (also filters out secondary alignments, default: None)", type=int)
    add_additional_option_to_group(filer_args_group, "--inconsistent_mapq_cutoff",
                                   help="ignore inconsistent alignments with MAPQ < this (works only with the reference annotation, default=5)",
                                   type=int, default=5)
    add_additional_option_to_group(filer_args_group, "--simple_alignments_mapq_cutoff",
                                   help="ignore alignments with 1 or 2 exons and MAPQ < this "
                                        "(works only in annotation-free mode, default=1)", type=int, default=1)
    add_additional_option_to_group(filer_args_group, "--max_coverage_small_chr",
                                   help="process only a fraction of reads for high-coverage loci on small chromosomes, "
                                        "e.g. mitochondrial (default 1000000); significantly improves running time and RAM",
                                   type=int, default=1000000)
    add_additional_option_to_group(filer_args_group, "--max_coverage_normal_chr",
                                   help="process only a fraction of reads for high-coverage loci on usual chromosomes"
                                        " (default -1 = infinity);  improves running time and RAM",
                                   type=int, default=-1)

    # REST
    add_hidden_option("--graph_clustering_distance", type=int, default=None,
                      help="intron graph clustering distance, "
                           "splice junctions less that this number of bp apart will not be differentiated")
    add_hidden_option("--cage", help="bed file with CAGE peaks", type=str, default=None)
    add_hidden_option("--cage-shift", type=int, default=50, help="interval before read start to look for CAGE peak")

    isoquant_version = "3.4.0"
    try:
        with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), "VERSION")) as version_f:
            isoquant_version = version_f.readline().strip()
    except FileNotFoundError:
        pass
    parser.add_argument('--version', '-v', action='version', version='IsoQuant ' + isoquant_version)

    args = parser.parse_args(cmd_args, namespace)

    if args.resume:
        resume_parser = argparse.ArgumentParser(add_help=False)
        resume_parser.add_argument("--resume", action="store_true", default=False,
                                   help="resume failed run, specify only output folder, "
                                        "input options are not allowed")
        resume_parser.add_argument("--output", "-o",
                                   help="output folder, will be created automatically [default=isoquant_output]",
                                   type=str, required=True)
        resume_parser.add_argument('--debug', action='store_true', default=argparse.SUPPRESS,
                                   help='Debug log output.')
        resume_parser.add_argument("--threads", "-t", help="number of threads to use",
                                   type=int, default=argparse.SUPPRESS)
        resume_parser.add_argument("--high_memory",
                                   help="increase RAM consumption (store alignment and the genome in RAM)",
                                   action='store_true', default=False)
        resume_parser.add_argument("--keep_tmp", help="do not remove temporary files in the end",
                                   action='store_true', default=argparse.SUPPRESS)

        args, unknown_args = resume_parser.parse_known_args(cmd_args)
        if unknown_args:
            logger.error("You cannot specify options other than --output/--threads/--debug/--high_memory "
                         "with --resume option")
            parser.print_usage()
            sys.exit(IsoQuantExitCode.INCOMPATIBLE_OPTIONS)

    args._cmd_line = " ".join(sys.argv)
    args._version = isoquant_version

    args.output_exists = os.path.exists(args.output)
    if not args.output_exists:
        os.makedirs(args.output)

    return args, parser


def check_and_load_args(args, parser):
    args.param_file = os.path.join(args.output, ".params")
    if args.resume:
        if not os.path.exists(args.output) or not os.path.exists(args.param_file):
            # logger is not defined yet
            logger.error("Previous run config was not detected, cannot resume. "
                         "Check that output folder is correctly specified.")
            sys.exit(IsoQuantExitCode.RESUME_CONFIG_NOT_FOUND)
        args = load_previous_run(args)
    elif args.output_exists:
        if os.path.exists(args.param_file):
            if args.force:
                logger.warning("Output folder already contains a previous run, will be overwritten.")
            else:
                logger.warning("Output folder already contains a previous run, some files may be overwritten. "
                               "Use --resume to resume a failed run. Use --force to avoid this message.")
                logger.warning("Press Ctrl+C to interrupt the run now.")
                delay = 9
                for i in range(delay):
                    countdown = delay - i
                    sys.stdout.write("Resuming the run in %d second%s\r" % (countdown, "s" if countdown > 1 else ""))
                    time.sleep(1)
                logger.info("Overwriting the previous run")
                time.sleep(1)
        else:
            logger.warning("Output folder already exists, some files may be overwritten.")

    if args.genedb_output is None:
        args.genedb_output = args.output
    elif not os.path.exists(args.genedb_output):
        os.makedirs(args.genedb_output)
    if not args.genedb:
        args.genedb_filename = None
    elif args.genedb.lower().endswith("db"):
        args.genedb_filename = args.genedb
    else:
        args.genedb_filename = os.path.join(args.genedb_output, os.path.splitext(os.path.basename(args.genedb))[0] + ".db")

    if not check_input_params(args):
        parser.print_usage()
        sys.exit(IsoQuantExitCode.INVALID_PARAMETER)

    # Validate --read_group none
    if args.read_group:
        if "none" in args.read_group and len(args.read_group) > 1:
            logger.error("--read_group 'none' cannot be combined with other values")
            sys.exit(IsoQuantExitCode.INVALID_PARAMETER)

    # Validate --large_output values
    if args.large_output:
        if "none" in args.large_output and len(args.large_output) > 1:
            logger.error("--large_output 'none' cannot be combined with other values")
            sys.exit(IsoQuantExitCode.INVALID_PARAMETER)
        for val in args.large_output:
            if val not in LARGE_OUTPUT_TYPES:
                logger.error("Invalid --large_output value: %s. Valid values: %s" % (val, ", ".join(LARGE_OUTPUT_TYPES)))
                sys.exit(IsoQuantExitCode.INVALID_PARAMETER)

    save_params(args)
    return args


def load_previous_run(args):
    logger.info("Loading parameters from the previous run")
    logger.error("Only --output/--threads/--debug/--high_memory are compatible with --resume option")
    unpickler = pickle.Unpickler(open(args.param_file, "rb"), fix_imports=False)
    loaded_args = unpickler.load()

    for option in args.__dict__:
        loaded_args.__dict__[option] = args.__dict__[option]

    if loaded_args.debug:
        logger.setLevel(logging.DEBUG)
        logger.handlers[0].setLevel(logging.DEBUG)

    return loaded_args


def save_params(args):
    for file_opt in ["genedb", "reference", "index", "bam", "fastq", "junc_bed_file",
                     "cage", "genedb_output", "read_assignments"]:
        if file_opt in args.__dict__ and args.__dict__[file_opt]:
            if isinstance(args.__dict__[file_opt], list):
                args.__dict__[file_opt] = list(map(os.path.abspath, args.__dict__[file_opt]))
            else:
                args.__dict__[file_opt] = os.path.abspath(args.__dict__[file_opt])

    if "read_group" in args.__dict__ and args.__dict__["read_group"]:
        updated_specs = []
        for spec in args.read_group:
            vals = spec.split(":")
            if len(vals) > 1 and vals[0] == 'file':
                vals[1] = os.path.abspath(vals[1])
                updated_specs.append(":".join(vals))
            else:
                updated_specs.append(spec)
        args.read_group = updated_specs

    pickler = pickle.Pickler(open(args.param_file, "wb"),  -1)
    pickler.dump(args)
    pass


# Check user's params
def check_input_params(args):
    if not args.reference:
        logger.error("Reference genome was not provided")
        return False
    if not args.data_type:
        logger.error("Data type is not provided, choose one of " + " ".join(DATA_TYPE_ALIASES.keys()))
        return False
    elif args.data_type not in DATA_TYPE_ALIASES.keys():
        logger.error("Unsupported data type " + args.data_type + ", choose one of: " + " ".join(DATA_TYPE_ALIASES.keys()))
        return False
    args.data_type = DATA_TYPE_ALIASES[args.data_type]

    if not args.fastq and not args.bam and not args.unmapped_bam and not args.read_assignments and not args.yaml:
        logger.error("No input data was provided")
        return False

    if args.yaml and args.illumina_bam:
        logger.error("When providing a yaml file it should include all input files, including the illumina bam file.")
        return False

    args.input_data = InputDataStorage(args)
    if args.aligner is not None and args.aligner not in SUPPORTED_ALIGNERS:
        logger.error(" Unsupported aligner " + args.aligner + ", choose one of: " + " ".join(SUPPORTED_ALIGNERS))
        return False

    if args.run_aligner_only and not args.input_data.input_type.needs_mapping():
        logger.error("Data type %s cannot be mapped and thus incompatible with --run_aligner_only option." % args.input_data.input_type.name)
        return False
    if args.stranded not in SUPPORTED_STRANDEDNESS:
        logger.error("Unsupported strandness " + args.stranded + ", choose one of: " + " ".join(SUPPORTED_STRANDEDNESS))
        return False

    if not args.genedb:
        if args.count_exons:
            logger.warning("--count_exons option has no effect without gene annotation")
        if args.sqanti_output:
            args.sqanti_output = False
            logger.warning("--sqanti_output option has no effect without gene annotation")
        if args.no_model_construction:
            logger.warning("Setting --no_model_construction without providing a gene "
                           "annotation will not produce any meaningful results")

    if args.no_model_construction and args.sqanti_output:
        args.sqanti_output = False
        logger.warning("--sqanti_output option has no effect without model construction")

    if args.no_secondary:
        logger.info("--no_secondary option has no effect and will be deprecated, secondary alignments are not used by default")

    if args.process_only_chr and args.discard_chr:
        args.discard_chr = []
        logger.warning("--discard_chr has not effect when --process_only_chr is set and will be ignored")

    if "read_group" in args.__dict__ and args.__dict__["read_group"]:
        updated_specs = []
        spec_set = set()
        for spec in args.read_group:
            if spec in spec_set:
                logger.warning("Read group %s is set twice, which has no effect, duplicate will be ignored" % spec)
                continue
            updated_specs.append(spec)
            spec_set.add(spec)
        args.read_group = updated_specs

    if not isinstance(args.mode, IsoQuantMode):
        args.mode = IsoQuantMode[args.mode]

    args.umi_length = 0
    if args.mode.needs_barcode_calling():
        barcode_sources = sum([bool(args.barcode_whitelist), bool(args.barcoded_reads), bool(args.barcoded_bam)])
        if barcode_sources > 1:
            logger.critical("Options --barcode_whitelist, --barcoded_reads, and --barcoded_bam are mutually exclusive")
            sys.exit(IsoQuantExitCode.INVALID_PARAMETER)
        if args.mode == IsoQuantMode.custom_sc:
            if not args.molecule and not args.barcoded_reads and not args.barcoded_bam:
                logger.critical("custom_sc mode requires --molecule, --barcoded_reads, or --barcoded_bam")
                sys.exit(IsoQuantExitCode.BARCODE_WHITELIST_MISSING)
        elif not args.barcode_whitelist and not args.barcoded_reads and not args.barcoded_bam:
            logger.critical("You have chosen single-cell/spatial mode %s, please specify barcode whitelist, "
                            "file with barcoded reads, or --barcoded_bam" % args.mode.name)
            sys.exit(IsoQuantExitCode.BARCODE_WHITELIST_MISSING)
        if args.barcoded_bam:
            args.umi_length = _detect_umi_length_from_bam(args.input_data.samples[0].file_list[0][0], args.umi_tag)
        else:
            args.umi_length = get_umi_length(args.mode)

    check_input_files(args)
    return True


def _detect_umi_length_from_bam(bam_path: str, umi_tag: str) -> int:
    with pysam.AlignmentFile(bam_path, "rb") as bam:
        for read in bam:
            if read.has_tag(umi_tag):
                return len(read.get_tag(umi_tag))
    return 0


def check_bam_file(bam_path: str, check_index: bool = True):
    """Check BAM file exists and optionally has index."""
    if not os.path.isfile(bam_path):
        logger.critical("BAM file " + bam_path + " does not exist")
        sys.exit(IsoQuantExitCode.INPUT_FILE_NOT_FOUND)
    if check_index:
        bamfile_in = pysam.AlignmentFile(bam_path, "rb")
        if not bamfile_in.has_index():
            logger.critical("BAM file " + bam_path + " is not indexed, run samtools sort and samtools index")
            sys.exit(IsoQuantExitCode.BAM_NOT_INDEXED)
        bamfile_in.close()


def check_file_exists(file_path: str, description: str):
    """Check that a file exists, exit with error if not."""
    if not os.path.isfile(file_path):
        logger.critical(f"{description} {file_path} does not exist")
        sys.exit(IsoQuantExitCode.INPUT_FILE_NOT_FOUND)


def extract_read_group_file_path(spec: str):
    """
    Extract file path from read_group spec if it's a file-based spec.

    Returns file path for 'file:path:...' specs, None otherwise.
    """
    parts = spec.split(":")
    if len(parts) >= 2 and parts[0] == 'file':
        return parts[1]
    return None


def check_input_files(args):
    # Check reference genome
    if args.reference and not os.path.isfile(args.reference):
        logger.critical("Reference genome " + args.reference + " does not exist")
        sys.exit(IsoQuantExitCode.INPUT_FILE_NOT_FOUND)

    # Check input reads (BAM/FASTQ/save files)
    for sample in args.input_data.samples:
        for lib in sample.file_list:
            for in_file in lib:
                if args.input_data.input_type == InputDataType.save:
                    saves = glob.glob(in_file + "*")
                    if not saves:
                        logger.critical("Input files " + in_file + "* do not exist")
                    continue
                if not os.path.isfile(in_file):
                    logger.critical("Input file " + in_file + " does not exist")
                    sys.exit(IsoQuantExitCode.INPUT_FILE_NOT_FOUND)
                if args.input_data.input_type == InputDataType.bam:
                    check_bam_file(in_file, check_index=True)

        # Check Illumina BAM files
        if sample.illumina_bam is not None:
            for illumina in sample.illumina_bam:
                check_bam_file(illumina, check_index=True)

    # Check barcoded reads files (from args, not sample - sample.barcoded_reads is set later)
    if hasattr(args, 'barcoded_reads') and args.barcoded_reads:
        if isinstance(args.barcoded_reads, list):
            for bc_file in args.barcoded_reads:
                check_file_exists(bc_file, "Barcoded reads file")
        else:
            check_file_exists(args.barcoded_reads, "Barcoded reads file")

    # Check molecule definition file
    if hasattr(args, 'molecule') and args.molecule:
        check_file_exists(args.molecule, "Molecule definition file")

    # Check barcode whitelist files
    if hasattr(args, 'barcode_whitelist') and args.barcode_whitelist:
        for wl_file in args.barcode_whitelist:
            check_file_exists(wl_file, "Barcode whitelist file")

    # Check barcode2spot file (parse spec to extract filename)
    if hasattr(args, 'barcode2spot') and args.barcode2spot:
        from src.read_groups import parse_barcode2spot_spec
        bc2spot_file, _, _ = parse_barcode2spot_spec(args.barcode2spot)
        check_file_exists(bc2spot_file, "Barcode to spot mapping file")

    # Check barcode2barcode file (parse spec to extract filename)
    if hasattr(args, 'barcode2barcode') and args.barcode2barcode:
        from src.read_groups import parse_barcode2spot_spec
        bc2bc_file, _, _ = parse_barcode2spot_spec(args.barcode2barcode)
        check_file_exists(bc2bc_file, "Barcode to barcode mapping file")

    # Check read_group file specs
    if hasattr(args, 'read_group') and args.read_group:
        for spec in args.read_group:
            file_path = extract_read_group_file_path(spec)
            if file_path:
                check_file_exists(file_path, "Read group file")

    # Check junction BED file
    if hasattr(args, 'junc_bed_file') and args.junc_bed_file:
        check_file_exists(args.junc_bed_file, "Junction BED file")

    # Check CAGE file (currently not supported)
    if args.cage is not None:
        logger.critical("CAGE data is not supported yet")
        sys.exit(IsoQuantExitCode.INVALID_PARAMETER)
        if not os.path.isfile(args.cage):
            logger.critical("Bed file with CAGE peaks " + args.cage + " does not exist")
            sys.exit(IsoQuantExitCode.INPUT_FILE_NOT_FOUND)

    # Check gene database
    if args.genedb is not None:
        if not os.path.isfile(args.genedb):
            logger.critical("Gene database " + args.genedb + " does not exist")
            sys.exit(IsoQuantExitCode.GENE_DB_NOT_FOUND)
    else:
        args.no_junc_bed = True

    # Check read assignments
    if args.read_assignments is not None:
        for r in args.read_assignments:
            if not glob.glob(r + "*"):
                logger.critical("No files found with prefix " + str(r))
                sys.exit(IsoQuantExitCode.INPUT_FILE_NOT_FOUND)


def create_output_dirs(args):
    for sample in args.input_data.samples:
        sample_dir = sample.out_dir
        if os.path.exists(sample_dir):
            if not args.resume:
                logger.warning(sample_dir + " folder already exists, some files may be overwritten")
        else:
            os.makedirs(sample_dir)
        sample_aux_dir = sample.aux_dir
        if os.path.exists(sample_aux_dir):
            if not args.resume:
                logger.warning(sample_aux_dir + " folder already exists, some files may be overwritten")
        else:
            os.makedirs(sample_aux_dir)


def set_logger(args):
    output_level = logging.DEBUG if args.__dict__.get('debug') else logging.INFO
    log_file = os.path.join(args.output, "isoquant.log")
    if os.path.exists(log_file):
        old_log_file = os.path.join(args.output, "isoquant.log.old")
        with open(old_log_file, "a") as olf:
            olf.write("\n")
            shutil.copyfileobj(open(log_file, "r"), olf)
    with open(log_file, "w") as f:
        f.write("Command line: " + args._cmd_line + '\n')
    setup_worker_logging(log_file, output_level)
    logger.info("Running Spl-IsoQuant version " + args._version)


def set_data_dependent_options(args):
    matching_strategies = {ASSEMBLY: "precise", PACBIO_CCS_DATA: "precise", NANOPORE_DATA: "default"}
    if args.matching_strategy is None:
        args.matching_strategy = matching_strategies[args.data_type]

    model_construction_strategies = {ASSEMBLY: "assembly", PACBIO_CCS_DATA: "default_pacbio", NANOPORE_DATA: "default_ont"}
    if args.model_construction_strategy is None:
        args.model_construction_strategy = model_construction_strategies[args.data_type]
        if args.fl_data and args.model_construction_strategy == "default_pacbio":
            args.model_construction_strategy = "fl_pacbio"

    splice_correction_strategies = {ASSEMBLY: "assembly", PACBIO_CCS_DATA: "default_pacbio", NANOPORE_DATA: "default_ont"}
    if args.splice_correction_strategy is None:
        args.splice_correction_strategy = splice_correction_strategies[args.data_type]

    args.resolve_ambiguous = 'monoexon_and_fsm' if args.fl_data else 'default'
    args.requires_polya_for_construction = False

    # Handle --read_group none: disable all auto-added groupings
    if args.read_group is not None and "none" in args.read_group:
        args.read_group = None
        return

    # Automatically add file_name grouping when multiple files are present
    if args.input_data.has_replicas():
        if args.read_group is None:
            # No read grouping specified, use file_name
            args.read_group = ["file_name"]
        else:
            # Read grouping specified, ensure file_name is included
            if "file_name" not in args.read_group:
                args.read_group.append("file_name")

    # Automatically add barcode_spot grouping when --barcode2spot is set
    if hasattr(args, 'barcode2spot') and args.barcode2spot:
        if args.read_group is None:
            args.read_group = ["barcode_spot"]
        elif "barcode_spot" not in args.read_group:
            args.read_group.append("barcode_spot")

    # Automatically add barcode_barcode grouping when --barcode2barcode is set
    if hasattr(args, 'barcode2barcode') and args.barcode2barcode:
        if args.read_group is None:
            args.read_group = ["barcode_barcode"]
        elif "barcode_barcode" not in args.read_group:
            args.read_group.append("barcode_barcode")

    # In SC modes, auto-add barcode grouping if no barcode-related grouping is set
    if args.mode.needs_barcode_calling():
        barcode_groupings = {"barcode", "barcode_spot", "barcode_barcode"}
        has_barcode_grouping = (args.read_group is not None and
                                any(rg in barcode_groupings for rg in args.read_group))
        if not has_barcode_grouping:
            if args.read_group is None:
                args.read_group = ["barcode"]
            else:
                args.read_group.append("barcode")
            logger.info("Single-cell/spatial mode: automatically adding '--read_group barcode'. "
                        "Use '--read_group none' to disable.")


def set_matching_options(args):
    MatchingStrategy = namedtuple('MatchingStrategy',
                                  ('delta', 'max_intron_shift', 'max_missed_exon_len', 'max_fake_terminal_exon_len',
                                   'max_suspicious_intron_abs_len', 'max_suspicious_intron_rel_len',
                                   'resolve_ambiguous', 'correct_minor_errors'))

    strategies = {
        'exact':   MatchingStrategy(0, 0, 0, 0, 0, 0.0, 'monoexon_only', False),
        'precise': MatchingStrategy(4, 30, 50, 20, 0, 0.0, 'monoexon_and_fsm', True),
        'default': MatchingStrategy(6, 60, 100, 40, 60, 1.0, 'monoexon_and_fsm', True),
        'loose':   MatchingStrategy(12, 60, 100, 40, 60, 1.0, 'all',  True),
    }

    strategy = strategies[args.matching_strategy]

    if args.delta is None:
        args.delta = strategy.delta
    elif args.delta < 0:
        logger.error("--delta can not be negative")
        sys.exit(IsoQuantExitCode.INVALID_PARAMETER)
    args.minor_exon_extension = 50
    args.major_exon_extension = 300
    args.max_intron_shift = strategy.max_intron_shift
    args.max_missed_exon_len = strategy.max_missed_exon_len
    args.max_fake_terminal_exon_len = strategy.max_fake_terminal_exon_len
    # short introns that are actually long deletions, fix minimaps logic
    args.max_suspicious_intron_abs_len = strategy.max_suspicious_intron_abs_len
    args.max_suspicious_intron_rel_len = strategy.max_suspicious_intron_rel_len
    args.min_abs_exon_overlap = 10
    args.min_rel_exon_overlap = 0.2
    args.micro_intron_length = 50
    args.max_intron_abs_diff = min(30, args.max_intron_shift)
    args.max_intron_rel_diff = 0.2
    args.apa_delta = args.minor_exon_extension
    args.minimal_exon_overlap = 5
    args.minimal_intron_absence_overlap = 20
    args.polya_window = 16
    args.polya_fraction = 0.75
    if args.resolve_ambiguous == 'default':
        args.resolve_ambiguous = strategy.resolve_ambiguous
    if args.resolve_ambiguous not in AmbiguityResolvingMethod.__dict__:
        logger.error("Incorrect resolving ambiguity method: " + args.resolve_ambiguous + ", default will be used")
        args.resolve_ambiguous = strategy.resolve_ambiguous
    args.resolve_ambiguous = AmbiguityResolvingMethod[args.resolve_ambiguous]
    args.correct_minor_errors = strategy.correct_minor_errors

    updated_strategy = MatchingStrategy(args.delta, args.max_intron_shift, args.max_missed_exon_len,
                                        args.max_fake_terminal_exon_len,
                                        args.max_suspicious_intron_abs_len, args.max_suspicious_intron_rel_len,
                                        args.resolve_ambiguous, args.correct_minor_errors)
    logger.debug('Using %s strategy. Updated strategy: %s.' % (args.matching_strategy, updated_strategy))


def set_splice_correction_options(args):
    SplicSiteCorrectionStrategy = namedtuple('SplicSiteCorrectionStrategy',
                                             ('fuzzy_junctions', 'intron_shifts', 'skipped_exons',
                                              'terminal_exons', 'fake_terminal_exons', 'microintron_retention'))
    strategies = {
        'none': SplicSiteCorrectionStrategy(False, False, False, False, False, False),
        'default_pacbio': SplicSiteCorrectionStrategy(True, False, True, False, False, True),
        'conservative_ont': SplicSiteCorrectionStrategy(True, False, True, False, False, False),
        'default_ont': SplicSiteCorrectionStrategy(True, False, True, False, True, True),
        'all': SplicSiteCorrectionStrategy(True, True, True, True, True, True),
        'assembly': SplicSiteCorrectionStrategy(False, False, True, False, False, False)
    }
    strategy = strategies[args.splice_correction_strategy]
    args.correct_fuzzy_junctions = strategy.fuzzy_junctions
    args.correct_intron_shifts = strategy.intron_shifts
    args.correct_skipped_exons = strategy.skipped_exons
    args.correct_terminal_exons = strategy.terminal_exons
    args.correct_fake_terminal_exons = strategy.fake_terminal_exons
    args.correct_microintron_retention = strategy.microintron_retention


def set_model_construction_options(args):
    ModelConstructionStrategy = namedtuple('ModelConstructionStrategy',
                                           ('min_novel_intron_count',
                                            'graph_clustering_ratio', 'graph_clustering_distance',
                                            'min_novel_isolated_intron_abs', 'min_novel_isolated_intron_rel',
                                            'terminal_position_abs', 'terminal_position_rel',
                                            'terminal_internal_position_rel',
                                            'min_known_count', 'min_nonfl_count',
                                            'min_novel_count', 'min_novel_count_rel',
                                            'min_mono_count_rel', 'singleton_adjacent_cov',
                                            'fl_only', 'novel_monoexonic',
                                            'require_monointronic_polya', 'require_monoexonic_polya',
                                            'report_canonical'))
    strategies = {
        'reliable':        ModelConstructionStrategy(2, 0.5, 20,  5, 0.05,  1, 0.1,  0.1,  2, 4, 8, 0.05, 0.05, 50,
                                                     True, False, True, True, StrandnessReportingLevel.only_canonical),
        'default_pacbio':  ModelConstructionStrategy(1, 0.5, 10,  2, 0.02,  1, 0.05,  0.05,  1, 2, 2, 0.02, 0.005, 100,
                                                     False, True, False, True, StrandnessReportingLevel.only_canonical),
        'sensitive_pacbio':ModelConstructionStrategy(1, 0.5, 5,   2, 0.005,  1, 0.01,  0.02,  1, 2, 2, 0.005, 0.001, 100,
                                                     False, True, False, False, StrandnessReportingLevel.only_stranded),
        'default_ont':     ModelConstructionStrategy(1, 0.5, 20,  3, 0.02,  1, 0.05,  0.05,  1, 3, 3, 0.02, 0.02, 10,
                                                     False, False, True, True, StrandnessReportingLevel.only_canonical),
        'sensitive_ont':   ModelConstructionStrategy(1, 0.5, 20,  3, 0.005,  1, 0.01,  0.02,  1, 2, 3, 0.005, 0.005, 10,
                                                     False, True, False, False, StrandnessReportingLevel.only_stranded),
        'fl_pacbio':       ModelConstructionStrategy(1, 0.5, 10,  2, 0.02,  1, 0.05,  0.01,  1, 2, 3, 0.02, 0.005, 100,
                                                     True, True, False, False, StrandnessReportingLevel.only_canonical),
        'all':             ModelConstructionStrategy(0, 0.3, 5,   1, 0.002,  1, 0.01, 0.01, 1, 1, 1, 0.002, 0.001, 500,
                                                     False, True, False, False, StrandnessReportingLevel.all),
        'assembly':        ModelConstructionStrategy(0, 0.3, 5,   1, 0.05,  1, 0.01, 0.02,  1, 1, 1, 0.05, 0.01, 50,
                                                     False, True, False, False, StrandnessReportingLevel.only_stranded)
    }
    strategy = strategies[args.model_construction_strategy]

    args.min_novel_intron_count = strategy.min_novel_intron_count
    args.graph_clustering_ratio = strategy.graph_clustering_ratio
    if args.graph_clustering_distance is None:
        args.graph_clustering_distance = strategy.graph_clustering_distance
    elif args.graph_clustering_distance < 0:
        logger.error("--graph_clustering_distance can not be negative")
        sys.exit(IsoQuantExitCode.INVALID_PARAMETER)
    args.min_novel_isolated_intron_abs = strategy.min_novel_isolated_intron_abs
    args.min_novel_isolated_intron_rel = strategy.min_novel_isolated_intron_rel
    args.terminal_position_abs = strategy.terminal_position_abs
    args.terminal_position_rel = strategy.terminal_position_rel
    args.terminal_internal_position_rel = strategy.terminal_internal_position_rel

    args.min_known_count = strategy.min_known_count
    args.min_nonfl_count = strategy.min_nonfl_count
    args.min_novel_count = strategy.min_novel_count
    args.min_mono_count_rel = strategy.min_mono_count_rel
    args.min_novel_count_rel = strategy.min_novel_count_rel
    args.singleton_adjacent_cov = strategy.singleton_adjacent_cov
    args.fl_only = strategy.fl_only
    args.min_mono_exon_coverage = 0.75

    if args.report_novel_unspliced is None:
        args.report_novel_unspliced = strategy.novel_monoexonic

    if not args.report_novel_unspliced and not args.no_model_construction:
        logger.info("Novel unspliced transcripts will not be reported, "
                    "set --report_novel_unspliced true to discover them")

    args.require_monointronic_polya = strategy.require_monointronic_polya
    args.require_monoexonic_polya = strategy.require_monoexonic_polya
    args.polya_requirement_strategy = PolyAUsageStrategies[args.polya_requirement]
    args.polya_trimmed = PolyATrimmed[args.polya_trimmed]
    args.report_canonical_strategy = StrandnessReportingLevel[args.report_canonical]
    if args.report_canonical_strategy == StrandnessReportingLevel.auto:
        args.report_canonical_strategy = strategy.report_canonical


def set_configs_directory(args):
    config_dir = os.path.join(os.environ['HOME'], '.config', 'IsoQuant')
    os.makedirs(config_dir, exist_ok=True)

    args.db_config_path = os.path.join(config_dir, 'db_config.json')
    args.index_config_path = os.path.join(config_dir, 'index_config.json')
    args.bed_config_path = os.path.join(config_dir, 'bed_config.json')
    args.alignment_config_path = os.path.join(config_dir, 'alignment_config.json')
    for config_path in (args.db_config_path, args.index_config_path, args.bed_config_path, args.alignment_config_path):
        if not os.path.exists(config_path):
            with open(config_path, 'w') as f_out:
                json.dump({}, f_out, indent=2)


def set_additional_params(args):
    set_configs_directory(args)
    set_data_dependent_options(args)
    set_matching_options(args)
    set_model_construction_options(args)
    set_splice_correction_options(args)

    args.print_additional_info = True
    args.indel_near_splice_site_dist = 10
    args.upstream_region_len = 20

    args.multimap_strategy = "take_best"
    multimap_strategies = {}
    for e in MultimapResolvingStrategy:
        multimap_strategies[e.name] = e.value
    args.multimap_strategy = MultimapResolvingStrategy(multimap_strategies[args.multimap_strategy])

    args.needs_reference = True
    if args.needs_reference and not args.reference:
        logger.warning("Reference genome is not provided! This may affect quality of the results!")
        args.needs_reference = False

    args.simple_models_mapq_cutoff = 30
    args.polya_percentage_threshold = 0.7
    args.low_polya_percentage_threshold = 0.1

    if args.bam_tags:
        args.bam_tags = args.bam_tags.split(",")
    else:
        args.bam_tags = []
    args.original_annotation = None


def prepare_reference_genome(args):
    if not args.needs_reference:
        return
    logger.info("Reading reference genome from %s" % args.reference)
    ref_dir = os.path.dirname(args.reference)
    ref_file_name = os.path.basename(args.reference)
    ref_name, outer_ext = os.path.splitext(ref_file_name)