glarue · glarue · May 29, 2026 · May 29, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,72 @@ All notable changes to intronIC will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [2.7.1] - 2026-05-29
+
+Backport of the isoform-flag-parity fix from the v2.4.x maintenance line (v2.4.3).
+
+### Fixed — `-i` (`--allow-multiple-isoforms`) and `-d` (`--include-duplicates`) flag parity
+
+Both flags are now respected end-to-end in both `--in-memory` and
+`--streaming` classify modes, with bit-identical bed.iic content
+across modes for every combination of the two flags.
+
+Three bugs fixed in `src/intronIC/cli/main.py`:
+
+- The in-memory post-extraction `IntronFilter` construction
+  hardcoded `longest_only=True` and `include_duplicates=False`,
+  silently overriding the CLI flags. Now reads
+  `config.extraction.allow_multiple_isoforms` and
+  `config.extraction.include_duplicates`.
+- The streaming per-contig worker filter
+  (`_streaming_extract_and_filter_contig`) hardcoded
+  `longest_only=True` and had no `allow_multiple_isoforms` plumbed
+  into the worker config_dict. The three streaming worker
+  config_dicts (BG accumulator, classify worker, fit worker) now
+  carry `allow_multiple_isoforms` next to the existing
+  `include_duplicates` entry, and the per-contig filter consumes it.
+- The streaming summary print hardcoded `include_isoforms=False`
+  regardless of the actual setting; now reflects
+  `config.extraction.allow_multiple_isoforms`.
+
+One additional fix in `src/intronIC/extraction/filters.py`:
+
+- `should_extract_sequences_for` always skipped coord-duplicates
+  from sequence extraction regardless of `include_duplicates`.
+  This made the `-d` flag a no-op for the in-memory pipeline: B-side
+  duplicates ended up in skip_list with no sequences, were
+  un-scoreable, and got dropped before the writer's
+  `include_duplicates` check ever ran. Now skips only when
+  `include_duplicates=False`; with `-d`, duplicates route to
+  extract_list where `extract_sequences_with_deduplication` reuses
+  the first occurrence's sequence buffer (no extra extraction cost).
+
+The four flag-combination truth table now holds, per the design in
+`IntronFilter`:
+
+| `-i` | `-d` | Behavior |
+|------|------|----------|
+| no   | no   | longest-isoform introns only, coord-dups collapsed (default) |
+| yes  | no   | every isoform's introns, coord-dups collapsed |
+| no   | yes  | longest-isoform introns only, every isoform's copy of a dup emitted |
+| yes  | yes  | every isoform's introns, every copy of every dup emitted |
+
+Tests added:
+- `tests/data/isoforms/synthetic.fa` + `synthetic.gff3` — minimal
+  alt-isoform fixture (1 gene, 2 isoforms, 1 shared intron, 1
+  alt-spliced intron).
+- `tests/integration/test_isoform_flag.py` — 16 behavior
+  assertions across (`in-memory`, `streaming`) x 4 flag combos
+  covering scored counts, presence of the alt-isoform-only intron,
+  duplicate emission, and streaming/in-memory parity on the fixture.
+- `tests/integration/test_streaming_equivalence.py` extended with
+  `test_streaming_matches_in_memory_with_flags` — 4 parametrized
+  cases asserting streaming == in-memory bed.iic content under all
+  four (`-i`, `-d`) combos using the worktree's code.
+- `tests/unit/test_filters.py` — two existing tests updated to
+  reflect the new (correct) behavior of
+  `should_extract_sequences_for` when `include_duplicates=True`.
+
 ## [2.7.0] - 2026-05-20
 
 ### Added — Continuous per-intron discount

diff --git a/pixi.toml b/pixi.toml
@@ -1,6 +1,6 @@
 [workspace]
 name = "intronic-refactored"
-version = "2.7.0"
+version = "2.7.1"
 description = "Intron classification tool for identifying U2-type and U12-type introns using SVM (refactored version)"
 authors = ["Graham E. Larue <egrahamlarue@gmail.com>"]
 channels = ["conda-forge"]

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "intronIC"
-version = "2.7.0"
+version = "2.7.1"
 description = "Intron classification tool for identifying U2-type and U12-type introns using SVM"
 readme = "README.md"
 license = {text = "GPL-3.0"}

diff --git a/src/intronIC/cli/main.py b/src/intronIC/cli/main.py
@@ -3553,7 +3553,7 @@ def _streaming_extract_and_filter_contig(
         scoring_regions=["five", "three"],
         allow_noncanonical=not config["exclude_noncanonical"],
         allow_overlap=not config["no_intron_overlap"],
-        longest_only=True,
+        longest_only=not config["allow_multiple_isoforms"],
         include_duplicates=config["include_duplicates"],
     )
     filtered_introns = intron_filter.filter_introns(contig_with_seqs)
@@ -3996,6 +3996,7 @@ def classify_streaming_per_contig(
             'clean_names': config.output.clean_names,
             'debug': config.output.debug,
             'include_duplicates': config.extraction.include_duplicates,
+            'allow_multiple_isoforms': config.extraction.allow_multiple_isoforms,
             'min_intron_len': config.extraction.min_intron_len,
             # Required by _streaming_extract_and_filter_contig:
             'feature_type': config.extraction.feature_type,
@@ -4088,6 +4089,7 @@ def classify_streaming_per_contig(
             "exclude_noncanonical": config.scoring.exclude_noncanonical,
             "no_intron_overlap": config.extraction.no_intron_overlap,
             "include_duplicates": config.extraction.include_duplicates,
+            "allow_multiple_isoforms": config.extraction.allow_multiple_isoforms,
             "ignore_nc_dnts": config.scoring.ignore_nc_dnts,
             "five_start": config.scoring.scoring_regions.five_start,
             "five_end": config.scoring.scoring_regions.five_end,
@@ -4281,6 +4283,7 @@ def classify_streaming_per_contig(
         "exclude_noncanonical": config.scoring.exclude_noncanonical,
         "no_intron_overlap": config.extraction.no_intron_overlap,
         "include_duplicates": config.extraction.include_duplicates,
+        "allow_multiple_isoforms": config.extraction.allow_multiple_isoforms,
         "threshold": config.scoring.threshold,
         "ignore_nc_dnts": config.scoring.ignore_nc_dnts,
         "five_start": config.scoring.scoring_regions.five_start,
@@ -4676,7 +4679,7 @@ def classify_streaming_per_contig(
         duplicates=accumulated_filter_stats.duplicates,
         kept=accumulated_filter_stats.kept_introns,
         include_duplicates=config.extraction.include_duplicates,
-        include_isoforms=False,  # Streaming always uses longest_only=True
+        include_isoforms=config.extraction.allow_multiple_isoforms,
         exclude_noncanonical=config.scoring.exclude_noncanonical,
         exclude_overlap=config.extraction.no_intron_overlap,
     )
@@ -6166,8 +6169,9 @@ def main_classify(config: IntronICConfig):
         messenger.log_only("Filtering introns for scoring")
 
         # Create filter with scoring-appropriate settings:
-        # - longest_only=True: Only score longest isoform per gene (filters ~8k introns)
-        # - include_duplicates=False: Don't score duplicates (filters ~38k introns)
+        # - longest_only: Inverse of allow_multiple_isoforms; default True drops
+        #   non-longest-isoform introns (filters ~8k introns on human)
+        # - include_duplicates: Default False drops coord-duplicates (filters ~38k introns)
         # - min_length: Filter short introns
         # - allow_noncanonical: Based on exclude_noncanonical flag
         # - allow_overlap: Based on no_intron_overlap flag
@@ -6180,8 +6184,8 @@ def main_classify(config: IntronICConfig):
             scoring_regions=["five", "three"],  # Check these for ambiguous bases
             allow_noncanonical=not config.scoring.exclude_noncanonical,
             allow_overlap=not config.extraction.no_intron_overlap,
-            longest_only=True,  # For sequences: no-op (no grandparent info)
-            include_duplicates=False,  # For sequences: no-op (unique coords)
+            longest_only=not config.extraction.allow_multiple_isoforms,
+            include_duplicates=config.extraction.include_duplicates,
         )
 
         filtered_introns = intron_filter.filter_introns(introns)

diff --git a/src/intronIC/extraction/filters.py b/src/intronIC/extraction/filters.py
@@ -516,12 +516,17 @@ def should_extract_sequences_for(
             if not is_longest:
                 return False
 
-    # Check duplicates - only extract for first occurrence
-    # We reuse sequences for all duplicates regardless of include_duplicates flag
-    # The include_duplicates flag affects filtering/output, not extraction
+    # Check duplicates - only extract for first occurrence when caller plans
+    # to drop duplicates downstream. When include_duplicates is True the
+    # downstream IntronFilter / writers keep coord-duplicate introns, so they
+    # need sequences attached (extract_sequences_with_deduplication groups by
+    # coords and reuses the first occurrence's sequence buffer, so this is
+    # cheap). Skipping them here would put them in skip_list with no sequences,
+    # making them un-scoreable and dropping them from .bed.iic/.score_info.iic
+    # output — the bug that breaks the `-d` flag in the in-memory pipeline.
     coord_key = (intron.coordinates.start, intron.coordinates.stop)
-    if coord_key in seen_coordinates:
-        # This is a duplicate - skip extraction (will reuse from first occurrence)
+    if coord_key in seen_coordinates and not include_duplicates:
+        # This is a duplicate and the caller doesn't want them — skip extraction.
         return False
 
     # Extract by default

diff --git a/tests/data/isoforms/synthetic.fa b/tests/data/isoforms/synthetic.fa
@@ -0,0 +1,64 @@
+>chr1
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGGTCGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAAGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGG
+TCGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAAGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTAAGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
diff --git a/tests/data/isoforms/synthetic.fa.fxi b/tests/data/isoforms/synthetic.fa.fxi
diff --git a/tests/data/isoforms/synthetic.gff3 b/tests/data/isoforms/synthetic.gff3
@@ -0,0 +1,11 @@
+##gff-version 3
+##sequence-region chr1 1 5000
+chr1	test	gene	100	3500	.	+	.	ID=gene:geneA;Name=geneA
+chr1	test	mRNA	100	3500	.	+	.	ID=transcript:isoA;Parent=gene:geneA;Name=isoA
+chr1	test	exon	100	499	.	+	.	Parent=transcript:isoA;ID=exon:isoA.1
+chr1	test	exon	800	1199	.	+	.	Parent=transcript:isoA;ID=exon:isoA.2
+chr1	test	exon	1500	3500	.	+	.	Parent=transcript:isoA;ID=exon:isoA.3
+chr1	test	mRNA	100	3000	.	+	.	ID=transcript:isoB;Parent=gene:geneA;Name=isoB
+chr1	test	exon	100	499	.	+	.	Parent=transcript:isoB;ID=exon:isoB.1
+chr1	test	exon	800	1199	.	+	.	Parent=transcript:isoB;ID=exon:isoB.2
+chr1	test	exon	1800	3000	.	+	.	Parent=transcript:isoB;ID=exon:isoB.3