-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvcf-to-gfa-converter.py
1675 lines (1399 loc) · 72.2 KB
/
vcf-to-gfa-converter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
"""
This script is a wrapper around hapli.tools.vcf_to_gfa
This file is kept for backward compatibility but will be removed in a future version.
Please use the module directly: python -m hapli.tools.vcf_to_gfa
"""
import os
import sys
import warnings
import logging
import time
import argparse
from collections import defaultdict
import multiprocessing as mp
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import pairwise2
import traceback
# Add parent directory to path to allow importing from hapli
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '.')))
# Show deprecation warning
warnings.warn(
"This script is now a wrapper around hapli.tools.vcf_to_gfa. "
"Please use the module directly: python -m hapli.tools.vcf_to_gfa",
DeprecationWarning,
stacklevel=2
)
# Import and run the main function from the module
from hapli.tools.vcf_to_gfa import main
if __name__ == "__main__":
sys.exit(main())
def setup_logging(debug=False, log_file=None, verbose=False):
"""Configure logging based on debug flag and optional log file.
Args:
debug (bool): Enable debug mode logging
log_file (str): Optional file path for logging
verbose (bool): Enable verbose output without full debug
"""
if debug:
log_level = logging.DEBUG
log_format = '%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s'
elif verbose:
log_level = logging.INFO
log_format = '%(asctime)s - %(levelname)s - %(message)s'
else:
log_level = logging.INFO
log_format = '%(message)s'
# Configure root logger
logging.basicConfig(level=log_level, format=log_format)
logger = logging.getLogger()
# Clear any existing handlers
logger.handlers = []
# Add console handler
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(log_level)
console_formatter = logging.Formatter(log_format)
console_handler.setFormatter(console_formatter)
logger.addHandler(console_handler)
# Add file handler if specified
if log_file:
file_handler = logging.FileHandler(log_file)
file_handler.setLevel(log_level)
file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s')
file_handler.setFormatter(file_formatter)
logger.addHandler(file_handler)
return logger
def parse_gfa(gfa_file, validate=True):
"""Parse a GFA file into segments, links, and paths with optional validation.
Args:
gfa_file (str): Path to GFA file
validate (bool): Whether to validate GFA structure
Returns:
tuple: (segments, links, paths) dictionaries and lists
"""
segments = {}
links = []
paths = {}
segment_counts = {} # For validation
start_time = time.time()
logging.info(f"Parsing GFA file: {gfa_file}")
with open(gfa_file, 'r') as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line:
continue
fields = line.split('\t')
record_type = fields[0]
if record_type == 'S': # Segment
if len(fields) < 3:
logging.warning(f"Line {line_num}: Invalid segment record, missing fields")
continue
seg_id = fields[1]
sequence = fields[2]
# Store segment sequence and track counts for validation
segments[seg_id] = sequence
segment_counts[seg_id] = segment_counts.get(seg_id, 0) + 1
# Extract optional tags if present
tags = {}
for field in fields[3:]:
if ':' in field:
tag_parts = field.split(':', 2)
if len(tag_parts) >= 3:
tag_name, tag_type, tag_value = tag_parts
tags[tag_name] = (tag_type, tag_value)
logging.debug(f"Parsed segment: {seg_id} (length: {len(sequence)}, tags: {len(tags)})")
elif record_type == 'L': # Link
if len(fields) < 6:
logging.warning(f"Line {line_num}: Invalid link record, missing fields")
continue
from_id = fields[1]
from_dir = fields[2]
to_id = fields[3]
to_dir = fields[4]
overlap = fields[5]
# Extract optional tags
tags = {}
for field in fields[6:]:
if ':' in field:
tag_parts = field.split(':', 2)
if len(tag_parts) >= 3:
tag_name, tag_type, tag_value = tag_parts
tags[tag_name] = (tag_type, tag_value)
links.append((from_id, from_dir, to_id, to_dir, overlap, tags))
logging.debug(f"Parsed link: {from_id}{from_dir} -> {to_id}{to_dir}")
elif record_type == 'P': # Path
if len(fields) < 3:
logging.warning(f"Line {line_num}: Invalid path record, missing fields")
continue
path_name = fields[1]
path_segments = []
for seg in fields[2].split(','):
# Split into segment ID and orientation
if not seg or len(seg) < 2:
logging.warning(f"Line {line_num}: Invalid segment in path: {seg}")
continue
seg_id = seg[:-1]
orientation = seg[-1]
# Validate segment exists
if validate and seg_id not in segments:
logging.warning(f"Line {line_num}: Path {path_name} references unknown segment: {seg_id}")
path_segments.append((seg_id, orientation))
# Extract path overlaps if present
overlaps = fields[3] if len(fields) > 3 else "*"
# Store with overlaps
paths[path_name] = {'segments': path_segments, 'overlaps': overlaps}
logging.debug(f"Parsed path: {path_name} with {len(path_segments)} segments")
# Validate GFA structure if requested
if validate:
validation_errors = []
# Check for duplicate segments
for seg_id, count in segment_counts.items():
if count > 1:
validation_errors.append(f"Duplicate segment ID: {seg_id} (appears {count} times)")
# Check for links referencing non-existent segments
for from_id, from_dir, to_id, to_dir, overlap, tags in links:
if from_id not in segments:
validation_errors.append(f"Link references non-existent segment: {from_id}")
if to_id not in segments:
validation_errors.append(f"Link references non-existent segment: {to_id}")
if validation_errors:
for error in validation_errors:
logging.warning(f"GFA validation error: {error}")
elapsed = time.time() - start_time
logging.info(f"Finished parsing GFA in {elapsed:.2f}s: {len(segments)} segments, {len(links)} links, {len(paths)} paths")
# Simplify paths structure for compatibility with rest of code
simplified_paths = {}
for path_name, path_data in paths.items():
simplified_paths[path_name] = path_data['segments']
return segments, links, simplified_paths
def build_reference_sequence(segments, path_segments):
"""Build a reference sequence from segments and calculate offsets.
Args:
segments (dict): Dictionary of segment ID to sequence
path_segments (list): List of (segment_id, orientation) tuples
Returns:
tuple: (reference_seq, segment_offsets)
"""
reference_seq = ""
segment_offsets = {}
current_offset = 0
for seg_id, orientation in path_segments:
if seg_id not in segments:
logging.error(f"Missing segment: {seg_id}")
continue
segment_seq = segments[seg_id]
# Handle reverse complement if needed
if orientation == '-':
segment_seq = reverse_complement(segment_seq)
segment_offsets[seg_id] = (current_offset, current_offset + len(segment_seq) - 1, orientation)
reference_seq += segment_seq
current_offset += len(segment_seq)
logging.info(f"Built reference sequence of length {len(reference_seq)}bp from {len(path_segments)} segments")
return reference_seq, segment_offsets
def reverse_complement(sequence):
"""Return the reverse complement of a DNA sequence using BioPython.
Args:
sequence (str): DNA sequence
Returns:
str: Reverse complemented sequence
"""
# Use BioPython for reverse complement
seq_obj = Seq(sequence)
return str(seq_obj.reverse_complement())
def parse_vcf(vcf_file, strict_hgvs=False, max_variants=None, chrom_filter=None):
"""Parse VCF file and extract variants using hgvs library.
Args:
vcf_file (str): Path to the VCF file
strict_hgvs (bool): If True, fail on HGVS parse errors
max_variants (int): Maximum number of variants to process
chrom_filter (str): Only process variants from this chromosome
Returns:
tuple: (variants, samples, format_fields)
- variants: List of variant dictionaries
- samples: List of sample names
- format_fields: List of format fields
"""
variants = []
samples = []
format_fields = []
parser = hgvs.parser.Parser()
start_time = time.time()
logging.info(f"Parsing VCF file: {vcf_file}")
vcf_stats = {
'total': 0,
'passed': 0,
'filtered': 0,
'by_type': defaultdict(int),
'by_chrom': defaultdict(int),
'invalid': 0,
'phased': 0,
'unphased': 0,
'multiallelic': 0
}
with open(vcf_file, 'r') as f:
for line_num, line in enumerate(f, 1):
# Process header
if line.startswith('#'):
# Extract column headers to identify samples
if line.startswith('#CHROM'):
header_fields = line.strip().split('\t')
if len(header_fields) > 9: # VCF has samples
samples = header_fields[9:]
logging.info(f"Found {len(samples)} samples in VCF: {', '.join(samples)}")
continue
vcf_stats['total'] += 1
# Stop if we've reached max variants
if max_variants and vcf_stats['passed'] >= max_variants:
logging.info(f"Reached maximum of {max_variants} variants, stopping")
break
fields = line.strip().split('\t')
if len(fields) < 8:
logging.warning(f"Line {line_num}: Invalid VCF record, missing fields")
vcf_stats['invalid'] += 1
continue
chrom = fields[0]
# Apply chromosome filter if provided
if chrom_filter and chrom != chrom_filter:
vcf_stats['filtered'] += 1
continue
try:
pos = int(fields[1])
except ValueError:
logging.warning(f"Line {line_num}: Invalid position: {fields[1]}")
vcf_stats['invalid'] += 1
continue
var_id = fields[2] if fields[2] != '.' else f"var_{line_num}"
ref = fields[3]
alt = fields[4]
if not ref or not alt or alt == '.':
logging.warning(f"Line {line_num}: Missing or invalid REF or ALT")
vcf_stats['invalid'] += 1
continue
# Handle multiple ALT alleles
alt_alleles = alt.split(',')
if len(alt_alleles) > 1:
logging.debug(f"Line {line_num}: Processing {len(alt_alleles)} alternate alleles")
vcf_stats['multiallelic'] += 1
# Parse INFO field
info_dict = {}
for item in fields[7].split(';'):
if '=' in item:
key, value = item.split('=', 1)
info_dict[key] = value
else:
info_dict[item] = True
# Process FORMAT field if present (for genotype data)
format_data = {}
sample_data = []
is_phased = False
if len(fields) > 8 and samples:
format_keys = fields[8].split(':')
if not format_fields: # Store format fields once
format_fields = format_keys
# Process each sample's genotype
for i, sample_name in enumerate(samples):
if len(fields) <= 9 + i:
continue
sample_values = fields[9 + i].split(':')
sample_dict = {}
# Map FORMAT fields to values
for j, key in enumerate(format_keys):
if j < len(sample_values):
sample_dict[key] = sample_values[j]
# Check for phasing in genotype (GT) field
if 'GT' in sample_dict:
gt = sample_dict['GT']
if '|' in gt: # Phased genotype
is_phased = True
vcf_stats['phased'] += 1
elif '/' in gt: # Unphased genotype
vcf_stats['unphased'] += 1
sample_data.append(sample_dict)
# Process each ALT allele
for idx, alt_allele in enumerate(alt_alleles):
current_id = var_id if len(alt_alleles) == 1 else f"{var_id}_{idx+1}"
# Determine variant type based on length and SVTYPE field
if 'SVTYPE' in info_dict:
variant_type = info_dict['SVTYPE']
else:
if len(ref) == 1 and len(alt_allele) == 1:
variant_type = 'SNP'
elif len(ref) > len(alt_allele):
variant_type = 'DEL'
elif len(ref) < len(alt_allele):
variant_type = 'INS'
elif len(ref) == len(alt_allele) and len(ref) > 1:
variant_type = 'MNP'
else:
variant_type = 'OTHER'
try:
end_pos = int(info_dict.get('END', pos + len(ref) - 1))
except ValueError:
logging.warning(f"Line {line_num}: Invalid END position: {info_dict.get('END')}")
end_pos = pos + len(ref) - 1
hgvs_notation = info_dict.get('HGVS', '')
hgvs_obj = None
# Parse the HGVS notation using the hgvs library
if hgvs_notation:
try:
hgvs_obj = parser.parse_hgvs_variant(hgvs_notation)
logging.debug(f"Parsed HGVS: {hgvs_notation} -> {hgvs_obj}")
except Exception as e:
error_msg = f"Could not parse HGVS notation for {current_id}: {e}"
if strict_hgvs:
logging.error(error_msg)
raise ValueError(error_msg)
else:
logging.warning(error_msg)
# Process genotype data for this allele
genotypes = []
if sample_data:
for i, sample in enumerate(sample_data):
if 'GT' in sample:
gt = sample['GT']
# Parse allele indices from GT field
if '|' in gt: # Phased
allele_indices = [int(a) if a.isdigit() else None for a in gt.split('|')]
phase_type = 'phased'
elif '/' in gt: # Unphased
allele_indices = [int(a) if a.isdigit() else None for a in gt.split('/')]
phase_type = 'unphased'
else:
allele_indices = [int(gt) if gt.isdigit() else None]
phase_type = 'unknown'
# Check if this specific ALT allele is present in genotype
# ALT index is idx+1 (because 0 is REF)
alt_idx = idx + 1
has_alt = alt_idx in allele_indices
genotypes.append({
'sample': samples[i],
'gt': gt,
'has_alt': has_alt,
'allele_indices': allele_indices,
'phase_type': phase_type,
'format_data': sample
})
variant = {
'id': current_id,
'chrom': chrom,
'pos': pos,
'ref': ref,
'alt': alt_allele,
'type': variant_type,
'end': end_pos,
'hgvs': hgvs_notation,
'hgvs_obj': hgvs_obj,
'info': info_dict,
'genotypes': genotypes,
'is_phased': is_phased,
'allele_index': idx + 1,
'line_num': line_num
}
variants.append(variant)
vcf_stats['passed'] += 1
vcf_stats['by_type'][variant_type] += 1
vcf_stats['by_chrom'][chrom] += 1
logging.debug(f"Parsed variant: {current_id} {variant_type} at position {pos}")
# Sort variants by position (descending) to avoid position shifts
variants = sorted(variants, key=lambda v: v['pos'], reverse=True)
elapsed = time.time() - start_time
logging.info(f"Finished parsing VCF in {elapsed:.2f}s: {len(variants)} variants")
# Log statistics
logging.info(f"VCF Statistics:")
logging.info(f" Total records: {vcf_stats['total']}")
logging.info(f" Passed variants: {vcf_stats['passed']}")
logging.info(f" Filtered variants: {vcf_stats['filtered']}")
logging.info(f" Invalid records: {vcf_stats['invalid']}")
logging.info(f" Multiallelic sites: {vcf_stats['multiallelic']}")
logging.info(f" Phased genotypes: {vcf_stats['phased']}")
logging.info(f" Unphased genotypes: {vcf_stats['unphased']}")
logging.info(f" Variant types:")
for var_type, count in sorted(vcf_stats['by_type'].items()):
logging.info(f" {var_type}: {count}")
return variants, samples, format_fields
def find_segment_for_position(position, segment_offsets):
"""Find which segment contains a given position and calculate relative position.
Args:
position (int): Genome position (1-based)
segment_offsets (dict): Dictionary of segment offsets
Returns:
tuple: (segment_id, relative_position, orientation)
"""
for seg_id, (start, end, orientation) in segment_offsets.items():
if start <= position - 1 <= end: # Convert to 0-based for internal calculations
rel_pos = position - 1 - start
# For reverse orientation, we need to recalculate relative position
if orientation == '-':
# Length of the segment
seg_len = end - start + 1
# Calculate position from end of segment
rel_pos = seg_len - rel_pos - 1
return seg_id, rel_pos, orientation
return None, None, None
def phase_variants(variants, samples):
"""Group variants by their phase blocks.
Args:
variants (list): List of variant dictionaries
samples (list): List of sample names
Returns:
dict: Sample name -> Phase block ID -> List of variants
"""
phased_variants = {sample: defaultdict(list) for sample in samples}
unphased_variants = {sample: [] for sample in samples}
for variant in variants:
# Process each sample's genotype
for genotype in variant.get('genotypes', []):
sample = genotype['sample']
phase_type = genotype.get('phase_type', 'unknown')
# Only process variants that are present in this sample
if not genotype.get('has_alt', False):
continue
if phase_type == 'phased':
# Use chromosome as the block ID for phased variants (simplified model)
# In a real implementation, we would use phase block information if available
block_id = variant['chrom']
phased_variants[sample][block_id].append(variant)
else:
# For unphased variants, keep them separate
unphased_variants[sample].append(variant)
return phased_variants, unphased_variants
def apply_variants_to_segments(segments, variants, segment_offsets, path_segments,
allow_mismatches=False, match_threshold=0.8, parallel=False):
"""Apply variants to segments and create modified segments.
Args:
segments (dict): Dictionary of segment ID to sequence
variants (list): List of variant dictionaries
segment_offsets (dict): Dictionary of segment ID to (start, end, orientation) tuple
path_segments (list): List of (segment_id, orientation) tuples
allow_mismatches (bool): Whether to allow reference mismatches
match_threshold (float): Minimum sequence similarity for fuzzy matching
parallel (bool): Whether to use parallel processing
Returns:
tuple: (modified_segments, new_path_segments, variant_application_log)
"""
start_time = time.time()
logging.info("Applying variants to segments...")
# Group variants by segment
variants_by_segment = defaultdict(list)
unmapped_variants = []
for variant in variants:
seg_id, rel_pos, orientation = find_segment_for_position(variant['pos'], segment_offsets)
if seg_id:
variants_by_segment[seg_id].append((variant, rel_pos, orientation))
logging.debug(f"Mapped variant {variant['id']} to segment {seg_id} at relative position {rel_pos}")
else:
unmapped_variants.append(variant)
logging.warning(f"Could not map variant {variant['id']} (pos {variant['pos']}) to any segment")
if unmapped_variants:
logging.warning(f"Failed to map {len(unmapped_variants)} variants to segments")
# Create modified segments
modified_segments = {}
variant_application_log = []
# Function to process a single segment
def process_segment(seg_id, var_list):
# Sort by relative position (descending)
var_list.sort(key=lambda x: x[1], reverse=True)
# Get the original sequence
segment_seq = segments[seg_id]
segment_seq_original = segment_seq # Keep original for comparison
logging.info(f"Modifying segment {seg_id} with {len(var_list)} variants")
segment_log = []
# Apply variants
for variant, rel_pos, orientation in var_list:
expected_ref = segment_seq[rel_pos:rel_pos + len(variant['ref'])]
# Check for reference match
if expected_ref != variant['ref']:
# Try fuzzy matching for mismatches
alignment_score = 0
if len(expected_ref) > 0 and len(variant['ref']) > 0:
# Use BioPython for sequence alignment to find best match
alignments = pairwise2.align.localms(
expected_ref, variant['ref'],
2, -1, -2, -0.5, # Match, mismatch, gap penalties
one_alignment_only=True
)
if alignments:
alignment_score = alignments[0].score / (2 * max(len(expected_ref), len(variant['ref'])))
if alignment_score < match_threshold:
msg = f"Reference mismatch for {variant['id']} in segment {seg_id} at position {rel_pos}."
msg += f"\n Expected: {variant['ref']}, Found: {expected_ref}"
msg += f"\n Similarity score: {alignment_score:.2f}"
if not allow_mismatches:
logging.error(msg)
raise ValueError(f"Reference mismatch for {variant['id']}. Use --allow-mismatches to force application.")
else:
logging.warning(msg)
else:
logging.info(f"Fuzzy match for {variant['id']}: Score {alignment_score:.2f}")
# Apply the variant based on type
old_seq = segment_seq
if variant['type'] == 'SNP':
logging.debug(f"Applying SNP {variant['id']}: {variant['ref']} -> {variant['alt']} at position {rel_pos}")
segment_seq = segment_seq[:rel_pos] + variant['alt'] + segment_seq[rel_pos + len(variant['ref']):]
elif variant['type'] in ['INV', 'MNP']:
logging.debug(f"Applying {variant['type']} {variant['id']}: {variant['ref']} -> {variant['alt']} at position {rel_pos}")
segment_seq = segment_seq[:rel_pos] + variant['alt'] + segment_seq[rel_pos + len(variant['ref']):]
elif variant['type'] == 'DEL':
logging.debug(f"Applying DEL {variant['id']}: Removing {len(variant['ref'])} bp at position {rel_pos}")
segment_seq = segment_seq[:rel_pos] + (variant['alt'] if variant['alt'] != '.' else '') + segment_seq[rel_pos + len(variant['ref']):]
elif variant['type'] == 'INS':
logging.debug(f"Applying INS {variant['id']}: Inserting {len(variant['alt'])-1} bp at position {rel_pos}")
# Handle INS where ref is the base before insertion
if len(variant['ref']) == 1:
segment_seq = segment_seq[:rel_pos + 1] + variant['alt'][1:] + segment_seq[rel_pos + 1:]
else:
segment_seq = segment_seq[:rel_pos] + variant['alt'] + segment_seq[rel_pos + len(variant['ref']):]
else:
logging.debug(f"Applying generic variant {variant['id']} ({variant['type']}): {variant['ref']} -> {variant['alt']} at position {rel_pos}")
segment_seq = segment_seq[:rel_pos] + variant['alt'] + segment_seq[rel_pos + len(variant['ref']):]
# Log the change
segment_log.append({
'variant_id': variant['id'],
'segment_id': seg_id,
'position': variant['pos'],
'rel_position': rel_pos,
'orientation': orientation,
'type': variant['type'],
'ref': variant['ref'],
'alt': variant['alt'],
'seq_before': old_seq[max(0, rel_pos-5):min(len(old_seq), rel_pos+len(variant['ref'])+5)],
'seq_after': segment_seq[max(0, rel_pos-5):min(len(segment_seq), rel_pos+len(variant['alt'])+5)]
})
# Create a new segment ID
var_ids = '_'.join(v[0]['id'] for v in var_list)
new_seg_id = f"{seg_id}_{var_ids}"
# Store the modified segment
modified_segment = {
'new_id': new_seg_id,
'sequence': segment_seq,
'original_sequence': segment_seq_original,
'variants': [v[0] for v in var_list],
'length_diff': len(segment_seq) - len(segment_seq_original)
}
logging.info(f"Created modified segment {new_seg_id}, length diff: {modified_segment['length_diff']} bp")
return seg_id, modified_segment, segment_log
# Process segments (either in parallel or serially)
if parallel and len(variants_by_segment) > 1:
logging.info(f"Processing {len(variants_by_segment)} segments in parallel")
with mp.Pool(processes=min(mp.cpu_count(), len(variants_by_segment))) as pool:
results = pool.starmap(process_segment, [(seg_id, var_list) for seg_id, var_list in variants_by_segment.items()])
for seg_id, mod_seg, seg_log in results:
modified_segments[seg_id] = mod_seg
variant_application_log.extend(seg_log)
else:
for seg_id, var_list in variants_by_segment.items():
seg_id, mod_seg, seg_log = process_segment(seg_id, var_list)
modified_segments[seg_id] = mod_seg
variant_application_log.extend(seg_log)
# Create a new path with modified segments
new_path_segments = []
for seg_id, orientation in path_segments:
if seg_id in modified_segments:
new_path_segments.append((modified_segments[seg_id]['new_id'], orientation))
else:
new_path_segments.append((seg_id, orientation))
elapsed = time.time() - start_time
logging.info(f"Created new path with {len(modified_segments)} modified segments in {elapsed:.2f}s")
return modified_segments, new_path_segments, variant_application_log
def apply_single_variant(segments, variant, segment_offsets, path_segments, allow_mismatches=False, match_threshold=0.8):
"""Apply a single variant to create a modified path.
This function is used for unphased variants to create individual paths for each variant.
Args:
segments (dict): Dictionary of segment ID to sequence
variant (dict): Single variant dictionary
segment_offsets (dict): Dictionary of segment offsets
path_segments (list): Reference path segments
allow_mismatches (bool): Whether to allow reference mismatches
match_threshold (float): Threshold for sequence similarity
Returns:
tuple: (modified_segment, new_path_segments, log_entry)
"""
# Find which segment contains this variant
seg_id, rel_pos, orientation = find_segment_for_position(variant['pos'], segment_offsets)
if not seg_id:
logging.warning(f"Could not map variant {variant['id']} to any segment")
return None, None, None
# Get the original sequence
segment_seq = segments[seg_id]
segment_seq_original = segment_seq
# Check reference sequence
expected_ref = segment_seq[rel_pos:rel_pos + len(variant['ref'])]
if expected_ref != variant['ref']:
# Try fuzzy matching
alignment_score = 0
if len(expected_ref) > 0 and len(variant['ref']) > 0:
alignments = pairwise2.align.localms(
expected_ref, variant['ref'],
2, -1, -2, -0.5,
one_alignment_only=True
)
if alignments:
alignment_score = alignments[0].score / (2 * max(len(expected_ref), len(variant['ref'])))
if alignment_score < match_threshold:
msg = f"Reference mismatch for {variant['id']} in segment {seg_id}."
msg += f"\n Expected: {variant['ref']}, Found: {expected_ref}"
if not allow_mismatches:
logging.error(msg)
return None, None, None
else:
logging.warning(msg)
# Apply the variant based on type
if variant['type'] == 'SNP':
segment_seq = segment_seq[:rel_pos] + variant['alt'] + segment_seq[rel_pos + len(variant['ref']):]
elif variant['type'] in ['INV', 'MNP']:
segment_seq = segment_seq[:rel_pos] + variant['alt'] + segment_seq[rel_pos + len(variant['ref']):]
elif variant['type'] == 'DEL':
segment_seq = segment_seq[:rel_pos] + (variant['alt'] if variant['alt'] != '.' else '') + segment_seq[rel_pos + len(variant['ref']):]
elif variant['type'] == 'INS':
if len(variant['ref']) == 1:
segment_seq = segment_seq[:rel_pos + 1] + variant['alt'][1:] + segment_seq[rel_pos + 1:]
else:
segment_seq = segment_seq[:rel_pos] + variant['alt'] + segment_seq[rel_pos + len(variant['ref']):]
else:
segment_seq = segment_seq[:rel_pos] + variant['alt'] + segment_seq[rel_pos + len(variant['ref']):]
# Create a new segment ID
new_seg_id = f"{seg_id}_{variant['id']}"
# Store the modified segment
modified_segment = {
'seg_id': seg_id,
'new_id': new_seg_id,
'sequence': segment_seq,
'original_sequence': segment_seq_original,
'variants': [variant],
'length_diff': len(segment_seq) - len(segment_seq_original)
}
# Create a new path
new_path_segments = []
for curr_seg_id, orient in path_segments:
if curr_seg_id == seg_id:
new_path_segments.append((new_seg_id, orient))
else:
new_path_segments.append((curr_seg_id, orient))
# Create log entry
log_entry = {
'variant_id': variant['id'],
'segment_id': seg_id,
'position': variant['pos'],
'rel_position': rel_pos,
'orientation': orientation,
'type': variant['type'],
'ref': variant['ref'],
'alt': variant['alt'],
'seq_before': segment_seq_original[max(0, rel_pos-5):min(len(segment_seq_original), rel_pos+len(variant['ref'])+5)],
'seq_after': segment_seq[max(0, rel_pos-5):min(len(segment_seq), rel_pos+len(variant['alt'])+5)]
}
return modified_segment, new_path_segments, log_entry
def generate_new_gfa(original_segments, modified_segments, links, paths, new_path_segments,
output_file, alt_path_name="ALT", keep_original=True, add_metadata=True,
sample_paths=None):
"""Generate a new GFA file with the modified segments and paths.
Args:
original_segments (dict): Dictionary of original segment ID to sequence
modified_segments (dict): Dictionary of modified segment information
links (list): List of link tuples
paths (dict): Dictionary of path name to segments
new_path_segments (list): List of (segment_id, orientation) tuples for new path
output_file (str): Path to output GFA file
alt_path_name (str): Name for the alternate path
keep_original (bool): Whether to keep original segments and paths
add_metadata (bool): Whether to add metadata tags to GFA
sample_paths (dict): Optional dictionary of sample -> path name -> path segments
for multi-sample VCF support
"""
start_time = time.time()
logging.info(f"Writing new GFA to {output_file}")
with open(output_file, 'w') as f:
# Write header with metadata
if add_metadata:
timestamp = time.strftime("%Y-%m-%dT%H:%M:%S")
f.write(f"H\tVN:Z:1.0\tTS:Z:{timestamp}\tPG:Z:vcf_to_gfa.py\n")
else:
f.write("H\tVN:Z:1.0\n")
# Collect all segments to write (original + modified)
all_segments = {}
if keep_original:
all_segments.update(original_segments)
# Collect segments from sample paths if provided
if sample_paths:
for sample, path_dict in sample_paths.items():
for path_name, path_info in path_dict.items():
if 'segments' in path_info:
for seg in path_info['segments']:
if isinstance(seg, dict) and 'new_id' in seg:
all_segments[seg['new_id']] = seg['sequence']
# Add segments from the main modified segments dict
for seg_id, mod_seg in modified_segments.items():
new_id = mod_seg['new_id']
all_segments[new_id] = mod_seg['sequence']
# Write all segments
segments_written = 0
for seg_id, sequence in all_segments.items():
# Check if this is a modified segment
is_modified = False
mod_data = None
# Look in main modified segments
for orig_id, mod_seg in modified_segments.items():
if mod_seg['new_id'] == seg_id:
is_modified = True
mod_data = mod_seg
break
# Look in sample paths if not found
if not is_modified and sample_paths:
for sample, path_dict in sample_paths.items():
for path_name, path_info in path_dict.items():
if 'segments' in path_info:
for seg in path_info['segments']:
if isinstance(seg, dict) and seg.get('new_id') == seg_id:
is_modified = True
mod_data = seg
break
# Write segment with appropriate tags
if is_modified and mod_data:
# Add segment metadata
tags = [f"LN:i:{len(sequence)}"]
if add_metadata:
# Add variant metadata
variant_ids = ';'.join(v['id'] for v in mod_data['variants'])
tags.append(f"VA:Z:{variant_ids}")
# Add length difference
if mod_data.get('length_diff', 0) != 0:
tags.append(f"LD:i:{mod_data['length_diff']}")
# Add original segment reference if available
if 'seg_id' in mod_data:
tags.append(f"OR:Z:{mod_data['seg_id']}")
f.write(f"S\t{seg_id}\t{sequence}\t{' '.join(tags)}\n")
else:
# Write regular segment
f.write(f"S\t{seg_id}\t{sequence}\tLN:i:{len(sequence)}\n")
segments_written += 1
logging.debug(f"Wrote segment {seg_id}")
# Write original links if requested
links_written = 0
if keep_original:
for link_data in links:
# Handle both old and new link formats
if len(link_data) == 5:
from_id, from_dir, to_id, to_dir, overlap = link_data
tags = {}
else:
from_id, from_dir, to_id, to_dir, overlap, tags = link_data
# Convert tags to string format
tag_str = ""
if tags:
tag_str = " " + " ".join(f"{name}:{type_val[0]}:{type_val[1]}" for name, type_val in tags.items())
f.write(f"L\t{from_id}\t{from_dir}\t{to_id}\t{to_dir}\t{overlap}{tag_str}\n")
links_written += 1
logging.debug(f"Wrote original link {from_id}{from_dir} -> {to_id}{to_dir}")
# Helper function to create links for a path
def create_links_for_path(path):
links_created = 0
for i in range(len(path) - 1):
# Handle both tuple and dict formats for segments
if isinstance(path[i], tuple):
from_seg, from_dir = path[i]
else:
from_seg, from_dir = path[i]['new_id'], path[i]['orientation']
if isinstance(path[i+1], tuple):
to_seg, to_dir = path[i+1]
else:
to_seg, to_dir = path[i+1]['new_id'], path[i+1]['orientation']
# Create link
tag_str = " SV:Z:variant" if add_metadata else ""
f.write(f"L\t{from_seg}\t{from_dir}\t{to_seg}\t{to_dir}\t0M{tag_str}\n")
links_created += 1
logging.debug(f"Created link {from_seg}{from_dir} -> {to_seg}{to_dir}")
return links_created
# Create links for the main path
ref_path_name = next(iter(paths.keys())) # Use first path if REF not found
if "REF" in paths:
ref_path_name = "REF"
# Create links for the main new path
new_links_added = 0
if new_path_segments:
new_links_added = create_links_for_path(new_path_segments)
links_written += new_links_added
logging.info(f"Added {new_links_added} links for main path")
# Create links for sample paths
if sample_paths:
for sample, path_dict in sample_paths.items():
for path_name, path_info in path_dict.items():
if 'segments' in path_info:
sample_links = create_links_for_path(path_info['segments'])
links_written += sample_links
logging.info(f"Added {sample_links} links for {sample} path {path_name}")
# Write original paths if requested
paths_written = 0
if keep_original:
for path_name, path_segs in paths.items():
segments_str = ','.join(f"{seg}{dir}" for seg, dir in path_segs)
f.write(f"P\t{path_name}\t{segments_str}\t*\n")