@@ -50,62 +50,59 @@ def __init__(
50
50
include_non_sample_nodes ,
51
51
):
52
52
self .tree_sequence = tree_sequence
53
- self .contig_id = contig_id
54
- self .isolated_as_missing = isolated_as_missing
55
53
56
54
vcf_model = tree_sequence .map_to_vcf_model (
57
55
individuals = individuals ,
58
56
ploidy = ploidy ,
59
57
individual_names = individual_names ,
60
58
include_non_sample_nodes = include_non_sample_nodes ,
61
59
position_transform = position_transform ,
60
+ contig_id = contig_id ,
61
+ isolated_as_missing = isolated_as_missing ,
62
62
)
63
+
64
+ # We now make some tweaks to the VCF model required for
65
+ # writing the VCF in text format
66
+
63
67
# Remove individuals with zero ploidy as these cannot be
64
68
# represented in VCF.
65
- individuals_nodes = vcf_model .individuals_nodes
66
- to_keep = (individuals_nodes != - 1 ).any (axis = 1 )
67
- individuals_nodes = individuals_nodes [to_keep ]
68
- self .individual_names = vcf_model .individuals_name [to_keep ]
69
+ to_keep = (vcf_model .individuals_nodes != - 1 ).any (axis = 1 )
70
+ vcf_model .individuals_nodes = vcf_model .individuals_nodes [to_keep ]
71
+ vcf_model .individual_names = vcf_model .individuals_name [to_keep ]
69
72
self .individual_ploidies = [
70
- len (nodes [nodes >= 0 ]) for nodes in individuals_nodes
73
+ len (nodes [nodes >= 0 ]) for nodes in vcf_model . individuals_nodes
71
74
]
72
- self .num_individuals = len (self .individual_names )
75
+ self .num_individuals = len (vcf_model .individual_names )
73
76
74
- if len (individuals_nodes ) == 0 :
77
+ if len (vcf_model . individuals_nodes ) == 0 :
75
78
raise ValueError ("No samples in resulting VCF model" )
76
79
80
+ if len (vcf_model .transformed_positions ) > 0 :
81
+ # Arguably this should be last_pos + 1, but if we hit this
82
+ # condition the coordinate systems are all muddled up anyway
83
+ # so it's simpler to stay with this rule that was inherited
84
+ # from the legacy VCF output code.
85
+ vcf_model .contig_length = max (
86
+ vcf_model .transformed_positions [- 1 ], vcf_model .contig_length
87
+ )
88
+
77
89
# Flatten the array of node IDs, filtering out the -1 padding values
78
90
self .samples = []
79
- for row in individuals_nodes :
91
+ for row in vcf_model . individuals_nodes :
80
92
for node_id in row :
81
93
if node_id != - 1 :
82
94
self .samples .append (node_id )
83
95
84
- self .transformed_positions = vcf_model .transformed_positions
85
- self .contig_length = vcf_model .contig_length
86
- if len (self .transformed_positions ) > 0 :
87
- # Arguably this should be last_pos + 1, but if we hit this
88
- # condition the coordinate systems are all muddled up anyway
89
- # so it's simpler to stay with this rule that was inherited
90
- # from the legacy VCF output code.
91
- self .contig_length = max (self .transformed_positions [- 1 ], self .contig_length )
92
-
93
96
if site_mask is None :
94
97
site_mask = np .zeros (tree_sequence .num_sites , dtype = bool )
95
98
self .site_mask = np .array (site_mask , dtype = bool )
96
99
if self .site_mask .shape != (tree_sequence .num_sites ,):
97
100
raise ValueError ("Site mask must be 1D a boolean array of length num_sites" )
98
101
99
- self .sample_mask = sample_mask
100
- if sample_mask is not None :
101
- if not callable (sample_mask ):
102
- sample_mask = np .array (sample_mask , dtype = bool )
103
- self .sample_mask = lambda _ : sample_mask
104
-
105
102
# The VCF spec does not allow for positions to be 0, so we error if one of the
106
103
# transformed positions is 0 and allow_position_zero is False.
107
104
if not allow_position_zero and np .any (
108
- self .transformed_positions [~ site_mask ] == 0
105
+ vcf_model .transformed_positions [~ site_mask ] == 0
109
106
):
110
107
raise ValueError (
111
108
"A variant position of 0 was found in the VCF output, this is not "
@@ -116,17 +113,26 @@ def __init__(
116
113
'"position_transform = lambda x: np.fmax(1, x)"'
117
114
)
118
115
116
+ self .sample_mask = sample_mask
117
+ if sample_mask is not None :
118
+ if not callable (sample_mask ):
119
+ sample_mask = np .array (sample_mask , dtype = bool )
120
+ self .sample_mask = lambda _ : sample_mask
121
+
122
+ self .vcf_model = vcf_model
123
+
119
124
def __write_header (self , output ):
120
125
print ("##fileformat=VCFv4.2" , file = output )
121
126
print (f"##source=tskit { provenance .__version__ } " , file = output )
122
127
print ('##FILTER=<ID=PASS,Description="All filters passed">' , file = output )
123
128
print (
124
- f"##contig=<ID={ self .contig_id } ,length={ self .contig_length } >" , file = output
129
+ f"##contig=<ID={ self .vcf_model .contig_id } ,length={ self .vcf_model .contig_length } >" ,
130
+ file = output ,
125
131
)
126
132
print (
127
133
'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">' , file = output
128
134
)
129
- vcf_samples = "\t " .join (self .individual_names )
135
+ vcf_samples = "\t " .join (self .vcf_model . individual_names )
130
136
print (
131
137
"#CHROM" ,
132
138
"POS" ,
@@ -163,7 +169,7 @@ def write(self, output):
163
169
indexes = np .array (indexes , dtype = int )
164
170
165
171
for variant in self .tree_sequence .variants (
166
- samples = self .samples , isolated_as_missing = self .isolated_as_missing
172
+ samples = self .samples , isolated_as_missing = self .vcf_model . isolated_as_missing
167
173
):
168
174
site_id = variant .site .id
169
175
# We check the mask before we do any checks so we can use this as a
@@ -176,13 +182,13 @@ def write(self, output):
176
182
"More than 9 alleles not currently supported. Please open an issue "
177
183
"on GitHub if this limitation affects you."
178
184
)
179
- pos = self .transformed_positions [variant .index ]
185
+ pos = self .vcf_model . transformed_positions [variant .index ]
180
186
ref = variant .alleles [0 ]
181
187
alt = "."
182
188
if variant .num_alleles > 1 :
183
189
alt = "," .join (variant .alleles [1 : variant .num_alleles ])
184
190
print (
185
- self .contig_id ,
191
+ self .vcf_model . contig_id ,
186
192
pos ,
187
193
site_id ,
188
194
ref ,
0 commit comments