Skip to content

Commit 5f5d692

Browse files
authored
VCF-53: Fix read_variant_stats() segfault when region is empty (#871)
* Added edge cases to Python Arrow bindings These edge cases are to handle when the buffers returned by a dataset read are empty. Specifically, an array's offsets must still contain the initial 0 value even when there's no data, and a string array must always have 3 buffers, even when there's no data. * The allele count and variant stats Python bindings now return empty arrow tables Previously the allele count binding would return None and the variant stats binding would try to parse the empty buffers returned by the reader, causing a segmentation fault. * Added empty region tests for read_variant_stats() and read_allele_count() These tests ensure that both methods return an empty DataFrame when the regions are empty or don't match the constraints of the query.
1 parent 7be3e54 commit 5f5d692

File tree

3 files changed

+63
-48
lines changed

3 files changed

+63
-48
lines changed

apis/python/src/tiledbvcf/binding/reader.cc

Lines changed: 32 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -516,16 +516,18 @@ py::object Reader::get_variant_stats_results() {
516516
auto an = BufferInfo::create("an", TILEDB_VCF_INT32, num_rows);
517517
auto af = BufferInfo::create("af", TILEDB_VCF_FLOAT32, num_rows);
518518

519-
check_error(
520-
reader,
521-
tiledb_vcf_reader_read_from_variant_stats(
522-
reader,
523-
reinterpret_cast<uint32_t*>(pos->data().data()),
524-
reinterpret_cast<char*>(allele->data().data()),
525-
reinterpret_cast<int32_t*>(allele->offsets().data()),
526-
reinterpret_cast<int*>(ac->data().data()),
527-
reinterpret_cast<int*>(an->data().data()),
528-
reinterpret_cast<float*>(af->data().data())));
519+
if (num_rows > 0) {
520+
check_error(
521+
reader,
522+
tiledb_vcf_reader_read_from_variant_stats(
523+
reader,
524+
reinterpret_cast<uint32_t*>(pos->data().data()),
525+
reinterpret_cast<char*>(allele->data().data()),
526+
reinterpret_cast<int32_t*>(allele->offsets().data()),
527+
reinterpret_cast<int*>(ac->data().data()),
528+
reinterpret_cast<int*>(an->data().data()),
529+
reinterpret_cast<float*>(af->data().data())));
530+
}
529531

530532
build_arrow_array_from_buffer(pos, num_rows, 0, num_rows);
531533
build_arrow_array_from_buffer(allele, num_rows, 0, alleles_size);
@@ -546,15 +548,16 @@ py::object Reader::get_allele_count_results() {
546548
reader,
547549
tiledb_vcf_reader_get_allele_count_buffer_sizes(
548550
reader, &num_rows, &refs_size, &alts_size, &filters_size, &gts_size));
549-
if (num_rows > 0) {
550-
auto pos = BufferInfo::create("pos", TILEDB_VCF_INT32, num_rows);
551-
auto ref = BufferInfo::create("ref", TILEDB_VCF_CHAR, num_rows, refs_size);
552-
auto alt = BufferInfo::create("alt", TILEDB_VCF_CHAR, num_rows, alts_size);
553-
auto filter =
554-
BufferInfo::create("filter", TILEDB_VCF_CHAR, num_rows, filters_size);
555-
auto gt = BufferInfo::create("gt", TILEDB_VCF_CHAR, num_rows, gts_size);
556-
auto count = BufferInfo::create("count", TILEDB_VCF_INT32, num_rows);
557551

552+
auto pos = BufferInfo::create("pos", TILEDB_VCF_INT32, num_rows);
553+
auto ref = BufferInfo::create("ref", TILEDB_VCF_CHAR, num_rows, refs_size);
554+
auto alt = BufferInfo::create("alt", TILEDB_VCF_CHAR, num_rows, alts_size);
555+
auto filter =
556+
BufferInfo::create("filter", TILEDB_VCF_CHAR, num_rows, filters_size);
557+
auto gt = BufferInfo::create("gt", TILEDB_VCF_CHAR, num_rows, gts_size);
558+
auto count = BufferInfo::create("count", TILEDB_VCF_INT32, num_rows);
559+
560+
if (num_rows > 0) {
558561
check_error(
559562
reader,
560563
tiledb_vcf_reader_read_from_allele_count(
@@ -569,19 +572,18 @@ py::object Reader::get_allele_count_results() {
569572
reinterpret_cast<char*>(gt->data().data()),
570573
reinterpret_cast<uint32_t*>(gt->offsets().data()),
571574
reinterpret_cast<int32_t*>(count->data().data())));
572-
573-
build_arrow_array_from_buffer(pos, num_rows, 0, num_rows);
574-
build_arrow_array_from_buffer(ref, num_rows, 0, num_rows);
575-
build_arrow_array_from_buffer(alt, num_rows, 0, num_rows);
576-
build_arrow_array_from_buffer(filter, num_rows, 0, num_rows);
577-
build_arrow_array_from_buffer(gt, num_rows, 0, num_rows);
578-
build_arrow_array_from_buffer(count, num_rows, 0, num_rows);
579-
580-
std::vector<std::shared_ptr<BufferInfo>> buffers = {
581-
pos, ref, alt, filter, gt, count};
582-
return buffers_to_table(buffers);
583575
}
584-
return py::cast<py::none> Py_None;
576+
577+
build_arrow_array_from_buffer(pos, num_rows, 0, num_rows);
578+
build_arrow_array_from_buffer(ref, num_rows, 0, num_rows);
579+
build_arrow_array_from_buffer(alt, num_rows, 0, num_rows);
580+
build_arrow_array_from_buffer(filter, num_rows, 0, num_rows);
581+
build_arrow_array_from_buffer(gt, num_rows, 0, num_rows);
582+
build_arrow_array_from_buffer(count, num_rows, 0, num_rows);
583+
584+
std::vector<std::shared_ptr<BufferInfo>> buffers = {
585+
pos, ref, alt, filter, gt, count};
586+
return buffers_to_table(buffers);
585587
}
586588

587589
void Reader::deleter(tiledb_vcf_reader_t* r) {

apis/python/src/tiledbvcf/binding/vcf_arrow.cc

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -141,32 +141,32 @@ static std::pair<ArrowSchema*, ArrowArray*> arrow_string_array(
141141
schema->release = &release_schema;
142142
schema->private_data = nullptr;
143143

144-
int n_buffers = offsets.capacity() ? 3 : 2;
145-
146144
auto array = (struct ArrowArray*)malloc(sizeof(struct ArrowArray));
147145
array->length = length;
148146
array->null_count = 0;
149147
array->offset = 0;
150-
array->n_buffers = n_buffers;
148+
array->n_buffers = 3;
151149
array->n_children = 0;
152150
array->buffers = nullptr;
153151
array->children = nullptr;
154152
array->dictionary = nullptr;
155153
array->release = &release_array;
156154
array->private_data = (void*)new ArrowBuffer(buffer);
157155

158-
array->buffers = (const void**)malloc(n_buffers * sizeof(void*));
159-
array->buffers[0] = nullptr; // validity
160-
array->buffers[n_buffers - 1] = data.data(); // data
161-
if (n_buffers == 3) {
162-
array->buffers[1] = offsets.data(); // offsets
163-
}
164-
156+
array->buffers = (const void**)malloc(3 * sizeof(void*));
165157
if (bitmap.capacity()) {
166158
schema->flags |= ARROW_FLAG_NULLABLE;
167159
array->null_count = -1;
168160
array->buffers[0] = bitmap.data();
161+
} else {
162+
array->buffers[0] = nullptr; // validity
163+
}
164+
// Edge case: empty results should still have a single 0 offset
165+
if (!length && !offsets.size()) {
166+
offsets.push_back(0);
169167
}
168+
array->buffers[1] = offsets.data(); // offsets
169+
array->buffers[2] = data.data(); // data
170170

171171
return std::make_pair(schema, array);
172172
}
@@ -205,7 +205,11 @@ static std::pair<ArrowSchema*, ArrowArray*> arrow_list_array(
205205
array->private_data = nullptr; // Buffers will be deleted by the child array
206206

207207
array->buffers = (const void**)malloc(array->n_buffers * sizeof(void*));
208-
array->buffers[0] = nullptr; // validity
208+
array->buffers[0] = nullptr; // validity
209+
// Edge case: empty results should still have a single 0 offset
210+
if (!length && !value_offsets.size()) {
211+
value_offsets.push_back(0);
212+
}
209213
array->buffers[1] = value_offsets.data(); // data
210214

211215
if (bitmap.capacity()) {
@@ -234,12 +238,12 @@ void build_arrow_array(
234238
auto [values_schema, values_array] =
235239
arrow_data_array(buffer, num_data_elements, buffer->data());
236240

241+
// Edge case: only subtract if there's offsets to avoid underflow
242+
if (num_offsets) {
243+
num_offsets -= 1;
244+
}
237245
std::tie(array_schema, array_array) = arrow_list_array(
238-
buffer,
239-
num_offsets - 1,
240-
buffer->offsets(),
241-
values_schema,
242-
values_array);
246+
buffer, num_offsets, buffer->offsets(), values_schema, values_array);
243247
} else {
244248
std::tie(array_schema, array_array) =
245249
arrow_data_array(buffer, num_data_elements, buffer->data());
@@ -278,9 +282,13 @@ void build_arrow_array_from_buffer(
278282
uint64_t num_data_elements) {
279283
if (buffer->datatype() == TILEDB_VCF_CHAR) {
280284
if (buffer->list_offsets().capacity() > 0) {
285+
// Edge case: only subtract if there's offsets to avoid underflow
286+
if (num_offsets) {
287+
num_offsets -= 1;
288+
}
281289
// Array of strings
282290
auto [values_schema, values_array] = arrow_string_array(
283-
buffer, num_offsets - 1, buffer->offsets(), buffer->data());
291+
buffer, num_offsets, buffer->offsets(), buffer->data());
284292

285293
// Array of lists of strings
286294
auto [arrow_schema, arrow_array] = arrow_list_array(

apis/python/tests/test_tiledbvcf.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1344,6 +1344,9 @@ def test_ingest_with_stats_v3(
13441344
with pytest.raises(Exception, match=interval_error):
13451345
test_stats_v3_ingestion.read_variant_stats_arrow(regions=["chr1:100-1"])
13461346

1347+
# test empty region
1348+
assert test_stats_v3_ingestion.read_variant_stats(regions=["chr3:1-10000"]).empty
1349+
13471350
# test types and deprecated region parameter
13481351
region1 = "chr1:1-10000"
13491352
df = test_stats_v3_ingestion.read_variant_stats(region1)
@@ -1551,7 +1554,8 @@ def test_ingest_with_stats_v3(
15511554
with pytest.raises(Exception, match=interval_error):
15521555
test_stats_v3_ingestion.read_allele_count_arrow(regions=["chr1:100-1"])
15531556

1554-
# test allele count
1557+
# test empty region
1558+
assert test_stats_v3_ingestion.read_allele_count(regions=["chr3:1-10000"]).empty
15551559

15561560
# test types and deprecated region parameter
15571561
region1 = "chr1:1-10000"
@@ -2458,6 +2462,7 @@ def test_delete_dataset(tmp_path):
24582462
# Check that the dataset does not exist
24592463
assert not os.path.exists(uri)
24602464

2465+
24612466
def test_equality_old_new_format():
24622467
old_ds = tiledbvcf.Dataset(os.path.join(TESTS_INPUT_DIR, "arrays/old_format"))
24632468
new_ds = tiledbvcf.Dataset(os.path.join(TESTS_INPUT_DIR, "arrays/new_format"))

0 commit comments

Comments
 (0)