Skip to content

Commit

Permalink
fix: Add describe_vcf and register_view methods (#105)
Browse files Browse the repository at this point in the history
* Add describe_vcf method

* fix: Add register_view operation
  • Loading branch information
mwiewior authored Mar 5, 2025
1 parent b4972f4 commit ab9bfa7
Show file tree
Hide file tree
Showing 7 changed files with 178 additions and 12 deletions.
5 changes: 3 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "polars_bio"
version = "0.7.1"
version = "0.7.2"
edition = "2021"

[lib]
Expand Down Expand Up @@ -36,9 +36,10 @@ polars-python = { git = "https://github.com/mwiewior/polars.git" , rev = "9d4fca

#exon ="0.32.4"
exon = { git = "https://github.com/mwiewior/exon.git", rev="d134d923e6c592a9972d93215a12c759c70a7ed5"}
datafusion-vcf = { git = "https://github.com/biodatageeks/datafusion-bio-formats.git/", rev = "290df122121b90c02bd07260f1f4539e254a82d7"}
datafusion-vcf = { git = "https://github.com/biodatageeks/datafusion-bio-formats.git/", rev = "4d7e72b4588025d3a020b1a55a16f6484ad846d9"}
async-trait = "0.1.86"
futures = "0.3.31"
coitrees = "0.4.0"
fnv = "1.0.7"
async-stream = "0.3.6"
rand = "0.8.5"
57 changes: 51 additions & 6 deletions docs/notebooks/cookbook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
"id": "62a7b57c30bf54e2",
"metadata": {
"ExecuteTime": {
"end_time": "2025-03-01T18:19:10.918443Z",
"start_time": "2025-03-01T18:19:08.913734Z"
"end_time": "2025-03-05T16:41:54.268168Z",
"start_time": "2025-03-05T16:41:53.664194Z"
}
},
"source": [
Expand All @@ -38,15 +38,15 @@
]
}
],
"execution_count": 1
"execution_count": 2
},
{
"cell_type": "code",
"id": "18e876b10c939ec",
"metadata": {
"ExecuteTime": {
"end_time": "2025-02-28T11:47:06.608479Z",
"start_time": "2025-02-28T11:47:06.606197Z"
"end_time": "2025-03-05T16:41:56.359878Z",
"start_time": "2025-03-05T16:41:56.357616Z"
}
},
"source": [
Expand All @@ -55,7 +55,7 @@
")"
],
"outputs": [],
"execution_count": 2
"execution_count": 3
},
{
"cell_type": "code",
Expand Down Expand Up @@ -112,6 +112,51 @@
],
"execution_count": 3
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-03-05T16:42:08.359227Z",
"start_time": "2025-03-05T16:41:58.979371Z"
}
},
"cell_type": "code",
"source": "pb.describe_vcf(gcs_vcf_path).sort(\"name\").limit(5)",
"id": "26feaf5bfad670b3",
"outputs": [
{
"data": {
"text/plain": [
"shape: (5, 3)\n",
"┌───────────┬─────────┬─────────────────────────────────┐\n",
"│ name ┆ type ┆ description │\n",
"│ --- ┆ --- ┆ --- │\n",
"│ str ┆ str ┆ str │\n",
"╞═══════════╪═════════╪═════════════════════════════════╡\n",
"│ ac ┆ Integer ┆ Number of non-reference allele… │\n",
"│ ac_afr ┆ Integer ┆ Number of non-reference Africa… │\n",
"│ ac_afr_xx ┆ Integer ┆ Number of non-reference Africa… │\n",
"│ ac_afr_xy ┆ Integer ┆ Number of non-reference Africa… │\n",
"│ ac_ami ┆ Integer ┆ Number of non-reference Amish … │\n",
"└───────────┴─────────┴─────────────────────────────────┘"
],
"text/html": [
"<div><style>\n",
".dataframe > thead > tr,\n",
".dataframe > tbody > tr {\n",
" text-align: right;\n",
" white-space: pre-wrap;\n",
"}\n",
"</style>\n",
"<small>shape: (5, 3)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>name</th><th>type</th><th>description</th></tr><tr><td>str</td><td>str</td><td>str</td></tr></thead><tbody><tr><td>&quot;ac&quot;</td><td>&quot;Integer&quot;</td><td>&quot;Number of non-reference allele…</td></tr><tr><td>&quot;ac_afr&quot;</td><td>&quot;Integer&quot;</td><td>&quot;Number of non-reference Africa…</td></tr><tr><td>&quot;ac_afr_xx&quot;</td><td>&quot;Integer&quot;</td><td>&quot;Number of non-reference Africa…</td></tr><tr><td>&quot;ac_afr_xy&quot;</td><td>&quot;Integer&quot;</td><td>&quot;Number of non-reference Africa…</td></tr><tr><td>&quot;ac_ami&quot;</td><td>&quot;Integer&quot;</td><td>&quot;Number of non-reference Amish …</td></tr></tbody></table></div>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 4
},
{
"cell_type": "markdown",
"id": "34e7074ab7737e5f",
Expand Down
4 changes: 4 additions & 0 deletions polars_bio/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@

from .context import ctx, set_option
from .io import (
describe_vcf,
read_bam,
read_fasta,
read_fastq,
read_table,
read_vcf,
register_vcf,
register_view,
sql,
)
from .polars_ext import PolarsRangesOperations as LazyFrame
Expand All @@ -32,6 +34,8 @@
"read_fastq",
"read_table",
"register_vcf",
"describe_vcf",
"register_view",
"sql",
"InputFormat",
"LazyFrame",
Expand Down
70 changes: 70 additions & 0 deletions polars_bio/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@
InputFormat,
ReadOptions,
VcfReadOptions,
py_describe_vcf,
py_read_sql,
py_read_table,
py_register_table,
py_register_view,
py_scan_sql,
py_scan_table,
)
Expand Down Expand Up @@ -204,6 +206,39 @@ def read_table(path: str, schema: Dict = None, **kwargs) -> pl.LazyFrame:
return df


def describe_vcf(path: str) -> pl.DataFrame:
"""
Describe VCF INFO schema.
Parameters:
path: The path to the VCF file.
!!! Example
```python
import polars_bio as pb
vcf_1 = "gs://gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.vcf.gz"
pb.describe_vcf(vcf_1).sort("name").limit(5)
```
```shell
shape: (5, 3)
┌───────────┬─────────┬──────────────────────────────────────────────────────────────────────────────────────┐
│ name ┆ type ┆ description │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str │
╞═══════════╪═════════╪══════════════════════════════════════════════════════════════════════════════════════╡
│ ac ┆ Integer ┆ Number of non-reference alleles observed (biallelic sites only). │
│ ac_afr ┆ Integer ┆ Number of non-reference African-American alleles observed (biallelic sites only). │
│ ac_afr_xx ┆ Integer ┆ Number of non-reference African-American XX alleles observed (biallelic sites only). │
│ ac_afr_xy ┆ Integer ┆ Number of non-reference African-American XY alleles observed (biallelic sites only). │
│ ac_ami ┆ Integer ┆ Number of non-reference Amish alleles observed (biallelic sites only). │
└───────────┴─────────┴──────────────────────────────────────────────────────────────────────────────────────┘
```
"""
return py_describe_vcf(ctx, path).to_polars()


def register_vcf(
path: str,
name: Union[str, None] = None,
Expand Down Expand Up @@ -242,6 +277,41 @@ def register_vcf(
py_register_table(ctx, path, name, InputFormat.Vcf, read_options)


def register_view(name: str, query: str) -> None:
"""
Register a query as a Datafusion view. This view can be used in genomic ranges operations,
such as overlap, nearest, and count_overlaps. It is useful for filtering, transforming, and aggregating data
prior to the range operation. When combined with the range operation, it can be used to perform complex in a streaming fashion end-to-end.
Parameters:
name: The name of the table.
query: The SQL query.
!!! Example
```python
import polars_bio as pb
pb.register_vcf("gs://gcp-public-data--gnomad/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr21.vcf.bgz", "gnomad_sv")
pb.register_view("v_gnomad_sv", "SELECT replace(chrom,'chr', '') AS chrom, start, end FROM gnomad_sv")
pb.sql("SELECT * FROM v_gnomad_sv").limit(5).collect()
```
```shell
shape: (5, 3)
┌───────┬─────────┬─────────┐
│ chrom ┆ start ┆ end │
│ --- ┆ --- ┆ --- │
│ str ┆ u32 ┆ u32 │
╞═══════╪═════════╪═════════╡
│ 21 ┆ 5031905 ┆ 5031905 │
│ 21 ┆ 5031905 ┆ 5031905 │
│ 21 ┆ 5031909 ┆ 5031909 │
│ 21 ┆ 5031911 ┆ 5031911 │
│ 21 ┆ 5031911 ┆ 5031911 │
└───────┴─────────┴─────────┘
```
"""
py_register_view(ctx, name, query)


def sql(query: str, streaming: bool = False) -> pl.LazyFrame:
"""
Execute a SQL query on the registered tables.
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "maturin"

[project]
name = "polars-bio"
version = "0.7.1"
version = "0.7.2"
description = "Blazing fast genomic operations on large Python dataframes"
authors = []
requires-python = ">=3.9"
Expand Down
47 changes: 46 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ use std::sync::{Arc, Mutex};

use datafusion::arrow::ffi_stream::ArrowArrayStreamReader;
use datafusion::arrow::pyarrow::PyArrowType;
use datafusion::datasource::MemTable;
use datafusion_python::dataframe::PyDataFrame;
use datafusion_vcf::storage::VcfReader;
use log::{debug, error, info};
use polars_lazy::prelude::{LazyFrame, ScanArgsAnonymous};
use polars_python::error::PyPolarsErr;
Expand Down Expand Up @@ -342,18 +344,61 @@ fn py_read_table(
})
}

#[pyfunction]
#[pyo3(signature = (py_ctx, path))]
fn py_describe_vcf(
py: Python<'_>,
py_ctx: &PyBioSessionContext,
path: String,
) -> PyResult<PyDataFrame> {
py.allow_threads(|| {
let rt = Runtime::new().unwrap();
let ctx = &py_ctx.ctx.session;

let df = rt.block_on(async {
let mut reader = VcfReader::new(path, None, Some(64), Some(8)).await;
let rb = reader.describe().await.unwrap();
let mem_table = MemTable::try_new(rb.schema().clone(), vec![vec![rb]]).unwrap();
let random_table_name = format!("vcf_schema_{}", rand::random::<u32>());
ctx.register_table(random_table_name.clone(), Arc::new(mem_table))
.unwrap();
let df = ctx.table(random_table_name).await.unwrap();
df
});
Ok(PyDataFrame::new(df))
})
}

#[pyfunction]
#[pyo3(signature = (py_ctx, name, query))]
fn py_register_view(
py: Python<'_>,
py_ctx: &PyBioSessionContext,
name: String,
query: String,
) -> PyResult<()> {
py.allow_threads(|| {
let rt = Runtime::new().unwrap();
let ctx = &py_ctx.ctx;
rt.block_on(ctx.sql(&format!("CREATE OR REPLACE VIEW {} AS {}", name, query)))
.unwrap();
Ok(())
})
}

#[pymodule]
fn polars_bio(_py: Python, m: &Bound<PyModule>) -> PyResult<()> {
pyo3_log::init();
m.add_function(wrap_pyfunction!(range_operation_frame, m)?)?;
m.add_function(wrap_pyfunction!(range_operation_scan, m)?)?;
m.add_function(wrap_pyfunction!(stream_range_operation_scan, m)?)?;
m.add_function(wrap_pyfunction!(py_read_table, m)?)?;
m.add_function(wrap_pyfunction!(py_register_table, m)?)?;
m.add_function(wrap_pyfunction!(py_read_table, m)?)?;
m.add_function(wrap_pyfunction!(py_read_sql, m)?)?;
m.add_function(wrap_pyfunction!(py_scan_sql, m)?)?;
m.add_function(wrap_pyfunction!(py_scan_table, m)?)?;
m.add_function(wrap_pyfunction!(py_describe_vcf, m)?)?;
m.add_function(wrap_pyfunction!(py_register_view, m)?)?;
// m.add_function(wrap_pyfunction!(unary_operation_scan, m)?)?;
m.add_class::<PyBioSessionContext>()?;
m.add_class::<FilterOp>()?;
Expand Down

0 comments on commit ab9bfa7

Please sign in to comment.