Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/releasenotes.md
83 changes: 59 additions & 24 deletions src/cooler/util.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from __future__ import annotations

import io
import os
import re
from collections import OrderedDict, defaultdict
from collections.abc import Generator, Iterable, Iterator
from contextlib import contextmanager
from typing import IO, Any
from typing import Any

import h5py
import numpy as np
Expand Down Expand Up @@ -204,56 +205,90 @@ def argnatsort(array: Iterable[str]) -> np.ndarray:
cols = tuple(zip(*(natsort_key(x) for x in array)))
return np.lexsort(cols[::-1])


def read_chromsizes(
filepath_or: str | IO[str],
filepath_or: str | io.StringIO,
name_patterns: tuple[str, ...] = (r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"),
all_names: bool = False,
**kwargs,
) -> pd.Series:
"""
Parse a ``<db>.chrom.sizes`` or ``<db>.chromInfo.txt`` file from the UCSC
database, where ``db`` is a genome assembly name.
Parse a `<db>.chrom.sizes` or `<db>.chromInfo.txt` file from the UCSC
database, where `db` is a genome assembly name.

Parameters
----------
filepath_or : str or file-like
Path or url to text file, or buffer.
name_patterns : sequence, optional
Sequence of regular expressions to capture desired sequence names.
Each corresponding set of records will be sorted in natural order.
all_names : bool, optional
Whether to return all contigs listed in the file. Default is
``False``.

Returns
-------
:py:class:`pandas.Series`
Series of integer bp lengths indexed by sequence name.
Whether to return all contigs listed in the file.
verbose : bool, optional
Whether to enable verbose logging for diagnostics.

References
----------
* `UCSC assembly terminology <http://genome.ucsc.edu/FAQ/FAQdownloads.html#download9>`_
* `GRC assembly terminology <https://www.ncbi.nlm.nih.gov/grc/help/definitions>`_

"""
if isinstance(filepath_or, str) and filepath_or.endswith(".gz"):
kwargs.setdefault("compression", "gzip")
chromtable = pd.read_csv(
filepath_or,
sep="\t",
usecols=[0, 1],
names=["name", "length"],
dtype={"name": str},
**kwargs,
)
# Handle URL case separately
if isinstance(filepath_or, str) and filepath_or.startswith(('http://', 'https://')):
try:
# Use pandas' built-in URL handling
chromtable = pd.read_csv(
filepath_or,
sep="\t",
usecols=[0, 1],
names=["name", "length"],
dtype={"name": str},
on_bad_lines="error",
**kwargs,
)
except Exception as e:
raise ValueError(f"Failed to fetch URL {filepath_or}: {e!s}") from e
else:
# Original validation for local files/StringIO
if isinstance(filepath_or, (str, io.StringIO)):
first_line = None
if isinstance(filepath_or, io.StringIO):
first_line = filepath_or.getvalue().splitlines()[0]
elif isinstance(filepath_or, str):
with open(filepath_or) as file:
first_line = file.readline()

if first_line and ' ' in first_line:
raise ValueError(
f"Chromsizes file '{filepath_or}' uses spaces instead of tabs "
"as delimiters. Please use tabs.")

# Read the file
chromtable = pd.read_csv(
filepath_or,
sep="\t",
usecols=[0, 1],
names=["name", "length"],
dtype={"name": str},
on_bad_lines="error",
**kwargs,
)

# Common validation for both URL and local files
chromtable["length"] = pd.to_numeric(chromtable["length"], errors="coerce")
if chromtable["length"].isnull().any():
raise ValueError(
f"Chromsizes file contains missing/invalid length values. "
f"Invalid rows: \n{chromtable[chromtable['length'].isnull()]}"
)

# Filter by patterns if needed
if not all_names:
parts = []
for pattern in name_patterns:
part = chromtable[chromtable["name"].str.contains(pattern)]
part = part.iloc[argnatsort(part["name"])]
parts.append(part)
chromtable = pd.concat(parts, axis=0)

chromtable.index = chromtable["name"].values
return chromtable["length"]

Expand Down
93 changes: 93 additions & 0 deletions tests/data/hg19.chrom.sizes
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
chr1 249250621
chr2 243199373
chr3 198022430
chr4 191154276
chr5 180915260
chr6 171115067
chr7 159138663
chrX 155270560
chr8 146364022
chr9 141213431
chr10 135534747
chr11 135006516
chr12 133851895
chr13 115169878
chr14 107349540
chr15 102531392
chr16 90354753
chr17 81195210
chr18 78077248
chr20 63025520
chrY 59373566
chr19 59128983
chr22 51304566
chr21 48129895
chr6_ssto_hap7 4928567
chr6_mcf_hap5 4833398
chr6_cox_hap2 4795371
chr6_mann_hap4 4683263
chr6_apd_hap1 4622290
chr6_qbl_hap6 4611984
chr6_dbb_hap3 4610396
chr17_ctg5_hap1 1680828
chr4_ctg9_hap1 590426
chr1_gl000192_random 547496
chrUn_gl000225 211173
chr4_gl000194_random 191469
chr4_gl000193_random 189789
chr9_gl000200_random 187035
chrUn_gl000222 186861
chrUn_gl000212 186858
chr7_gl000195_random 182896
chrUn_gl000223 180455
chrUn_gl000224 179693
chrUn_gl000219 179198
chr17_gl000205_random 174588
chrUn_gl000215 172545
chrUn_gl000216 172294
chrUn_gl000217 172149
chr9_gl000199_random 169874
chrUn_gl000211 166566
chrUn_gl000213 164239
chrUn_gl000220 161802
chrUn_gl000218 161147
chr19_gl000209_random 159169
chrUn_gl000221 155397
chrUn_gl000214 137718
chrUn_gl000228 129120
chrUn_gl000227 128374
chr1_gl000191_random 106433
chr19_gl000208_random 92689
chr9_gl000198_random 90085
chr17_gl000204_random 81310
chrUn_gl000233 45941
chrUn_gl000237 45867
chrUn_gl000230 43691
chrUn_gl000242 43523
chrUn_gl000243 43341
chrUn_gl000241 42152
chrUn_gl000236 41934
chrUn_gl000240 41933
chr17_gl000206_random 41001
chrUn_gl000232 40652
chrUn_gl000234 40531
chr11_gl000202_random 40103
chrUn_gl000238 39939
chrUn_gl000244 39929
chrUn_gl000248 39786
chr8_gl000196_random 38914
chrUn_gl000249 38502
chrUn_gl000246 38154
chr17_gl000203_random 37498
chr8_gl000197_random 37175
chrUn_gl000245 36651
chrUn_gl000247 36422
chr9_gl000201_random 36148
chrUn_gl000235 34474
chrUn_gl000239 33824
chr21_gl000210_random 27682
chrUn_gl000231 27386
chrUn_gl000229 19913
chrM 16571
chrUn_gl000226 15008
chr18_gl000207_random 4262
16 changes: 15 additions & 1 deletion tests/test_util.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os.path as op
from io import BytesIO
from io import BytesIO, StringIO

import h5py
import numpy as np
Expand Down Expand Up @@ -166,6 +166,20 @@ def test_read_chromsizes():
util.read_chromsizes(op.join(datadir, "toy.chrom.sizes"))


def test_read_chromsizes_bad_input():
broken_data = "chr1\t1000\nchr2\tbad_value\nchr3\t2000\n"
broken_file = StringIO(broken_data)
with pytest.raises(ValueError):
util.read_chromsizes(broken_file)


def test_read_chromsizes_bad_delimiter():
broken_data = "chr1 1000\nchr2 bad_value\nchr3 2000\n"
broken_file = StringIO(broken_data)
with pytest.raises(ValueError):
util.read_chromsizes(broken_file)


# def test_fetch_chromsizes():
# util.fetch_chromsizes("hg19")

Expand Down