open2c · ShigrafS · Feb 26, 2025 · Mar 1, 2025 · Mar 1, 2025 · Mar 1, 2025
diff --git a/docs/releasenotes.md b/docs/releasenotes.md
@@ -1 +1 @@
-../CHANGES.md
+../CHANGES.md
diff --git a/src/cooler/util.py b/src/cooler/util.py
@@ -1,11 +1,12 @@
 from __future__ import annotations
 
+import io
 import os
 import re
 from collections import OrderedDict, defaultdict
 from collections.abc import Generator, Iterable, Iterator
 from contextlib import contextmanager
-from typing import IO, Any
+from typing import Any
 
 import h5py
 import numpy as np
@@ -204,56 +205,90 @@ def argnatsort(array: Iterable[str]) -> np.ndarray:
     cols = tuple(zip(*(natsort_key(x) for x in array)))
     return np.lexsort(cols[::-1])
 
-
 def read_chromsizes(
-    filepath_or: str | IO[str],
+    filepath_or: str | io.StringIO,
     name_patterns: tuple[str, ...] = (r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"),
     all_names: bool = False,
     **kwargs,
 ) -> pd.Series:
     """
-    Parse a ``<db>.chrom.sizes`` or ``<db>.chromInfo.txt`` file from the UCSC
-    database, where ``db`` is a genome assembly name.
+    Parse a `<db>.chrom.sizes` or `<db>.chromInfo.txt` file from the UCSC
+    database, where `db` is a genome assembly name.
 
     Parameters
     ----------
     filepath_or : str or file-like
         Path or url to text file, or buffer.
     name_patterns : sequence, optional
         Sequence of regular expressions to capture desired sequence names.
-        Each corresponding set of records will be sorted in natural order.
     all_names : bool, optional
-        Whether to return all contigs listed in the file. Default is
-        ``False``.
-
-    Returns
-    -------
-    :py:class:`pandas.Series`
-        Series of integer bp lengths indexed by sequence name.
+        Whether to return all contigs listed in the file.
+    verbose : bool, optional
+        Whether to enable verbose logging for diagnostics.
 
     References
     ----------
     * `UCSC assembly terminology <http://genome.ucsc.edu/FAQ/FAQdownloads.html#download9>`_
     * `GRC assembly terminology <https://www.ncbi.nlm.nih.gov/grc/help/definitions>`_
-
     """
-    if isinstance(filepath_or, str) and filepath_or.endswith(".gz"):
-        kwargs.setdefault("compression", "gzip")
-    chromtable = pd.read_csv(
-        filepath_or,
-        sep="\t",
-        usecols=[0, 1],
-        names=["name", "length"],
-        dtype={"name": str},
-        **kwargs,
-    )
+    # Handle URL case separately
+    if isinstance(filepath_or, str) and filepath_or.startswith(('http://', 'https://')):
+        try:
+            # Use pandas' built-in URL handling
+            chromtable = pd.read_csv(
+                filepath_or,
+                sep="\t",
+                usecols=[0, 1],
+                names=["name", "length"],
+                dtype={"name": str},
+                on_bad_lines="error",
+                **kwargs,
+            )
+        except Exception as e:
+            raise ValueError(f"Failed to fetch URL {filepath_or}: {e!s}") from e
+    else:
+        # Original validation for local files/StringIO
+        if isinstance(filepath_or, (str, io.StringIO)):
+            first_line = None
+            if isinstance(filepath_or, io.StringIO):
+                first_line = filepath_or.getvalue().splitlines()[0]
+            elif isinstance(filepath_or, str):
+                with open(filepath_or) as file:
+                    first_line = file.readline()
+
+            if first_line and ' ' in first_line:
+                raise ValueError(
+                    f"Chromsizes file '{filepath_or}' uses spaces instead of tabs "
+                    "as delimiters. Please use tabs.")
+
+        # Read the file
+        chromtable = pd.read_csv(
+            filepath_or,
+            sep="\t",
+            usecols=[0, 1],
+            names=["name", "length"],
+            dtype={"name": str},
+            on_bad_lines="error",
+            **kwargs,
+        )
+
+    # Common validation for both URL and local files
+    chromtable["length"] = pd.to_numeric(chromtable["length"], errors="coerce")
+    if chromtable["length"].isnull().any():
+        raise ValueError(
+            f"Chromsizes file contains missing/invalid length values. "
+            f"Invalid rows: \n{chromtable[chromtable['length'].isnull()]}"
+        )
+
+    # Filter by patterns if needed
     if not all_names:
         parts = []
         for pattern in name_patterns:
             part = chromtable[chromtable["name"].str.contains(pattern)]
             part = part.iloc[argnatsort(part["name"])]
             parts.append(part)
         chromtable = pd.concat(parts, axis=0)
+
     chromtable.index = chromtable["name"].values
     return chromtable["length"]
 

diff --git a/tests/data/hg19.chrom.sizes b/tests/data/hg19.chrom.sizes
@@ -0,0 +1,93 @@
+chr1	249250621
+chr2	243199373
+chr3	198022430
+chr4	191154276
+chr5	180915260
+chr6	171115067
+chr7	159138663
+chrX	155270560
+chr8	146364022
+chr9	141213431
+chr10	135534747
+chr11	135006516
+chr12	133851895
+chr13	115169878
+chr14	107349540
+chr15	102531392
+chr16	90354753
+chr17	81195210
+chr18	78077248
+chr20	63025520
+chrY	59373566
+chr19	59128983
+chr22	51304566
+chr21	48129895
+chr6_ssto_hap7	4928567
+chr6_mcf_hap5	4833398
+chr6_cox_hap2	4795371
+chr6_mann_hap4	4683263
+chr6_apd_hap1	4622290
+chr6_qbl_hap6	4611984
+chr6_dbb_hap3	4610396
+chr17_ctg5_hap1	1680828
+chr4_ctg9_hap1	590426
+chr1_gl000192_random	547496
+chrUn_gl000225	211173
+chr4_gl000194_random	191469
+chr4_gl000193_random	189789
+chr9_gl000200_random	187035
+chrUn_gl000222	186861
+chrUn_gl000212	186858
+chr7_gl000195_random	182896
+chrUn_gl000223	180455
+chrUn_gl000224	179693
+chrUn_gl000219	179198
+chr17_gl000205_random	174588
+chrUn_gl000215	172545
+chrUn_gl000216	172294
+chrUn_gl000217	172149
+chr9_gl000199_random	169874
+chrUn_gl000211	166566
+chrUn_gl000213	164239
+chrUn_gl000220	161802
+chrUn_gl000218	161147
+chr19_gl000209_random	159169
+chrUn_gl000221	155397
+chrUn_gl000214	137718
+chrUn_gl000228	129120
+chrUn_gl000227	128374
+chr1_gl000191_random	106433
+chr19_gl000208_random	92689
+chr9_gl000198_random	90085
+chr17_gl000204_random	81310
+chrUn_gl000233	45941
+chrUn_gl000237	45867
+chrUn_gl000230	43691
+chrUn_gl000242	43523
+chrUn_gl000243	43341
+chrUn_gl000241	42152
+chrUn_gl000236	41934
+chrUn_gl000240	41933
+chr17_gl000206_random	41001
+chrUn_gl000232	40652
+chrUn_gl000234	40531
+chr11_gl000202_random	40103
+chrUn_gl000238	39939
+chrUn_gl000244	39929
+chrUn_gl000248	39786
+chr8_gl000196_random	38914
+chrUn_gl000249	38502
+chrUn_gl000246	38154
+chr17_gl000203_random	37498
+chr8_gl000197_random	37175
+chrUn_gl000245	36651
+chrUn_gl000247	36422
+chr9_gl000201_random	36148
+chrUn_gl000235	34474
+chrUn_gl000239	33824
+chr21_gl000210_random	27682
+chrUn_gl000231	27386
+chrUn_gl000229	19913
+chrM	16571
+chrUn_gl000226	15008
+chr18_gl000207_random	4262
diff --git a/tests/test_util.py b/tests/test_util.py
@@ -1,5 +1,5 @@
 import os.path as op
-from io import BytesIO
+from io import BytesIO, StringIO
 
 import h5py
 import numpy as np
@@ -166,6 +166,20 @@ def test_read_chromsizes():
     util.read_chromsizes(op.join(datadir, "toy.chrom.sizes"))
 
 
+def test_read_chromsizes_bad_input():
+    broken_data = "chr1\t1000\nchr2\tbad_value\nchr3\t2000\n"
+    broken_file = StringIO(broken_data)
+    with pytest.raises(ValueError):
+        util.read_chromsizes(broken_file)
+
+
+def test_read_chromsizes_bad_delimiter():
+    broken_data = "chr1 1000\nchr2 bad_value\nchr3 2000\n"
+    broken_file = StringIO(broken_data)
+    with pytest.raises(ValueError):
+        util.read_chromsizes(broken_file)
+
+
 # def test_fetch_chromsizes():
 #     util.fetch_chromsizes("hg19")