Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 30 additions & 13 deletions src/easydiffraction/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,25 @@

pooch.get_logger().setLevel('WARNING') # Suppress pooch info messages

_DATA_REPO = 'easyscience/diffraction'
_DATA_ROOT = 'data'
# commit SHA preferred
_DATA_INDEX_REF = '010c69546fa9ec1bd998bdcaa902e1df4f5d10af'
_DATA_INDEX_REF = '0176b5d2d7cb4464835680be7aece999956482e5'
# macOS: sha256sum index.json
_DATA_INDEX_HASH = 'sha256:9449dbba0475158bbce9dea1fbb1e5e596c1f63d41fc136a3e3f5d677c5c6779'
_DATA_INDEX_HASH = 'sha256:301d6aafdc1ccf5f97d2edb491a6b350f6195f05106f8f38c9bf5530e592c8ec'


def _build_data_url(path: str) -> str:
path = path.lstrip('/')
return f'https://raw.githubusercontent.com/{_DATA_REPO}/{_DATA_INDEX_REF}/{_DATA_ROOT}/{path}'

Comment thread
AndrewSazonov marked this conversation as resolved.

def _record_path(record: dict) -> str:
if 'path' in record:
return record['path']

msg = "Index record must contain 'path' key."
raise KeyError(msg)
Comment thread
AndrewSazonov marked this conversation as resolved.


def _validate_url(url: str) -> None:
Expand All @@ -51,9 +66,13 @@ def _validate_url(url: str) -> None:
raise ValueError(msg)


def _filename_for_id_from_url(data_id: int | str, url: str) -> str:
"""Return local filename using the extension from the URL."""
suffix = pathlib.Path(urlparse(url).path).suffix # includes leading dot ('.cif', '.xye', ...)
def _filename_for_id_from_path(data_id: int | str, record_path: str) -> str:
"""
Return local filename using the extension from the record path.
"""
suffix = pathlib.PurePosixPath(
record_path
).suffix # includes leading dot ('.cif', '.xye', ...)
# If URL has no suffix, fall back to no extension.
return f'ed-{data_id}{suffix}'

Expand All @@ -74,10 +93,7 @@ def _normalize_known_hash(value: str | None) -> str | None:

def _fetch_data_index() -> dict:
"""Fetch and cache the diffraction data index.json."""
index_url = (
'https://raw.githubusercontent.com/easyscience/diffraction/'
f'{_DATA_INDEX_REF}/data/index.json'
)
index_url = _build_data_url('index.json')
_validate_url(index_url)

destination_dirname = 'easydiffraction'
Expand Down Expand Up @@ -170,11 +186,10 @@ def download_data(
raise KeyError(msg)

record = index[key]
url = record['url']
record_path = _record_path(record)
url = _build_data_url(record_path)
_validate_url(url)

known_hash = _normalize_known_hash(record.get('hash'))
fname = _filename_for_id_from_url(id, url)
fname = _filename_for_id_from_path(id, record_path)

dest_path = pathlib.Path(destination)
dest_path.mkdir(parents=True, exist_ok=True)
Expand All @@ -197,6 +212,8 @@ def download_data(
log.debug(f"Data #{id} already present at '{file_path}', but will be overwritten.")
file_path.unlink()

known_hash = _normalize_known_hash(record.get('hash'))

# Pooch downloads to destination with our controlled filename.
pooch.retrieve(
url=url,
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/easydiffraction/test___init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def test_lazy_functions_execute_with_monkeypatch(monkeypatch, capsys, tmp_path):

fake_index = {
'12': {
'url': 'https://example.com/data.xye',
'path': 'data.xye',
'hash': 'sha256:...',
'description': 'Demo dataset',
}
Expand All @@ -72,4 +72,4 @@ def fake_retrieve(**kwargs):

result = utils.download_data(id=12, destination=str(tmp_path), overwrite=True)
assert Path(result).exists()
assert calls['kwargs']['url'] == 'https://example.com/data.xye'
assert calls['kwargs']['url'] == utils._build_data_url('data.xye')
31 changes: 19 additions & 12 deletions tests/unit/easydiffraction/utils/test_utils_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,30 +26,37 @@ def test_validate_url_accepts_https():
MUT._validate_url('https://example.com/file.cif')


# --- _filename_for_id_from_url ------------------------------------------------
# --- _filename_for_id_from_path -----------------------------------------------


def test_filename_for_id_from_url_with_extension():
def test_filename_for_id_from_path_with_extension():
import easydiffraction.utils.utils as MUT

result = MUT._filename_for_id_from_url(12, 'https://example.com/data/file.xye')
result = MUT._filename_for_id_from_path(12, 'file.xye')
assert result == 'ed-12.xye'


def test_filename_for_id_from_url_cif_extension():
def test_filename_for_id_from_path_cif_extension():
import easydiffraction.utils.utils as MUT

result = MUT._filename_for_id_from_url('3', 'https://example.com/path/model.cif')
result = MUT._filename_for_id_from_path('3', 'path/model.cif')
assert result == 'ed-3.cif'


def test_filename_for_id_from_url_no_extension():
def test_filename_for_id_from_path_no_extension():
import easydiffraction.utils.utils as MUT

result = MUT._filename_for_id_from_url(7, 'https://example.com/path/noext')
result = MUT._filename_for_id_from_path(7, 'path/noext')
assert result == 'ed-7'


def test_record_path_raises_for_missing_path_key():
import easydiffraction.utils.utils as MUT

with pytest.raises(KeyError, match="Index record must contain 'path' key"):
MUT._record_path({'url': 'https://example.com/data.xye'})


# --- _normalize_known_hash ----------------------------------------------------


Expand Down Expand Up @@ -322,7 +329,7 @@ def test_tof_to_d_linear_negative_tof_minus_offset_gives_nan():
def test_download_data_unknown_id(monkeypatch):
import easydiffraction.utils.utils as MUT

fake_index = {'1': {'url': 'https://example.com/data.xye', 'hash': None}}
fake_index = {'1': {'path': 'data.xye', 'hash': None}}
monkeypatch.setattr(MUT, '_fetch_data_index', lambda: fake_index)
with pytest.raises(KeyError, match='Unknown dataset id=999'):
MUT.download_data(id=999)
Expand All @@ -333,7 +340,7 @@ def test_download_data_already_exists_no_overwrite(monkeypatch, tmp_path, capsys

fake_index = {
'1': {
'url': 'https://example.com/data.xye',
'path': 'data.xye',
'hash': None,
'description': 'Test data',
}
Expand All @@ -355,7 +362,7 @@ def test_download_data_success(monkeypatch, tmp_path, capsys):

fake_index = {
'1': {
'url': 'https://example.com/data.xye',
'path': 'data.xye',
'hash': None,
'description': 'Test data',
}
Expand Down Expand Up @@ -383,7 +390,7 @@ def test_download_data_overwrite_existing(monkeypatch, tmp_path, capsys):

fake_index = {
'1': {
'url': 'https://example.com/data.xye',
'path': 'data.xye',
'hash': None,
'description': 'Test data',
}
Expand Down Expand Up @@ -411,7 +418,7 @@ def test_download_data_no_description(monkeypatch, tmp_path, capsys):

fake_index = {
'1': {
'url': 'https://example.com/data.xye',
'path': 'data.xye',
'hash': 'sha256:...',
}
}
Expand Down
Loading