Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,24 @@ Versioned according to [Semantic Versioning](http://semver.org/).

## Unreleased

Added:

* Resource Manager Server as `ocrd_network` analogon of `ocrd.cli.resmgr`, #1309
* `ocrd network resmgr-server` for triggering Resource Manager Server (RMS) in the background
* Processing Server also deploys RMS on each processing host
* ...

Fixed:

* `ocrd.resource_manager`: ensure necessary + reduce unnecessary updates of user database
* `ocrd.resource_manager`: deduplicate entries (newer wins) before updating user database
* `ocrd resmgr download`: extract archives independent of whether they are URLs or local paths
* `ocrd resmgr download`: if `--overwrite`, ensure the old res gets removed
* `ocrd resmgr download`: default to `data` location instead of first in list of allowed
* `ocrd_utils.list_all_resources`: filter module non-resource files w/ more anti-patterns
* `ocrd_utils.list_all_resources`: no subpaths except for `cwd` location, OCR-D/spec#263, #1315
* `ocrd_utils.list_all_resources`: filter resources via media (MIME) type, if specified, #1315

Changed:

* `ocrd network client workflow run`: Allow passing workflow as `ocrd process` tasks as alternative to a workflow file, #1264, #1335
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@ beanie~=1.7
click >=7
cryptography < 43.0.0
Deprecated == 1.2.0
docker
docker>=7.1.0
elementpath
fastapi>=0.78.0
filetype
Flask
frozendict>=2.4.0
gitpython
gdown
httpx>=0.22.0
importlib_metadata ; python_version < '3.8'
Expand Down
2 changes: 2 additions & 0 deletions src/ocrd/cli/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
processing_server_cli,
processing_worker_cli,
processor_server_cli,
resource_manager_server_cli
)


Expand All @@ -28,3 +29,4 @@ def network_cli():
network_cli.add_command(processing_server_cli)
network_cli.add_command(processing_worker_cli)
network_cli.add_command(processor_server_cli)
network_cli.add_command(resource_manager_server_cli)
94 changes: 29 additions & 65 deletions src/ocrd/cli/resmgr.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
get_ocrd_tool_json,
initLogging,
RESOURCE_LOCATIONS,
RESOURCE_TYPES
)
from ocrd.constants import RESOURCE_USER_LIST_COMMENT

Expand Down Expand Up @@ -70,16 +71,16 @@ def list_installed(executable=None):
@resmgr_cli.command('download')
@click.option('-n', '--any-url', default='', help='URL of unregistered resource to download/copy from')
@click.option('-D', '--no-dynamic', default=False, is_flag=True,
help="Whether to skip looking into each processor's --dump-{json,module-dir} for module-level resources")
@click.option('-t', '--resource-type', type=click.Choice(['file', 'directory', 'archive']), default='file',
help='Type of resource',)
@click.option('-P', '--path-in-archive', default='.', help='Path to extract in case of archive type')
help="Skip looking into each processor's --dump-{json,module-dir} module-registered resources")
@click.option('-t', '--resource-type', type=click.Choice(RESOURCE_TYPES), default='file',
help='Type of resource (when unregistered or incomplete)',)
@click.option('-P', '--path-in-archive', default='.', help='Path to extract in case of archive type (when unregistered or incomplete)')
@click.option('-a', '--allow-uninstalled', is_flag=True,
help="Allow installing resources for uninstalled processors",)
help="Allow installing resources for not installed processors",)
@click.option('-o', '--overwrite', help='Overwrite existing resources', is_flag=True)
@click.option('-l', '--location', type=click.Choice(RESOURCE_LOCATIONS),
@click.option('-l', '--location', type=click.Choice(RESOURCE_LOCATIONS),
help="Where to store resources - defaults to first location in processor's 'resource_locations' "
"list or finally 'data'")
"list, i.e. usually 'data'")
@click.argument('executable', required=True)
@click.argument('name', required=False)
def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstalled, overwrite, location, executable,
Expand All @@ -106,8 +107,6 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal
executable = None
if name == '*':
name = None
is_url = (any_url.startswith('https://') or any_url.startswith('http://')) if any_url else False
is_filename = Path(any_url).exists() if any_url else False
if executable and not which(executable):
if not allow_uninstalled:
log.error(f"Executable '{executable}' is not installed. "
Expand All @@ -126,65 +125,30 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal
'path_in_archive': path_in_archive}]
)]
for this_executable, this_reslist in reslist:
for resdict in this_reslist:
if 'size' in resdict:
registered = "registered"
else:
registered = "unregistered"
if any_url:
resdict['url'] = any_url
if resdict['url'] == '???':
log.warning(f"Cannot download user resource {resdict['name']}")
continue
if resdict['url'].startswith('https://') or resdict['url'].startswith('http://'):
log.info(f"Downloading {registered} resource '{resdict['name']}' ({resdict['url']})")
if 'size' not in resdict:
with requests.head(resdict['url']) as r:
resdict['size'] = int(r.headers.get('content-length', 0))
else:
log.info(f"Copying {registered} resource '{resdict['name']}' ({resdict['url']})")
urlpath = Path(resdict['url'])
resdict['url'] = str(urlpath.resolve())
if Path(urlpath).is_dir():
resdict['size'] = directory_size(urlpath)
else:
resdict['size'] = urlpath.stat().st_size
if not location:
location = get_ocrd_tool_json(this_executable)['resource_locations'][0]
elif location not in get_ocrd_tool_json(this_executable)['resource_locations']:
log.error(f"The selected --location {location} is not in the {this_executable}'s resource search path, "
f"refusing to install to invalid location")
sys.exit(1)
if location != 'module':
basedir = resmgr.location_to_resource_dir(location)
else:
basedir = get_moduledir(this_executable)
if not basedir:
basedir = resmgr.location_to_resource_dir('data')

resource_locations = get_ocrd_tool_json(this_executable)['resource_locations']
if not location:
location = resource_locations[0]
elif location not in resource_locations:
log.warning(f"The selected --location {location} is not in the {this_executable}'s resource search path, "
f"refusing to install to invalid location. Instead installing to: {resource_locations[0]}")
res_dest_dir = resmgr.build_resource_dest_dir(location=location, executable=this_executable)
for res_dict in this_reslist:
try:
with click.progressbar(length=resdict['size']) as bar:
fpath = resmgr.download(
this_executable,
resdict['url'],
basedir,
name=resdict['name'],
resource_type=resdict.get('type', resource_type),
path_in_archive=resdict.get('path_in_archive', path_in_archive),
overwrite=overwrite,
no_subdir=location in ['cwd', 'module'],
progress_cb=lambda delta: bar.update(delta)
)
if registered == 'unregistered':
log.info(f"{this_executable} resource '{name}' ({any_url}) not a known resource, creating stub "
f"in {resmgr.user_list}'")
resmgr.add_to_user_database(this_executable, fpath, url=any_url)
resmgr.save_user_list()
log.info(f"Installed resource {resdict['url']} under {fpath}")
fpath = resmgr.handle_resource(
res_dict=res_dict,
executable=this_executable,
dest_dir=res_dest_dir,
any_url=any_url,
overwrite=overwrite,
resource_type=resource_type,
path_in_archive=path_in_archive
)
if not fpath:
continue
except FileExistsError as exc:
log.info(str(exc))
log.info(f"Use in parameters as "
f"'{resmgr.parameter_usage(resdict['name'], usage=resdict.get('parameter_usage', 'as-is'))}'")
usage = res_dict.get('parameter_usage', 'as-is')
log.info(f"Use in parameters as '{resmgr.parameter_usage(res_dict['name'], usage)}'")


@resmgr_cli.command('migrate')
Expand Down
2 changes: 0 additions & 2 deletions src/ocrd/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
'DOWNLOAD_DIR',
'DEFAULT_REPOSITORY_URL',
'BASHLIB_FILENAME',
'RESOURCE_LIST_FILENAME',
'BACKUP_DIR',
'RESOURCE_USER_LIST_COMMENT',
]
Expand All @@ -19,6 +18,5 @@
DOWNLOAD_DIR = '/tmp/ocrd-core-downloads'
DEFAULT_REPOSITORY_URL = 'http://localhost:5000/'
BASHLIB_FILENAME = resource_filename(__package__, 'lib.bash')
RESOURCE_LIST_FILENAME = resource_filename(__package__, 'resource_list.yml')
RESOURCE_USER_LIST_COMMENT = "# OCR-D private resource list (consider sending a PR with your own resources to OCR-D/core)"
BACKUP_DIR = '.backup'
20 changes: 5 additions & 15 deletions src/ocrd/processor/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,14 @@
from ocrd_utils import (
VERSION as OCRD_VERSION,
MIMETYPE_PAGE,
MIME_TO_EXT,
config,
getLogger,
list_resource_candidates,
pushd_popd,
list_all_resources,
get_processor_resource_types,
resource_filename,
parse_json_file_with_comments,
pushd_popd,
make_file_id,
deprecation_warning
)
Expand Down Expand Up @@ -935,9 +934,8 @@ def resolve_resource(self, val):
cwd = self.old_pwd
else:
cwd = getcwd()
ret = [cand for cand in list_resource_candidates(executable, val,
cwd=cwd, moduled=self.moduledir)
if exists(cand)]
ret = list(filter(exists, list_resource_candidates(executable, val,
cwd=cwd, moduled=self.moduledir)))
if ret:
self._base_logger.debug("Resolved %s to absolute path %s" % (val, ret[0]))
return ret[0]
Expand Down Expand Up @@ -968,17 +966,9 @@ def list_all_resources(self):
"""
List all resources found in the filesystem and matching content-type by filename suffix
"""
mimetypes = get_processor_resource_types(None, self.ocrd_tool)
for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir):
for res in list_all_resources(self.executable, ocrd_tool=self.ocrd_tool, moduled=self.moduledir):
res = Path(res)
if '*/*' not in mimetypes:
if res.is_dir() and 'text/directory' not in mimetypes:
continue
# if we do not know all MIME types, then keep the file, otherwise require suffix match
if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix)
for mime in mimetypes):
continue
yield res
yield res.name

@property
def module(self):
Expand Down
61 changes: 0 additions & 61 deletions src/ocrd/resource_list.yml

This file was deleted.

Loading