From 58ff1bf5014c4ae8f58c9a0654dd16fe7b5dcad0 Mon Sep 17 00:00:00 2001 From: Oleg Tarasov Date: Sat, 26 Jun 2021 20:15:30 +0300 Subject: [PATCH] Using dependency graph analysis to get leaf conda packages --- conda-export.py | 85 ++++++++++++++++++++++++++++++++++++++----------- readme.md | 57 ++++++++++++++++++--------------- setup.py | 2 +- 3 files changed, 99 insertions(+), 45 deletions(-) diff --git a/conda-export.py b/conda-export.py index 4d9fec5..2c10e65 100644 --- a/conda-export.py +++ b/conda-export.py @@ -5,12 +5,42 @@ from pathlib import Path from typing import Iterable, List, Set +import conda.exports from conda.base.context import locate_prefix_by_name from conda.cli.main import init_loggers from conda.common.serialize import yaml_safe_dump +from conda.models.enums import PackageType from conda_env.env import from_environment -__version__ = "0.0.3" +import networkx + +__version__ = "0.0.4" + + +def get_conda_leaves(prefix: str) -> Set[str]: + cache = dict( + filter( + lambda pair: pair[1].package_type + not in [ + PackageType.VIRTUAL_PYTHON_WHEEL, + PackageType.VIRTUAL_PYTHON_EGG_MANAGEABLE, + PackageType.VIRTUAL_PYTHON_EGG_UNMANAGEABLE, + ], + conda.exports.linked_data(prefix=prefix).items(), + ) + ) + graph = networkx.DiGraph() + for k in cache.keys(): + n = cache[k]["name"] + v = cache[k]["version"] + graph.add_node(n, version=v) + for j in cache[k]["depends"]: + n2 = j.split(" ")[0] + v2 = j.split(" ")[1:] + graph.add_edge(n, n2, version=v2) + return set( + map(lambda i: i[0].lower(), (filter(lambda i: i[1] == 0, graph.in_degree))) + ) def get_pip_leaves(prefix: str) -> Set[str]: @@ -31,7 +61,7 @@ def get_pip_leaves(prefix: str) -> Set[str]: except: raise Exception(f"Failed to parse packages list: {output}") - return {package["name"] for package in packages} + return {package["name"].lower() for package in packages} def main() -> None: @@ -52,34 +82,51 @@ def main() -> None: prefix = locate_prefix_by_name(args.name) - # Get packages with `pip list --not-required`. - pip_leaves = get_pip_leaves(prefix) - # All the packages in the environment: conda and pip (with versions) env_all = from_environment(args.name, prefix, no_builds=True) # Conda packages that were explicitly installed, but not pip packages (--from-history mode). env_hist = from_environment(args.name, prefix, no_builds=True, from_history=True) - # Strip version info from full conda packages. - conda_packages = { - pkg.split("=")[0] for pkg in env_all.dependencies.get("conda", []) - } + # Conda packages in the environment that no other packages depend on + conda_leaves = get_conda_leaves(prefix) + + # Get packages with `pip list --not-required`. + pip_leaves = get_pip_leaves(prefix) - # Leave just those pip packages that were not installed through conda. - pip_leaves = pip_leaves.difference(conda_packages) + # Conda packages from history with explicit version specified, but not full package spec + # from explicit environment file. + versioned_hist = set( + map( + lambda pkg: pkg.lower(), + filter( + lambda pkg: "=" in pkg and "md5=" not in pkg, + env_hist.dependencies.get("conda", []), + ), + ) + ) + + # Exclude conda packages with explicitly specified versions from conda leaves + conda_leaves = conda_leaves.difference( + {pkg.split("=")[0] for pkg in versioned_hist} + ) - # Additionally filter pip packages with conda's version of things. + # Intersect conda's list of pip packages with packages that pip itself considers leaves. + pip_final = [] if "pip" in env_all.dependencies: - conda_pip = {pkg.split("==")[0] for pkg in env_all.dependencies["pip"]} - pip_leaves = pip_leaves.intersection(conda_pip) + conda_pip = {pkg.split("=")[0].lower() for pkg in env_all.dependencies["pip"]} + pip_final = list(sorted(conda_pip.intersection(pip_leaves))) - final_dict = env_hist.to_dict() - final_dict["channels"] = env_all.channels - del final_dict["prefix"] + final_dict = { + "name": env_all.name, + "channels": env_all.channels, + "dependencies": list(sorted(conda_leaves.union(versioned_hist))), + } - if len(pip_leaves) > 0: - final_dict["dependencies"].append({"pip": list(sorted(pip_leaves))}) + if len(pip_final) > 0: + if "pip" in final_dict["dependencies"]: + final_dict["dependencies"].remove("pip") + final_dict["dependencies"].append({"pip": pip_final}) result = yaml_safe_dump(final_dict) diff --git a/readme.md b/readme.md index 6347256..8d04292 100644 --- a/readme.md +++ b/readme.md @@ -1,9 +1,15 @@ # conda-export + An alternative to `conda env export` that helps create portable environment -specifications with minimal number of packages. +specifications with minimum number of packages. -Resulting specification is similar to `conda env export --from-history`, but also -includes packages that were installed using `pip`. +Conda-export creates environment specifications which contain only top-level +(non-transient) dependencies. It tries to minimize specific version information and +total number of packages, so that the resulting spec maximizes [upgradability](https://pythonspeed.com/articles/conda-dependency-management/#three-kinds-of-dependency-specification). +At the same time, it respects specific package versions that were used while creating +the environment. If, at some point, you installed a package with explicit version (e. g. +`conda install pytorch=1.9.0`), `conda-export` will include this specific version in +the resulting spec file. ## Installation @@ -22,25 +28,26 @@ conda-export -n [env name] -f [optional output file] If `-f` is not specified, dumps the spec to the console. -## Rationale -There are several options when you want to share conda environment specification: - -1. You can use `conda env export -n [name] -f [file]`. This command will give you a - full yaml specification with build versions, which is ideal for reproducibility on - **the same machine**. Unfortunately, this specification will most likely fail on a - different machine or different OS. -2. `conda env export --no-builds` is a little better, but it still contains specific - versions for all the packages, and such specification can still fail on a different - OS. You can postprocess the spec with a simple regex, removing version info, but - the spec will still contain all the packages that are installed in the environment. - Such a spec proves hard to maintain and reason about, and can still fail on - different OS. -3. Finally, you can use `conda env export --from-history`, which will give you only - those packages that you explicitly installed with `conda`. Versions will be - included only if you explicitly requested them upon package installation. This - would be the ideal solution, but unfortunately this command will not include - packages that were installed with `pip`. - -To circumvent all the above restrictions, I've created `conda-export` which generates -a spec with `--from-history` and adds `pip` packages, trying to minimize the number of -packages by including only leaves that no other packages depend on. \ No newline at end of file +## How it works + +This is the exact algorithm that is used to export environment specifications: + +1. `conda-leaves` ← make a dependency graph of all conda packages and select top-level + ones. Exclude packages that were installed with `pip`. +2. `versioned_hist` ← execute `conda env export --from-history` to get only those + packages that were explicitly installed by user with `conda create` or `conda + install`. Filter packages to leave only those with explicit version specified. +3. `conda_pip` ← execute `conda env export` and get packages that were installed with + `pip` and not `conda`. +4. `pip_leaves` ← execute `pip list --not-required` to get top-level packages from + pip's perspective. +5. Compile the final list as follows: + * conda dependencies: `conda_leaves.union(versioned_hist)` + * pip dependencies: `conda_pip.intersection(pip_leaves)` + +## What about exactly reproducible environments? + +`conda-export` is not suited for creating [reproducible](https://pythonspeed.com/articles/conda-dependency-management/#three-kinds-of-dependency-specification) +environments. Please use `conda-lock` with environment specs generated from +`conda-export` in order to create multi-platform lock files that contain exact package +versions. \ No newline at end of file diff --git a/setup.py b/setup.py index 90a88ef..91ee4cf 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ setup( name="conda-export", - version="0.0.3", + version="0.0.4", description="Platform agnostic conda environment export", author="Oleg Tarasov", url="https://github.com/olegtarasov/conda-export",