Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(BA-606): Remove private label before commit #3641

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/3641.fix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Remove `private` value in kernel-feature label before commiting images to list committed images on the session launcher
13 changes: 10 additions & 3 deletions src/ai/backend/agent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,14 @@
from ai.backend.common.bgtask import BackgroundTaskManager
from ai.backend.common.config import model_definition_iv
from ai.backend.common.defs import REDIS_STAT_DB, REDIS_STREAM_DB
from ai.backend.common.docker import MAX_KERNELSPEC, MIN_KERNELSPEC, ImageRef
from ai.backend.common.docker import (
DEFAULT_KERNEL_FEATURE,
MAX_KERNELSPEC,
MIN_KERNELSPEC,
ImageRef,
KernelFeatures,
LabelName,
)
from ai.backend.common.events import (
AbstractEvent,
AgentErrorEvent,
Expand Down Expand Up @@ -142,7 +149,7 @@
from . import alloc_map as alloc_map_mod
from .affinity_map import AffinityMap
from .exception import AgentError, ContainerCreationError, ResourceError
from .kernel import AbstractKernel, KernelFeatures, match_distro_data
from .kernel import AbstractKernel, match_distro_data
from .resources import (
AbstractAllocMap,
AbstractComputeDevice,
Expand Down Expand Up @@ -233,7 +240,7 @@ def __init__(
self.image_labels = kernel_config["image"]["labels"]
self.kspec_version = int(self.image_labels.get("ai.backend.kernelspec", "1"))
self.kernel_features = frozenset(
self.image_labels.get("ai.backend.features", "uid-match").split()
self.image_labels.get(LabelName.FEATURES.value, DEFAULT_KERNEL_FEATURE.value).split()
)
self.kernel_id = kernel_id
self.session_id = session_id
Expand Down
4 changes: 2 additions & 2 deletions src/ai/backend/agent/docker/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@

from ai.backend.common import redis_helper
from ai.backend.common.cgroup import get_cgroup_mount_point
from ai.backend.common.docker import MAX_KERNELSPEC, MIN_KERNELSPEC, ImageRef
from ai.backend.common.docker import MAX_KERNELSPEC, MIN_KERNELSPEC, ImageRef, KernelFeatures
from ai.backend.common.events import EventProducer, KernelLifecycleEventReason
from ai.backend.common.exception import ImageNotAvailable, InvalidImageName, InvalidImageTag
from ai.backend.common.plugin.monitor import ErrorPluginContext, StatsPluginContext
Expand Down Expand Up @@ -81,7 +81,7 @@
from ..agent import ACTIVE_STATUS_SET, AbstractAgent, AbstractKernelCreationContext, ComputerContext
from ..exception import ContainerCreationError, UnsupportedResource
from ..fs import create_scratch_filesystem, destroy_scratch_filesystem
from ..kernel import AbstractKernel, KernelFeatures
from ..kernel import AbstractKernel
from ..plugin.network import ContainerNetworkCapability, ContainerNetworkInfo, NetworkPluginContext
from ..proxy import DomainSocketProxy, proxy_connection
from ..resources import AbstractComputePlugin, KernelResourceSpec, Mount, known_slot_types
Expand Down
8 changes: 0 additions & 8 deletions src/ai/backend/agent/kernel.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,14 +92,6 @@
]


class KernelFeatures(StringSetFlag):
UID_MATCH = "uid-match"
USER_INPUT = "user-input"
BATCH_MODE = "batch"
QUERY_MODE = "query"
TTY_MODE = "tty"


class ClientFeatures(StringSetFlag):
INPUT = "input"
CONTINUATION = "continuation"
Expand Down
4 changes: 2 additions & 2 deletions src/ai/backend/agent/kubernetes/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
from kubernetes_asyncio import config as kube_config

from ai.backend.common.asyncio import current_loop
from ai.backend.common.docker import ImageRef
from ai.backend.common.docker import ImageRef, KernelFeatures
from ai.backend.common.etcd import AsyncEtcd
from ai.backend.common.events import EventProducer
from ai.backend.common.plugin.monitor import ErrorPluginContext, StatsPluginContext
Expand Down Expand Up @@ -60,7 +60,7 @@

from ..agent import ACTIVE_STATUS_SET, AbstractAgent, AbstractKernelCreationContext, ComputerContext
from ..exception import K8sError, UnsupportedResource
from ..kernel import AbstractKernel, KernelFeatures
from ..kernel import AbstractKernel
from ..resources import AbstractComputePlugin, KernelResourceSpec, Mount, known_slot_types
from ..types import Container, ContainerStatus, MountInfo, Port
from .kernel import KubernetesKernel
Expand Down
74 changes: 58 additions & 16 deletions src/ai/backend/common/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

from . import validators as tx
from .arch import arch_name_aliases
from .enum_extension import StringSetFlag
from .exception import InvalidImageName, InvalidImageTag, ProjectMismatchWithCanonical
from .service_ports import parse_service_ports
from .utils import is_ip_address_format, join_non_empty
Expand Down Expand Up @@ -75,25 +76,66 @@

rx_slug = re.compile(r"^[A-Za-z0-9](?:[A-Za-z0-9-._]*[A-Za-z0-9])?$")


class LabelName(enum.StrEnum):
# Common image labels
KERNEL_SPEC = "ai.backend.kernelspec"
FEATURES = "ai.backend.features"
BASE_DISTRO = "ai.backend.base-distro"
RUNTIME_TYPE = "ai.backend.runtime-type"
RUNTIME_PATH = "ai.backend.runtime-path"

ROLE = "ai.backend.role"
ENVS_CORECOUNT = "ai.backend.envs.corecount"
ACCELERATORS = "ai.backend.accelerators"
SERVICE_PORTS = "ai.backend.service-ports"

# Inference image labels
ENDPOINT_PORTS = "ai.backend.endpoint-ports"
MODEL_PATH = "ai.backend.model-path"
MODEL_FORMAT = "ai.backend.model-format"

# Ownership
CUSTOMIZED_OWNER = "ai.backend.customized-image.owner"
CUSTOMIZED_NAME = "ai.backend.customized-image.name"
CUSTOMIZED_ID = "ai.backend.customized-image.id"
CUSTOMIZED_USER_EMAIL = "ai.backend.customized-image.user.email"


class KernelFeatures(StringSetFlag):
UID_MATCH = "uid-match"
USER_INPUT = "user-input"
BATCH_MODE = "batch"
QUERY_MODE = "query"
TTY_MODE = "tty"

OPERATION = "operation"
# Images with the `private` feature are not shown on a image list of the session launcher.
# TODO: Replace `private` feature with RBAC API
PRIVATE = "private"


DEFAULT_KERNEL_FEATURE: Final[KernelFeatures] = KernelFeatures.UID_MATCH

common_image_label_schema = t.Dict({
# Required labels
t.Key("ai.backend.kernelspec", default=1): t.ToInt(lte=MAX_KERNELSPEC, gte=MIN_KERNELSPEC),
t.Key("ai.backend.features", default=["uid-match"]): tx.StringList(delimiter=" "),
t.Key(LabelName.KERNEL_SPEC.value, default=1): t.ToInt(lte=MAX_KERNELSPEC, gte=MIN_KERNELSPEC),
t.Key(LabelName.FEATURES.value, default=["uid-match"]): tx.StringList(delimiter=" "),
# ai.backend.resource.min.*
t.Key("ai.backend.base-distro", default=None): t.Null | t.String(),
t.Key("ai.backend.runtime-type", default="app"): t.String(),
t.Key("ai.backend.runtime-path", default=PurePath("/bin/true")): tx.PurePath(),
t.Key(LabelName.BASE_DISTRO.value, default=None): t.Null | t.String(),
t.Key(LabelName.RUNTIME_TYPE.value, default="app"): t.String(),
t.Key(LabelName.RUNTIME_PATH.value, default=PurePath("/bin/true")): tx.PurePath(),
# Optional labels
t.Key("ai.backend.role", default="COMPUTE"): t.Enum("COMPUTE", "INFERENCE", "SYSTEM"),
t.Key("ai.backend.envs.corecount", optional=True): tx.StringList(allow_blank=True),
t.Key("ai.backend.accelerators", optional=True): tx.StringList(allow_blank=True),
t.Key("ai.backend.service-ports", optional=True): tx.StringList(allow_blank=True),
t.Key(LabelName.ROLE.value, default="COMPUTE"): t.Enum("COMPUTE", "INFERENCE", "SYSTEM"),
t.Key(LabelName.ENVS_CORECOUNT.value, optional=True): tx.StringList(allow_blank=True),
t.Key(LabelName.ACCELERATORS.value, optional=True): tx.StringList(allow_blank=True),
t.Key(LabelName.SERVICE_PORTS.value, optional=True): tx.StringList(allow_blank=True),
}).allow_extra("*")

inference_image_label_schema = t.Dict({
t.Key("ai.backend.endpoint-ports"): tx.StringList(min_length=1),
t.Key("ai.backend.model-path"): tx.PurePath(),
t.Key("ai.backend.model-format"): t.String(),
t.Key(LabelName.ENDPOINT_PORTS.value): tx.StringList(min_length=1),
t.Key(LabelName.MODEL_PATH.value): tx.PurePath(),
t.Key(LabelName.MODEL_FORMAT.value): t.String(),
}).ignore_extra("*")


Expand Down Expand Up @@ -287,14 +329,14 @@ def validate_image_labels(labels: dict[str, str]) -> dict[str, str]:
service_ports = {
item["name"]: item
for item in parse_service_ports(
common_labels.get("ai.backend.service-ports", ""),
common_labels.get("ai.backend.endpoint-ports", ""),
common_labels.get(LabelName.SERVICE_PORTS.value, ""),
common_labels.get(LabelName.ENDPOINT_PORTS.value, ""),
)
}
match common_labels["ai.backend.role"]:
match common_labels[LabelName.ROLE.value]:
case "INFERENCE":
inference_labels = inference_image_label_schema.check(labels)
for name in inference_labels["ai.backend.endpoint-ports"]:
for name in inference_labels[LabelName.ENDPOINT_PORTS.value]:
if name not in service_ports:
raise ValueError(
f"ai.backend.endpoint-ports contains an undefined service port: {name}"
Expand Down
28 changes: 17 additions & 11 deletions src/ai/backend/manager/api/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
from sqlalchemy.sql.expression import null, true

from ai.backend.common.bgtask import ProgressReporter
from ai.backend.common.docker import ImageRef
from ai.backend.common.docker import DEFAULT_KERNEL_FEATURE, ImageRef, KernelFeatures, LabelName
from ai.backend.manager.models.container_registry import ContainerRegistryRow
from ai.backend.manager.models.group import GroupRow
from ai.backend.manager.models.image import ImageIdentifier, rescan_images
Expand Down Expand Up @@ -1255,7 +1255,7 @@ async def _commit_and_upload(reporter: ProgressReporter) -> None:
.select_from(ImageRow)
.where(
(
ImageRow.labels["ai.backend.customized-image.owner"].as_string()
ImageRow.labels[LabelName.CUSTOMIZED_OWNER.value].as_string()
== f"{params.image_visibility.value}:{image_owner_id}"
)
)
Expand All @@ -1278,21 +1278,26 @@ async def _commit_and_upload(reporter: ProgressReporter) -> None:
query = sa.select(ImageRow).where(
ImageRow.name.like(f"{new_canonical}%")
& (
ImageRow.labels["ai.backend.customized-image.owner"].as_string()
ImageRow.labels[LabelName.CUSTOMIZED_OWNER.value].as_string()
== f"{params.image_visibility.value}:{image_owner_id}"
)
& (
ImageRow.labels["ai.backend.customized-image.name"].as_string()
ImageRow.labels[LabelName.CUSTOMIZED_NAME.value].as_string()
== params.image_name
)
)
existing_row = await sess.scalar(query)

customized_image_id: str
kern_features: list[str]
if existing_row:
customized_image_id = existing_row.labels["ai.backend.customized-image.id"]
kern_features = existing_row.labels.get(
LabelName.FEATURES.value, DEFAULT_KERNEL_FEATURE.value
).split()
customized_image_id = existing_row.labels[LabelName.CUSTOMIZED_ID.value]
log.debug("reusing existing customized image ID {}", customized_image_id)
else:
kern_features = [DEFAULT_KERNEL_FEATURE.value]
customized_image_id = str(uuid.uuid4())

new_canonical += f"-customized_{customized_image_id.replace('-', '')}"
Expand All @@ -1305,15 +1310,16 @@ async def _commit_and_upload(reporter: ProgressReporter) -> None:
)

image_labels = {
"ai.backend.customized-image.owner": f"{params.image_visibility.value}:{image_owner_id}",
"ai.backend.customized-image.name": params.image_name,
"ai.backend.customized-image.id": customized_image_id,
LabelName.CUSTOMIZED_OWNER.value: f"{params.image_visibility.value}:{image_owner_id}",
LabelName.CUSTOMIZED_NAME.value: params.image_name,
LabelName.CUSTOMIZED_ID.value: customized_image_id,
LabelName.FEATURES.value: " ".join([
feat for feat in kern_features if feat != KernelFeatures.PRIVATE.value
]),
}
match params.image_visibility:
case CustomizedImageVisibilityScope.USER:
image_labels["ai.backend.customized-image.user.email"] = request["user"][
"email"
]
image_labels[LabelName.CUSTOMIZED_USER_EMAIL.value] = request["user"]["email"]

# commit image with new tag set
resp = await root_ctx.registry.commit_session(
Expand Down
6 changes: 3 additions & 3 deletions src/ai/backend/manager/models/gql_models/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from sqlalchemy.orm import load_only, selectinload

from ai.backend.common import redis_helper
from ai.backend.common.docker import ImageRef
from ai.backend.common.docker import ImageRef, KernelFeatures, LabelName
from ai.backend.common.exception import UnknownImageReference
from ai.backend.common.types import (
ImageAlias,
Expand Down Expand Up @@ -295,12 +295,12 @@ def matches_filter(
is_valid = ImageLoadFilter.GENERAL in load_filters
for label in self.labels:
match label.key:
case "ai.backend.features" if "operation" in label.value:
case LabelName.FEATURES.value if KernelFeatures.OPERATION.value in label.value:
if ImageLoadFilter.OPERATIONAL in load_filters:
is_valid = True
else:
return False
case "ai.backend.customized-image.owner":
case LabelName.CUSTOMIZED_OWNER.value:
if (
ImageLoadFilter.CUSTOMIZED not in load_filters
and ImageLoadFilter.CUSTOMIZED_GLOBAL not in load_filters
Expand Down
Loading