diff --git a/docs/manager/graphql-reference/schema.graphql b/docs/manager/graphql-reference/schema.graphql index 939cdc38de..77ba5e04d8 100644 --- a/docs/manager/graphql-reference/schema.graphql +++ b/docs/manager/graphql-reference/schema.graphql @@ -83,6 +83,9 @@ type Queries { is_installed: Boolean is_operation: Boolean @deprecated(reason: "Deprecated since 24.03.4. This field is ignored if `load_filters` is specified and is not null.") + """Added in 25.3.0.""" + load_only_active: Boolean = true + """ Added in 24.03.8. Allowed values are: [general, operational, customized]. When superuser queries with `customized` option set the resolver will return every customized images (including those not owned by callee). To resolve images owned by user only call `customized_images`. """ diff --git a/src/ai/backend/manager/api/session.py b/src/ai/backend/manager/api/session.py index 583d71cf23..09a9784200 100644 --- a/src/ai/backend/manager/api/session.py +++ b/src/ai/backend/manager/api/session.py @@ -52,7 +52,7 @@ from ai.backend.common.docker import ImageRef from ai.backend.manager.models.container_registry import ContainerRegistryRow from ai.backend.manager.models.group import GroupRow -from ai.backend.manager.models.image import ImageIdentifier, rescan_images +from ai.backend.manager.models.image import ImageIdentifier, ImageStatus, rescan_images if TYPE_CHECKING: from sqlalchemy.ext.asyncio import AsyncConnection as SAConnection @@ -1259,6 +1259,7 @@ async def _commit_and_upload(reporter: ProgressReporter) -> None: == f"{params.image_visibility.value}:{image_owner_id}" ) ) + .where(ImageRow.status == ImageStatus.ALIVE) ) existing_image_count = await sess.scalar(query) @@ -1275,16 +1276,20 @@ async def _commit_and_upload(reporter: ProgressReporter) -> None: ) # check if image with same name exists and reuse ID it if is - query = sa.select(ImageRow).where( - ImageRow.name.like(f"{new_canonical}%") - & ( - ImageRow.labels["ai.backend.customized-image.owner"].as_string() - == f"{params.image_visibility.value}:{image_owner_id}" - ) - & ( - ImageRow.labels["ai.backend.customized-image.name"].as_string() - == params.image_name + query = ( + sa.select(ImageRow) + .where( + ImageRow.name.like(f"{new_canonical}%") + & ( + ImageRow.labels["ai.backend.customized-image.owner"].as_string() + == f"{params.image_visibility.value}:{image_owner_id}" + ) + & ( + ImageRow.labels["ai.backend.customized-image.name"].as_string() + == params.image_name + ) ) + .where(ImageRow.status == ImageStatus.ALIVE) ) existing_row = await sess.scalar(query) diff --git a/src/ai/backend/manager/cli/image_impl.py b/src/ai/backend/manager/cli/image_impl.py index c24f59ea22..f9f6e7003a 100644 --- a/src/ai/backend/manager/cli/image_impl.py +++ b/src/ai/backend/manager/cli/image_impl.py @@ -33,6 +33,7 @@ async def list_images(cli_ctx, short, installed_only): ): displayed_items = [] try: + # Idea: Add `deleted` option to include deleted images. items = await ImageRow.list(session) # NOTE: installed/installed_agents fields are no longer provided in CLI, # until we finish the epic refactoring of image metadata db. @@ -250,20 +251,23 @@ async def validate_image_canonical( if current or architecture is not None: if current: architecture = architecture or CURRENT_ARCH - image_row = await session.scalar( - sa.select(ImageRow).where( - (ImageRow.name == canonical) & (ImageRow.architecture == architecture) - ) + + assert architecture is not None + image_row = await ImageRow.resolve( + session, [ImageIdentifier(canonical, architecture)] ) - if image_row is None: - raise UnknownImageReference(f"{canonical}/{architecture}") + for key, value in validate_image_labels(image_row.labels).items(): print(f"{key:<40}: ", end="") if isinstance(value, list): value = f"{', '.join(value)}" print(value) else: - rows = await session.scalars(sa.select(ImageRow).where(ImageRow.name == canonical)) + rows = await session.scalars( + sa.select(ImageRow) + .where(ImageRow.name == canonical) + .where(ImageRow.status == ImageStatus.ALIVE) + ) image_rows = rows.fetchall() if not image_rows: raise UnknownImageReference(f"{canonical}") diff --git a/src/ai/backend/manager/container_registry/base.py b/src/ai/backend/manager/container_registry/base.py index d05cab020d..85a289891a 100644 --- a/src/ai/backend/manager/container_registry/base.py +++ b/src/ai/backend/manager/container_registry/base.py @@ -32,7 +32,7 @@ from ai.backend.manager.models.container_registry import ContainerRegistryRow from ..defs import INTRINSIC_SLOTS_MIN -from ..models.image import ImageIdentifier, ImageRow, ImageType +from ..models.image import ImageIdentifier, ImageRow, ImageStatus, ImageType from ..models.utils import ExtendedAsyncSAEngine log = BraceStyleAdapter(logging.getLogger(__spec__.name)) @@ -131,7 +131,7 @@ async def commit_rescan_result(self) -> None: existing_images = await session.scalars( sa.select(ImageRow).where( sa.func.ROW(ImageRow.name, ImageRow.architecture).in_(image_identifiers), - ), + ) ) is_local = self.registry_name == "local" @@ -146,6 +146,15 @@ async def commit_rescan_result(self) -> None: image_row.resources = update["resources"] image_row.is_local = is_local + if image_row.status == ImageStatus.DELETED: + image_row.status = ImageStatus.ALIVE + + progress_msg = f"Restored deleted image - {image_ref.canonical}/{image_ref.architecture} ({update['config_digest']})" + log.info(progress_msg) + + if (reporter := progress_reporter.get()) is not None: + await reporter.update(1, message=progress_msg) + for image_identifier, update in _all_updates.items(): try: parsed_img = ImageRef.from_image_str( @@ -178,6 +187,7 @@ async def commit_rescan_result(self) -> None: accelerators=update.get("accels"), labels=update["labels"], resources=update["resources"], + status=ImageStatus.ALIVE, ) ) progress_msg = f"Updated image - {parsed_img.canonical}/{image_identifier.architecture} ({update['config_digest']})" diff --git a/src/ai/backend/manager/container_registry/local.py b/src/ai/backend/manager/container_registry/local.py index 0507013af9..85c3500fe5 100644 --- a/src/ai/backend/manager/container_registry/local.py +++ b/src/ai/backend/manager/container_registry/local.py @@ -12,7 +12,7 @@ from ai.backend.common.docker import arch_name_aliases, get_docker_connector from ai.backend.logging import BraceStyleAdapter -from ..models.image import ImageRow +from ..models.image import ImageRow, ImageStatus from .base import ( BaseContainerRegistry, concurrency_sema, @@ -82,10 +82,12 @@ async def _read_image_info( config_digest = data["Id"] async with self.db.begin_readonly_session() as db_session: already_exists = await db_session.scalar( - sa.select([sa.func.count(ImageRow.id)]).where( + sa.select([sa.func.count(ImageRow.id)]) + .where( ImageRow.config_digest == config_digest, ImageRow.is_local == sa.false(), ) + .where(ImageRow.status == ImageStatus.ALIVE), ) if already_exists > 0: return {}, "already synchronized from a remote registry" diff --git a/src/ai/backend/manager/models/gql.py b/src/ai/backend/manager/models/gql.py index eb1e34cd3c..9b2c63349c 100644 --- a/src/ai/backend/manager/models/gql.py +++ b/src/ai/backend/manager/models/gql.py @@ -518,6 +518,10 @@ class Queries(graphene.ObjectType): is_operation=graphene.Boolean( deprecation_reason="Deprecated since 24.03.4. This field is ignored if `load_filters` is specified and is not null." ), + load_only_active=graphene.Boolean( + default_value=True, + description="Added in 25.3.0.", + ), load_filters=graphene.List( graphene.String, default_value=None, @@ -1373,13 +1377,15 @@ async def resolve_image( client_role = ctx.user["role"] client_domain = ctx.user["domain_name"] if id: - item = await Image.load_item_by_id(info.context, uuid.UUID(id)) + item = await Image.load_item_by_id(info.context, uuid.UUID(id), load_only_active=False) else: if not (reference and architecture): raise InvalidAPIParameters( "reference/architecture and id can't be omitted at the same time!" ) - item = await Image.load_item(info.context, reference, architecture) + item = await Image.load_item( + info.context, reference, architecture, load_only_active=False + ) if client_role == UserRole.SUPERADMIN: pass elif client_role in (UserRole.ADMIN, UserRole.USER): @@ -1428,6 +1434,7 @@ async def resolve_images( *, is_installed: bool | None = None, is_operation=False, + load_only_active: bool = True, load_filters: list[str] | None = None, image_filters: list[str] | None = None, ) -> Sequence[Image]: @@ -1459,7 +1466,7 @@ async def resolve_images( # but to conform with previous implementation... image_load_types.add(ImageLoadFilter.OPERATIONAL) - items = await Image.load_all(ctx, types=image_load_types) + items = await Image.load_all(ctx, types=image_load_types, load_only_active=load_only_active) if client_role == UserRole.SUPERADMIN: pass elif client_role in (UserRole.ADMIN, UserRole.USER): diff --git a/src/ai/backend/manager/models/gql_models/image.py b/src/ai/backend/manager/models/gql_models/image.py index 9a29f072f9..98119997a5 100644 --- a/src/ai/backend/manager/models/gql_models/image.py +++ b/src/ai/backend/manager/models/gql_models/image.py @@ -195,12 +195,15 @@ async def batch_load_by_canonical( cls, graph_ctx: GraphQueryContext, image_names: Sequence[str], + load_only_active: bool = True, ) -> Sequence[Optional[Image]]: query = ( sa.select(ImageRow) .where(ImageRow.name.in_(image_names)) .options(selectinload(ImageRow.aliases)) ) + if load_only_active: + query = query.where(ImageRow.status == ImageStatus.ALIVE) async with graph_ctx.db.begin_readonly_session() as session: result = await session.execute(query) return [await Image.from_row(graph_ctx, row) for row in result.scalars().all()] @@ -210,18 +213,22 @@ async def batch_load_by_image_ref( cls, graph_ctx: GraphQueryContext, image_refs: Sequence[ImageRef], + load_only_active: bool = True, ) -> Sequence[Optional[Image]]: image_names = [x.canonical for x in image_refs] - return await cls.batch_load_by_canonical(graph_ctx, image_names) + return await cls.batch_load_by_canonical(graph_ctx, image_names, load_only_active) @classmethod async def load_item_by_id( cls, ctx: GraphQueryContext, id: UUID, + load_only_active: bool = True, ) -> Image: async with ctx.db.begin_readonly_session() as session: - row = await ImageRow.get(session, id, load_aliases=True) + row = await ImageRow.get( + session, id, load_aliases=True, load_only_active=load_only_active + ) if not row: raise ImageNotFound @@ -233,6 +240,7 @@ async def load_item( ctx: GraphQueryContext, reference: str, architecture: str, + load_only_active: bool = True, ) -> Image: try: async with ctx.db.begin_readonly_session() as session: @@ -242,6 +250,7 @@ async def load_item( ImageIdentifier(reference, architecture), ImageAlias(reference), ], + load_only_active=load_only_active, ) except UnknownImageReference: raise ImageNotFound @@ -253,9 +262,12 @@ async def load_all( ctx: GraphQueryContext, *, types: set[ImageLoadFilter] = set(), + load_only_active: bool = True, ) -> Sequence[Image]: async with ctx.db.begin_readonly_session() as session: - rows = await ImageRow.list(session, load_aliases=True) + rows = await ImageRow.list( + session, load_aliases=True, load_only_active=load_only_active + ) items: list[Image] = [ item async for item in cls.bulk_load(ctx, rows) if item.matches_filter(ctx, types) ] @@ -355,12 +367,16 @@ async def batch_load_by_name_and_arch( cls, graph_ctx: GraphQueryContext, name_and_arch: Sequence[tuple[str, str]], + load_only_active: bool = True, ) -> Sequence[Sequence[ImageNode]]: query = ( sa.select(ImageRow) .where(sa.tuple_(ImageRow.name, ImageRow.architecture).in_(name_and_arch)) .options(selectinload(ImageRow.aliases)) ) + if load_only_active: + query = query.where(ImageRow.status == ImageStatus.ALIVE) + async with graph_ctx.db.begin_readonly_session() as db_session: return await batch_multiresult_in_scalar_stream( graph_ctx, @@ -376,9 +392,12 @@ async def batch_load_by_image_identifier( cls, graph_ctx: GraphQueryContext, image_ids: Sequence[ImageIdentifier], + load_only_active: bool = True, ) -> Sequence[Sequence[ImageNode]]: name_and_arch_tuples = [(img.canonical, img.architecture) for img in image_ids] - return await cls.batch_load_by_name_and_arch(graph_ctx, name_and_arch_tuples) + return await cls.batch_load_by_name_and_arch( + graph_ctx, name_and_arch_tuples, load_only_active + ) @overload @classmethod @@ -421,6 +440,7 @@ def from_row(cls, row: ImageRow | None) -> ImageNode | None: ], supported_accelerators=(row.accelerators or "").split(","), aliases=[alias_row.alias for alias_row in row.aliases], + status=row.status, ) @classmethod @@ -445,6 +465,7 @@ def from_legacy_image(cls, row: Image) -> ImageNode: resource_limits=row.resource_limits, supported_accelerators=row.supported_accelerators, aliases=row.aliases, + status=row.status, ) @classmethod @@ -500,7 +521,9 @@ async def mutate( client_role = ctx.user["role"] async with ctx.db.begin_session() as session: - image_row = await ImageRow.get(session, _image_id, load_aliases=True) + image_row = await ImageRow.get( + session, _image_id, load_only_active=True, load_aliases=True + ) if not image_row: raise ObjectNotFound("image") if client_role != UserRole.SUPERADMIN: @@ -648,7 +671,9 @@ async def mutate( client_role = ctx.user["role"] async with ctx.db.begin_session() as session: - image_row = await ImageRow.get(session, _image_id, load_aliases=True) + image_row = await ImageRow.get( + session, _image_id, load_only_active=True, load_aliases=True + ) if not image_row: raise ObjectNotFound("image") if client_role != UserRole.SUPERADMIN: @@ -702,7 +727,9 @@ async def mutate( client_role = ctx.user["role"] async with ctx.db.begin_readonly_session() as session: - image_row = await ImageRow.get(session, _image_id, load_aliases=True) + image_row = await ImageRow.get( + session, _image_id, load_only_active=True, load_aliases=True + ) if not image_row: raise ImageNotFound if client_role != UserRole.SUPERADMIN: diff --git a/src/ai/backend/manager/models/image.py b/src/ai/backend/manager/models/image.py index b9abec8f2c..96ce5bab81 100644 --- a/src/ai/backend/manager/models/image.py +++ b/src/ai/backend/manager/models/image.py @@ -379,6 +379,7 @@ def __init__( accelerators=None, labels=None, resources=None, + status=ImageStatus.ALIVE, ) -> None: self.name = name self.project = project @@ -394,6 +395,7 @@ def __init__( self.accelerators = accelerators self.labels = labels self.resources = resources + self.status = status @property def trimmed_digest(self) -> str: @@ -420,6 +422,7 @@ async def from_alias( session: AsyncSession, alias: str, load_aliases: bool = False, + load_only_active: bool = True, *, loading_options: Iterable[RelationLoadingOption] = tuple(), ) -> ImageRow: @@ -430,6 +433,8 @@ async def from_alias( ) if load_aliases: query = query.options(selectinload(ImageRow.aliases)) + if load_only_active: + query = query.where(ImageRow.status == ImageStatus.ALIVE) query = _apply_loading_option(query, loading_options) result = await session.scalar(query) if result is not None: @@ -443,6 +448,7 @@ async def from_image_identifier( session: AsyncSession, identifier: ImageIdentifier, load_aliases: bool = True, + load_only_active: bool = True, *, loading_options: Iterable[RelationLoadingOption] = tuple(), ) -> ImageRow: @@ -453,6 +459,8 @@ async def from_image_identifier( if load_aliases: query = query.options(selectinload(ImageRow.aliases)) + if load_only_active: + query = query.where(ImageRow.status == ImageStatus.ALIVE) query = _apply_loading_option(query, loading_options) result = await session.execute(query) @@ -471,6 +479,7 @@ async def from_image_ref( *, strict_arch: bool = False, load_aliases: bool = False, + load_only_active: bool = True, loading_options: Iterable[RelationLoadingOption] = tuple(), ) -> ImageRow: """ @@ -483,6 +492,9 @@ async def from_image_ref( query = sa.select(ImageRow).where(ImageRow.name == ref.canonical) if load_aliases: query = query.options(selectinload(ImageRow.aliases)) + if load_only_active: + query = query.where(ImageRow.status == ImageStatus.ALIVE) + query = _apply_loading_option(query, loading_options) result = await session.execute(query) @@ -504,6 +516,7 @@ async def resolve( reference_candidates: list[ImageAlias | ImageRef | ImageIdentifier], *, strict_arch: bool = False, + load_only_active: bool = True, load_aliases: bool = True, loading_options: Iterable[RelationLoadingOption] = tuple(), ) -> ImageRow: @@ -554,7 +567,11 @@ async def resolve( searched_refs.append(f"identifier:{reference!r}") try: if row := await resolver_func( - session, reference, load_aliases=load_aliases, loading_options=loading_options + session, + reference, + load_aliases=load_aliases, + load_only_active=load_only_active, + loading_options=loading_options, ): return row except UnknownImageReference: @@ -563,19 +580,31 @@ async def resolve( @classmethod async def get( - cls, session: AsyncSession, image_id: UUID, load_aliases=False + cls, + session: AsyncSession, + image_id: UUID, + load_only_active: bool = True, + load_aliases: bool = False, ) -> ImageRow | None: query = sa.select(ImageRow).where(ImageRow.id == image_id) if load_aliases: query = query.options(selectinload(ImageRow.aliases)) + if load_only_active: + query = query.where(ImageRow.status == ImageStatus.ALIVE) + result = await session.execute(query) return result.scalar() @classmethod - async def list(cls, session: AsyncSession, load_aliases=False) -> List[ImageRow]: + async def list( + cls, session: AsyncSession, load_only_active: bool = True, load_aliases: bool = False + ) -> List[ImageRow]: query = sa.select(ImageRow) if load_aliases: query = query.options(selectinload(ImageRow.aliases)) + if load_only_active: + query = query.where(ImageRow.status == ImageStatus.ALIVE) + result = await session.execute(query) return result.scalars().all() @@ -873,7 +902,9 @@ async def build_ctx_in_system_scope( permissions = await self.calculate_permission(ctx, SystemScope()) image_id_permission_map: dict[UUID, frozenset[ImagePermission]] = {} - for image_row in await self.db_session.scalars(sa.select(ImageRow)): + for image_row in await self.db_session.scalars( + sa.select(ImageRow).where(ImageRow.status == ImageStatus.ALIVE) + ): image_id_permission_map[image_row.id] = permissions perm_ctx = ImagePermissionContext( object_id_to_additional_permission_map=image_id_permission_map @@ -909,7 +940,11 @@ async def _in_domain_scope( raise InvalidScope(f"Domain not found (n:{scope.domain_name})") allowed_registries: set[str] = set(domain_row.allowed_docker_registries) - _img_query_stmt = sa.select(ImageRow).options(load_only(ImageRow.id, ImageRow.registry)) + _img_query_stmt = ( + sa.select(ImageRow) + .where(ImageRow.status == ImageStatus.ALIVE) + .options(load_only(ImageRow.id, ImageRow.registry)) + ) for row in await self.db_session.scalars(_img_query_stmt): _row = cast(ImageRow, row) if _row.registry in allowed_registries: @@ -952,8 +987,10 @@ async def _in_user_scope( permissions = await self.calculate_permission(ctx, scope) image_id_permission_map: dict[UUID, frozenset[ImagePermission]] = {} allowed_registries: set[str] = set(user_row.domain.allowed_docker_registries) - _img_query_stmt = sa.select(ImageRow).options( - load_only(ImageRow.id, ImageRow.labels, ImageRow.registry) + _img_query_stmt = ( + sa.select(ImageRow) + .where(ImageRow.status == ImageStatus.ALIVE) + .options(load_only(ImageRow.id, ImageRow.labels, ImageRow.registry)) ) for row in await self.db_session.scalars(_img_query_stmt): _row = cast(ImageRow, row)