From 3d23bf81794a370b7da3a695d94d76f94e2c4a0b Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Wed, 14 May 2025 12:15:48 -0400 Subject: [PATCH 01/77] feat: ROOT-1: Add offset to storage link model to support storage link to task within file of tasks on cloud storage --- label_studio/io_storages/base_models.py | 15 +++ ...lobimportstoragelink_row_group_and_more.py | 93 +++++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100644 label_studio/io_storages/migrations/0019_azureblobimportstoragelink_row_group_and_more.py diff --git a/label_studio/io_storages/base_models.py b/label_studio/io_storages/base_models.py index 1a312c84a0c7..cb2e58972643 100644 --- a/label_studio/io_storages/base_models.py +++ b/label_studio/io_storages/base_models.py @@ -688,12 +688,27 @@ class Meta: class ImportStorageLink(models.Model): task = models.OneToOneField('tasks.Task', on_delete=models.CASCADE, related_name='%(app_label)s_%(class)s') + + # --- + # NOTE: ImportStorageLink is task-level, not key-level. These fields could be split out into a new, key-level table to optimize DB access patterns. + key = models.TextField(_('key'), null=False, help_text='External link key') + + # NOTE: unused object_exists = models.BooleanField( _('object exists'), help_text='Whether object under external link still exists', default=True ) + # --- + created_at = models.DateTimeField(_('created at'), auto_now_add=True, help_text='Creation time') + # NOTE: we could support other file formats in future with an equivalent of row_group - ORC, Feather, Avro. + # In that case, we would need a file_type enum or similar to distinguish them in filters. + row_group = models.PositiveIntegerField(null=True, blank=True, help_text='Parquet row group') + row_idx = models.PositiveIntegerField( + null=True, blank=True, help_text='Parquet row index, or JSON[L] object index' + ) + @classmethod def exists(cls, key, storage): return cls.objects.filter(key=key, storage=storage.id).exists() diff --git a/label_studio/io_storages/migrations/0019_azureblobimportstoragelink_row_group_and_more.py b/label_studio/io_storages/migrations/0019_azureblobimportstoragelink_row_group_and_more.py new file mode 100644 index 000000000000..d864f060a5c8 --- /dev/null +++ b/label_studio/io_storages/migrations/0019_azureblobimportstoragelink_row_group_and_more.py @@ -0,0 +1,93 @@ +# Generated by Django 5.1.9 on 2025-05-14 15:52 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("io_storages", "0018_alter_azureblobexportstorage_project_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="azureblobimportstoragelink", + name="row_group", + field=models.PositiveIntegerField( + blank=True, help_text="Parquet row group", null=True + ), + ), + migrations.AddField( + model_name="azureblobimportstoragelink", + name="row_idx", + field=models.PositiveIntegerField( + blank=True, + help_text="Parquet row index, or JSON[L] object index", + null=True, + ), + ), + migrations.AddField( + model_name="gcsimportstoragelink", + name="row_group", + field=models.PositiveIntegerField( + blank=True, help_text="Parquet row group", null=True + ), + ), + migrations.AddField( + model_name="gcsimportstoragelink", + name="row_idx", + field=models.PositiveIntegerField( + blank=True, + help_text="Parquet row index, or JSON[L] object index", + null=True, + ), + ), + migrations.AddField( + model_name="localfilesimportstoragelink", + name="row_group", + field=models.PositiveIntegerField( + blank=True, help_text="Parquet row group", null=True + ), + ), + migrations.AddField( + model_name="localfilesimportstoragelink", + name="row_idx", + field=models.PositiveIntegerField( + blank=True, + help_text="Parquet row index, or JSON[L] object index", + null=True, + ), + ), + migrations.AddField( + model_name="redisimportstoragelink", + name="row_group", + field=models.PositiveIntegerField( + blank=True, help_text="Parquet row group", null=True + ), + ), + migrations.AddField( + model_name="redisimportstoragelink", + name="row_idx", + field=models.PositiveIntegerField( + blank=True, + help_text="Parquet row index, or JSON[L] object index", + null=True, + ), + ), + migrations.AddField( + model_name="s3importstoragelink", + name="row_group", + field=models.PositiveIntegerField( + blank=True, help_text="Parquet row group", null=True + ), + ), + migrations.AddField( + model_name="s3importstoragelink", + name="row_idx", + field=models.PositiveIntegerField( + blank=True, + help_text="Parquet row index, or JSON[L] object index", + null=True, + ), + ), + ] From 9e2a7bb64304448162dc030106a215457b896bc8 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Wed, 14 May 2025 13:23:59 -0400 Subject: [PATCH 02/77] review comments --- label_studio/io_storages/base_models.py | 12 ++----- ...lobimportstoragelink_row_group_and_more.py | 32 +++++++++---------- 2 files changed, 18 insertions(+), 26 deletions(-) diff --git a/label_studio/io_storages/base_models.py b/label_studio/io_storages/base_models.py index cb2e58972643..7030022efded 100644 --- a/label_studio/io_storages/base_models.py +++ b/label_studio/io_storages/base_models.py @@ -689,25 +689,17 @@ class ImportStorageLink(models.Model): task = models.OneToOneField('tasks.Task', on_delete=models.CASCADE, related_name='%(app_label)s_%(class)s') - # --- - # NOTE: ImportStorageLink is task-level, not key-level. These fields could be split out into a new, key-level table to optimize DB access patterns. - key = models.TextField(_('key'), null=False, help_text='External link key') # NOTE: unused object_exists = models.BooleanField( _('object exists'), help_text='Whether object under external link still exists', default=True ) - # --- created_at = models.DateTimeField(_('created at'), auto_now_add=True, help_text='Creation time') - # NOTE: we could support other file formats in future with an equivalent of row_group - ORC, Feather, Avro. - # In that case, we would need a file_type enum or similar to distinguish them in filters. - row_group = models.PositiveIntegerField(null=True, blank=True, help_text='Parquet row group') - row_idx = models.PositiveIntegerField( - null=True, blank=True, help_text='Parquet row index, or JSON[L] object index' - ) + row_group = models.IntegerField(null=True, blank=True, help_text='Parquet row group') + row_index = models.IntegerField(null=True, blank=True, help_text='Parquet row index, or JSON[L] object index') @classmethod def exists(cls, key, storage): diff --git a/label_studio/io_storages/migrations/0019_azureblobimportstoragelink_row_group_and_more.py b/label_studio/io_storages/migrations/0019_azureblobimportstoragelink_row_group_and_more.py index d864f060a5c8..dd11aebb6fc6 100644 --- a/label_studio/io_storages/migrations/0019_azureblobimportstoragelink_row_group_and_more.py +++ b/label_studio/io_storages/migrations/0019_azureblobimportstoragelink_row_group_and_more.py @@ -1,4 +1,4 @@ -# Generated by Django 5.1.9 on 2025-05-14 15:52 +# Generated by Django 5.1.9 on 2025-05-14 17:23 from django.db import migrations, models @@ -13,14 +13,14 @@ class Migration(migrations.Migration): migrations.AddField( model_name="azureblobimportstoragelink", name="row_group", - field=models.PositiveIntegerField( + field=models.IntegerField( blank=True, help_text="Parquet row group", null=True ), ), migrations.AddField( model_name="azureblobimportstoragelink", - name="row_idx", - field=models.PositiveIntegerField( + name="row_index", + field=models.IntegerField( blank=True, help_text="Parquet row index, or JSON[L] object index", null=True, @@ -29,14 +29,14 @@ class Migration(migrations.Migration): migrations.AddField( model_name="gcsimportstoragelink", name="row_group", - field=models.PositiveIntegerField( + field=models.IntegerField( blank=True, help_text="Parquet row group", null=True ), ), migrations.AddField( model_name="gcsimportstoragelink", - name="row_idx", - field=models.PositiveIntegerField( + name="row_index", + field=models.IntegerField( blank=True, help_text="Parquet row index, or JSON[L] object index", null=True, @@ -45,14 +45,14 @@ class Migration(migrations.Migration): migrations.AddField( model_name="localfilesimportstoragelink", name="row_group", - field=models.PositiveIntegerField( + field=models.IntegerField( blank=True, help_text="Parquet row group", null=True ), ), migrations.AddField( model_name="localfilesimportstoragelink", - name="row_idx", - field=models.PositiveIntegerField( + name="row_index", + field=models.IntegerField( blank=True, help_text="Parquet row index, or JSON[L] object index", null=True, @@ -61,14 +61,14 @@ class Migration(migrations.Migration): migrations.AddField( model_name="redisimportstoragelink", name="row_group", - field=models.PositiveIntegerField( + field=models.IntegerField( blank=True, help_text="Parquet row group", null=True ), ), migrations.AddField( model_name="redisimportstoragelink", - name="row_idx", - field=models.PositiveIntegerField( + name="row_index", + field=models.IntegerField( blank=True, help_text="Parquet row index, or JSON[L] object index", null=True, @@ -77,14 +77,14 @@ class Migration(migrations.Migration): migrations.AddField( model_name="s3importstoragelink", name="row_group", - field=models.PositiveIntegerField( + field=models.IntegerField( blank=True, help_text="Parquet row group", null=True ), ), migrations.AddField( model_name="s3importstoragelink", - name="row_idx", - field=models.PositiveIntegerField( + name="row_index", + field=models.IntegerField( blank=True, help_text="Parquet row index, or JSON[L] object index", null=True, From 2cbcb4158dcd6b1959b1740725cc7c7cc11497d8 Mon Sep 17 00:00:00 2001 From: matt-bernstein <60152561+matt-bernstein@users.noreply.github.com> Date: Wed, 14 May 2025 13:52:29 -0400 Subject: [PATCH 03/77] remove whitespace Co-authored-by: Jo Booth --- label_studio/io_storages/base_models.py | 1 - 1 file changed, 1 deletion(-) diff --git a/label_studio/io_storages/base_models.py b/label_studio/io_storages/base_models.py index 7030022efded..a534055e4ce8 100644 --- a/label_studio/io_storages/base_models.py +++ b/label_studio/io_storages/base_models.py @@ -688,7 +688,6 @@ class Meta: class ImportStorageLink(models.Model): task = models.OneToOneField('tasks.Task', on_delete=models.CASCADE, related_name='%(app_label)s_%(class)s') - key = models.TextField(_('key'), null=False, help_text='External link key') # NOTE: unused From 3de9ffb0fc64ad00edd8fb53489a5a1b473f3ec5 Mon Sep 17 00:00:00 2001 From: matt-bernstein <60152561+matt-bernstein@users.noreply.github.com> Date: Wed, 14 May 2025 13:54:05 -0400 Subject: [PATCH 04/77] update comment Co-authored-by: Jo Booth --- label_studio/io_storages/base_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/label_studio/io_storages/base_models.py b/label_studio/io_storages/base_models.py index a534055e4ce8..8f3e602bee94 100644 --- a/label_studio/io_storages/base_models.py +++ b/label_studio/io_storages/base_models.py @@ -690,7 +690,7 @@ class ImportStorageLink(models.Model): task = models.OneToOneField('tasks.Task', on_delete=models.CASCADE, related_name='%(app_label)s_%(class)s') key = models.TextField(_('key'), null=False, help_text='External link key') - # NOTE: unused + # This field is set to True on creation and never updated; it should not be relied upon. object_exists = models.BooleanField( _('object exists'), help_text='Whether object under external link still exists', default=True ) From dc423d0eae2baf221dda701adee390152826022a Mon Sep 17 00:00:00 2001 From: matt-bernstein Date: Wed, 14 May 2025 17:58:18 +0000 Subject: [PATCH 05/77] Sync Follow Merge dependencies Workflow run: https://github.com/HumanSignal/label-studio/actions/runs/15027678115 --- poetry.lock | 6 +++--- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index a4d687005133..7b27409fb23e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2141,7 +2141,7 @@ optional = false python-versions = ">=3.9,<4" groups = ["main"] files = [ - {file = "856f396d0ad76c201630f086e8cad16ea0d8b568.zip", hash = "sha256:9c8f19fe23536ecc1790c6742ec9e27f2d2e65a0d52601d3bd423d69a5c298f8"}, + {file = "9a2300458d391a6168139a3d6e7ab5fedfccd178.zip", hash = "sha256:6c8b8c0e1da526690db549ce5da126130da515938b4647b21a66e44a1e5f55e4"}, ] [package.dependencies] @@ -2168,7 +2168,7 @@ xmljson = "0.2.1" [package.source] type = "url" -url = "https://github.com/HumanSignal/label-studio-sdk/archive/856f396d0ad76c201630f086e8cad16ea0d8b568.zip" +url = "https://github.com/HumanSignal/label-studio-sdk/archive/9a2300458d391a6168139a3d6e7ab5fedfccd178.zip" [[package]] name = "launchdarkly-server-sdk" @@ -4958,4 +4958,4 @@ uwsgi = ["pyuwsgi", "uwsgitop"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<4" -content-hash = "87f678be573807802b370d5f00cf48250540f622d409899c4ef3ce761b3c6227" +content-hash = "9f7509b33e9a1c5e25557ecaa94062f7a29d043cd492fd6f20fce64ab21fd914" diff --git a/pyproject.toml b/pyproject.toml index 596a6ae3467c..7369269ded27 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,7 +73,7 @@ dependencies = [ "djangorestframework-simplejwt[crypto] (>=5.4.0,<6.0.0)", "tldextract (>=5.1.3)", ## HumanSignal repo dependencies :start - "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/856f396d0ad76c201630f086e8cad16ea0d8b568.zip", + "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/9a2300458d391a6168139a3d6e7ab5fedfccd178.zip", ## HumanSignal repo dependencies :end ] From 411940c3211e5fd29c259ec06d5da38996fa0c32 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Wed, 14 May 2025 14:32:57 -0400 Subject: [PATCH 06/77] feat: ROOT-9: Allow reading multiple tasks from a JSON file in source cloud storage From f37b8af5748779c5982abe5d7a5c52430c0587e3 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Wed, 14 May 2025 14:53:23 -0400 Subject: [PATCH 07/77] update comment --- label_studio/io_storages/base_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/label_studio/io_storages/base_models.py b/label_studio/io_storages/base_models.py index 8f3e602bee94..5408a06cd089 100644 --- a/label_studio/io_storages/base_models.py +++ b/label_studio/io_storages/base_models.py @@ -445,7 +445,7 @@ def _scan_and_create_links(self, link_class): for task_data in tasks_data: # TODO: batch this loop body with add_task -> add_tasks in a single bulk write. - # Also have to handle any mismatch between len(tasks_data) and settings.WEBHOOK_BATCH_SIZE + # See DIA-2062 for prerequisites task = self.add_task(task_data, self.project, maximum_annotations, max_inner_id, self, key, link_class) max_inner_id += 1 From cfbb07687f7e002c132478d86f83dab1b73c5a1e Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Wed, 14 May 2025 15:11:47 -0400 Subject: [PATCH 08/77] update storagelink creation --- label_studio/io_storages/base_models.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/label_studio/io_storages/base_models.py b/label_studio/io_storages/base_models.py index 5408a06cd089..8f2509b4573f 100644 --- a/label_studio/io_storages/base_models.py +++ b/label_studio/io_storages/base_models.py @@ -341,7 +341,7 @@ def _scan_and_create_links_v2(self): raise NotImplementedError @classmethod - def add_task(cls, data, project, maximum_annotations, max_inner_id, storage, key, link_class): + def add_task(cls, data, project, maximum_annotations, max_inner_id, storage, key, row_index, link_class): # predictions predictions = data.get('predictions', []) if predictions: @@ -375,8 +375,8 @@ def add_task(cls, data, project, maximum_annotations, max_inner_id, storage, key inner_id=max_inner_id, ) - link_class.create(task, key, storage) - logger.debug(f'Create {storage.__class__.__name__} link with key={key} for task={task}') + link_class.create(task, key, row_index, storage) + logger.debug(f'Create {storage.__class__.__name__} link with {key=} and {row_index=} for {task=}') raise_exception = not flag_set( 'ff_fix_back_dev_3342_storage_scan_with_invalid_annotations', user=AnonymousUser() @@ -424,9 +424,9 @@ def _scan_and_create_links(self, link_class): self.info_update_progress(last_sync_count=tasks_created, tasks_existed=tasks_existed) # skip if task already exists - if link_class.exists(key, self): - logger.debug(f'{self.__class__.__name__} link {key} already exists') - tasks_existed += 1 # update progress counter + if n_tasks_linked := link_class.n_tasks_linked(key, self): + logger.debug(f'{self.__class__.__name__} already has {n_tasks_linked} tasks linked to {key=}') + tasks_existed += n_tasks_linked # update progress counter continue logger.debug(f'{self}: found new key {key}') @@ -443,10 +443,11 @@ def _scan_and_create_links(self, link_class): if not flag_set('fflag_feat_dia_2092_multitasks_per_storage_link'): tasks_data = tasks_data[:1] - for task_data in tasks_data: + # TODO: for keys that contain exactly one task not in a list (blob url, top-level JSON dict) should row_index=None? + for row_index, task_data in enumerate(tasks_data): # TODO: batch this loop body with add_task -> add_tasks in a single bulk write. # See DIA-2062 for prerequisites - task = self.add_task(task_data, self.project, maximum_annotations, max_inner_id, self, key, link_class) + task = self.add_task(task_data, self.project, maximum_annotations, max_inner_id, self, key, row_index, link_class) max_inner_id += 1 # update progress counters for storage info @@ -701,12 +702,12 @@ class ImportStorageLink(models.Model): row_index = models.IntegerField(null=True, blank=True, help_text='Parquet row index, or JSON[L] object index') @classmethod - def exists(cls, key, storage): - return cls.objects.filter(key=key, storage=storage.id).exists() + def n_tasks_linked(cls, key, storage): + return cls.objects.filter(key=key, storage=storage.id).count() @classmethod - def create(cls, task, key, storage): - link, created = cls.objects.get_or_create(task_id=task.id, key=key, storage=storage, object_exists=True) + def create(cls, task, key, row_index, storage): + link, created = cls.objects.get_or_create(task_id=task.id, key=key, row_index=row_index, storage=storage, object_exists=True) return link class Meta: From d3c02d92f2014038e35c5e8136d55aa77da1fb7f Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Wed, 14 May 2025 15:14:30 -0400 Subject: [PATCH 09/77] fix DM action --- label_studio/data_manager/actions/remove_duplicates.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/label_studio/data_manager/actions/remove_duplicates.py b/label_studio/data_manager/actions/remove_duplicates.py index fdcfe992908e..6abee19967da 100644 --- a/label_studio/data_manager/actions/remove_duplicates.py +++ b/label_studio/data_manager/actions/remove_duplicates.py @@ -172,6 +172,8 @@ def restore_storage_links_for_duplicated_tasks(duplicates) -> None: link = storage_link_class( task_id=task['id'], key=link_instance.key, + row_index=link_instance.row_index, + row_group=link_instance.row_group, storage=link_instance.storage, ) link.save() From 1033d26551907232f82b3401b6e96ba6839e077e Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Wed, 14 May 2025 15:14:55 -0400 Subject: [PATCH 10/77] blue --- label_studio/io_storages/base_models.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/label_studio/io_storages/base_models.py b/label_studio/io_storages/base_models.py index 8f2509b4573f..cb321012e4f5 100644 --- a/label_studio/io_storages/base_models.py +++ b/label_studio/io_storages/base_models.py @@ -447,7 +447,9 @@ def _scan_and_create_links(self, link_class): for row_index, task_data in enumerate(tasks_data): # TODO: batch this loop body with add_task -> add_tasks in a single bulk write. # See DIA-2062 for prerequisites - task = self.add_task(task_data, self.project, maximum_annotations, max_inner_id, self, key, row_index, link_class) + task = self.add_task( + task_data, self.project, maximum_annotations, max_inner_id, self, key, row_index, link_class + ) max_inner_id += 1 # update progress counters for storage info @@ -707,7 +709,9 @@ def n_tasks_linked(cls, key, storage): @classmethod def create(cls, task, key, row_index, storage): - link, created = cls.objects.get_or_create(task_id=task.id, key=key, row_index=row_index, storage=storage, object_exists=True) + link, created = cls.objects.get_or_create( + task_id=task.id, key=key, row_index=row_index, storage=storage, object_exists=True + ) return link class Meta: From 4e75268e56aa064ff922ed3159bfaebee47f7f9a Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Wed, 14 May 2025 15:43:32 -0400 Subject: [PATCH 11/77] udpate test --- .../tests/data_manager/test_api_actions.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/label_studio/tests/data_manager/test_api_actions.py b/label_studio/tests/data_manager/test_api_actions.py index cd2fa778028c..0175d5810581 100644 --- a/label_studio/tests/data_manager/test_api_actions.py +++ b/label_studio/tests/data_manager/test_api_actions.py @@ -137,24 +137,31 @@ def test_action_remove_duplicates(business_client, project_id, storage_model, li # task 4: add duplicated task, with storage link and one annotation task4 = make_task(task_data, project) make_annotation({'result': []}, task4.id) + # this task would have row_index=0 instead of None if it was created after multitask support was added link_model.objects.create(task=task4, key='duplicated.jpg', storage=storage) + # task 5: add not a duplicated task using the same key, ensuring multiple tasks in the same key don't interfere + task_data = {'data': {'image': 'normal2.jpg'}} + task5 = make_task(task_data, project) + link_model.objects.create(task=task5, key='duplicated.jpg', row_index=1, storage=storage) + # call the "remove duplicated tasks" action status = business_client.post( f'/api/dm/actions?project={project_id}&id=remove_duplicates', json={'selectedItems': {'all': True, 'excluded': []}}, ) - # As the result, we should have only 2 tasks left: - # task 1 and task 3 with storage link copied from task 4 + # As the result, we should have only 3 tasks left: + # task 1, task 5, and task 3 with storage link copied from task 4 assert list(project.tasks.order_by('id').values_list('id', flat=True)) == [ task1.id, task3.id, + task5.id, ] assert status.status_code == 200 - assert link_model.objects.count() == 1 + assert link_model.objects.count() == 2 assert project.annotations.count() == 4 - assert project.tasks.count() == 2 + assert project.tasks.count() == 3 @pytest.mark.django_db From b629817f6e9e33de0ca20a3a683b2173e9c2b1b8 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Wed, 14 May 2025 16:52:31 -0400 Subject: [PATCH 12/77] futureproof create method --- label_studio/io_storages/base_models.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/label_studio/io_storages/base_models.py b/label_studio/io_storages/base_models.py index cb321012e4f5..4878389ea3f1 100644 --- a/label_studio/io_storages/base_models.py +++ b/label_studio/io_storages/base_models.py @@ -375,7 +375,7 @@ def add_task(cls, data, project, maximum_annotations, max_inner_id, storage, key inner_id=max_inner_id, ) - link_class.create(task, key, row_index, storage) + link_class.create(task, key, storage, row_index=row_index) logger.debug(f'Create {storage.__class__.__name__} link with {key=} and {row_index=} for {task=}') raise_exception = not flag_set( @@ -708,9 +708,9 @@ def n_tasks_linked(cls, key, storage): return cls.objects.filter(key=key, storage=storage.id).count() @classmethod - def create(cls, task, key, row_index, storage): + def create(cls, task, key, storage, row_index=0, row_group=None): link, created = cls.objects.get_or_create( - task_id=task.id, key=key, row_index=row_index, storage=storage, object_exists=True + task_id=task.id, key=key, row_index=row_index, row_group=row_group, storage=storage, object_exists=True ) return link From 8f9d6e8be10c54633bf1fdff44032134491c22d0 Mon Sep 17 00:00:00 2001 From: matt-bernstein Date: Wed, 14 May 2025 20:55:39 +0000 Subject: [PATCH 13/77] Sync Follow Merge dependencies Workflow run: https://github.com/HumanSignal/label-studio/actions/runs/15030860401 From 5714e9890e289cdeaaf10a9baf29808fd22d25fb Mon Sep 17 00:00:00 2001 From: matt-bernstein Date: Thu, 15 May 2025 13:01:40 +0000 Subject: [PATCH 14/77] Sync Follow Merge dependencies Workflow run: https://github.com/HumanSignal/label-studio/actions/runs/15045586529 From 85e7b0ae0602c58995edb1803c2ce22d17e14be1 Mon Sep 17 00:00:00 2001 From: matt-bernstein Date: Thu, 15 May 2025 18:27:49 +0000 Subject: [PATCH 15/77] Sync Follow Merge dependencies Workflow run: https://github.com/HumanSignal/label-studio/actions/runs/15052428789 From f6841e37b4da72a8745a8bb62fefaccf42690daa Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Thu, 15 May 2025 15:55:07 -0400 Subject: [PATCH 16/77] make row_index=none default --- label_studio/io_storages/base_models.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/label_studio/io_storages/base_models.py b/label_studio/io_storages/base_models.py index 4878389ea3f1..1e9ff448ece8 100644 --- a/label_studio/io_storages/base_models.py +++ b/label_studio/io_storages/base_models.py @@ -423,7 +423,7 @@ def _scan_and_create_links(self, link_class): logger.debug(f'Scanning key {key}') self.info_update_progress(last_sync_count=tasks_created, tasks_existed=tasks_existed) - # skip if task already exists + # skip if key has already been synced if n_tasks_linked := link_class.n_tasks_linked(key, self): logger.debug(f'{self.__class__.__name__} already has {n_tasks_linked} tasks linked to {key=}') tasks_existed += n_tasks_linked # update progress counter @@ -439,12 +439,16 @@ def _scan_and_create_links(self, link_class): f'(images, audio, text, etc.), edit storage settings and enable ' f'"Treat every bucket object as a source file"' ) - - if not flag_set('fflag_feat_dia_2092_multitasks_per_storage_link'): - tasks_data = tasks_data[:1] - - # TODO: for keys that contain exactly one task not in a list (blob url, top-level JSON dict) should row_index=None? - for row_index, task_data in enumerate(tasks_data): + + if isinstance(tasks_data, dict): + tasks_data = [tasks_data] + row_indices = [None] + else: + if not flag_set('fflag_feat_dia_2092_multitasks_per_storage_link'): + tasks_data = tasks_data[:1] + row_indices = range(len(tasks_data)) + + for row_index, task_data in zip(row_indices, tasks_data): # TODO: batch this loop body with add_task -> add_tasks in a single bulk write. # See DIA-2062 for prerequisites task = self.add_task( @@ -708,7 +712,7 @@ def n_tasks_linked(cls, key, storage): return cls.objects.filter(key=key, storage=storage.id).count() @classmethod - def create(cls, task, key, storage, row_index=0, row_group=None): + def create(cls, task, key, storage, row_index=None, row_group=None): link, created = cls.objects.get_or_create( task_id=task.id, key=key, row_index=row_index, row_group=row_group, storage=storage, object_exists=True ) From 810ba624a93e04a45e2e1c55ccd53a04bb6ffaf5 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Thu, 15 May 2025 16:00:10 -0400 Subject: [PATCH 17/77] get_data return type --- label_studio/io_storages/azure_blob/models.py | 6 +++--- label_studio/io_storages/gcs/models.py | 6 +++--- label_studio/io_storages/redis/models.py | 5 +++-- label_studio/io_storages/s3/models.py | 6 +++--- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/label_studio/io_storages/azure_blob/models.py b/label_studio/io_storages/azure_blob/models.py index 7d2002970743..01798acc562d 100644 --- a/label_studio/io_storages/azure_blob/models.py +++ b/label_studio/io_storages/azure_blob/models.py @@ -209,17 +209,17 @@ def iterkeys(self): continue yield file.name - def get_data(self, key) -> list[dict]: + def get_data(self, key) -> Union[dict, list[dict]]: if self.use_blob_urls: data_key = settings.DATA_UNDEFINED_NAME - return [{data_key: f'{self.url_scheme}://{self.container}/{key}'}] + return {data_key: f'{self.url_scheme}://{self.container}/{key}'} container = self.get_container() blob = container.download_blob(key) blob_str = blob.content_as_text() value = json.loads(blob_str) if isinstance(value, dict): - return [value] + return value elif isinstance(value, list): for idx, item in enumerate(value): if not isinstance(item, dict): diff --git a/label_studio/io_storages/gcs/models.py b/label_studio/io_storages/gcs/models.py index 29c28b021752..feac991c02b1 100644 --- a/label_studio/io_storages/gcs/models.py +++ b/label_studio/io_storages/gcs/models.py @@ -180,9 +180,9 @@ def iterkeys(self): return_key=True, ) - def get_data(self, key) -> list[dict]: + def get_data(self, key) -> Union[dict, list[dict]]: if self.use_blob_urls: - return [{settings.DATA_UNDEFINED_NAME: GCS.get_uri(self.bucket, key)}] + return {settings.DATA_UNDEFINED_NAME: GCS.get_uri(self.bucket, key)} data = GCS.read_file( client=self.get_client(), bucket_name=self.bucket, @@ -190,7 +190,7 @@ def get_data(self, key) -> list[dict]: convert_to=GCS.ConvertBlobTo.JSON, ) if isinstance(data, dict): - return [data] + return data elif isinstance(data, list): for idx, item in enumerate(data): if not isinstance(item, dict): diff --git a/label_studio/io_storages/redis/models.py b/label_studio/io_storages/redis/models.py index 200a02b6dd76..1f6c49c128cb 100644 --- a/label_studio/io_storages/redis/models.py +++ b/label_studio/io_storages/redis/models.py @@ -3,6 +3,7 @@ import json import logging +from typing import Union import redis from django.db import models @@ -89,7 +90,7 @@ def iterkeys(self): for key in client.keys(path + '*'): yield key - def get_data(self, key) -> list[dict]: + def get_data(self, key) -> Union[dict, list[dict]]: client = self.get_client() value_str = client.get(key) if not value_str: @@ -98,7 +99,7 @@ def get_data(self, key) -> list[dict]: value = json.loads(value_str) # NOTE: this validation did not previously exist, we were accepting any JSON values if isinstance(value, dict): - return [value] + return value elif isinstance(value, list): for idx, item in enumerate(value): if not isinstance(item, dict): diff --git a/label_studio/io_storages/s3/models.py b/label_studio/io_storages/s3/models.py index b2b5865ff48d..79c1d8a95050 100644 --- a/label_studio/io_storages/s3/models.py +++ b/label_studio/io_storages/s3/models.py @@ -218,11 +218,11 @@ def scan_and_create_links(self): return self._scan_and_create_links(S3ImportStorageLink) @catch_and_reraise_from_none - def get_data(self, key) -> list[dict]: + def get_data(self, key) -> Union[dict, list[dict]]: uri = f'{self.url_scheme}://{self.bucket}/{key}' if self.use_blob_urls: data_key = settings.DATA_UNDEFINED_NAME - return [{data_key: uri}] + return {data_key: uri} # read task json from bucket and validate it _, s3 = self.get_client_and_resource() @@ -231,7 +231,7 @@ def get_data(self, key) -> list[dict]: try: value = json.loads(obj) if isinstance(value, dict): - return [value] + return value elif isinstance(value, list): for idx, item in enumerate(value): if not isinstance(item, dict): From 8b41a3d386b764a3dbb36788996efad99c5584eb Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Thu, 15 May 2025 16:02:57 -0400 Subject: [PATCH 18/77] blue --- label_studio/io_storages/base_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/label_studio/io_storages/base_models.py b/label_studio/io_storages/base_models.py index 1e9ff448ece8..5c6adc329325 100644 --- a/label_studio/io_storages/base_models.py +++ b/label_studio/io_storages/base_models.py @@ -439,7 +439,7 @@ def _scan_and_create_links(self, link_class): f'(images, audio, text, etc.), edit storage settings and enable ' f'"Treat every bucket object as a source file"' ) - + if isinstance(tasks_data, dict): tasks_data = [tasks_data] row_indices = [None] From 3ae1230067a66350d7a4645e37ef24be7391b199 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Thu, 15 May 2025 17:36:40 -0400 Subject: [PATCH 19/77] test new model fields --- .../tests/test_multitask_import.py | 33 ++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/label_studio/io_storages/tests/test_multitask_import.py b/label_studio/io_storages/tests/test_multitask_import.py index d54b140dcfbb..46c72dbcb694 100644 --- a/label_studio/io_storages/tests/test_multitask_import.py +++ b/label_studio/io_storages/tests/test_multitask_import.py @@ -3,6 +3,8 @@ import boto3 import pytest from django.test import TestCase +from io_storages.models import S3ImportStorage +from io_storages.s3.models import S3ImportStorageLink from io_storages.tests.factories import ( AzureBlobImportStorageFactory, GCSImportStorageFactory, @@ -12,6 +14,7 @@ from moto import mock_s3 from projects.tests.factories import ProjectFactory from rest_framework.test import APIClient + from tests.utils import azure_client_mock, gcs_client_mock, mock_feature_flag, redis_client_mock @@ -37,7 +40,6 @@ def _test_storage_import(self, storage_class, task_data, **storage_kwargs): client.force_authenticate(user=self.project.created_by) # Setup storage with required credentials - storage = storage_class(project=self.project, **storage_kwargs) # Validate connection before sync @@ -112,3 +114,32 @@ def test_import_multiple_tasks_redis(self): path='', use_blob_urls=False, ) + + def test_storagelink_fields(self): + # use an actual storage and storagelink to test this, since factories aren't connected properly + with mock_s3(): + # Setup S3 bucket and test data + s3 = boto3.client('s3', region_name='us-east-1') + bucket_name = 'pytest-s3-jsons' + s3.create_bucket(Bucket=bucket_name) + + # Put test data into S3 + s3.put_object(Bucket=bucket_name, Key='test.json', Body=json.dumps(self.common_task_data)) + + # create a real storage and sync it + storage = S3ImportStorage( + project=self.project, + bucket=bucket_name, + aws_access_key_id='example', + aws_secret_access_key='example', + use_blob_urls=False, + ) + storage.save() + storage.sync() + + # check that the storage link fields are set correctly + storage_links = S3ImportStorageLink.objects.filter(storage=storage).order_by('task_id') + self.assertEqual(storage_links[0].row_index, 0) + self.assertEqual(storage_links[0].row_group, None) + self.assertEqual(storage_links[1].row_index, 1) + self.assertEqual(storage_links[1].row_group, None) From 7cb8e9c367945a1ca30f73e523fc9e82aa26b9bb Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Fri, 16 May 2025 13:09:29 -0400 Subject: [PATCH 20/77] feat: ROOT-11: Support reading JSONL from source cloud storages From 36e181e77c26b18d07e7ca9652d8a61b5876df8e Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Fri, 16 May 2025 13:15:01 -0400 Subject: [PATCH 21/77] split out repeated json parsing logic from import storages --- label_studio/io_storages/azure_blob/models.py | 17 +------- label_studio/io_storages/gcs/models.py | 20 ++-------- label_studio/io_storages/redis/models.py | 23 +---------- label_studio/io_storages/s3/models.py | 22 +--------- label_studio/io_storages/utils.py | 40 +++++++++++++++++++ label_studio/tests/jwt_auth/test_views.py | 1 + label_studio/tests/sdk/legacy/test_tasks.py | 1 + label_studio/tests/test_cli.py | 2 +- label_studio/tests/test_has_lock.py | 1 + label_studio/tests/test_io_storages.py | 1 + label_studio/tests/test_organizations.py | 3 +- label_studio/tests/test_project.py | 3 +- .../tests/test_project_reset_summary.py | 1 + 13 files changed, 60 insertions(+), 75 deletions(-) diff --git a/label_studio/io_storages/azure_blob/models.py b/label_studio/io_storages/azure_blob/models.py index 01798acc562d..f73b2f0a5e39 100644 --- a/label_studio/io_storages/azure_blob/models.py +++ b/label_studio/io_storages/azure_blob/models.py @@ -25,7 +25,7 @@ ImportStorageLink, ProjectStorageMixin, ) -from io_storages.utils import parse_range, storage_can_resolve_bucket_url +from io_storages.utils import load_tasks_json, parse_range, storage_can_resolve_bucket_url from tasks.models import Annotation from label_studio.io_storages.azure_blob.utils import AZURE @@ -217,20 +217,7 @@ def get_data(self, key) -> Union[dict, list[dict]]: container = self.get_container() blob = container.download_blob(key) blob_str = blob.content_as_text() - value = json.loads(blob_str) - if isinstance(value, dict): - return value - elif isinstance(value, list): - for idx, item in enumerate(value): - if not isinstance(item, dict): - raise ValueError( - f'Error on key {key} item {idx}: For {self.__class__.__name__} your JSON file must be a dictionary with one task, or a list of dictionaries with one task each' - ) - return value - else: - raise ValueError( - f'Error on key {key}: For {self.__class__.__name__} your JSON file must be a dictionary with one task, or a list of dictionaries with one task each' - ) + return load_tasks_json(blob_str, key, self.__class__.__name__) def scan_and_create_links(self): return self._scan_and_create_links(AzureBlobImportStorageLink) diff --git a/label_studio/io_storages/gcs/models.py b/label_studio/io_storages/gcs/models.py index feac991c02b1..62c8dc9faec8 100644 --- a/label_studio/io_storages/gcs/models.py +++ b/label_studio/io_storages/gcs/models.py @@ -23,7 +23,7 @@ ProjectStorageMixin, ) from io_storages.gcs.utils import GCS -from io_storages.utils import parse_range, storage_can_resolve_bucket_url +from io_storages.utils import load_tasks_json, parse_range, storage_can_resolve_bucket_url from tasks.models import Annotation logger = logging.getLogger(__name__) @@ -183,25 +183,13 @@ def iterkeys(self): def get_data(self, key) -> Union[dict, list[dict]]: if self.use_blob_urls: return {settings.DATA_UNDEFINED_NAME: GCS.get_uri(self.bucket, key)} - data = GCS.read_file( + blob_str = GCS.read_file( client=self.get_client(), bucket_name=self.bucket, key=key, - convert_to=GCS.ConvertBlobTo.JSON, + convert_to=GCS.ConvertBlobTo.NOTHING, ) - if isinstance(data, dict): - return data - elif isinstance(data, list): - for idx, item in enumerate(data): - if not isinstance(item, dict): - raise ValueError( - f'Error on key {key} item {idx}: For {self.__class__.__name__} your JSON file must be a dictionary with one task, or a list of dictionaries with one task each' - ) - return data - else: - raise ValueError( - f'Error on key {key}: For {self.__class__.__name__} your JSON file must be a dictionary with one task, or a list of dictionaries with one task each' - ) + return load_tasks_json(blob_str, key, self.__class__.__name__) def generate_http_url(self, url): return GCS.generate_http_url( diff --git a/label_studio/io_storages/redis/models.py b/label_studio/io_storages/redis/models.py index 1f6c49c128cb..ed87ac2ef0ee 100644 --- a/label_studio/io_storages/redis/models.py +++ b/label_studio/io_storages/redis/models.py @@ -17,6 +17,7 @@ ImportStorageLink, ProjectStorageMixin, ) +from io_storages.utils import load_tasks_json from tasks.models import Annotation logger = logging.getLogger(__name__) @@ -95,27 +96,7 @@ def get_data(self, key) -> Union[dict, list[dict]]: value_str = client.get(key) if not value_str: return [] - try: - value = json.loads(value_str) - # NOTE: this validation did not previously exist, we were accepting any JSON values - if isinstance(value, dict): - return value - elif isinstance(value, list): - for idx, item in enumerate(value): - if not isinstance(item, dict): - raise ValueError( - f'Error on key {key} item {idx}: For {self.__class__.__name__} your JSON file must be a dictionary with one task, or a list of dictionaries with one task each' - ) - return value - else: - raise ValueError( - f'Error on key {key}: For {self.__class__.__name__} your JSON file must be a dictionary with one task, or a list of dictionaries with one task each' - ) - except json.decoder.JSONDecodeError: - raise ValueError( - f"Can't import JSON-formatted tasks from {key}. If you're trying to import binary objects, " - f'perhaps you\'ve forgot to enable "Treat every bucket object as a source file" option?' - ) + return load_tasks_json(value_str, key, self.__class__.__name__) def scan_and_create_links(self): return self._scan_and_create_links(RedisImportStorageLink) diff --git a/label_studio/io_storages/s3/models.py b/label_studio/io_storages/s3/models.py index 79c1d8a95050..b559df7190ee 100644 --- a/label_studio/io_storages/s3/models.py +++ b/label_studio/io_storages/s3/models.py @@ -228,27 +228,9 @@ def get_data(self, key) -> Union[dict, list[dict]]: _, s3 = self.get_client_and_resource() bucket = s3.Bucket(self.bucket) obj = s3.Object(bucket.name, key).get()['Body'].read().decode('utf-8') - try: - value = json.loads(obj) - if isinstance(value, dict): - return value - elif isinstance(value, list): - for idx, item in enumerate(value): - if not isinstance(item, dict): - raise TaskValidationError( - f'Error on key {key} item {idx}: For {self.__class__.__name__} your JSON file must be a dictionary with one task, or a list of dictionaries with one task each' - ) - return value + from io_storages.utils import load_tasks_json - else: - raise TaskValidationError( - f'Error on key {key}: For {self.__class__.__name__} your JSON file must be a dictionary with one task, or a list of dictionaries with one task each' - ) - except json.decoder.JSONDecodeError: - raise ValueError( - f"Can't import JSON-formatted tasks from {key}. If you're trying to import binary objects, " - f'perhaps you\'ve forgot to enable "Treat every bucket object as a source file" option?' - ) + return load_tasks_json(obj, key, self.__class__.__name__, TaskValidationError) @catch_and_reraise_from_none def generate_http_url(self, url): diff --git a/label_studio/io_storages/utils.py b/label_studio/io_storages/utils.py index 1798704e8091..0af8816090be 100644 --- a/label_studio/io_storages/utils.py +++ b/label_studio/io_storages/utils.py @@ -1,5 +1,6 @@ """This file and its contents are licensed under the Apache License 2.0. Please see the included NOTICE for copyright information and LICENSE for a copy of the license. """ +import json import logging import re from dataclasses import dataclass @@ -109,3 +110,42 @@ def parse_range(range_header): end = '' return start, end + + +def load_tasks_json(blob_str: str, key: str, storage_class_name: str, error_cls=ValueError): + """ + Parse blob_str containing task JSON(s) and return the validated result or raise an error. + + Other args are used for error messages. + """ + + try: + value = json.loads(blob_str) + except json.decoder.JSONDecodeError: + raise ValueError( + ( + f"Can't import JSON-formatted tasks from {key}. If you're trying to import binary objects, " + f'perhaps you\'ve forgot to enable "Treat every bucket object as a source file" option?' + ) + ) + + if isinstance(value, dict): + return value + if isinstance(value, list): + for idx, item in enumerate(value): + if not isinstance(item, dict): + raise error_cls( + ( + f'Error on key {key} item {idx}: For {storage_class_name} ' + 'your JSON file must be a dictionary with one task, or a list of ' + 'dictionaries with one task each' + ) + ) + return value + + raise error_cls( + ( + f'Error on key {key}: For {storage_class_name} your JSON file must be a ' + 'dictionary with one task, or a list of dictionaries with one task each' + ) + ) diff --git a/label_studio/tests/jwt_auth/test_views.py b/label_studio/tests/jwt_auth/test_views.py index 634798820fd5..6479e8aea583 100644 --- a/label_studio/tests/jwt_auth/test_views.py +++ b/label_studio/tests/jwt_auth/test_views.py @@ -3,6 +3,7 @@ from rest_framework import status from rest_framework.test import APIClient from rest_framework_simplejwt.exceptions import TokenError + from tests.jwt_auth.utils import create_user_with_token_settings from tests.utils import mock_feature_flag diff --git a/label_studio/tests/sdk/legacy/test_tasks.py b/label_studio/tests/sdk/legacy/test_tasks.py index 036a45b556d5..e6a414b052df 100644 --- a/label_studio/tests/sdk/legacy/test_tasks.py +++ b/label_studio/tests/sdk/legacy/test_tasks.py @@ -6,6 +6,7 @@ pytestmark = pytest.mark.django_db from label_studio_sdk import Client + from tests.sdk.utils import sdk_logs diff --git a/label_studio/tests/test_cli.py b/label_studio/tests/test_cli.py index 0413bdd9b1c1..0a9b9ef3a02f 100644 --- a/label_studio/tests/test_cli.py +++ b/label_studio/tests/test_cli.py @@ -2,9 +2,9 @@ """ import pytest from server import _create_user -from tests.utils import make_annotation, make_project, make_task from label_studio.core.argparser import parse_input_args +from tests.utils import make_annotation, make_project, make_task @pytest.mark.django_db diff --git a/label_studio/tests/test_has_lock.py b/label_studio/tests/test_has_lock.py index dc9dd15eb64c..0e12c53aa5ec 100644 --- a/label_studio/tests/test_has_lock.py +++ b/label_studio/tests/test_has_lock.py @@ -1,6 +1,7 @@ import json import pytest + from tests.utils import make_project diff --git a/label_studio/tests/test_io_storages.py b/label_studio/tests/test_io_storages.py index 19be1504b4c6..dc7ee9802a93 100644 --- a/label_studio/tests/test_io_storages.py +++ b/label_studio/tests/test_io_storages.py @@ -1,6 +1,7 @@ import json import pytest + from tests.utils import make_project diff --git a/label_studio/tests/test_organizations.py b/label_studio/tests/test_organizations.py index 87e4a9ab0825..dbcce8423aaa 100644 --- a/label_studio/tests/test_organizations.py +++ b/label_studio/tests/test_organizations.py @@ -3,9 +3,10 @@ import pytest from organizations.models import Organization, OrganizationMember from tasks.models import Task -from tests.utils import make_annotation from users.models import User +from tests.utils import make_annotation + @pytest.mark.django_db def test_active_organization_filled(business_client): diff --git a/label_studio/tests/test_project.py b/label_studio/tests/test_project.py index 03e448165b1f..a3a5b2ac1c6a 100644 --- a/label_studio/tests/test_project.py +++ b/label_studio/tests/test_project.py @@ -2,9 +2,10 @@ import pytest from django.db.models.query import QuerySet -from tests.utils import make_project from users.models import User +from tests.utils import make_project + @pytest.mark.django_db def test_update_tasks_counters_and_task_states(business_client): diff --git a/label_studio/tests/test_project_reset_summary.py b/label_studio/tests/test_project_reset_summary.py index 97dab48eb627..404d55a5d44f 100644 --- a/label_studio/tests/test_project_reset_summary.py +++ b/label_studio/tests/test_project_reset_summary.py @@ -2,6 +2,7 @@ import pytest from tasks.models import Task + from tests.conftest import project_choices from tests.utils import make_project From 015a2f726c20695cd2d50cd0f1248d8aa9191be4 Mon Sep 17 00:00:00 2001 From: robot-ci-heartex Date: Fri, 16 May 2025 17:54:24 +0000 Subject: [PATCH 22/77] Apply pre-commit linters --- label_studio/io_storages/tests/test_multitask_import.py | 1 - label_studio/tests/jwt_auth/test_views.py | 1 - label_studio/tests/sdk/legacy/test_tasks.py | 1 - label_studio/tests/test_cli.py | 2 +- label_studio/tests/test_has_lock.py | 1 - label_studio/tests/test_io_storages.py | 1 - label_studio/tests/test_organizations.py | 3 +-- label_studio/tests/test_project.py | 3 +-- label_studio/tests/test_project_reset_summary.py | 1 - 9 files changed, 3 insertions(+), 11 deletions(-) diff --git a/label_studio/io_storages/tests/test_multitask_import.py b/label_studio/io_storages/tests/test_multitask_import.py index 46c72dbcb694..e73ceaf6ea2a 100644 --- a/label_studio/io_storages/tests/test_multitask_import.py +++ b/label_studio/io_storages/tests/test_multitask_import.py @@ -14,7 +14,6 @@ from moto import mock_s3 from projects.tests.factories import ProjectFactory from rest_framework.test import APIClient - from tests.utils import azure_client_mock, gcs_client_mock, mock_feature_flag, redis_client_mock diff --git a/label_studio/tests/jwt_auth/test_views.py b/label_studio/tests/jwt_auth/test_views.py index 6479e8aea583..634798820fd5 100644 --- a/label_studio/tests/jwt_auth/test_views.py +++ b/label_studio/tests/jwt_auth/test_views.py @@ -3,7 +3,6 @@ from rest_framework import status from rest_framework.test import APIClient from rest_framework_simplejwt.exceptions import TokenError - from tests.jwt_auth.utils import create_user_with_token_settings from tests.utils import mock_feature_flag diff --git a/label_studio/tests/sdk/legacy/test_tasks.py b/label_studio/tests/sdk/legacy/test_tasks.py index e6a414b052df..036a45b556d5 100644 --- a/label_studio/tests/sdk/legacy/test_tasks.py +++ b/label_studio/tests/sdk/legacy/test_tasks.py @@ -6,7 +6,6 @@ pytestmark = pytest.mark.django_db from label_studio_sdk import Client - from tests.sdk.utils import sdk_logs diff --git a/label_studio/tests/test_cli.py b/label_studio/tests/test_cli.py index 0a9b9ef3a02f..0413bdd9b1c1 100644 --- a/label_studio/tests/test_cli.py +++ b/label_studio/tests/test_cli.py @@ -2,9 +2,9 @@ """ import pytest from server import _create_user +from tests.utils import make_annotation, make_project, make_task from label_studio.core.argparser import parse_input_args -from tests.utils import make_annotation, make_project, make_task @pytest.mark.django_db diff --git a/label_studio/tests/test_has_lock.py b/label_studio/tests/test_has_lock.py index 0e12c53aa5ec..dc9dd15eb64c 100644 --- a/label_studio/tests/test_has_lock.py +++ b/label_studio/tests/test_has_lock.py @@ -1,7 +1,6 @@ import json import pytest - from tests.utils import make_project diff --git a/label_studio/tests/test_io_storages.py b/label_studio/tests/test_io_storages.py index dc7ee9802a93..19be1504b4c6 100644 --- a/label_studio/tests/test_io_storages.py +++ b/label_studio/tests/test_io_storages.py @@ -1,7 +1,6 @@ import json import pytest - from tests.utils import make_project diff --git a/label_studio/tests/test_organizations.py b/label_studio/tests/test_organizations.py index dbcce8423aaa..87e4a9ab0825 100644 --- a/label_studio/tests/test_organizations.py +++ b/label_studio/tests/test_organizations.py @@ -3,9 +3,8 @@ import pytest from organizations.models import Organization, OrganizationMember from tasks.models import Task -from users.models import User - from tests.utils import make_annotation +from users.models import User @pytest.mark.django_db diff --git a/label_studio/tests/test_project.py b/label_studio/tests/test_project.py index a3a5b2ac1c6a..03e448165b1f 100644 --- a/label_studio/tests/test_project.py +++ b/label_studio/tests/test_project.py @@ -2,9 +2,8 @@ import pytest from django.db.models.query import QuerySet -from users.models import User - from tests.utils import make_project +from users.models import User @pytest.mark.django_db diff --git a/label_studio/tests/test_project_reset_summary.py b/label_studio/tests/test_project_reset_summary.py index 404d55a5d44f..97dab48eb627 100644 --- a/label_studio/tests/test_project_reset_summary.py +++ b/label_studio/tests/test_project_reset_summary.py @@ -2,7 +2,6 @@ import pytest from tasks.models import Task - from tests.conftest import project_choices from tests.utils import make_project From aa5113149822d6de2de198b4671ede7bd6fd7f56 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Fri, 16 May 2025 13:57:43 -0400 Subject: [PATCH 23/77] add to settings --- label_studio/core/settings/base.py | 1 + label_studio/io_storages/utils.py | 12 +++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/label_studio/core/settings/base.py b/label_studio/core/settings/base.py index 1dec35b2f451..a92e76c6d24d 100644 --- a/label_studio/core/settings/base.py +++ b/label_studio/core/settings/base.py @@ -597,6 +597,7 @@ MEMBER_PERM = 'core.api_permissions.MemberHasOwnerPermission' RECALCULATE_ALL_STATS = None GET_STORAGE_LIST = 'io_storages.functions.get_storage_list' +STORAGE_LOAD_TASKS_JSON = 'io_storages.utils.load_tasks_json' STORAGE_ANNOTATION_SERIALIZER = 'io_storages.serializers.StorageAnnotationSerializer' TASK_SERIALIZER_BULK = 'tasks.serializers.BaseTaskSerializerBulk' PREPROCESS_FIELD_NAME = 'data_manager.functions.preprocess_field_name' diff --git a/label_studio/io_storages/utils.py b/label_studio/io_storages/utils.py index 0af8816090be..e045823e1eb1 100644 --- a/label_studio/io_storages/utils.py +++ b/label_studio/io_storages/utils.py @@ -6,6 +6,10 @@ from dataclasses import dataclass from typing import Union +from django.conf import settings + +from label_studio.core.utils.common import load_func + logger = logging.getLogger(__name__) # Put storage prefixes here @@ -112,7 +116,7 @@ def parse_range(range_header): return start, end -def load_tasks_json(blob_str: str, key: str, storage_class_name: str, error_cls=ValueError): +def _load_tasks_json(blob_str: str, key: str, storage_class_name: str, error_cls=ValueError): """ Parse blob_str containing task JSON(s) and return the validated result or raise an error. @@ -149,3 +153,9 @@ def load_tasks_json(blob_str: str, key: str, storage_class_name: str, error_cls= 'dictionary with one task, or a list of dictionaries with one task each' ) ) + + +def load_tasks_json(blob_str: str, key: str, storage_class_name: str, error_cls=ValueError): + # uses _load_tasks_json here and an LSE-specific implementation in LSE + load_tasks_json_func = load_func(settings.STORAGE_LOAD_TASKS_JSON) + return load_tasks_json_func(blob_str, key, storage_class_name, error_cls) From 8c45515a4f5b4be303b2e7e542da539637d4e30c Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Fri, 16 May 2025 14:00:51 -0400 Subject: [PATCH 24/77] comment --- label_studio/io_storages/s3/models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/label_studio/io_storages/s3/models.py b/label_studio/io_storages/s3/models.py index b559df7190ee..8317a5cd0c01 100644 --- a/label_studio/io_storages/s3/models.py +++ b/label_studio/io_storages/s3/models.py @@ -230,6 +230,7 @@ def get_data(self, key) -> Union[dict, list[dict]]: obj = s3.Object(bucket.name, key).get()['Body'].read().decode('utf-8') from io_storages.utils import load_tasks_json + # TODO: Why do only S3 storages use TaskValidationError here? If the mystery is resolved, can remove this argument from load_tasks_json and use ValueError everywhere return load_tasks_json(obj, key, self.__class__.__name__, TaskValidationError) @catch_and_reraise_from_none From fbe800f865c77b11a59ca9499097eec8a3a7dc62 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Fri, 16 May 2025 14:09:59 -0400 Subject: [PATCH 25/77] add pyarrow lib --- poetry.lock | 57 +++++++++++++++++++++++++++++++++++++++++++++++++- pyproject.toml | 1 + 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index 7b27409fb23e..8ad3d9b25e50 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3178,6 +3178,61 @@ files = [ {file = "psycopg2_binary-2.9.10-cp39-cp39-win_amd64.whl", hash = "sha256:30e34c4e97964805f715206c7b789d54a78b70f3ff19fbe590104b71c45600e5"}, ] +[[package]] +name = "pyarrow" +version = "18.1.0" +description = "Python library for Apache Arrow" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "pyarrow-18.1.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:e21488d5cfd3d8b500b3238a6c4b075efabc18f0f6d80b29239737ebd69caa6c"}, + {file = "pyarrow-18.1.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:b516dad76f258a702f7ca0250885fc93d1fa5ac13ad51258e39d402bd9e2e1e4"}, + {file = "pyarrow-18.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f443122c8e31f4c9199cb23dca29ab9427cef990f283f80fe15b8e124bcc49b"}, + {file = "pyarrow-18.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0a03da7f2758645d17b7b4f83c8bffeae5bbb7f974523fe901f36288d2eab71"}, + {file = "pyarrow-18.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:ba17845efe3aa358ec266cf9cc2800fa73038211fb27968bfa88acd09261a470"}, + {file = "pyarrow-18.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:3c35813c11a059056a22a3bef520461310f2f7eea5c8a11ef9de7062a23f8d56"}, + {file = "pyarrow-18.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9736ba3c85129d72aefa21b4f3bd715bc4190fe4426715abfff90481e7d00812"}, + {file = "pyarrow-18.1.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:eaeabf638408de2772ce3d7793b2668d4bb93807deed1725413b70e3156a7854"}, + {file = "pyarrow-18.1.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:3b2e2239339c538f3464308fd345113f886ad031ef8266c6f004d49769bb074c"}, + {file = "pyarrow-18.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f39a2e0ed32a0970e4e46c262753417a60c43a3246972cfc2d3eb85aedd01b21"}, + {file = "pyarrow-18.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e31e9417ba9c42627574bdbfeada7217ad8a4cbbe45b9d6bdd4b62abbca4c6f6"}, + {file = "pyarrow-18.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:01c034b576ce0eef554f7c3d8c341714954be9b3f5d5bc7117006b85fcf302fe"}, + {file = "pyarrow-18.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:f266a2c0fc31995a06ebd30bcfdb7f615d7278035ec5b1cd71c48d56daaf30b0"}, + {file = "pyarrow-18.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:d4f13eee18433f99adefaeb7e01d83b59f73360c231d4782d9ddfaf1c3fbde0a"}, + {file = "pyarrow-18.1.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:9f3a76670b263dc41d0ae877f09124ab96ce10e4e48f3e3e4257273cee61ad0d"}, + {file = "pyarrow-18.1.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:da31fbca07c435be88a0c321402c4e31a2ba61593ec7473630769de8346b54ee"}, + {file = "pyarrow-18.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:543ad8459bc438efc46d29a759e1079436290bd583141384c6f7a1068ed6f992"}, + {file = "pyarrow-18.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0743e503c55be0fdb5c08e7d44853da27f19dc854531c0570f9f394ec9671d54"}, + {file = "pyarrow-18.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:d4b3d2a34780645bed6414e22dda55a92e0fcd1b8a637fba86800ad737057e33"}, + {file = "pyarrow-18.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:c52f81aa6f6575058d8e2c782bf79d4f9fdc89887f16825ec3a66607a5dd8e30"}, + {file = "pyarrow-18.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:0ad4892617e1a6c7a551cfc827e072a633eaff758fa09f21c4ee548c30bcaf99"}, + {file = "pyarrow-18.1.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:84e314d22231357d473eabec709d0ba285fa706a72377f9cc8e1cb3c8013813b"}, + {file = "pyarrow-18.1.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:f591704ac05dfd0477bb8f8e0bd4b5dc52c1cadf50503858dce3a15db6e46ff2"}, + {file = "pyarrow-18.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:acb7564204d3c40babf93a05624fc6a8ec1ab1def295c363afc40b0c9e66c191"}, + {file = "pyarrow-18.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:74de649d1d2ccb778f7c3afff6085bd5092aed4c23df9feeb45dd6b16f3811aa"}, + {file = "pyarrow-18.1.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:f96bd502cb11abb08efea6dab09c003305161cb6c9eafd432e35e76e7fa9b90c"}, + {file = "pyarrow-18.1.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:36ac22d7782554754a3b50201b607d553a8d71b78cdf03b33c1125be4b52397c"}, + {file = "pyarrow-18.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:25dbacab8c5952df0ca6ca0af28f50d45bd31c1ff6fcf79e2d120b4a65ee7181"}, + {file = "pyarrow-18.1.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:6a276190309aba7bc9d5bd2933230458b3521a4317acfefe69a354f2fe59f2bc"}, + {file = "pyarrow-18.1.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:ad514dbfcffe30124ce655d72771ae070f30bf850b48bc4d9d3b25993ee0e386"}, + {file = "pyarrow-18.1.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aebc13a11ed3032d8dd6e7171eb6e86d40d67a5639d96c35142bd568b9299324"}, + {file = "pyarrow-18.1.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d6cf5c05f3cee251d80e98726b5c7cc9f21bab9e9783673bac58e6dfab57ecc8"}, + {file = "pyarrow-18.1.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:11b676cd410cf162d3f6a70b43fb9e1e40affbc542a1e9ed3681895f2962d3d9"}, + {file = "pyarrow-18.1.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:b76130d835261b38f14fc41fdfb39ad8d672afb84c447126b84d5472244cfaba"}, + {file = "pyarrow-18.1.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:0b331e477e40f07238adc7ba7469c36b908f07c89b95dd4bd3a0ec84a3d1e21e"}, + {file = "pyarrow-18.1.0-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:2c4dd0c9010a25ba03e198fe743b1cc03cd33c08190afff371749c52ccbbaf76"}, + {file = "pyarrow-18.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f97b31b4c4e21ff58c6f330235ff893cc81e23da081b1a4b1c982075e0ed4e9"}, + {file = "pyarrow-18.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a4813cb8ecf1809871fd2d64a8eff740a1bd3691bbe55f01a3cf6c5ec869754"}, + {file = "pyarrow-18.1.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:05a5636ec3eb5cc2a36c6edb534a38ef57b2ab127292a716d00eabb887835f1e"}, + {file = "pyarrow-18.1.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:73eeed32e724ea3568bb06161cad5fa7751e45bc2228e33dcb10c614044165c7"}, + {file = "pyarrow-18.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:a1880dd6772b685e803011a6b43a230c23b566859a6e0c9a276c1e0faf4f4052"}, + {file = "pyarrow-18.1.0.tar.gz", hash = "sha256:9386d3ca9c145b5539a1cfc75df07757dff870168c959b473a0bccbc3abc8c73"}, +] + +[package.extras] +test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"] + [[package]] name = "pyasn1" version = "0.5.0" @@ -4958,4 +5013,4 @@ uwsgi = ["pyuwsgi", "uwsgitop"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<4" -content-hash = "9f7509b33e9a1c5e25557ecaa94062f7a29d043cd492fd6f20fce64ab21fd914" +content-hash = "7684e8020651378bda172bf7b018e0251aad720a8e68980ac61811ea98074ab7" diff --git a/pyproject.toml b/pyproject.toml index 6d27c90f181c..548646c449b1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,7 @@ dependencies = [ "ordered-set (==4.0.2)", "pandas (>=2.2.3)", "psycopg2-binary (==2.9.10)", + "pyarrow (>=18.0.0,<19.0.0)", "pydantic (>=2.9.2)", "python-dateutil (>=2.8.1)", "pytz (>=2022.1,<2023.0)", From e89639dbed3615af21d23ab489a3fdabdbd7438d Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Fri, 16 May 2025 14:22:12 -0400 Subject: [PATCH 26/77] cleanup GCS utils --- label_studio/io_storages/gcs/models.py | 1 - label_studio/io_storages/gcs/utils.py | 23 ----------------------- 2 files changed, 24 deletions(-) diff --git a/label_studio/io_storages/gcs/models.py b/label_studio/io_storages/gcs/models.py index 62c8dc9faec8..134fcbb3a4da 100644 --- a/label_studio/io_storages/gcs/models.py +++ b/label_studio/io_storages/gcs/models.py @@ -187,7 +187,6 @@ def get_data(self, key) -> Union[dict, list[dict]]: client=self.get_client(), bucket_name=self.bucket, key=key, - convert_to=GCS.ConvertBlobTo.NOTHING, ) return load_tasks_json(blob_str, key, self.__class__.__name__) diff --git a/label_studio/io_storages/gcs/utils.py b/label_studio/io_storages/gcs/utils.py index 60340457eaa1..ba92d1ef7551 100644 --- a/label_studio/io_storages/gcs/utils.py +++ b/label_studio/io_storages/gcs/utils.py @@ -250,15 +250,6 @@ def iter_images_filename(cls, client, bucket_name, max_files): def get_uri(cls, bucket_name, key): return f'gs://{bucket_name}/{key}' - @classmethod - def _try_read_json(cls, blob_str): - try: - data = json.loads(blob_str) - except ValueError: - logger.error(f"Can't parse JSON from {blob_str}") - return - return data - @classmethod def read_file( cls, client: gcs.Client, bucket_name: str, key: str, convert_to: ConvertBlobTo = ConvertBlobTo.NOTHING @@ -266,20 +257,6 @@ def read_file( bucket = client.get_bucket(bucket_name) blob = bucket.blob(key) blob_str = blob.download_as_bytes() - if convert_to == cls.ConvertBlobTo.NOTHING: - return blob_str - elif convert_to == cls.ConvertBlobTo.JSON: - return cls._try_read_json(blob_str) - elif convert_to == cls.ConvertBlobTo.JSON_DICT: - json_data = cls._try_read_json(blob_str) - if not isinstance(json_data, dict): - raise ValueError( - f'Error on key {key}: For {cls.__name__} your JSON file must be a dictionary with one task.' - ) - return json_data - elif convert_to == cls.ConvertBlobTo.BASE64: - return base64.b64encode(blob_str) - return blob_str @classmethod From 364ccfbc1189a072581345d3e4103595f313acea Mon Sep 17 00:00:00 2001 From: matt-bernstein Date: Fri, 16 May 2025 18:26:51 +0000 Subject: [PATCH 27/77] Sync Follow Merge dependencies Workflow run: https://github.com/HumanSignal/label-studio/actions/runs/15074933586 From 2dcbca39ba82efd5fd5cef545ccb655d9a35a738 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Fri, 16 May 2025 16:54:15 -0400 Subject: [PATCH 28/77] fix gcs --- label_studio/io_storages/gcs/models.py | 1 + label_studio/io_storages/gcs/utils.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/label_studio/io_storages/gcs/models.py b/label_studio/io_storages/gcs/models.py index 134fcbb3a4da..62c8dc9faec8 100644 --- a/label_studio/io_storages/gcs/models.py +++ b/label_studio/io_storages/gcs/models.py @@ -187,6 +187,7 @@ def get_data(self, key) -> Union[dict, list[dict]]: client=self.get_client(), bucket_name=self.bucket, key=key, + convert_to=GCS.ConvertBlobTo.NOTHING, ) return load_tasks_json(blob_str, key, self.__class__.__name__) diff --git a/label_studio/io_storages/gcs/utils.py b/label_studio/io_storages/gcs/utils.py index ba92d1ef7551..b1163308fd94 100644 --- a/label_studio/io_storages/gcs/utils.py +++ b/label_studio/io_storages/gcs/utils.py @@ -257,6 +257,10 @@ def read_file( bucket = client.get_bucket(bucket_name) blob = bucket.blob(key) blob_str = blob.download_as_bytes() + + if convert_to == cls.ConvertBlobTo.BASE64: + return base64.b64encode(blob_str) + return blob_str @classmethod From c45fbfd865e3d9c06a28885b56b8932d2f84a947 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Mon, 19 May 2025 12:26:25 -0400 Subject: [PATCH 29/77] organize import --- label_studio/io_storages/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/label_studio/io_storages/utils.py b/label_studio/io_storages/utils.py index e045823e1eb1..4f2e02907bd5 100644 --- a/label_studio/io_storages/utils.py +++ b/label_studio/io_storages/utils.py @@ -8,7 +8,8 @@ from django.conf import settings -from label_studio.core.utils.common import load_func +from core.feature_flags import flag_set +from core.utils.common import load_func logger = logging.getLogger(__name__) From 64ac978a02018dcb838422989e20a860c37be324 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Mon, 19 May 2025 13:53:36 -0400 Subject: [PATCH 30/77] handle localfiles --- label_studio/io_storages/localfiles/models.py | 29 +++++-------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/label_studio/io_storages/localfiles/models.py b/label_studio/io_storages/localfiles/models.py index 48f8c6f8e7da..0fc74fd25f35 100644 --- a/label_studio/io_storages/localfiles/models.py +++ b/label_studio/io_storages/localfiles/models.py @@ -20,6 +20,7 @@ ImportStorageLink, ProjectStorageMixin, ) +from io_storages.utils import load_tasks_json from rest_framework.exceptions import ValidationError from tasks.models import Annotation @@ -90,27 +91,13 @@ def get_data(self, key) -> dict | list[dict]: } try: - with open(path, encoding='utf8') as f: - value = json.load(f) - except (UnicodeDecodeError, json.decoder.JSONDecodeError): - raise ValueError( - f"Can't import JSON-formatted tasks from {key}. If you're trying to import binary objects, " - f'perhaps you\'ve forgot to enable "Treat every bucket object as a source file" option?' - ) - - if isinstance(value, dict): - return value - elif isinstance(value, list): - for idx, item in enumerate(value): - if not isinstance(item, dict): - raise ValueError( - f'Error on key {key} item {idx}: For {self.__class__.__name__} your JSON file must be a dictionary with one task, or a list of dictionaries with one task each' - ) - return value - else: - raise ValueError( - f'Error on key {key}: For {self.__class__.__name__} your JSON file must be a dictionary with one task, or a list of dictionaries with one task each' - ) + with open(path, 'rb') as f: + blob_str = f.read().decode('utf-8') + return load_tasks_json(blob_str, key, self.__class__.__name__) + except UnicodeDecodeError as e: + raise ValueError(f"Failed to decode file {path} as UTF-8: {str(e)}") + except OSError as e: + raise ValueError(f"Failed to read file {path}: {str(e)}") def scan_and_create_links(self): return self._scan_and_create_links(LocalFilesImportStorageLink) From fecf18e37c64f20c14acc5c96b29c7d67335fbc9 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Mon, 19 May 2025 13:54:53 -0400 Subject: [PATCH 31/77] remove unused TaskValidationError --- label_studio/io_storages/s3/models.py | 4 +--- label_studio/io_storages/utils.py | 10 +++++----- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/label_studio/io_storages/s3/models.py b/label_studio/io_storages/s3/models.py index ea92f776d593..a383ab62238b 100644 --- a/label_studio/io_storages/s3/models.py +++ b/label_studio/io_storages/s3/models.py @@ -29,7 +29,6 @@ ) from io_storages.utils import storage_can_resolve_bucket_url, load_tasks_json from tasks.models import Annotation -from tasks.validation import ValidationError as TaskValidationError from label_studio.io_storages.s3.utils import AWS @@ -229,8 +228,7 @@ def get_data(self, key) -> Union[dict, list[dict]]: bucket = s3.Bucket(self.bucket) obj = s3.Object(bucket.name, key).get()['Body'].read().decode('utf-8') - # TODO: Why do only S3 storages use TaskValidationError here? If the mystery is resolved, can remove this argument from load_tasks_json and use ValueError everywhere - return load_tasks_json(obj, key, self.__class__.__name__, TaskValidationError) + return load_tasks_json(obj, key, self.__class__.__name__) @catch_and_reraise_from_none def generate_http_url(self, url): diff --git a/label_studio/io_storages/utils.py b/label_studio/io_storages/utils.py index 4f2e02907bd5..e96520e5e1fe 100644 --- a/label_studio/io_storages/utils.py +++ b/label_studio/io_storages/utils.py @@ -117,7 +117,7 @@ def parse_range(range_header): return start, end -def _load_tasks_json(blob_str: str, key: str, storage_class_name: str, error_cls=ValueError): +def _load_tasks_json(blob_str: str, key: str, storage_class_name: str): """ Parse blob_str containing task JSON(s) and return the validated result or raise an error. @@ -139,7 +139,7 @@ def _load_tasks_json(blob_str: str, key: str, storage_class_name: str, error_cls if isinstance(value, list): for idx, item in enumerate(value): if not isinstance(item, dict): - raise error_cls( + raise ValueError( ( f'Error on key {key} item {idx}: For {storage_class_name} ' 'your JSON file must be a dictionary with one task, or a list of ' @@ -148,7 +148,7 @@ def _load_tasks_json(blob_str: str, key: str, storage_class_name: str, error_cls ) return value - raise error_cls( + raise ValueError( ( f'Error on key {key}: For {storage_class_name} your JSON file must be a ' 'dictionary with one task, or a list of dictionaries with one task each' @@ -156,7 +156,7 @@ def _load_tasks_json(blob_str: str, key: str, storage_class_name: str, error_cls ) -def load_tasks_json(blob_str: str, key: str, storage_class_name: str, error_cls=ValueError): +def load_tasks_json(blob_str: str, key: str, storage_class_name: str): # uses _load_tasks_json here and an LSE-specific implementation in LSE load_tasks_json_func = load_func(settings.STORAGE_LOAD_TASKS_JSON) - return load_tasks_json_func(blob_str, key, storage_class_name, error_cls) + return load_tasks_json_func(blob_str, key, storage_class_name) From f219f21e1dff931f0fbbb7747923f03ca3ec2d30 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Mon, 19 May 2025 15:41:11 -0400 Subject: [PATCH 32/77] add jsonl; compute row_idx and row_group in get_data instead of afterwards --- label_studio/io_storages/azure_blob/models.py | 5 +- label_studio/io_storages/base_models.py | 31 +++++---- label_studio/io_storages/gcs/models.py | 5 +- label_studio/io_storages/localfiles/models.py | 9 +-- label_studio/io_storages/redis/models.py | 5 +- label_studio/io_storages/s3/models.py | 7 +- label_studio/io_storages/utils.py | 66 +++++++++++-------- 7 files changed, 74 insertions(+), 54 deletions(-) diff --git a/label_studio/io_storages/azure_blob/models.py b/label_studio/io_storages/azure_blob/models.py index f73b2f0a5e39..c0819984ad53 100644 --- a/label_studio/io_storages/azure_blob/models.py +++ b/label_studio/io_storages/azure_blob/models.py @@ -209,10 +209,11 @@ def iterkeys(self): continue yield file.name - def get_data(self, key) -> Union[dict, list[dict]]: + def get_data(self, key) -> tuple[list[dict], list[int | None], list[int | None]]: if self.use_blob_urls: data_key = settings.DATA_UNDEFINED_NAME - return {data_key: f'{self.url_scheme}://{self.container}/{key}'} + task = {data_key: f'{self.url_scheme}://{self.container}/{key}'} + return [task], [None], [None] container = self.get_container() blob = container.download_blob(key) diff --git a/label_studio/io_storages/base_models.py b/label_studio/io_storages/base_models.py index 5c6adc329325..f6b45567548e 100644 --- a/label_studio/io_storages/base_models.py +++ b/label_studio/io_storages/base_models.py @@ -341,7 +341,9 @@ def _scan_and_create_links_v2(self): raise NotImplementedError @classmethod - def add_task(cls, data, project, maximum_annotations, max_inner_id, storage, key, row_index, link_class): + def add_task( + cls, data, project, maximum_annotations, max_inner_id, storage, key, row_index, row_group, link_class + ): # predictions predictions = data.get('predictions', []) if predictions: @@ -375,8 +377,8 @@ def add_task(cls, data, project, maximum_annotations, max_inner_id, storage, key inner_id=max_inner_id, ) - link_class.create(task, key, storage, row_index=row_index) - logger.debug(f'Create {storage.__class__.__name__} link with {key=} and {row_index=} for {task=}') + link_class.create(task, key, storage, row_index=row_index, row_group=row_group) + logger.debug(f'Create {storage.__class__.__name__} link with {key=} {row_index=} {row_group=} for {task=}') raise_exception = not flag_set( 'ff_fix_back_dev_3342_storage_scan_with_invalid_annotations', user=AnonymousUser() @@ -431,7 +433,7 @@ def _scan_and_create_links(self, link_class): logger.debug(f'{self}: found new key {key}') try: - tasks_data = self.get_data(key) + tasks_data, row_indices, row_groups = self.get_data(key) except (UnicodeDecodeError, json.decoder.JSONDecodeError) as exc: logger.debug(exc, exc_info=True) raise ValueError( @@ -440,19 +442,22 @@ def _scan_and_create_links(self, link_class): f'"Treat every bucket object as a source file"' ) - if isinstance(tasks_data, dict): - tasks_data = [tasks_data] - row_indices = [None] - else: - if not flag_set('fflag_feat_dia_2092_multitasks_per_storage_link'): - tasks_data = tasks_data[:1] - row_indices = range(len(tasks_data)) + if not flag_set('fflag_feat_dia_2092_multitasks_per_storage_link'): + tasks_data = tasks_data[:1] - for row_index, task_data in zip(row_indices, tasks_data): + for task_data, row_index, row_group in zip(tasks_data, row_indices, row_groups): # TODO: batch this loop body with add_task -> add_tasks in a single bulk write. # See DIA-2062 for prerequisites task = self.add_task( - task_data, self.project, maximum_annotations, max_inner_id, self, key, row_index, link_class + task_data, + self.project, + maximum_annotations, + max_inner_id, + self, + key, + row_index, + row_group, + link_class, ) max_inner_id += 1 diff --git a/label_studio/io_storages/gcs/models.py b/label_studio/io_storages/gcs/models.py index 62c8dc9faec8..846170075917 100644 --- a/label_studio/io_storages/gcs/models.py +++ b/label_studio/io_storages/gcs/models.py @@ -180,9 +180,10 @@ def iterkeys(self): return_key=True, ) - def get_data(self, key) -> Union[dict, list[dict]]: + def get_data(self, key) -> tuple[list[dict], list[int | None], list[int | None]]: if self.use_blob_urls: - return {settings.DATA_UNDEFINED_NAME: GCS.get_uri(self.bucket, key)} + task = {settings.DATA_UNDEFINED_NAME: GCS.get_uri(self.bucket, key)} + return [task], [None], [None] blob_str = GCS.read_file( client=self.get_client(), bucket_name=self.bucket, diff --git a/label_studio/io_storages/localfiles/models.py b/label_studio/io_storages/localfiles/models.py index 0fc74fd25f35..798670fec373 100644 --- a/label_studio/io_storages/localfiles/models.py +++ b/label_studio/io_storages/localfiles/models.py @@ -79,25 +79,26 @@ def iterkeys(self): continue yield str(file) - def get_data(self, key) -> dict | list[dict]: + def get_data(self, key) -> tuple[list[dict], list[int | None], list[int | None]]: path = Path(key) if self.use_blob_urls: # include self-hosted links pointed to local resources via # {settings.HOSTNAME}/data/local-files?d= document_root = Path(settings.LOCAL_FILES_DOCUMENT_ROOT) relative_path = str(path.relative_to(document_root)) - return { + task = { settings.DATA_UNDEFINED_NAME: f'{settings.HOSTNAME}/data/local-files/?d={quote(str(relative_path))}' } + return [task], [None], [None] try: with open(path, 'rb') as f: blob_str = f.read().decode('utf-8') return load_tasks_json(blob_str, key, self.__class__.__name__) except UnicodeDecodeError as e: - raise ValueError(f"Failed to decode file {path} as UTF-8: {str(e)}") + raise ValueError(f'Failed to decode file {path} as UTF-8: {str(e)}') except OSError as e: - raise ValueError(f"Failed to read file {path}: {str(e)}") + raise ValueError(f'Failed to read file {path}: {str(e)}') def scan_and_create_links(self): return self._scan_and_create_links(LocalFilesImportStorageLink) diff --git a/label_studio/io_storages/redis/models.py b/label_studio/io_storages/redis/models.py index ed87ac2ef0ee..c8e5f3efcfd9 100644 --- a/label_studio/io_storages/redis/models.py +++ b/label_studio/io_storages/redis/models.py @@ -3,7 +3,6 @@ import json import logging -from typing import Union import redis from django.db import models @@ -91,11 +90,11 @@ def iterkeys(self): for key in client.keys(path + '*'): yield key - def get_data(self, key) -> Union[dict, list[dict]]: + def get_data(self, key) -> tuple[list[dict], list[int | None], list[int | None]]: client = self.get_client() value_str = client.get(key) if not value_str: - return [] + return [], [], [] return load_tasks_json(value_str, key, self.__class__.__name__) def scan_and_create_links(self): diff --git a/label_studio/io_storages/s3/models.py b/label_studio/io_storages/s3/models.py index a383ab62238b..431162a59745 100644 --- a/label_studio/io_storages/s3/models.py +++ b/label_studio/io_storages/s3/models.py @@ -27,7 +27,7 @@ get_client_and_resource, resolve_s3_url, ) -from io_storages.utils import storage_can_resolve_bucket_url, load_tasks_json +from io_storages.utils import load_tasks_json, storage_can_resolve_bucket_url from tasks.models import Annotation from label_studio.io_storages.s3.utils import AWS @@ -217,11 +217,12 @@ def scan_and_create_links(self): return self._scan_and_create_links(S3ImportStorageLink) @catch_and_reraise_from_none - def get_data(self, key) -> Union[dict, list[dict]]: + def get_data(self, key) -> tuple[list[dict], list[int | None], list[int | None]]: uri = f'{self.url_scheme}://{self.bucket}/{key}' if self.use_blob_urls: data_key = settings.DATA_UNDEFINED_NAME - return {data_key: uri} + task = {data_key: uri} + return [task], [None], [None] # read task json from bucket and validate it _, s3 = self.get_client_and_resource() diff --git a/label_studio/io_storages/utils.py b/label_studio/io_storages/utils.py index e96520e5e1fe..04f50ffbf41e 100644 --- a/label_studio/io_storages/utils.py +++ b/label_studio/io_storages/utils.py @@ -4,12 +4,13 @@ import logging import re from dataclasses import dataclass -from typing import Union - -from django.conf import settings +from typing import Optional, Union +import pyarrow as pa +import pyarrow.json from core.feature_flags import flag_set from core.utils.common import load_func +from django.conf import settings logger = logging.getLogger(__name__) @@ -117,43 +118,54 @@ def parse_range(range_header): return start, end -def _load_tasks_json(blob_str: str, key: str, storage_class_name: str): +def _load_tasks_json(blob_str: str, key: str) -> tuple[list[dict], list[int | None], list[int | None]]: """ Parse blob_str containing task JSON(s) and return the validated result or raise an error. - Other args are used for error messages. + Args: + blob_str (str): The blob string to parse. + key (str): The key of the blob. Used for error messages. + + Returns: + list[dict]: parsed tasks. + list[int|None]: row_index for each task. + list[int|None]: row_group for each task. """ - try: - value = json.loads(blob_str) - except json.decoder.JSONDecodeError: + def _error_wrapper(exc: Optional[Exception] = None): raise ValueError( ( f"Can't import JSON-formatted tasks from {key}. If you're trying to import binary objects, " f'perhaps you\'ve forgot to enable "Treat every bucket object as a source file" option?' ) - ) + ) from exc + + # TODO: rely on file extensions here instead of pure exception-chaining? Would be more readable, potentially less reliable, would work for all storage types except redis. + try: + value = json.loads(blob_str) + except json.decoder.JSONDecodeError as e: + if flag_set('fflag_feat_root_11_support_jsonl_cloud_storage'): + try: + table = pyarrow.json.read_json(pa.py_buffer(blob_str)) + return table.to_pylist() + except Exception as e: + _error_wrapper(e) + else: + _error_wrapper(e) if isinstance(value, dict): - return value + return [value], [None], [None] if isinstance(value, list): - for idx, item in enumerate(value): - if not isinstance(item, dict): - raise ValueError( - ( - f'Error on key {key} item {idx}: For {storage_class_name} ' - 'your JSON file must be a dictionary with one task, or a list of ' - 'dictionaries with one task each' - ) - ) - return value - - raise ValueError( - ( - f'Error on key {key}: For {storage_class_name} your JSON file must be a ' - 'dictionary with one task, or a list of dictionaries with one task each' - ) - ) + # validate tasks by briefly converting to table + try: + table = pa.Table.from_pylist(value) + values = table.to_pylist() + except Exception as e: + _error_wrapper(e) + n_tasks = len(values) + return values, list(range(n_tasks)), [None] * n_tasks + + _error_wrapper() def load_tasks_json(blob_str: str, key: str, storage_class_name: str): From 606fa1f179f820077d14fe03e43d2d70c00cf0c0 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Mon, 19 May 2025 15:46:28 -0400 Subject: [PATCH 33/77] don't pass storage class name --- label_studio/io_storages/azure_blob/models.py | 2 +- label_studio/io_storages/gcs/models.py | 2 +- label_studio/io_storages/localfiles/models.py | 2 +- label_studio/io_storages/redis/models.py | 2 +- label_studio/io_storages/s3/models.py | 2 +- label_studio/io_storages/utils.py | 4 ++-- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/label_studio/io_storages/azure_blob/models.py b/label_studio/io_storages/azure_blob/models.py index c0819984ad53..c5416dbb4638 100644 --- a/label_studio/io_storages/azure_blob/models.py +++ b/label_studio/io_storages/azure_blob/models.py @@ -218,7 +218,7 @@ def get_data(self, key) -> tuple[list[dict], list[int | None], list[int | None]] container = self.get_container() blob = container.download_blob(key) blob_str = blob.content_as_text() - return load_tasks_json(blob_str, key, self.__class__.__name__) + return load_tasks_json(blob_str, key) def scan_and_create_links(self): return self._scan_and_create_links(AzureBlobImportStorageLink) diff --git a/label_studio/io_storages/gcs/models.py b/label_studio/io_storages/gcs/models.py index 846170075917..ff7a5fa6c26e 100644 --- a/label_studio/io_storages/gcs/models.py +++ b/label_studio/io_storages/gcs/models.py @@ -190,7 +190,7 @@ def get_data(self, key) -> tuple[list[dict], list[int | None], list[int | None]] key=key, convert_to=GCS.ConvertBlobTo.NOTHING, ) - return load_tasks_json(blob_str, key, self.__class__.__name__) + return load_tasks_json(blob_str, key) def generate_http_url(self, url): return GCS.generate_http_url( diff --git a/label_studio/io_storages/localfiles/models.py b/label_studio/io_storages/localfiles/models.py index 798670fec373..79303b5dcb2d 100644 --- a/label_studio/io_storages/localfiles/models.py +++ b/label_studio/io_storages/localfiles/models.py @@ -94,7 +94,7 @@ def get_data(self, key) -> tuple[list[dict], list[int | None], list[int | None]] try: with open(path, 'rb') as f: blob_str = f.read().decode('utf-8') - return load_tasks_json(blob_str, key, self.__class__.__name__) + return load_tasks_json(blob_str, key) except UnicodeDecodeError as e: raise ValueError(f'Failed to decode file {path} as UTF-8: {str(e)}') except OSError as e: diff --git a/label_studio/io_storages/redis/models.py b/label_studio/io_storages/redis/models.py index c8e5f3efcfd9..843b27db34af 100644 --- a/label_studio/io_storages/redis/models.py +++ b/label_studio/io_storages/redis/models.py @@ -95,7 +95,7 @@ def get_data(self, key) -> tuple[list[dict], list[int | None], list[int | None]] value_str = client.get(key) if not value_str: return [], [], [] - return load_tasks_json(value_str, key, self.__class__.__name__) + return load_tasks_json(value_str, key) def scan_and_create_links(self): return self._scan_and_create_links(RedisImportStorageLink) diff --git a/label_studio/io_storages/s3/models.py b/label_studio/io_storages/s3/models.py index 431162a59745..96dc5f83f38f 100644 --- a/label_studio/io_storages/s3/models.py +++ b/label_studio/io_storages/s3/models.py @@ -229,7 +229,7 @@ def get_data(self, key) -> tuple[list[dict], list[int | None], list[int | None]] bucket = s3.Bucket(self.bucket) obj = s3.Object(bucket.name, key).get()['Body'].read().decode('utf-8') - return load_tasks_json(obj, key, self.__class__.__name__) + return load_tasks_json(obj, key) @catch_and_reraise_from_none def generate_http_url(self, url): diff --git a/label_studio/io_storages/utils.py b/label_studio/io_storages/utils.py index 04f50ffbf41e..a1f05aedec43 100644 --- a/label_studio/io_storages/utils.py +++ b/label_studio/io_storages/utils.py @@ -168,7 +168,7 @@ def _error_wrapper(exc: Optional[Exception] = None): _error_wrapper() -def load_tasks_json(blob_str: str, key: str, storage_class_name: str): +def load_tasks_json(blob_str: str, key: str): # uses _load_tasks_json here and an LSE-specific implementation in LSE load_tasks_json_func = load_func(settings.STORAGE_LOAD_TASKS_JSON) - return load_tasks_json_func(blob_str, key, storage_class_name) + return load_tasks_json_func(blob_str, key) From e62cbc63aad43c4ed21f40058bb145c700e82a11 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Mon, 19 May 2025 15:52:02 -0400 Subject: [PATCH 34/77] more permissive parsing --- label_studio/io_storages/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/label_studio/io_storages/utils.py b/label_studio/io_storages/utils.py index a1f05aedec43..ce6bfb7d8d16 100644 --- a/label_studio/io_storages/utils.py +++ b/label_studio/io_storages/utils.py @@ -146,7 +146,9 @@ def _error_wrapper(exc: Optional[Exception] = None): except json.decoder.JSONDecodeError as e: if flag_set('fflag_feat_root_11_support_jsonl_cloud_storage'): try: - table = pyarrow.json.read_json(pa.py_buffer(blob_str)) + table = pyarrow.json.read_json( + pa.py_buffer(blob_str), parse_options=pa.json.ParseOptions(newlines_in_values=True) + ) return table.to_pylist() except Exception as e: _error_wrapper(e) From 1d4cc495b605c21e05b87e891149967d6741f943 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Mon, 19 May 2025 16:11:32 -0400 Subject: [PATCH 35/77] fix recursion --- label_studio/core/settings/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/label_studio/core/settings/base.py b/label_studio/core/settings/base.py index c30c21aaa314..1d30361b0679 100644 --- a/label_studio/core/settings/base.py +++ b/label_studio/core/settings/base.py @@ -598,7 +598,7 @@ MEMBER_PERM = 'core.api_permissions.MemberHasOwnerPermission' RECALCULATE_ALL_STATS = None GET_STORAGE_LIST = 'io_storages.functions.get_storage_list' -STORAGE_LOAD_TASKS_JSON = 'io_storages.utils.load_tasks_json' +STORAGE_LOAD_TASKS_JSON = 'io_storages.utils._load_tasks_json' STORAGE_ANNOTATION_SERIALIZER = 'io_storages.serializers.StorageAnnotationSerializer' TASK_SERIALIZER_BULK = 'tasks.serializers.BaseTaskSerializerBulk' PREPROCESS_FIELD_NAME = 'data_manager.functions.preprocess_field_name' From bb42c08b29ada979ab646fe1c966477cd9061c3d Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Mon, 19 May 2025 17:55:12 -0400 Subject: [PATCH 36/77] structured return type for get_data --- label_studio/io_storages/azure_blob/models.py | 6 ++-- label_studio/io_storages/base_models.py | 10 +++--- label_studio/io_storages/gcs/models.py | 6 ++-- label_studio/io_storages/localfiles/models.py | 6 ++-- label_studio/io_storages/redis/models.py | 6 ++-- label_studio/io_storages/s3/models.py | 7 ++-- label_studio/io_storages/utils.py | 35 ++++++++++++++----- 7 files changed, 46 insertions(+), 30 deletions(-) diff --git a/label_studio/io_storages/azure_blob/models.py b/label_studio/io_storages/azure_blob/models.py index c5416dbb4638..ed175b501061 100644 --- a/label_studio/io_storages/azure_blob/models.py +++ b/label_studio/io_storages/azure_blob/models.py @@ -25,7 +25,7 @@ ImportStorageLink, ProjectStorageMixin, ) -from io_storages.utils import load_tasks_json, parse_range, storage_can_resolve_bucket_url +from io_storages.utils import load_tasks_json, parse_range, storage_can_resolve_bucket_url, StorageLinkParams from tasks.models import Annotation from label_studio.io_storages.azure_blob.utils import AZURE @@ -209,11 +209,11 @@ def iterkeys(self): continue yield file.name - def get_data(self, key) -> tuple[list[dict], list[int | None], list[int | None]]: + def get_data(self, key) -> tuple[list[dict], list[StorageLinkParams]]: if self.use_blob_urls: data_key = settings.DATA_UNDEFINED_NAME task = {data_key: f'{self.url_scheme}://{self.container}/{key}'} - return [task], [None], [None] + return [task], [StorageLinkParams(key=key)] container = self.get_container() blob = container.download_blob(key) diff --git a/label_studio/io_storages/base_models.py b/label_studio/io_storages/base_models.py index f6b45567548e..beb7394bb847 100644 --- a/label_studio/io_storages/base_models.py +++ b/label_studio/io_storages/base_models.py @@ -433,7 +433,7 @@ def _scan_and_create_links(self, link_class): logger.debug(f'{self}: found new key {key}') try: - tasks_data, row_indices, row_groups = self.get_data(key) + tasks_data, links_params = self.get_data(key) except (UnicodeDecodeError, json.decoder.JSONDecodeError) as exc: logger.debug(exc, exc_info=True) raise ValueError( @@ -445,7 +445,7 @@ def _scan_and_create_links(self, link_class): if not flag_set('fflag_feat_dia_2092_multitasks_per_storage_link'): tasks_data = tasks_data[:1] - for task_data, row_index, row_group in zip(tasks_data, row_indices, row_groups): + for task_data, link_params in zip(tasks_data, links_params): # TODO: batch this loop body with add_task -> add_tasks in a single bulk write. # See DIA-2062 for prerequisites task = self.add_task( @@ -454,10 +454,8 @@ def _scan_and_create_links(self, link_class): maximum_annotations, max_inner_id, self, - key, - row_index, - row_group, - link_class, + **link_params, + link_class=link_class, ) max_inner_id += 1 diff --git a/label_studio/io_storages/gcs/models.py b/label_studio/io_storages/gcs/models.py index ff7a5fa6c26e..090586ff2d43 100644 --- a/label_studio/io_storages/gcs/models.py +++ b/label_studio/io_storages/gcs/models.py @@ -23,7 +23,7 @@ ProjectStorageMixin, ) from io_storages.gcs.utils import GCS -from io_storages.utils import load_tasks_json, parse_range, storage_can_resolve_bucket_url +from io_storages.utils import load_tasks_json, parse_range, storage_can_resolve_bucket_url, StorageLinkParams from tasks.models import Annotation logger = logging.getLogger(__name__) @@ -180,10 +180,10 @@ def iterkeys(self): return_key=True, ) - def get_data(self, key) -> tuple[list[dict], list[int | None], list[int | None]]: + def get_data(self, key) -> tuple[list[dict], list[StorageLinkParams]]: if self.use_blob_urls: task = {settings.DATA_UNDEFINED_NAME: GCS.get_uri(self.bucket, key)} - return [task], [None], [None] + return [task], [StorageLinkParams(key=key)] blob_str = GCS.read_file( client=self.get_client(), bucket_name=self.bucket, diff --git a/label_studio/io_storages/localfiles/models.py b/label_studio/io_storages/localfiles/models.py index 79303b5dcb2d..1f223d2a7267 100644 --- a/label_studio/io_storages/localfiles/models.py +++ b/label_studio/io_storages/localfiles/models.py @@ -20,7 +20,7 @@ ImportStorageLink, ProjectStorageMixin, ) -from io_storages.utils import load_tasks_json +from io_storages.utils import load_tasks_json, StorageLinkParams from rest_framework.exceptions import ValidationError from tasks.models import Annotation @@ -79,7 +79,7 @@ def iterkeys(self): continue yield str(file) - def get_data(self, key) -> tuple[list[dict], list[int | None], list[int | None]]: + def get_data(self, key) -> tuple[list[dict], list[StorageLinkParams]]: path = Path(key) if self.use_blob_urls: # include self-hosted links pointed to local resources via @@ -89,7 +89,7 @@ def get_data(self, key) -> tuple[list[dict], list[int | None], list[int | None]] task = { settings.DATA_UNDEFINED_NAME: f'{settings.HOSTNAME}/data/local-files/?d={quote(str(relative_path))}' } - return [task], [None], [None] + return [task], [StorageLinkParams(key=key)] try: with open(path, 'rb') as f: diff --git a/label_studio/io_storages/redis/models.py b/label_studio/io_storages/redis/models.py index 843b27db34af..755f6e0be76c 100644 --- a/label_studio/io_storages/redis/models.py +++ b/label_studio/io_storages/redis/models.py @@ -16,7 +16,7 @@ ImportStorageLink, ProjectStorageMixin, ) -from io_storages.utils import load_tasks_json +from io_storages.utils import load_tasks_json, StorageLinkParams from tasks.models import Annotation logger = logging.getLogger(__name__) @@ -90,11 +90,11 @@ def iterkeys(self): for key in client.keys(path + '*'): yield key - def get_data(self, key) -> tuple[list[dict], list[int | None], list[int | None]]: + def get_data(self, key) -> tuple[list[dict], list[StorageLinkParams]]: client = self.get_client() value_str = client.get(key) if not value_str: - return [], [], [] + return [], [] return load_tasks_json(value_str, key) def scan_and_create_links(self): diff --git a/label_studio/io_storages/s3/models.py b/label_studio/io_storages/s3/models.py index 96dc5f83f38f..75e4f96f7ebe 100644 --- a/label_studio/io_storages/s3/models.py +++ b/label_studio/io_storages/s3/models.py @@ -27,7 +27,7 @@ get_client_and_resource, resolve_s3_url, ) -from io_storages.utils import load_tasks_json, storage_can_resolve_bucket_url +from io_storages.utils import load_tasks_json, storage_can_resolve_bucket_url, StorageLinkParams from tasks.models import Annotation from label_studio.io_storages.s3.utils import AWS @@ -217,18 +217,17 @@ def scan_and_create_links(self): return self._scan_and_create_links(S3ImportStorageLink) @catch_and_reraise_from_none - def get_data(self, key) -> tuple[list[dict], list[int | None], list[int | None]]: + def get_data(self, key) -> tuple[list[dict], list[StorageLinkParams]]: uri = f'{self.url_scheme}://{self.bucket}/{key}' if self.use_blob_urls: data_key = settings.DATA_UNDEFINED_NAME task = {data_key: uri} - return [task], [None], [None] + return [task], [StorageLinkParams(key=key)] # read task json from bucket and validate it _, s3 = self.get_client_and_resource() bucket = s3.Bucket(self.bucket) obj = s3.Object(bucket.name, key).get()['Body'].read().decode('utf-8') - return load_tasks_json(obj, key) @catch_and_reraise_from_none diff --git a/label_studio/io_storages/utils.py b/label_studio/io_storages/utils.py index ce6bfb7d8d16..b2147736d0a9 100644 --- a/label_studio/io_storages/utils.py +++ b/label_studio/io_storages/utils.py @@ -118,7 +118,28 @@ def parse_range(range_header): return start, end -def _load_tasks_json(blob_str: str, key: str) -> tuple[list[dict], list[int | None], list[int | None]]: +@dataclass +class StorageLinkParams: + key: str + row_index: int | None + row_group: int | None + + @classmethod + def bulk_create( + cls, key, row_idxs: list[int] | None = None, row_groups: list[int] | None = None + ) -> list['StorageLinkParams']: + if row_idxs is None and row_groups is None: + row_idxs, row_groups = [None], [None] + if row_idxs is None: + row_idxs = [None] * len(row_groups) + if row_groups is None: + row_groups = [None] * len(row_idxs) + return [ + cls(key=key, row_index=row_idx, row_group=row_group) for row_idx, row_group in zip(row_idxs, row_groups) + ] + + +def _load_tasks_json(blob_str: str, key: str) -> tuple[list[dict], list[StorageLinkParams]]: """ Parse blob_str containing task JSON(s) and return the validated result or raise an error. @@ -128,8 +149,7 @@ def _load_tasks_json(blob_str: str, key: str) -> tuple[list[dict], list[int | No Returns: list[dict]: parsed tasks. - list[int|None]: row_index for each task. - list[int|None]: row_group for each task. + list[StorageLinkParams]: link params for each task. """ def _error_wrapper(exc: Optional[Exception] = None): @@ -149,14 +169,14 @@ def _error_wrapper(exc: Optional[Exception] = None): table = pyarrow.json.read_json( pa.py_buffer(blob_str), parse_options=pa.json.ParseOptions(newlines_in_values=True) ) - return table.to_pylist() + return table.to_pylist(), StorageLinkParams.bulk_create(key, range(table.num_rows)) except Exception as e: _error_wrapper(e) else: _error_wrapper(e) if isinstance(value, dict): - return [value], [None], [None] + return [value], [StorageLinkParams(key)] if isinstance(value, list): # validate tasks by briefly converting to table try: @@ -164,13 +184,12 @@ def _error_wrapper(exc: Optional[Exception] = None): values = table.to_pylist() except Exception as e: _error_wrapper(e) - n_tasks = len(values) - return values, list(range(n_tasks)), [None] * n_tasks + return values, StorageLinkParams.bulk_create(key, range(len(values))) _error_wrapper() -def load_tasks_json(blob_str: str, key: str): +def load_tasks_json(blob_str: str, key: str) -> tuple[list[dict], list[StorageLinkParams]]: # uses _load_tasks_json here and an LSE-specific implementation in LSE load_tasks_json_func = load_func(settings.STORAGE_LOAD_TASKS_JSON) return load_tasks_json_func(blob_str, key) From 2d9bafe703179ec44fa176644cc7c466ff19efae Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Mon, 19 May 2025 17:57:36 -0400 Subject: [PATCH 37/77] ruff --- label_studio/io_storages/azure_blob/models.py | 2 +- label_studio/io_storages/gcs/models.py | 2 +- label_studio/io_storages/localfiles/models.py | 2 +- label_studio/io_storages/redis/models.py | 2 +- label_studio/io_storages/s3/models.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/label_studio/io_storages/azure_blob/models.py b/label_studio/io_storages/azure_blob/models.py index ed175b501061..ff02c3154cce 100644 --- a/label_studio/io_storages/azure_blob/models.py +++ b/label_studio/io_storages/azure_blob/models.py @@ -25,7 +25,7 @@ ImportStorageLink, ProjectStorageMixin, ) -from io_storages.utils import load_tasks_json, parse_range, storage_can_resolve_bucket_url, StorageLinkParams +from io_storages.utils import StorageLinkParams, load_tasks_json, parse_range, storage_can_resolve_bucket_url from tasks.models import Annotation from label_studio.io_storages.azure_blob.utils import AZURE diff --git a/label_studio/io_storages/gcs/models.py b/label_studio/io_storages/gcs/models.py index 090586ff2d43..9b1b9b8f4391 100644 --- a/label_studio/io_storages/gcs/models.py +++ b/label_studio/io_storages/gcs/models.py @@ -23,7 +23,7 @@ ProjectStorageMixin, ) from io_storages.gcs.utils import GCS -from io_storages.utils import load_tasks_json, parse_range, storage_can_resolve_bucket_url, StorageLinkParams +from io_storages.utils import StorageLinkParams, load_tasks_json, parse_range, storage_can_resolve_bucket_url from tasks.models import Annotation logger = logging.getLogger(__name__) diff --git a/label_studio/io_storages/localfiles/models.py b/label_studio/io_storages/localfiles/models.py index 1f223d2a7267..b2b00d030336 100644 --- a/label_studio/io_storages/localfiles/models.py +++ b/label_studio/io_storages/localfiles/models.py @@ -20,7 +20,7 @@ ImportStorageLink, ProjectStorageMixin, ) -from io_storages.utils import load_tasks_json, StorageLinkParams +from io_storages.utils import StorageLinkParams, load_tasks_json from rest_framework.exceptions import ValidationError from tasks.models import Annotation diff --git a/label_studio/io_storages/redis/models.py b/label_studio/io_storages/redis/models.py index 755f6e0be76c..37c7fe20508d 100644 --- a/label_studio/io_storages/redis/models.py +++ b/label_studio/io_storages/redis/models.py @@ -16,7 +16,7 @@ ImportStorageLink, ProjectStorageMixin, ) -from io_storages.utils import load_tasks_json, StorageLinkParams +from io_storages.utils import StorageLinkParams, load_tasks_json from tasks.models import Annotation logger = logging.getLogger(__name__) diff --git a/label_studio/io_storages/s3/models.py b/label_studio/io_storages/s3/models.py index 75e4f96f7ebe..f58b3e1902ee 100644 --- a/label_studio/io_storages/s3/models.py +++ b/label_studio/io_storages/s3/models.py @@ -27,7 +27,7 @@ get_client_and_resource, resolve_s3_url, ) -from io_storages.utils import load_tasks_json, storage_can_resolve_bucket_url, StorageLinkParams +from io_storages.utils import StorageLinkParams, load_tasks_json, storage_can_resolve_bucket_url from tasks.models import Annotation from label_studio.io_storages.s3.utils import AWS From b79b88631928c1388a795e49c713b3ffc51168f5 Mon Sep 17 00:00:00 2001 From: matt-bernstein <60152561+matt-bernstein@users.noreply.github.com> Date: Mon, 19 May 2025 17:58:43 -0400 Subject: [PATCH 38/77] string fmt Co-authored-by: Jo Booth --- label_studio/io_storages/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/label_studio/io_storages/utils.py b/label_studio/io_storages/utils.py index b2147736d0a9..e39f2d0f8737 100644 --- a/label_studio/io_storages/utils.py +++ b/label_studio/io_storages/utils.py @@ -156,7 +156,7 @@ def _error_wrapper(exc: Optional[Exception] = None): raise ValueError( ( f"Can't import JSON-formatted tasks from {key}. If you're trying to import binary objects, " - f'perhaps you\'ve forgot to enable "Treat every bucket object as a source file" option?' + f'perhaps you forgot to enable "Treat every bucket object as a source file" option?' ) ) from exc From 5cb81f654c524133e55058bd13189ccb743bc58f Mon Sep 17 00:00:00 2001 From: matt-bernstein <60152561+matt-bernstein@users.noreply.github.com> Date: Mon, 19 May 2025 17:59:07 -0400 Subject: [PATCH 39/77] string fmt Co-authored-by: Jo Booth --- label_studio/io_storages/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/label_studio/io_storages/utils.py b/label_studio/io_storages/utils.py index e39f2d0f8737..6c04c96be830 100644 --- a/label_studio/io_storages/utils.py +++ b/label_studio/io_storages/utils.py @@ -155,7 +155,7 @@ def _load_tasks_json(blob_str: str, key: str) -> tuple[list[dict], list[StorageL def _error_wrapper(exc: Optional[Exception] = None): raise ValueError( ( - f"Can't import JSON-formatted tasks from {key}. If you're trying to import binary objects, " + f"Can’t import JSON-formatted tasks from {key}. If you’re trying to import binary objects, " f'perhaps you forgot to enable "Treat every bucket object as a source file" option?' ) ) from exc From 93104db8b4508051d228433a2f4eb9f7aba56b4e Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Mon, 19 May 2025 17:59:49 -0400 Subject: [PATCH 40/77] remove comment --- label_studio/io_storages/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/label_studio/io_storages/utils.py b/label_studio/io_storages/utils.py index 6c04c96be830..4d482e4b9217 100644 --- a/label_studio/io_storages/utils.py +++ b/label_studio/io_storages/utils.py @@ -155,12 +155,11 @@ def _load_tasks_json(blob_str: str, key: str) -> tuple[list[dict], list[StorageL def _error_wrapper(exc: Optional[Exception] = None): raise ValueError( ( - f"Can’t import JSON-formatted tasks from {key}. If you’re trying to import binary objects, " + f'Can’t import JSON-formatted tasks from {key}. If you’re trying to import binary objects, ' f'perhaps you forgot to enable "Treat every bucket object as a source file" option?' ) ) from exc - # TODO: rely on file extensions here instead of pure exception-chaining? Would be more readable, potentially less reliable, would work for all storage types except redis. try: value = json.loads(blob_str) except json.decoder.JSONDecodeError as e: From 773413d5ce78df3f7243d19476057da2c5f2f8fc Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Mon, 19 May 2025 18:01:14 -0400 Subject: [PATCH 41/77] remove option --- label_studio/io_storages/gcs/models.py | 1 - 1 file changed, 1 deletion(-) diff --git a/label_studio/io_storages/gcs/models.py b/label_studio/io_storages/gcs/models.py index 9b1b9b8f4391..b8028614146c 100644 --- a/label_studio/io_storages/gcs/models.py +++ b/label_studio/io_storages/gcs/models.py @@ -188,7 +188,6 @@ def get_data(self, key) -> tuple[list[dict], list[StorageLinkParams]]: client=self.get_client(), bucket_name=self.bucket, key=key, - convert_to=GCS.ConvertBlobTo.NOTHING, ) return load_tasks_json(blob_str, key) From f604b81d710a1a98ce9db55c48f4e1dfd4444fa3 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Tue, 20 May 2025 11:16:56 -0400 Subject: [PATCH 42/77] bugfix --- label_studio/io_storages/base_models.py | 11 +++++------ label_studio/io_storages/utils.py | 4 ++-- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/label_studio/io_storages/base_models.py b/label_studio/io_storages/base_models.py index beb7394bb847..2c5d3c7a852f 100644 --- a/label_studio/io_storages/base_models.py +++ b/label_studio/io_storages/base_models.py @@ -8,6 +8,7 @@ import os import traceback as tb from concurrent.futures import ThreadPoolExecutor +from dataclasses import asdict from datetime import datetime from typing import Union from urllib.parse import urljoin @@ -341,9 +342,7 @@ def _scan_and_create_links_v2(self): raise NotImplementedError @classmethod - def add_task( - cls, data, project, maximum_annotations, max_inner_id, storage, key, row_index, row_group, link_class - ): + def add_task(cls, data, project, maximum_annotations, max_inner_id, storage, link_params, link_class): # predictions predictions = data.get('predictions', []) if predictions: @@ -377,8 +376,8 @@ def add_task( inner_id=max_inner_id, ) - link_class.create(task, key, storage, row_index=row_index, row_group=row_group) - logger.debug(f'Create {storage.__class__.__name__} link with {key=} {row_index=} {row_group=} for {task=}') + link_class.create(task, storage=storage, **asdict(link_params)) + logger.debug(f'Create {storage.__class__.__name__} link with {asdict(link_params)} for {task=}') raise_exception = not flag_set( 'ff_fix_back_dev_3342_storage_scan_with_invalid_annotations', user=AnonymousUser() @@ -454,7 +453,7 @@ def _scan_and_create_links(self, link_class): maximum_annotations, max_inner_id, self, - **link_params, + link_params, link_class=link_class, ) max_inner_id += 1 diff --git a/label_studio/io_storages/utils.py b/label_studio/io_storages/utils.py index 4d482e4b9217..db41625e83de 100644 --- a/label_studio/io_storages/utils.py +++ b/label_studio/io_storages/utils.py @@ -121,8 +121,8 @@ def parse_range(range_header): @dataclass class StorageLinkParams: key: str - row_index: int | None - row_group: int | None + row_index: int | None = None + row_group: int | None = None @classmethod def bulk_create( From f4c7d389e07a2d0f4336a7f7f965c26fba3af98e Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Tue, 20 May 2025 15:44:10 -0400 Subject: [PATCH 43/77] change structured type, add tests --- label_studio/io_storages/azure_blob/models.py | 11 +- label_studio/io_storages/base_models.py | 21 ++- label_studio/io_storages/gcs/models.py | 11 +- label_studio/io_storages/localfiles/models.py | 6 +- label_studio/io_storages/redis/models.py | 6 +- label_studio/io_storages/s3/models.py | 6 +- .../tests/test_multitask_import.py | 172 ++++++++++++++++++ label_studio/io_storages/utils.py | 26 +-- 8 files changed, 222 insertions(+), 37 deletions(-) diff --git a/label_studio/io_storages/azure_blob/models.py b/label_studio/io_storages/azure_blob/models.py index ff02c3154cce..84fc57914a2e 100644 --- a/label_studio/io_storages/azure_blob/models.py +++ b/label_studio/io_storages/azure_blob/models.py @@ -25,7 +25,12 @@ ImportStorageLink, ProjectStorageMixin, ) -from io_storages.utils import StorageLinkParams, load_tasks_json, parse_range, storage_can_resolve_bucket_url +from io_storages.utils import ( + StorageObjectParams, + load_tasks_json, + parse_range, + storage_can_resolve_bucket_url, +) from tasks.models import Annotation from label_studio.io_storages.azure_blob.utils import AZURE @@ -209,11 +214,11 @@ def iterkeys(self): continue yield file.name - def get_data(self, key) -> tuple[list[dict], list[StorageLinkParams]]: + def get_data(self, key) -> list[StorageObjectParams]: if self.use_blob_urls: data_key = settings.DATA_UNDEFINED_NAME task = {data_key: f'{self.url_scheme}://{self.container}/{key}'} - return [task], [StorageLinkParams(key=key)] + return [StorageObjectParams(key=key, task_data=task)] container = self.get_container() blob = container.download_blob(key) diff --git a/label_studio/io_storages/base_models.py b/label_studio/io_storages/base_models.py index 2c5d3c7a852f..3ae9642b398c 100644 --- a/label_studio/io_storages/base_models.py +++ b/label_studio/io_storages/base_models.py @@ -28,7 +28,7 @@ from django.utils import timezone from django.utils.translation import gettext_lazy as _ from django_rq import job -from io_storages.utils import get_uri_via_regex, parse_bucket_uri +from io_storages.utils import StorageObjectParams, get_uri_via_regex, parse_bucket_uri from rq.job import Job from tasks.models import Annotation, Task from tasks.serializers import AnnotationSerializer, PredictionSerializer @@ -231,7 +231,7 @@ class ImportStorage(Storage): def iterkeys(self): return iter(()) - def get_data(self, key) -> list[dict]: + def get_data(self, key) -> list[StorageObjectParams]: raise NotImplementedError def generate_http_url(self, url): @@ -342,7 +342,10 @@ def _scan_and_create_links_v2(self): raise NotImplementedError @classmethod - def add_task(cls, data, project, maximum_annotations, max_inner_id, storage, link_params, link_class): + def add_task(cls, project, maximum_annotations, max_inner_id, storage, link_params, link_class): + link_kwargs = asdict(link_params) + data = link_kwargs.pop('task_data') + # predictions predictions = data.get('predictions', []) if predictions: @@ -376,8 +379,8 @@ def add_task(cls, data, project, maximum_annotations, max_inner_id, storage, lin inner_id=max_inner_id, ) - link_class.create(task, storage=storage, **asdict(link_params)) - logger.debug(f'Create {storage.__class__.__name__} link with {asdict(link_params)} for {task=}') + link_class.create(task, storage=storage, **link_kwargs) + logger.debug(f'Create {storage.__class__.__name__} link with {link_kwargs} for {task=}') raise_exception = not flag_set( 'ff_fix_back_dev_3342_storage_scan_with_invalid_annotations', user=AnonymousUser() @@ -432,7 +435,8 @@ def _scan_and_create_links(self, link_class): logger.debug(f'{self}: found new key {key}') try: - tasks_data, links_params = self.get_data(key) + # list of (task data + ImportStorageLink details) + links_params = self.get_data(key) except (UnicodeDecodeError, json.decoder.JSONDecodeError) as exc: logger.debug(exc, exc_info=True) raise ValueError( @@ -442,13 +446,12 @@ def _scan_and_create_links(self, link_class): ) if not flag_set('fflag_feat_dia_2092_multitasks_per_storage_link'): - tasks_data = tasks_data[:1] + links_params = links_params[:1] - for task_data, link_params in zip(tasks_data, links_params): + for link_params in links_params: # TODO: batch this loop body with add_task -> add_tasks in a single bulk write. # See DIA-2062 for prerequisites task = self.add_task( - task_data, self.project, maximum_annotations, max_inner_id, diff --git a/label_studio/io_storages/gcs/models.py b/label_studio/io_storages/gcs/models.py index b8028614146c..39cf81085132 100644 --- a/label_studio/io_storages/gcs/models.py +++ b/label_studio/io_storages/gcs/models.py @@ -23,7 +23,12 @@ ProjectStorageMixin, ) from io_storages.gcs.utils import GCS -from io_storages.utils import StorageLinkParams, load_tasks_json, parse_range, storage_can_resolve_bucket_url +from io_storages.utils import ( + StorageObjectParams, + load_tasks_json, + parse_range, + storage_can_resolve_bucket_url, +) from tasks.models import Annotation logger = logging.getLogger(__name__) @@ -180,10 +185,10 @@ def iterkeys(self): return_key=True, ) - def get_data(self, key) -> tuple[list[dict], list[StorageLinkParams]]: + def get_data(self, key) -> list[StorageObjectParams]: if self.use_blob_urls: task = {settings.DATA_UNDEFINED_NAME: GCS.get_uri(self.bucket, key)} - return [task], [StorageLinkParams(key=key)] + return [StorageObjectParams(key=key, task_data=task)] blob_str = GCS.read_file( client=self.get_client(), bucket_name=self.bucket, diff --git a/label_studio/io_storages/localfiles/models.py b/label_studio/io_storages/localfiles/models.py index b2b00d030336..244d8a6880cf 100644 --- a/label_studio/io_storages/localfiles/models.py +++ b/label_studio/io_storages/localfiles/models.py @@ -20,7 +20,7 @@ ImportStorageLink, ProjectStorageMixin, ) -from io_storages.utils import StorageLinkParams, load_tasks_json +from io_storages.utils import StorageObjectParams, load_tasks_json from rest_framework.exceptions import ValidationError from tasks.models import Annotation @@ -79,7 +79,7 @@ def iterkeys(self): continue yield str(file) - def get_data(self, key) -> tuple[list[dict], list[StorageLinkParams]]: + def get_data(self, key) -> list[StorageObjectParams]: path = Path(key) if self.use_blob_urls: # include self-hosted links pointed to local resources via @@ -89,7 +89,7 @@ def get_data(self, key) -> tuple[list[dict], list[StorageLinkParams]]: task = { settings.DATA_UNDEFINED_NAME: f'{settings.HOSTNAME}/data/local-files/?d={quote(str(relative_path))}' } - return [task], [StorageLinkParams(key=key)] + return [StorageObjectParams(key=key, task_data=task)] try: with open(path, 'rb') as f: diff --git a/label_studio/io_storages/redis/models.py b/label_studio/io_storages/redis/models.py index 37c7fe20508d..c002ab609f1d 100644 --- a/label_studio/io_storages/redis/models.py +++ b/label_studio/io_storages/redis/models.py @@ -16,7 +16,7 @@ ImportStorageLink, ProjectStorageMixin, ) -from io_storages.utils import StorageLinkParams, load_tasks_json +from io_storages.utils import StorageObjectParams, load_tasks_json from tasks.models import Annotation logger = logging.getLogger(__name__) @@ -90,11 +90,11 @@ def iterkeys(self): for key in client.keys(path + '*'): yield key - def get_data(self, key) -> tuple[list[dict], list[StorageLinkParams]]: + def get_data(self, key) -> list[StorageObjectParams]: client = self.get_client() value_str = client.get(key) if not value_str: - return [], [] + return [] return load_tasks_json(value_str, key) def scan_and_create_links(self): diff --git a/label_studio/io_storages/s3/models.py b/label_studio/io_storages/s3/models.py index f58b3e1902ee..3ab85dda8df1 100644 --- a/label_studio/io_storages/s3/models.py +++ b/label_studio/io_storages/s3/models.py @@ -27,7 +27,7 @@ get_client_and_resource, resolve_s3_url, ) -from io_storages.utils import StorageLinkParams, load_tasks_json, storage_can_resolve_bucket_url +from io_storages.utils import StorageObjectParams, load_tasks_json, storage_can_resolve_bucket_url from tasks.models import Annotation from label_studio.io_storages.s3.utils import AWS @@ -217,12 +217,12 @@ def scan_and_create_links(self): return self._scan_and_create_links(S3ImportStorageLink) @catch_and_reraise_from_none - def get_data(self, key) -> tuple[list[dict], list[StorageLinkParams]]: + def get_data(self, key) -> list[StorageObjectParams]: uri = f'{self.url_scheme}://{self.bucket}/{key}' if self.use_blob_urls: data_key = settings.DATA_UNDEFINED_NAME task = {data_key: uri} - return [task], [StorageLinkParams(key=key)] + return [StorageObjectParams(key=key, task_data=task)] # read task json from bucket and validate it _, s3 = self.get_client_and_resource() diff --git a/label_studio/io_storages/tests/test_multitask_import.py b/label_studio/io_storages/tests/test_multitask_import.py index e73ceaf6ea2a..77488f7f6c86 100644 --- a/label_studio/io_storages/tests/test_multitask_import.py +++ b/label_studio/io_storages/tests/test_multitask_import.py @@ -11,9 +11,11 @@ RedisImportStorageFactory, S3ImportStorageFactory, ) +from io_storages.utils import StorageObjectParams, load_tasks_json from moto import mock_s3 from projects.tests.factories import ProjectFactory from rest_framework.test import APIClient + from tests.utils import azure_client_mock, gcs_client_mock, mock_feature_flag, redis_client_mock @@ -142,3 +144,173 @@ def test_storagelink_fields(self): self.assertEqual(storage_links[0].row_group, None) self.assertEqual(storage_links[1].row_index, 1) self.assertEqual(storage_links[1].row_group, None) + + +@mock_feature_flag('fflag_feat_root_11_support_jsonl_cloud_storage', True) +class TestTaskFormats(TestCase): + + bare_task_list = [ + { + 'text': 'Test task 1', + }, + { + 'text': 'Test task 2', + }, + ] + + annots_preds_task_list = [ + { + 'data': {'text': 'Machine learning models require high-quality labeled data.'}, + 'annotations': [ + { + 'result': [ + { + 'value': {'start': 0, 'end': 22, 'text': 'Machine learning models', 'labels': ['FIELD']}, + 'from_name': 'label', + 'to_name': 'text', + 'type': 'labels', + }, + { + 'value': {'start': 44, 'end': 56, 'text': 'labeled data', 'labels': ['ACTION']}, + 'from_name': 'label', + 'to_name': 'text', + 'type': 'labels', + }, + ] + } + ], + 'predictions': [ + { + 'result': [ + { + 'value': {'start': 0, 'end': 22, 'text': 'Machine learning models', 'labels': ['FIELD']}, + 'from_name': 'label', + 'to_name': 'text', + 'type': 'labels', + } + ] + } + ], + }, + {'data': {'text': 'Prosper annotation helps improve model accuracy.'}, 'predictions': [{'result': []}]}, + ] + + def setUp(self): + self.project = ProjectFactory() + self.storage = S3ImportStorage( + project=self.project, + bucket='example', + aws_access_key_id='example', + aws_secret_access_key='example', + use_blob_urls=False, + ) + self.storage.save() + + def _create_tasks(self, params_list: list[StorageObjectParams]): + # check that no errors are raised during task creation; not checking the task itself + for params in params_list: + _ = S3ImportStorage.add_task(self.project, 1, 0, self.storage, params, S3ImportStorageLink) + + def test_bare_task(self): + + task_data = self.bare_task_list[0] + + blob_str = json.dumps(task_data).encode() + output = load_tasks_json(blob_str, 'test.json') + expected_output = [StorageObjectParams(key='test.json', task_data=task_data)] + self.assertEqual(output, expected_output) + + self._create_tasks(output) + + def test_data_key(self): + + task_data = {'data': self.bare_task_list[0]} + + blob_str = json.dumps(task_data).encode() + output = load_tasks_json(blob_str, 'test.json') + expected_output = [StorageObjectParams(key='test.json', task_data=task_data)] + self.assertEqual(output, expected_output) + + self._create_tasks(output) + + def test_1elem_list(self): + + task_data = self.bare_task_list[:1] + + blob_str = json.dumps(task_data).encode() + output = load_tasks_json(blob_str, 'test.json') + expected_output = [ + StorageObjectParams(key='test.json', task_data=task_data[0], row_index=0), + ] + self.assertEqual(output, expected_output) + + self._create_tasks(output) + + def test_2elem_list(self): + + task_data = self.bare_task_list + + blob_str = json.dumps(task_data).encode() + output = load_tasks_json(blob_str, 'test.json') + expected_output = [ + StorageObjectParams(key='test.json', task_data=task_data[0], row_index=0), + StorageObjectParams(key='test.json', task_data=task_data[1], row_index=1), + ] + self.assertEqual(output, expected_output) + + self._create_tasks(output) + + def test_preds_and_annots_list(self): + task_data = self.annots_preds_task_list + + blob_str = json.dumps(task_data).encode() + output = load_tasks_json(blob_str, 'test.json') + expected_output = [ + StorageObjectParams(key='test.json', task_data=task_data[0], row_index=0), + StorageObjectParams(key='test.json', task_data=task_data[1], row_index=1), + ] + self.assertEqual(output, expected_output) + + self._create_tasks(output) + + def test_list_jsonl(self): + task_data = self.bare_task_list + + blob_str = '\n'.join([json.dumps(task) for task in task_data]).encode() + output = load_tasks_json(blob_str, 'test.jsonl') + expected_output = [ + StorageObjectParams(key='test.jsonl', task_data=task_data[0], row_index=0), + StorageObjectParams(key='test.jsonl', task_data=task_data[1], row_index=1), + ] + self.assertEqual(output, expected_output) + + self._create_tasks(output) + + def test_list_jsonl_with_preds_and_annots(self): + task_data = self.annots_preds_task_list + + blob_str = '\n'.join([json.dumps(task) for task in task_data]).encode() + output = load_tasks_json(blob_str, 'test.jsonl') + expected_output = [ + StorageObjectParams(key='test.jsonl', task_data=task_data[0], row_index=0), + StorageObjectParams(key='test.jsonl', task_data=task_data[1], row_index=1), + ] + self.assertEqual(output, expected_output) + + self._create_tasks(output) + + def test_ff_blocks_jsonl(self): + with mock_feature_flag('fflag_feat_root_11_support_jsonl_cloud_storage', False): + with self.assertRaises(ValueError): + load_tasks_json(b'{"text": "Test task 1"}\n{"text": "Test task 2"}', 'test.jsonl') + + def test_mixed_formats_invalid(self): + task_data = [self.bare_task_list[0], self.annots_preds_task_list[0]] + + with self.assertRaises(ValueError): + blob_str = json.dumps(task_data).encode() + load_tasks_json(blob_str, 'test.json') + + with self.assertRaises(ValueError): + blob_str = '\n'.join([json.dumps(task) for task in task_data]).encode() + load_tasks_json(blob_str, 'test.jsonl') diff --git a/label_studio/io_storages/utils.py b/label_studio/io_storages/utils.py index db41625e83de..0cd40bf6a881 100644 --- a/label_studio/io_storages/utils.py +++ b/label_studio/io_storages/utils.py @@ -119,27 +119,27 @@ def parse_range(range_header): @dataclass -class StorageLinkParams: +class StorageObjectParams: + task_data: dict key: str row_index: int | None = None row_group: int | None = None @classmethod def bulk_create( - cls, key, row_idxs: list[int] | None = None, row_groups: list[int] | None = None - ) -> list['StorageLinkParams']: - if row_idxs is None and row_groups is None: - row_idxs, row_groups = [None], [None] + cls, task_datas: list[dict], key, row_idxs: list[int] | None = None, row_groups: list[int] | None = None + ) -> list['StorageObjectParams']: if row_idxs is None: - row_idxs = [None] * len(row_groups) + row_idxs = [None] * len(task_datas) if row_groups is None: - row_groups = [None] * len(row_idxs) + row_groups = [None] * len(task_datas) return [ - cls(key=key, row_index=row_idx, row_group=row_group) for row_idx, row_group in zip(row_idxs, row_groups) + cls(key=key, row_index=row_idx, row_group=row_group, task_data=task_data) + for row_idx, row_group, task_data in zip(row_idxs, row_groups, task_datas) ] -def _load_tasks_json(blob_str: str, key: str) -> tuple[list[dict], list[StorageLinkParams]]: +def _load_tasks_json(blob_str: str, key: str) -> list[StorageObjectParams]: """ Parse blob_str containing task JSON(s) and return the validated result or raise an error. @@ -168,14 +168,14 @@ def _error_wrapper(exc: Optional[Exception] = None): table = pyarrow.json.read_json( pa.py_buffer(blob_str), parse_options=pa.json.ParseOptions(newlines_in_values=True) ) - return table.to_pylist(), StorageLinkParams.bulk_create(key, range(table.num_rows)) + return StorageObjectParams.bulk_create(table.to_pylist(), key, range(table.num_rows)) except Exception as e: _error_wrapper(e) else: _error_wrapper(e) if isinstance(value, dict): - return [value], [StorageLinkParams(key)] + return [StorageObjectParams(key=key, task_data=value)] if isinstance(value, list): # validate tasks by briefly converting to table try: @@ -183,12 +183,12 @@ def _error_wrapper(exc: Optional[Exception] = None): values = table.to_pylist() except Exception as e: _error_wrapper(e) - return values, StorageLinkParams.bulk_create(key, range(len(values))) + return StorageObjectParams.bulk_create(values, key, range(len(values))) _error_wrapper() -def load_tasks_json(blob_str: str, key: str) -> tuple[list[dict], list[StorageLinkParams]]: +def load_tasks_json(blob_str: str, key: str) -> tuple[list[dict], list[StorageObjectParams]]: # uses _load_tasks_json here and an LSE-specific implementation in LSE load_tasks_json_func = load_func(settings.STORAGE_LOAD_TASKS_JSON) return load_tasks_json_func(blob_str, key) From 92e14fc97743388bcc18c62da309ebb0b5fe29fb Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Tue, 20 May 2025 16:02:00 -0400 Subject: [PATCH 44/77] wip fix ff mocking --- .../io_storages/tests/test_multitask_import.py | 11 +++++------ label_studio/tests/utils.py | 7 ++++++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/label_studio/io_storages/tests/test_multitask_import.py b/label_studio/io_storages/tests/test_multitask_import.py index 77488f7f6c86..42e6d4002427 100644 --- a/label_studio/io_storages/tests/test_multitask_import.py +++ b/label_studio/io_storages/tests/test_multitask_import.py @@ -1,7 +1,6 @@ import json import boto3 -import pytest from django.test import TestCase from io_storages.models import S3ImportStorage from io_storages.s3.models import S3ImportStorageLink @@ -19,7 +18,6 @@ from tests.utils import azure_client_mock, gcs_client_mock, mock_feature_flag, redis_client_mock -@pytest.mark.skip(reason='FF mocking is broken here, letting these tests run in LSE instead') class TestMultiTaskImport(TestCase): @classmethod def setUpTestData(cls): @@ -146,7 +144,6 @@ def test_storagelink_fields(self): self.assertEqual(storage_links[1].row_group, None) -@mock_feature_flag('fflag_feat_root_11_support_jsonl_cloud_storage', True) class TestTaskFormats(TestCase): bare_task_list = [ @@ -273,6 +270,7 @@ def test_preds_and_annots_list(self): self._create_tasks(output) + @mock_feature_flag('fflag_feat_root_11_support_jsonl_cloud_storage', True) def test_list_jsonl(self): task_data = self.bare_task_list @@ -286,6 +284,7 @@ def test_list_jsonl(self): self._create_tasks(output) + @mock_feature_flag('fflag_feat_root_11_support_jsonl_cloud_storage', True) def test_list_jsonl_with_preds_and_annots(self): task_data = self.annots_preds_task_list @@ -299,10 +298,10 @@ def test_list_jsonl_with_preds_and_annots(self): self._create_tasks(output) + @mock_feature_flag('fflag_feat_root_11_support_jsonl_cloud_storage', False) def test_ff_blocks_jsonl(self): - with mock_feature_flag('fflag_feat_root_11_support_jsonl_cloud_storage', False): - with self.assertRaises(ValueError): - load_tasks_json(b'{"text": "Test task 1"}\n{"text": "Test task 2"}', 'test.jsonl') + with self.assertRaises(ValueError): + load_tasks_json(b'{"text": "Test task 1"}\n{"text": "Test task 2"}', 'test.jsonl') def test_mixed_formats_invalid(self): task_data = [self.bare_task_list[0], self.annots_preds_task_list[0]] diff --git a/label_studio/tests/utils.py b/label_studio/tests/utils.py index a15cdfde7b51..051fc9a21d0c 100644 --- a/label_studio/tests/utils.py +++ b/label_studio/tests/utils.py @@ -417,7 +417,7 @@ def file_exists_in_storage(response, exists=True, file_path=None): def mock_feature_flag(flag_name: str, value: bool, parent_module: str = 'core.feature_flags'): - """Decorator to mock a feature flag state for a test function. + """Decorator to mock a feature flag state for a test function or method. Args: flag_name: Name of the feature flag to mock @@ -436,6 +436,11 @@ def fake_flag_set(feature_flag, *flag_args, **flag_kwargs): with mock.patch(f'{parent_module}.flag_set', wraps=fake_flag_set): return func(*args, **kwargs) + # Handle both functions and methods + # For instance methods, we need to preserve the instance + if hasattr(func, '__self__'): + # This is a bound method, preserve the binding + return wrapper.__get__(func.__self__, type(func.__self__)) return wrapper return decorator From 065b6a30e31ec283b3e71734f0cbc19c1ccfd6d3 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Tue, 20 May 2025 16:31:03 -0400 Subject: [PATCH 45/77] wip convert to pytest --- .../tests/test_multitask_import.py | 545 +++++++++--------- label_studio/tests/utils.py | 7 +- 2 files changed, 287 insertions(+), 265 deletions(-) diff --git a/label_studio/io_storages/tests/test_multitask_import.py b/label_studio/io_storages/tests/test_multitask_import.py index 42e6d4002427..c09628ab8b41 100644 --- a/label_studio/io_storages/tests/test_multitask_import.py +++ b/label_studio/io_storages/tests/test_multitask_import.py @@ -1,7 +1,7 @@ import json import boto3 -from django.test import TestCase +import pytest from io_storages.models import S3ImportStorage from io_storages.s3.models import S3ImportStorageLink from io_storages.tests.factories import ( @@ -13,303 +13,330 @@ from io_storages.utils import StorageObjectParams, load_tasks_json from moto import mock_s3 from projects.tests.factories import ProjectFactory -from rest_framework.test import APIClient from tests.utils import azure_client_mock, gcs_client_mock, mock_feature_flag, redis_client_mock +# +# Integration tests for storage.sync() +# -class TestMultiTaskImport(TestCase): - @classmethod - def setUpTestData(cls): - # Setup project with simple config - cls.project = ProjectFactory() - # Common test data - cls.common_task_data = [ - {'data': {'image_url': 'http://ggg.com/image.jpg', 'text': 'Task 1 text'}}, - {'data': {'image_url': 'http://ggg.com/image2.jpg', 'text': 'Task 2 text'}}, - ] +@pytest.fixture +@pytest.mark.django_db +def project(): + return ProjectFactory() - @mock_feature_flag('fflag_feat_dia_2092_multitasks_per_storage_link', True) - def _test_storage_import(self, storage_class, task_data, **storage_kwargs): - """Helper to test import for a specific storage type""" - # can't do this in the classmethod for some reason, or self.client != cls.client - client = APIClient() - client.force_authenticate(user=self.project.created_by) +@pytest.fixture(scope='module') +def common_task_data(): + return [ + {'data': {'image_url': 'http://ggg.com/image.jpg', 'text': 'Task 1 text'}}, + {'data': {'image_url': 'http://ggg.com/image2.jpg', 'text': 'Task 2 text'}}, + ] - # Setup storage with required credentials - storage = storage_class(project=self.project, **storage_kwargs) - # Validate connection before sync - try: - storage.validate_connection() - except Exception as e: - self.fail(f'Storage connection validation failed: {str(e)}') +@pytest.mark.django_db +def _test_storage_import(project, business_client, storage_class, task_data, **storage_kwargs): + """Helper to test import for a specific storage type""" + # client = APIClient() + # client.force_authenticate(user=project.created_by) + + # Setup storage with required credentials + storage = storage_class(project=project, **storage_kwargs) + + # Validate connection before sync + try: + storage.validate_connection() + except Exception as e: + pytest.fail(f'Storage connection validation failed: {str(e)}') + + # Sync storage + # Don't have to wait for sync to complete because it's blocking without rq + storage.sync() + + # Validate tasks were imported correctly + tasks_response = business_client.get(f'/api/tasks?project={project.id}') + assert tasks_response.status_code == 200 + tasks = tasks_response.json()['tasks'] + assert len(tasks) == len(task_data) + + # Validate task content + for task, expected_data in zip(tasks, task_data): + assert task['data'] == expected_data['data'] + + +@mock_feature_flag('fflag_feat_dia_2092_multitasks_per_storage_link', True) +def test_import_multiple_tasks_s3(project, common_task_data): + with mock_s3(): + # Setup S3 bucket and test data + s3 = boto3.client('s3', region_name='us-east-1') + bucket_name = 'pytest-s3-jsons' + s3.create_bucket(Bucket=bucket_name) + + # Put test data into S3 + s3.put_object(Bucket=bucket_name, Key='test.json', Body=json.dumps(common_task_data)) + + _test_storage_import( + project, + S3ImportStorageFactory, + common_task_data, + bucket='pytest-s3-jsons', + aws_access_key_id='example', + aws_secret_access_key='example', + use_blob_urls=False, + ) - # Sync storage - # Don't have to wait for sync to complete because it's blocking without rq - storage.sync() - # Validate tasks were imported correctly - tasks_response = client.get(f'/api/tasks?project={self.project.id}') - self.assertEqual(tasks_response.status_code, 200) - tasks = tasks_response.json()['tasks'] - self.assertEqual(len(tasks), len(task_data)) - - # Validate task content - for task, expected_data in zip(tasks, task_data): - self.assertEqual(task['data'], expected_data['data']) - - def test_import_multiple_tasks_s3(self): - with mock_s3(): - # Setup S3 bucket and test data - s3 = boto3.client('s3', region_name='us-east-1') - bucket_name = 'pytest-s3-jsons' - s3.create_bucket(Bucket=bucket_name) - - # Put test data into S3 - s3.put_object(Bucket=bucket_name, Key='test.json', Body=json.dumps(self.common_task_data)) - - self._test_storage_import( - S3ImportStorageFactory, - self.common_task_data, - bucket='pytest-s3-jsons', - aws_access_key_id='example', - aws_secret_access_key='example', - use_blob_urls=False, - ) - - def test_import_multiple_tasks_gcs(self): - # initialize mock with sample data - with gcs_client_mock(): - - self._test_storage_import( - GCSImportStorageFactory, - self.common_task_data, - # magic bucket name to set correct data in gcs_client_mock - bucket='multitask_JSON', - use_blob_urls=False, - ) - - def test_import_multiple_tasks_azure(self): - # initialize mock with sample data - with azure_client_mock(sample_json_contents=self.common_task_data, sample_blob_names=['test.json']): - - self._test_storage_import( - AzureBlobImportStorageFactory, - self.common_task_data, - use_blob_urls=False, - ) - - def test_import_multiple_tasks_redis(self): - with redis_client_mock() as redis: - - redis.set('test.json', json.dumps(self.common_task_data)) - - self._test_storage_import( - RedisImportStorageFactory, - self.common_task_data, - path='', - use_blob_urls=False, - ) - - def test_storagelink_fields(self): - # use an actual storage and storagelink to test this, since factories aren't connected properly - with mock_s3(): - # Setup S3 bucket and test data - s3 = boto3.client('s3', region_name='us-east-1') - bucket_name = 'pytest-s3-jsons' - s3.create_bucket(Bucket=bucket_name) - - # Put test data into S3 - s3.put_object(Bucket=bucket_name, Key='test.json', Body=json.dumps(self.common_task_data)) - - # create a real storage and sync it - storage = S3ImportStorage( - project=self.project, - bucket=bucket_name, - aws_access_key_id='example', - aws_secret_access_key='example', - use_blob_urls=False, - ) - storage.save() - storage.sync() - - # check that the storage link fields are set correctly - storage_links = S3ImportStorageLink.objects.filter(storage=storage).order_by('task_id') - self.assertEqual(storage_links[0].row_index, 0) - self.assertEqual(storage_links[0].row_group, None) - self.assertEqual(storage_links[1].row_index, 1) - self.assertEqual(storage_links[1].row_group, None) - - -class TestTaskFormats(TestCase): - - bare_task_list = [ - { - 'text': 'Test task 1', - }, - { - 'text': 'Test task 2', - }, - ] +@mock_feature_flag('fflag_feat_dia_2092_multitasks_per_storage_link', True) +def test_import_multiple_tasks_gcs(project, common_task_data): + # initialize mock with sample data + with gcs_client_mock(): + _test_storage_import( + project, + GCSImportStorageFactory, + common_task_data, + # magic bucket name to set correct data in gcs_client_mock + bucket='multitask_JSON', + use_blob_urls=False, + ) - annots_preds_task_list = [ - { - 'data': {'text': 'Machine learning models require high-quality labeled data.'}, - 'annotations': [ - { - 'result': [ - { - 'value': {'start': 0, 'end': 22, 'text': 'Machine learning models', 'labels': ['FIELD']}, - 'from_name': 'label', - 'to_name': 'text', - 'type': 'labels', - }, - { - 'value': {'start': 44, 'end': 56, 'text': 'labeled data', 'labels': ['ACTION']}, - 'from_name': 'label', - 'to_name': 'text', - 'type': 'labels', - }, - ] - } - ], - 'predictions': [ - { - 'result': [ - { - 'value': {'start': 0, 'end': 22, 'text': 'Machine learning models', 'labels': ['FIELD']}, - 'from_name': 'label', - 'to_name': 'text', - 'type': 'labels', - } - ] - } - ], - }, - {'data': {'text': 'Prosper annotation helps improve model accuracy.'}, 'predictions': [{'result': []}]}, - ] - def setUp(self): - self.project = ProjectFactory() - self.storage = S3ImportStorage( - project=self.project, - bucket='example', +@mock_feature_flag('fflag_feat_dia_2092_multitasks_per_storage_link', True) +def test_import_multiple_tasks_azure(project, common_task_data): + # initialize mock with sample data + with azure_client_mock(sample_json_contents=common_task_data, sample_blob_names=['test.json']): + _test_storage_import( + project, + AzureBlobImportStorageFactory, + common_task_data, + use_blob_urls=False, + ) + + +@mock_feature_flag('fflag_feat_dia_2092_multitasks_per_storage_link', True) +def test_import_multiple_tasks_redis(project, common_task_data): + with redis_client_mock() as redis: + redis.set('test.json', json.dumps(common_task_data)) + + _test_storage_import( + project, + RedisImportStorageFactory, + common_task_data, + path='', + use_blob_urls=False, + ) + + +@mock_feature_flag('fflag_feat_dia_2092_multitasks_per_storage_link', True) +@pytest.mark.django_db +def test_storagelink_fields(project, common_task_data): + # use an actual storage and storagelink to test this, since factories aren't connected properly + with mock_s3(): + # Setup S3 bucket and test data + s3 = boto3.client('s3', region_name='us-east-1') + bucket_name = 'pytest-s3-jsons' + s3.create_bucket(Bucket=bucket_name) + + # Put test data into S3 + s3.put_object(Bucket=bucket_name, Key='test.json', Body=json.dumps(common_task_data)) + + # create a real storage and sync it + storage = S3ImportStorage( + project=project, + bucket=bucket_name, aws_access_key_id='example', aws_secret_access_key='example', use_blob_urls=False, ) - self.storage.save() - - def _create_tasks(self, params_list: list[StorageObjectParams]): - # check that no errors are raised during task creation; not checking the task itself - for params in params_list: - _ = S3ImportStorage.add_task(self.project, 1, 0, self.storage, params, S3ImportStorageLink) + storage.save() + storage.sync() - def test_bare_task(self): + # check that the storage link fields are set correctly + storage_links = S3ImportStorageLink.objects.filter(storage=storage).order_by('task_id') + assert storage_links[0].row_index == 0 + assert storage_links[0].row_group is None + assert storage_links[1].row_index == 1 + assert storage_links[1].row_group is None + + +# +# Unit tests for load_tasks_json() +# + + +@pytest.fixture(scope='module') +def storage(project): + storage = S3ImportStorage( + project=project, + bucket='example', + aws_access_key_id='example', + aws_secret_access_key='example', + use_blob_urls=False, + ) + storage.save() + return storage + + +def create_tasks(project, storage, params_list: list[StorageObjectParams]): + # check that no errors are raised during task creation; not checking the task itself + for params in params_list: + _ = S3ImportStorage.add_task(project, 1, 0, storage, params, S3ImportStorageLink) + + +# Test data +bare_task_list = [ + { + 'text': 'Test task 1', + }, + { + 'text': 'Test task 2', + }, +] + +annots_preds_task_list = [ + { + 'data': {'text': 'Machine learning models require high-quality labeled data.'}, + 'annotations': [ + { + 'result': [ + { + 'value': {'start': 0, 'end': 22, 'text': 'Machine learning models', 'labels': ['FIELD']}, + 'from_name': 'label', + 'to_name': 'text', + 'type': 'labels', + }, + { + 'value': {'start': 44, 'end': 56, 'text': 'labeled data', 'labels': ['ACTION']}, + 'from_name': 'label', + 'to_name': 'text', + 'type': 'labels', + }, + ] + } + ], + 'predictions': [ + { + 'result': [ + { + 'value': {'start': 0, 'end': 22, 'text': 'Machine learning models', 'labels': ['FIELD']}, + 'from_name': 'label', + 'to_name': 'text', + 'type': 'labels', + } + ] + } + ], + }, + {'data': {'text': 'Prosper annotation helps improve model accuracy.'}, 'predictions': [{'result': []}]}, +] + + +def test_bare_task(project, storage): + task_data = bare_task_list[0] + + blob_str = json.dumps(task_data).encode() + output = load_tasks_json(blob_str, 'test.json') + expected_output = [StorageObjectParams(key='test.json', task_data=task_data)] + assert output == expected_output + + create_tasks(project, storage, output) + + +def test_data_key(project, storage): + task_data = {'data': bare_task_list[0]} + + blob_str = json.dumps(task_data).encode() + output = load_tasks_json(blob_str, 'test.json') + expected_output = [StorageObjectParams(key='test.json', task_data=task_data)] + assert output == expected_output + + create_tasks(project, storage, output) + + +def test_1elem_list(project, storage): + task_data = bare_task_list[:1] + + blob_str = json.dumps(task_data).encode() + output = load_tasks_json(blob_str, 'test.json') + expected_output = [ + StorageObjectParams(key='test.json', task_data=task_data[0], row_index=0), + ] + assert output == expected_output - task_data = self.bare_task_list[0] + create_tasks(project, storage, output) - blob_str = json.dumps(task_data).encode() - output = load_tasks_json(blob_str, 'test.json') - expected_output = [StorageObjectParams(key='test.json', task_data=task_data)] - self.assertEqual(output, expected_output) - self._create_tasks(output) +def test_2elem_list(project, storage): + task_data = bare_task_list - def test_data_key(self): + blob_str = json.dumps(task_data).encode() + output = load_tasks_json(blob_str, 'test.json') + expected_output = [ + StorageObjectParams(key='test.json', task_data=task_data[0], row_index=0), + StorageObjectParams(key='test.json', task_data=task_data[1], row_index=1), + ] + assert output == expected_output - task_data = {'data': self.bare_task_list[0]} + create_tasks(project, storage, output) - blob_str = json.dumps(task_data).encode() - output = load_tasks_json(blob_str, 'test.json') - expected_output = [StorageObjectParams(key='test.json', task_data=task_data)] - self.assertEqual(output, expected_output) - self._create_tasks(output) +def test_preds_and_annots_list(project, storage): + task_data = annots_preds_task_list - def test_1elem_list(self): + blob_str = json.dumps(task_data).encode() + output = load_tasks_json(blob_str, 'test.json') + expected_output = [ + StorageObjectParams(key='test.json', task_data=task_data[0], row_index=0), + StorageObjectParams(key='test.json', task_data=task_data[1], row_index=1), + ] + assert output == expected_output - task_data = self.bare_task_list[:1] + create_tasks(project, storage, output) - blob_str = json.dumps(task_data).encode() - output = load_tasks_json(blob_str, 'test.json') - expected_output = [ - StorageObjectParams(key='test.json', task_data=task_data[0], row_index=0), - ] - self.assertEqual(output, expected_output) - self._create_tasks(output) +@mock_feature_flag('fflag_feat_root_11_support_jsonl_cloud_storage', True) +def test_list_jsonl(project, storage): + task_data = bare_task_list - def test_2elem_list(self): + blob_str = '\n'.join([json.dumps(task) for task in task_data]).encode() + output = load_tasks_json(blob_str, 'test.jsonl') + expected_output = [ + StorageObjectParams(key='test.jsonl', task_data=task_data[0], row_index=0), + StorageObjectParams(key='test.jsonl', task_data=task_data[1], row_index=1), + ] + assert output == expected_output - task_data = self.bare_task_list + create_tasks(project, storage, output) - blob_str = json.dumps(task_data).encode() - output = load_tasks_json(blob_str, 'test.json') - expected_output = [ - StorageObjectParams(key='test.json', task_data=task_data[0], row_index=0), - StorageObjectParams(key='test.json', task_data=task_data[1], row_index=1), - ] - self.assertEqual(output, expected_output) - self._create_tasks(output) +@mock_feature_flag('fflag_feat_root_11_support_jsonl_cloud_storage', True) +def test_list_jsonl_with_preds_and_annots(project, storage): + task_data = annots_preds_task_list - def test_preds_and_annots_list(self): - task_data = self.annots_preds_task_list + blob_str = '\n'.join([json.dumps(task) for task in task_data]).encode() + output = load_tasks_json(blob_str, 'test.jsonl') + expected_output = [ + StorageObjectParams(key='test.jsonl', task_data=task_data[0], row_index=0), + StorageObjectParams(key='test.jsonl', task_data=task_data[1], row_index=1), + ] + assert output == expected_output - blob_str = json.dumps(task_data).encode() - output = load_tasks_json(blob_str, 'test.json') - expected_output = [ - StorageObjectParams(key='test.json', task_data=task_data[0], row_index=0), - StorageObjectParams(key='test.json', task_data=task_data[1], row_index=1), - ] - self.assertEqual(output, expected_output) + create_tasks(project, storage, output) - self._create_tasks(output) - @mock_feature_flag('fflag_feat_root_11_support_jsonl_cloud_storage', True) - def test_list_jsonl(self): - task_data = self.bare_task_list +@mock_feature_flag('fflag_feat_root_11_support_jsonl_cloud_storage', False) +def test_ff_blocks_jsonl(): + with pytest.raises(ValueError): + load_tasks_json(b'{"text": "Test task 1"}\n{"text": "Test task 2"}', 'test.jsonl') - blob_str = '\n'.join([json.dumps(task) for task in task_data]).encode() - output = load_tasks_json(blob_str, 'test.jsonl') - expected_output = [ - StorageObjectParams(key='test.jsonl', task_data=task_data[0], row_index=0), - StorageObjectParams(key='test.jsonl', task_data=task_data[1], row_index=1), - ] - self.assertEqual(output, expected_output) - self._create_tasks(output) +def test_mixed_formats_invalid(): + task_data = [bare_task_list[0], annots_preds_task_list[0]] - @mock_feature_flag('fflag_feat_root_11_support_jsonl_cloud_storage', True) - def test_list_jsonl_with_preds_and_annots(self): - task_data = self.annots_preds_task_list + with pytest.raises(ValueError): + blob_str = json.dumps(task_data).encode() + load_tasks_json(blob_str, 'test.json') + with pytest.raises(ValueError): blob_str = '\n'.join([json.dumps(task) for task in task_data]).encode() - output = load_tasks_json(blob_str, 'test.jsonl') - expected_output = [ - StorageObjectParams(key='test.jsonl', task_data=task_data[0], row_index=0), - StorageObjectParams(key='test.jsonl', task_data=task_data[1], row_index=1), - ] - self.assertEqual(output, expected_output) - - self._create_tasks(output) - - @mock_feature_flag('fflag_feat_root_11_support_jsonl_cloud_storage', False) - def test_ff_blocks_jsonl(self): - with self.assertRaises(ValueError): - load_tasks_json(b'{"text": "Test task 1"}\n{"text": "Test task 2"}', 'test.jsonl') - - def test_mixed_formats_invalid(self): - task_data = [self.bare_task_list[0], self.annots_preds_task_list[0]] - - with self.assertRaises(ValueError): - blob_str = json.dumps(task_data).encode() - load_tasks_json(blob_str, 'test.json') - - with self.assertRaises(ValueError): - blob_str = '\n'.join([json.dumps(task) for task in task_data]).encode() - load_tasks_json(blob_str, 'test.jsonl') + load_tasks_json(blob_str, 'test.jsonl') diff --git a/label_studio/tests/utils.py b/label_studio/tests/utils.py index 051fc9a21d0c..a15cdfde7b51 100644 --- a/label_studio/tests/utils.py +++ b/label_studio/tests/utils.py @@ -417,7 +417,7 @@ def file_exists_in_storage(response, exists=True, file_path=None): def mock_feature_flag(flag_name: str, value: bool, parent_module: str = 'core.feature_flags'): - """Decorator to mock a feature flag state for a test function or method. + """Decorator to mock a feature flag state for a test function. Args: flag_name: Name of the feature flag to mock @@ -436,11 +436,6 @@ def fake_flag_set(feature_flag, *flag_args, **flag_kwargs): with mock.patch(f'{parent_module}.flag_set', wraps=fake_flag_set): return func(*args, **kwargs) - # Handle both functions and methods - # For instance methods, we need to preserve the instance - if hasattr(func, '__self__'): - # This is a bound method, preserve the binding - return wrapper.__get__(func.__self__, type(func.__self__)) return wrapper return decorator From 561c446a402b33eae8b10285bee819867340f240 Mon Sep 17 00:00:00 2001 From: Jo Booth Date: Tue, 20 May 2025 16:47:16 -0400 Subject: [PATCH 46/77] regen lockfile --- poetry.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/poetry.lock b/poetry.lock index c232d61941ed..60a6be507662 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. [[package]] name = "annotated-types" @@ -5013,4 +5013,4 @@ uwsgi = ["pyuwsgi", "uwsgitop"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<4" -content-hash = "7684e8020651378bda172bf7b018e0251aad720a8e68980ac61811ea98074ab7" +content-hash = "610d20d68612bf7c3a6c6d9b8b383d5740008f8e0217e2789fbb332a334b8bdc" From 200062906934c295c7198bce268aef17f5347a79 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Tue, 20 May 2025 16:50:20 -0400 Subject: [PATCH 47/77] fix tests --- .../tests/test_multitask_import.py | 73 ++++++++++++------- 1 file changed, 46 insertions(+), 27 deletions(-) diff --git a/label_studio/io_storages/tests/test_multitask_import.py b/label_studio/io_storages/tests/test_multitask_import.py index c09628ab8b41..67fafe00d4db 100644 --- a/label_studio/io_storages/tests/test_multitask_import.py +++ b/label_studio/io_storages/tests/test_multitask_import.py @@ -13,6 +13,7 @@ from io_storages.utils import StorageObjectParams, load_tasks_json from moto import mock_s3 from projects.tests.factories import ProjectFactory +from rest_framework.test import APIClient from tests.utils import azure_client_mock, gcs_client_mock, mock_feature_flag, redis_client_mock @@ -22,7 +23,6 @@ @pytest.fixture -@pytest.mark.django_db def project(): return ProjectFactory() @@ -35,11 +35,10 @@ def common_task_data(): ] -@pytest.mark.django_db -def _test_storage_import(project, business_client, storage_class, task_data, **storage_kwargs): +def _test_storage_import(project, storage_class, task_data, **storage_kwargs): """Helper to test import for a specific storage type""" - # client = APIClient() - # client.force_authenticate(user=project.created_by) + client = APIClient() + client.force_authenticate(user=project.created_by) # Setup storage with required credentials storage = storage_class(project=project, **storage_kwargs) @@ -55,7 +54,7 @@ def _test_storage_import(project, business_client, storage_class, task_data, **s storage.sync() # Validate tasks were imported correctly - tasks_response = business_client.get(f'/api/tasks?project={project.id}') + tasks_response = client.get(f'/api/tasks?project={project.id}') assert tasks_response.status_code == 200 tasks = tasks_response.json()['tasks'] assert len(tasks) == len(task_data) @@ -66,6 +65,7 @@ def _test_storage_import(project, business_client, storage_class, task_data, **s @mock_feature_flag('fflag_feat_dia_2092_multitasks_per_storage_link', True) +@pytest.mark.django_db def test_import_multiple_tasks_s3(project, common_task_data): with mock_s3(): # Setup S3 bucket and test data @@ -88,6 +88,7 @@ def test_import_multiple_tasks_s3(project, common_task_data): @mock_feature_flag('fflag_feat_dia_2092_multitasks_per_storage_link', True) +@pytest.mark.django_db def test_import_multiple_tasks_gcs(project, common_task_data): # initialize mock with sample data with gcs_client_mock(): @@ -102,6 +103,7 @@ def test_import_multiple_tasks_gcs(project, common_task_data): @mock_feature_flag('fflag_feat_dia_2092_multitasks_per_storage_link', True) +@pytest.mark.django_db def test_import_multiple_tasks_azure(project, common_task_data): # initialize mock with sample data with azure_client_mock(sample_json_contents=common_task_data, sample_blob_names=['test.json']): @@ -114,6 +116,7 @@ def test_import_multiple_tasks_azure(project, common_task_data): @mock_feature_flag('fflag_feat_dia_2092_multitasks_per_storage_link', True) +@pytest.mark.django_db def test_import_multiple_tasks_redis(project, common_task_data): with redis_client_mock() as redis: redis.set('test.json', json.dumps(common_task_data)) @@ -164,8 +167,9 @@ def test_storagelink_fields(project, common_task_data): # -@pytest.fixture(scope='module') -def storage(project): +@pytest.fixture +def storage(): + project = ProjectFactory() storage = S3ImportStorage( project=project, bucket='example', @@ -174,10 +178,12 @@ def storage(project): use_blob_urls=False, ) storage.save() - return storage + return project, storage -def create_tasks(project, storage, params_list: list[StorageObjectParams]): +@pytest.mark.django_db +def create_tasks(storage, params_list: list[StorageObjectParams]): + project, storage = storage # check that no errors are raised during task creation; not checking the task itself for params in params_list: _ = S3ImportStorage.add_task(project, 1, 0, storage, params, S3ImportStorageLink) @@ -231,7 +237,8 @@ def create_tasks(project, storage, params_list: list[StorageObjectParams]): ] -def test_bare_task(project, storage): +@pytest.mark.django_db +def test_bare_task(storage): task_data = bare_task_list[0] blob_str = json.dumps(task_data).encode() @@ -239,10 +246,11 @@ def test_bare_task(project, storage): expected_output = [StorageObjectParams(key='test.json', task_data=task_data)] assert output == expected_output - create_tasks(project, storage, output) + create_tasks(storage, output) -def test_data_key(project, storage): +@pytest.mark.django_db +def test_data_key(storage): task_data = {'data': bare_task_list[0]} blob_str = json.dumps(task_data).encode() @@ -250,10 +258,11 @@ def test_data_key(project, storage): expected_output = [StorageObjectParams(key='test.json', task_data=task_data)] assert output == expected_output - create_tasks(project, storage, output) + create_tasks(storage, output) -def test_1elem_list(project, storage): +@pytest.mark.django_db +def test_1elem_list(storage): task_data = bare_task_list[:1] blob_str = json.dumps(task_data).encode() @@ -263,10 +272,11 @@ def test_1elem_list(project, storage): ] assert output == expected_output - create_tasks(project, storage, output) + create_tasks(storage, output) -def test_2elem_list(project, storage): +@pytest.mark.django_db +def test_2elem_list(storage): task_data = bare_task_list blob_str = json.dumps(task_data).encode() @@ -277,25 +287,29 @@ def test_2elem_list(project, storage): ] assert output == expected_output - create_tasks(project, storage, output) + create_tasks(storage, output) -def test_preds_and_annots_list(project, storage): +@pytest.mark.django_db +def test_preds_and_annots_list(storage): task_data = annots_preds_task_list blob_str = json.dumps(task_data).encode() output = load_tasks_json(blob_str, 'test.json') + + fixed_task_data_1 = task_data[1].copy() + fixed_task_data_1['annotations'] = None # this key exists in the output, since preds exist expected_output = [ StorageObjectParams(key='test.json', task_data=task_data[0], row_index=0), - StorageObjectParams(key='test.json', task_data=task_data[1], row_index=1), + StorageObjectParams(key='test.json', task_data=fixed_task_data_1, row_index=1), ] assert output == expected_output - create_tasks(project, storage, output) + create_tasks(storage, output) -@mock_feature_flag('fflag_feat_root_11_support_jsonl_cloud_storage', True) -def test_list_jsonl(project, storage): +@pytest.mark.django_db +def test_list_jsonl(storage): task_data = bare_task_list blob_str = '\n'.join([json.dumps(task) for task in task_data]).encode() @@ -306,24 +320,29 @@ def test_list_jsonl(project, storage): ] assert output == expected_output - create_tasks(project, storage, output) + create_tasks(storage, output) +@pytest.mark.django_db @mock_feature_flag('fflag_feat_root_11_support_jsonl_cloud_storage', True) -def test_list_jsonl_with_preds_and_annots(project, storage): +def test_list_jsonl_with_preds_and_annots(storage): task_data = annots_preds_task_list blob_str = '\n'.join([json.dumps(task) for task in task_data]).encode() output = load_tasks_json(blob_str, 'test.jsonl') + + fixed_task_data_1 = task_data[1].copy() + fixed_task_data_1['annotations'] = None # this key exists in the output, since preds exist expected_output = [ StorageObjectParams(key='test.jsonl', task_data=task_data[0], row_index=0), - StorageObjectParams(key='test.jsonl', task_data=task_data[1], row_index=1), + StorageObjectParams(key='test.jsonl', task_data=fixed_task_data_1, row_index=1), ] assert output == expected_output - create_tasks(project, storage, output) + create_tasks(storage, output) +@pytest.mark.django_db @mock_feature_flag('fflag_feat_root_11_support_jsonl_cloud_storage', False) def test_ff_blocks_jsonl(): with pytest.raises(ValueError): From 78fbdd4759b798fece44bfb2b04e03ff63e89e65 Mon Sep 17 00:00:00 2001 From: Jo Booth Date: Tue, 20 May 2025 17:09:39 -0400 Subject: [PATCH 48/77] place django_db mark at file level --- .../io_storages/tests/test_multitask_import.py | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/label_studio/io_storages/tests/test_multitask_import.py b/label_studio/io_storages/tests/test_multitask_import.py index 67fafe00d4db..be0c7625ba95 100644 --- a/label_studio/io_storages/tests/test_multitask_import.py +++ b/label_studio/io_storages/tests/test_multitask_import.py @@ -14,7 +14,6 @@ from moto import mock_s3 from projects.tests.factories import ProjectFactory from rest_framework.test import APIClient - from tests.utils import azure_client_mock, gcs_client_mock, mock_feature_flag, redis_client_mock # @@ -22,6 +21,9 @@ # +pytestmark = pytest.mark.django_db + + @pytest.fixture def project(): return ProjectFactory() @@ -65,7 +67,6 @@ def _test_storage_import(project, storage_class, task_data, **storage_kwargs): @mock_feature_flag('fflag_feat_dia_2092_multitasks_per_storage_link', True) -@pytest.mark.django_db def test_import_multiple_tasks_s3(project, common_task_data): with mock_s3(): # Setup S3 bucket and test data @@ -88,7 +89,6 @@ def test_import_multiple_tasks_s3(project, common_task_data): @mock_feature_flag('fflag_feat_dia_2092_multitasks_per_storage_link', True) -@pytest.mark.django_db def test_import_multiple_tasks_gcs(project, common_task_data): # initialize mock with sample data with gcs_client_mock(): @@ -103,7 +103,6 @@ def test_import_multiple_tasks_gcs(project, common_task_data): @mock_feature_flag('fflag_feat_dia_2092_multitasks_per_storage_link', True) -@pytest.mark.django_db def test_import_multiple_tasks_azure(project, common_task_data): # initialize mock with sample data with azure_client_mock(sample_json_contents=common_task_data, sample_blob_names=['test.json']): @@ -116,7 +115,6 @@ def test_import_multiple_tasks_azure(project, common_task_data): @mock_feature_flag('fflag_feat_dia_2092_multitasks_per_storage_link', True) -@pytest.mark.django_db def test_import_multiple_tasks_redis(project, common_task_data): with redis_client_mock() as redis: redis.set('test.json', json.dumps(common_task_data)) @@ -131,7 +129,6 @@ def test_import_multiple_tasks_redis(project, common_task_data): @mock_feature_flag('fflag_feat_dia_2092_multitasks_per_storage_link', True) -@pytest.mark.django_db def test_storagelink_fields(project, common_task_data): # use an actual storage and storagelink to test this, since factories aren't connected properly with mock_s3(): @@ -181,7 +178,6 @@ def storage(): return project, storage -@pytest.mark.django_db def create_tasks(storage, params_list: list[StorageObjectParams]): project, storage = storage # check that no errors are raised during task creation; not checking the task itself @@ -237,7 +233,6 @@ def create_tasks(storage, params_list: list[StorageObjectParams]): ] -@pytest.mark.django_db def test_bare_task(storage): task_data = bare_task_list[0] @@ -249,7 +244,6 @@ def test_bare_task(storage): create_tasks(storage, output) -@pytest.mark.django_db def test_data_key(storage): task_data = {'data': bare_task_list[0]} @@ -261,7 +255,6 @@ def test_data_key(storage): create_tasks(storage, output) -@pytest.mark.django_db def test_1elem_list(storage): task_data = bare_task_list[:1] @@ -275,7 +268,6 @@ def test_1elem_list(storage): create_tasks(storage, output) -@pytest.mark.django_db def test_2elem_list(storage): task_data = bare_task_list @@ -290,7 +282,6 @@ def test_2elem_list(storage): create_tasks(storage, output) -@pytest.mark.django_db def test_preds_and_annots_list(storage): task_data = annots_preds_task_list @@ -308,7 +299,6 @@ def test_preds_and_annots_list(storage): create_tasks(storage, output) -@pytest.mark.django_db def test_list_jsonl(storage): task_data = bare_task_list @@ -323,7 +313,6 @@ def test_list_jsonl(storage): create_tasks(storage, output) -@pytest.mark.django_db @mock_feature_flag('fflag_feat_root_11_support_jsonl_cloud_storage', True) def test_list_jsonl_with_preds_and_annots(storage): task_data = annots_preds_task_list @@ -342,7 +331,6 @@ def test_list_jsonl_with_preds_and_annots(storage): create_tasks(storage, output) -@pytest.mark.django_db @mock_feature_flag('fflag_feat_root_11_support_jsonl_cloud_storage', False) def test_ff_blocks_jsonl(): with pytest.raises(ValueError): From 882c08a9c045952d7a8c969feb1c95bde8a72834 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Tue, 20 May 2025 18:06:32 -0400 Subject: [PATCH 49/77] handle None --- label_studio/io_storages/base_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/label_studio/io_storages/base_models.py b/label_studio/io_storages/base_models.py index 3ae9642b398c..67424cbdfd57 100644 --- a/label_studio/io_storages/base_models.py +++ b/label_studio/io_storages/base_models.py @@ -347,7 +347,7 @@ def add_task(cls, project, maximum_annotations, max_inner_id, storage, link_para data = link_kwargs.pop('task_data') # predictions - predictions = data.get('predictions', []) + predictions = data.get('predictions') or [] if predictions: if 'data' not in data: raise ValueError( @@ -355,7 +355,7 @@ def add_task(cls, project, maximum_annotations, max_inner_id, storage, link_para ) # annotations - annotations = data.get('annotations', []) + annotations = data.get('annotations') or [] cancelled_annotations = 0 if annotations: if 'data' not in data: From cf0c0d3307149b752391a64f45862fb7914d3c9c Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Wed, 21 May 2025 10:03:01 -0400 Subject: [PATCH 50/77] wip mock ffs --- .../tests/test_multitask_import.py | 61 ++++++++++++++++--- 1 file changed, 52 insertions(+), 9 deletions(-) diff --git a/label_studio/io_storages/tests/test_multitask_import.py b/label_studio/io_storages/tests/test_multitask_import.py index be0c7625ba95..adacc495969f 100644 --- a/label_studio/io_storages/tests/test_multitask_import.py +++ b/label_studio/io_storages/tests/test_multitask_import.py @@ -1,6 +1,7 @@ import json import boto3 +import mock import pytest from io_storages.models import S3ImportStorage from io_storages.s3.models import S3ImportStorageLink @@ -14,13 +15,55 @@ from moto import mock_s3 from projects.tests.factories import ProjectFactory from rest_framework.test import APIClient -from tests.utils import azure_client_mock, gcs_client_mock, mock_feature_flag, redis_client_mock + +from tests.utils import azure_client_mock, gcs_client_mock, redis_client_mock + +# FIXME: mock_feature_flag is not working in this file for some reason + + +@pytest.fixture(name='fflag_feat_dia_2092_multitasks_per_storage_link_on') +def fflag_feat_dia_2092_multitasks_per_storage_link_on(): + from core.feature_flags import flag_set + + def fake_flag_set(*args, **kwargs): + if args[0] == 'fflag_feat_dia_2092_multitasks_per_storage_link': + return True + return flag_set(*args, **kwargs) + + with mock.patch('core.feature_flags.flag_set', wraps=fake_flag_set): + yield + + +@pytest.fixture(name='fflag_feat_root_11_support_jsonl_cloud_storage_on') +def fflag_feat_root_11_support_jsonl_cloud_storage_on(): + from core.feature_flags import flag_set + + def fake_flag_set(*args, **kwargs): + if args[0] == 'fflag_feat_root_11_support_jsonl_cloud_storage': + return True + return flag_set(*args, **kwargs) + + with mock.patch('core.feature_flags.flag_set', wraps=fake_flag_set): + yield + + +@pytest.fixture(name='fflag_feat_root_11_support_jsonl_cloud_storage_off') +def fflag_feat_root_11_support_jsonl_cloud_storage_off(): + from core.feature_flags import flag_set + + def fake_flag_set(*args, **kwargs): + if args[0] == 'fflag_feat_root_11_support_jsonl_cloud_storage': + return False + return flag_set(*args, **kwargs) + + with mock.patch('core.feature_flags.flag_set', wraps=fake_flag_set): + yield + # # Integration tests for storage.sync() # - pytestmark = pytest.mark.django_db @@ -66,7 +109,7 @@ def _test_storage_import(project, storage_class, task_data, **storage_kwargs): assert task['data'] == expected_data['data'] -@mock_feature_flag('fflag_feat_dia_2092_multitasks_per_storage_link', True) +@pytest.mark.fflag_feat_dia_2092_multitasks_per_storage_link_on def test_import_multiple_tasks_s3(project, common_task_data): with mock_s3(): # Setup S3 bucket and test data @@ -88,7 +131,7 @@ def test_import_multiple_tasks_s3(project, common_task_data): ) -@mock_feature_flag('fflag_feat_dia_2092_multitasks_per_storage_link', True) +@pytest.mark.fflag_feat_dia_2092_multitasks_per_storage_link_on def test_import_multiple_tasks_gcs(project, common_task_data): # initialize mock with sample data with gcs_client_mock(): @@ -102,7 +145,7 @@ def test_import_multiple_tasks_gcs(project, common_task_data): ) -@mock_feature_flag('fflag_feat_dia_2092_multitasks_per_storage_link', True) +@pytest.mark.fflag_feat_dia_2092_multitasks_per_storage_link_on def test_import_multiple_tasks_azure(project, common_task_data): # initialize mock with sample data with azure_client_mock(sample_json_contents=common_task_data, sample_blob_names=['test.json']): @@ -114,7 +157,7 @@ def test_import_multiple_tasks_azure(project, common_task_data): ) -@mock_feature_flag('fflag_feat_dia_2092_multitasks_per_storage_link', True) +@pytest.mark.fflag_feat_dia_2092_multitasks_per_storage_link_on def test_import_multiple_tasks_redis(project, common_task_data): with redis_client_mock() as redis: redis.set('test.json', json.dumps(common_task_data)) @@ -128,7 +171,7 @@ def test_import_multiple_tasks_redis(project, common_task_data): ) -@mock_feature_flag('fflag_feat_dia_2092_multitasks_per_storage_link', True) +@pytest.mark.fflag_feat_dia_2092_multitasks_per_storage_link_on def test_storagelink_fields(project, common_task_data): # use an actual storage and storagelink to test this, since factories aren't connected properly with mock_s3(): @@ -313,7 +356,7 @@ def test_list_jsonl(storage): create_tasks(storage, output) -@mock_feature_flag('fflag_feat_root_11_support_jsonl_cloud_storage', True) +@pytest.mark.fflag_feat_root_11_support_jsonl_cloud_storage_on def test_list_jsonl_with_preds_and_annots(storage): task_data = annots_preds_task_list @@ -331,7 +374,7 @@ def test_list_jsonl_with_preds_and_annots(storage): create_tasks(storage, output) -@mock_feature_flag('fflag_feat_root_11_support_jsonl_cloud_storage', False) +@pytest.mark.fflag_feat_root_11_support_jsonl_cloud_storage_off def test_ff_blocks_jsonl(): with pytest.raises(ValueError): load_tasks_json(b'{"text": "Test task 1"}\n{"text": "Test task 2"}', 'test.jsonl') From bae6ae11e66b759b5fc8310e6846ab75b0769901 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Wed, 21 May 2025 11:00:33 -0400 Subject: [PATCH 51/77] ensure blob_str enters as bytes --- label_studio/io_storages/azure_blob/models.py | 2 +- label_studio/io_storages/localfiles/models.py | 4 +--- label_studio/io_storages/redis/models.py | 3 ++- label_studio/io_storages/s3/models.py | 2 +- label_studio/io_storages/utils.py | 4 ++-- 5 files changed, 7 insertions(+), 8 deletions(-) diff --git a/label_studio/io_storages/azure_blob/models.py b/label_studio/io_storages/azure_blob/models.py index 84fc57914a2e..a2f61b4b4ffc 100644 --- a/label_studio/io_storages/azure_blob/models.py +++ b/label_studio/io_storages/azure_blob/models.py @@ -222,7 +222,7 @@ def get_data(self, key) -> list[StorageObjectParams]: container = self.get_container() blob = container.download_blob(key) - blob_str = blob.content_as_text() + blob_str = blob.content_as_bytes() return load_tasks_json(blob_str, key) def scan_and_create_links(self): diff --git a/label_studio/io_storages/localfiles/models.py b/label_studio/io_storages/localfiles/models.py index 244d8a6880cf..3971f50b3d52 100644 --- a/label_studio/io_storages/localfiles/models.py +++ b/label_studio/io_storages/localfiles/models.py @@ -93,10 +93,8 @@ def get_data(self, key) -> list[StorageObjectParams]: try: with open(path, 'rb') as f: - blob_str = f.read().decode('utf-8') + blob_str = f.read() return load_tasks_json(blob_str, key) - except UnicodeDecodeError as e: - raise ValueError(f'Failed to decode file {path} as UTF-8: {str(e)}') except OSError as e: raise ValueError(f'Failed to read file {path}: {str(e)}') diff --git a/label_studio/io_storages/redis/models.py b/label_studio/io_storages/redis/models.py index c002ab609f1d..56f95b4fba4c 100644 --- a/label_studio/io_storages/redis/models.py +++ b/label_studio/io_storages/redis/models.py @@ -95,7 +95,8 @@ def get_data(self, key) -> list[StorageObjectParams]: value_str = client.get(key) if not value_str: return [] - return load_tasks_json(value_str, key) + blob_str = value_str.encode('utf-8') + return load_tasks_json(blob_str, key) def scan_and_create_links(self): return self._scan_and_create_links(RedisImportStorageLink) diff --git a/label_studio/io_storages/s3/models.py b/label_studio/io_storages/s3/models.py index 3ab85dda8df1..c2b436db9a7f 100644 --- a/label_studio/io_storages/s3/models.py +++ b/label_studio/io_storages/s3/models.py @@ -227,7 +227,7 @@ def get_data(self, key) -> list[StorageObjectParams]: # read task json from bucket and validate it _, s3 = self.get_client_and_resource() bucket = s3.Bucket(self.bucket) - obj = s3.Object(bucket.name, key).get()['Body'].read().decode('utf-8') + obj = s3.Object(bucket.name, key).get()['Body'].read() return load_tasks_json(obj, key) @catch_and_reraise_from_none diff --git a/label_studio/io_storages/utils.py b/label_studio/io_storages/utils.py index 0cd40bf6a881..d4f177109b53 100644 --- a/label_studio/io_storages/utils.py +++ b/label_studio/io_storages/utils.py @@ -139,12 +139,12 @@ def bulk_create( ] -def _load_tasks_json(blob_str: str, key: str) -> list[StorageObjectParams]: +def _load_tasks_json(blob_str: bytes, key: str) -> list[StorageObjectParams]: """ Parse blob_str containing task JSON(s) and return the validated result or raise an error. Args: - blob_str (str): The blob string to parse. + blob_str (bytes): The blob string to parse. key (str): The key of the blob. Used for error messages. Returns: From 900142363785c6cdd9bc8ee499f6c49c5c174e29 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Wed, 21 May 2025 14:08:40 -0400 Subject: [PATCH 52/77] add debug log, fix azure test --- label_studio/io_storages/base_models.py | 1 + label_studio/io_storages/tests/test_multitask_import.py | 2 +- label_studio/tests/utils.py | 3 +++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/label_studio/io_storages/base_models.py b/label_studio/io_storages/base_models.py index 67424cbdfd57..4e36e05a828d 100644 --- a/label_studio/io_storages/base_models.py +++ b/label_studio/io_storages/base_models.py @@ -520,6 +520,7 @@ def sync(self): self.info_set_queued() import_sync_background(self.__class__, self.id) except Exception: + logger.debug(f'Storage {self} failed', exc_info=True) storage_background_failure(self) class Meta: diff --git a/label_studio/io_storages/tests/test_multitask_import.py b/label_studio/io_storages/tests/test_multitask_import.py index adacc495969f..2dc6b21f709e 100644 --- a/label_studio/io_storages/tests/test_multitask_import.py +++ b/label_studio/io_storages/tests/test_multitask_import.py @@ -342,7 +342,7 @@ def test_preds_and_annots_list(storage): create_tasks(storage, output) -def test_list_jsonl(storage): +def test_list_jsonl(storage, fflag_feat_root_11_support_jsonl_cloud_storage_on): task_data = bare_task_list blob_str = '\n'.join([json.dumps(task) for task in task_data]).encode() diff --git a/label_studio/tests/utils.py b/label_studio/tests/utils.py index a15cdfde7b51..f7aed71d6da6 100644 --- a/label_studio/tests/utils.py +++ b/label_studio/tests/utils.py @@ -192,6 +192,9 @@ def generate_signed_url(self, **kwargs): def content_as_text(self): return json.dumps(sample_json_contents) + def content_as_bytes(self): + return json.dumps(sample_json_contents).encode('utf-8') + class DummyAzureContainer: def __init__(self, container_name, **kwargs): self.name = container_name From 10e972532e051ceaf1b263639192c1b0a0dc33b4 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Wed, 21 May 2025 16:07:47 -0400 Subject: [PATCH 53/77] fix imputed field in task from other tasks in file --- label_studio/io_storages/base_models.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/label_studio/io_storages/base_models.py b/label_studio/io_storages/base_models.py index 4e36e05a828d..b308453bf26f 100644 --- a/label_studio/io_storages/base_models.py +++ b/label_studio/io_storages/base_models.py @@ -365,7 +365,10 @@ def add_task(cls, project, maximum_annotations, max_inner_id, storage, link_para cancelled_annotations = len([a for a in annotations if a.get('was_cancelled', False)]) if 'data' in data and isinstance(data['data'], dict): - data = data['data'] + if data['data'] is not None: + data = data['data'] + else: + data.pop('data') with transaction.atomic(): task = Task.objects.create( From 2109679755fb043a3d0b38042f0345dfbc903900 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Wed, 21 May 2025 16:30:46 -0400 Subject: [PATCH 54/77] fix redis --- label_studio/io_storages/redis/models.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/label_studio/io_storages/redis/models.py b/label_studio/io_storages/redis/models.py index 56f95b4fba4c..c002ab609f1d 100644 --- a/label_studio/io_storages/redis/models.py +++ b/label_studio/io_storages/redis/models.py @@ -95,8 +95,7 @@ def get_data(self, key) -> list[StorageObjectParams]: value_str = client.get(key) if not value_str: return [] - blob_str = value_str.encode('utf-8') - return load_tasks_json(blob_str, key) + return load_tasks_json(value_str, key) def scan_and_create_links(self): return self._scan_and_create_links(RedisImportStorageLink) From 3f065f5ac399aa66d79b7f3f813122bf1a5b52bb Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Wed, 21 May 2025 16:57:26 -0400 Subject: [PATCH 55/77] fix mixed task formats and tests --- .../tests/test_multitask_import.py | 65 +++++++++++++------ label_studio/io_storages/utils.py | 8 +-- 2 files changed, 46 insertions(+), 27 deletions(-) diff --git a/label_studio/io_storages/tests/test_multitask_import.py b/label_studio/io_storages/tests/test_multitask_import.py index 2dc6b21f709e..96c5db1619a9 100644 --- a/label_studio/io_storages/tests/test_multitask_import.py +++ b/label_studio/io_storages/tests/test_multitask_import.py @@ -16,12 +16,10 @@ from projects.tests.factories import ProjectFactory from rest_framework.test import APIClient -from tests.utils import azure_client_mock, gcs_client_mock, redis_client_mock +from tests.utils import azure_client_mock, gcs_client_mock, mock_feature_flag, redis_client_mock -# FIXME: mock_feature_flag is not working in this file for some reason - -@pytest.fixture(name='fflag_feat_dia_2092_multitasks_per_storage_link_on') +@pytest.fixture(name='fflag_feat_dia_2092_multitasks_per_storage_link_on', autouse=True) def fflag_feat_dia_2092_multitasks_per_storage_link_on(): from core.feature_flags import flag_set @@ -30,7 +28,7 @@ def fake_flag_set(*args, **kwargs): return True return flag_set(*args, **kwargs) - with mock.patch('core.feature_flags.flag_set', wraps=fake_flag_set): + with mock.patch('io_storages.base_models.flag_set', wraps=fake_flag_set): yield @@ -43,7 +41,7 @@ def fake_flag_set(*args, **kwargs): return True return flag_set(*args, **kwargs) - with mock.patch('core.feature_flags.flag_set', wraps=fake_flag_set): + with mock.patch('io_storages.utils.flag_set', wraps=fake_flag_set): yield @@ -56,7 +54,7 @@ def fake_flag_set(*args, **kwargs): return False return flag_set(*args, **kwargs) - with mock.patch('core.feature_flags.flag_set', wraps=fake_flag_set): + with mock.patch('io_storages.utils.flag_set', wraps=fake_flag_set): yield @@ -331,18 +329,32 @@ def test_preds_and_annots_list(storage): blob_str = json.dumps(task_data).encode() output = load_tasks_json(blob_str, 'test.json') - fixed_task_data_1 = task_data[1].copy() - fixed_task_data_1['annotations'] = None # this key exists in the output, since preds exist expected_output = [ StorageObjectParams(key='test.json', task_data=task_data[0], row_index=0), - StorageObjectParams(key='test.json', task_data=fixed_task_data_1, row_index=1), + StorageObjectParams(key='test.json', task_data=task_data[1], row_index=1), ] assert output == expected_output create_tasks(storage, output) -def test_list_jsonl(storage, fflag_feat_root_11_support_jsonl_cloud_storage_on): +def test_mixed_formats(storage): + task_data = [bare_task_list[0], annots_preds_task_list[0]] + + blob_str = json.dumps(task_data).encode() + output = load_tasks_json(blob_str, 'test.json') + + expected_output = [ + StorageObjectParams(key='test.json', task_data=task_data[0], row_index=0), + StorageObjectParams(key='test.json', task_data=task_data[1], row_index=1), + ] + assert output == expected_output + + create_tasks(storage, output) + + +@mock_feature_flag('fflag_feat_root_11_support_jsonl_cloud_storage', True, 'io_storages.utils') +def test_list_jsonl(storage): task_data = bare_task_list blob_str = '\n'.join([json.dumps(task) for task in task_data]).encode() @@ -356,7 +368,7 @@ def test_list_jsonl(storage, fflag_feat_root_11_support_jsonl_cloud_storage_on): create_tasks(storage, output) -@pytest.mark.fflag_feat_root_11_support_jsonl_cloud_storage_on +@mock_feature_flag('fflag_feat_root_11_support_jsonl_cloud_storage', True, 'io_storages.utils') def test_list_jsonl_with_preds_and_annots(storage): task_data = annots_preds_task_list @@ -374,19 +386,32 @@ def test_list_jsonl_with_preds_and_annots(storage): create_tasks(storage, output) -@pytest.mark.fflag_feat_root_11_support_jsonl_cloud_storage_off +@mock_feature_flag('fflag_feat_root_11_support_jsonl_cloud_storage', False, 'io_storages.utils') def test_ff_blocks_jsonl(): with pytest.raises(ValueError): load_tasks_json(b'{"text": "Test task 1"}\n{"text": "Test task 2"}', 'test.jsonl') -def test_mixed_formats_invalid(): +@mock_feature_flag('fflag_feat_root_11_support_jsonl_cloud_storage', True, 'io_storages.utils') +def test_mixed_formats_jsonl(storage): task_data = [bare_task_list[0], annots_preds_task_list[0]] - with pytest.raises(ValueError): - blob_str = json.dumps(task_data).encode() - load_tasks_json(blob_str, 'test.json') + blob_str = '\n'.join([json.dumps(task) for task in task_data]).encode() + output = load_tasks_json(blob_str, 'test.jsonl') - with pytest.raises(ValueError): - blob_str = '\n'.join([json.dumps(task) for task in task_data]).encode() - load_tasks_json(blob_str, 'test.jsonl') + # keys are copied across tasks; empty keys are correctly ignored in create_tasks() + fixed_task_data_0 = task_data[0].copy() + fixed_task_data_0['data'] = None + fixed_task_data_0['predictions'] = None + fixed_task_data_0['annotations'] = None + + fixed_task_data_1 = task_data[1].copy() + fixed_task_data_1['text'] = None + + expected_output = [ + StorageObjectParams(key='test.jsonl', task_data=fixed_task_data_0, row_index=0), + StorageObjectParams(key='test.jsonl', task_data=fixed_task_data_1, row_index=1), + ] + assert output == expected_output + + create_tasks(storage, output) diff --git a/label_studio/io_storages/utils.py b/label_studio/io_storages/utils.py index d4f177109b53..a88b0ded89c9 100644 --- a/label_studio/io_storages/utils.py +++ b/label_studio/io_storages/utils.py @@ -177,13 +177,7 @@ def _error_wrapper(exc: Optional[Exception] = None): if isinstance(value, dict): return [StorageObjectParams(key=key, task_data=value)] if isinstance(value, list): - # validate tasks by briefly converting to table - try: - table = pa.Table.from_pylist(value) - values = table.to_pylist() - except Exception as e: - _error_wrapper(e) - return StorageObjectParams.bulk_create(values, key, range(len(values))) + return StorageObjectParams.bulk_create(value, key, range(len(value))) _error_wrapper() From 2806f0ceff7257a835df900e737b1c214382fc4b Mon Sep 17 00:00:00 2001 From: robot-ci-heartex Date: Wed, 21 May 2025 21:07:09 +0000 Subject: [PATCH 56/77] Apply pre-commit linters --- label_studio/io_storages/tests/test_multitask_import.py | 1 - 1 file changed, 1 deletion(-) diff --git a/label_studio/io_storages/tests/test_multitask_import.py b/label_studio/io_storages/tests/test_multitask_import.py index 96c5db1619a9..673c2853bd71 100644 --- a/label_studio/io_storages/tests/test_multitask_import.py +++ b/label_studio/io_storages/tests/test_multitask_import.py @@ -15,7 +15,6 @@ from moto import mock_s3 from projects.tests.factories import ProjectFactory from rest_framework.test import APIClient - from tests.utils import azure_client_mock, gcs_client_mock, mock_feature_flag, redis_client_mock From 2aef19eb34695f32fa7cd5cfb58a6fbd1552dd74 Mon Sep 17 00:00:00 2001 From: matt-bernstein <60152561+matt-bernstein@users.noreply.github.com> Date: Wed, 21 May 2025 17:09:18 -0400 Subject: [PATCH 57/77] fix docstring Co-authored-by: Jo Booth --- label_studio/io_storages/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/label_studio/io_storages/utils.py b/label_studio/io_storages/utils.py index a88b0ded89c9..a706e1667ea0 100644 --- a/label_studio/io_storages/utils.py +++ b/label_studio/io_storages/utils.py @@ -148,7 +148,6 @@ def _load_tasks_json(blob_str: bytes, key: str) -> list[StorageObjectParams]: key (str): The key of the blob. Used for error messages. Returns: - list[dict]: parsed tasks. list[StorageLinkParams]: link params for each task. """ From 1e82dd6b37ff6d958d6ec8b14683874ca0b6ea15 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Wed, 21 May 2025 17:14:05 -0400 Subject: [PATCH 58/77] rename symbols --- label_studio/core/settings/base.py | 2 +- label_studio/io_storages/azure_blob/models.py | 6 ++-- label_studio/io_storages/base_models.py | 4 +-- label_studio/io_storages/gcs/models.py | 6 ++-- label_studio/io_storages/localfiles/models.py | 6 ++-- label_studio/io_storages/redis/models.py | 4 +-- label_studio/io_storages/s3/models.py | 6 ++-- .../tests/test_multitask_import.py | 35 ++++++++++--------- label_studio/io_storages/utils.py | 24 ++++++------- 9 files changed, 47 insertions(+), 46 deletions(-) diff --git a/label_studio/core/settings/base.py b/label_studio/core/settings/base.py index 1d30361b0679..53248a8b77d6 100644 --- a/label_studio/core/settings/base.py +++ b/label_studio/core/settings/base.py @@ -598,7 +598,7 @@ MEMBER_PERM = 'core.api_permissions.MemberHasOwnerPermission' RECALCULATE_ALL_STATS = None GET_STORAGE_LIST = 'io_storages.functions.get_storage_list' -STORAGE_LOAD_TASKS_JSON = 'io_storages.utils._load_tasks_json' +STORAGE_LOAD_TASKS_JSON = 'io_storages.utils.load_tasks_json_lso' STORAGE_ANNOTATION_SERIALIZER = 'io_storages.serializers.StorageAnnotationSerializer' TASK_SERIALIZER_BULK = 'tasks.serializers.BaseTaskSerializerBulk' PREPROCESS_FIELD_NAME = 'data_manager.functions.preprocess_field_name' diff --git a/label_studio/io_storages/azure_blob/models.py b/label_studio/io_storages/azure_blob/models.py index a2f61b4b4ffc..78f2986a365e 100644 --- a/label_studio/io_storages/azure_blob/models.py +++ b/label_studio/io_storages/azure_blob/models.py @@ -26,7 +26,7 @@ ProjectStorageMixin, ) from io_storages.utils import ( - StorageObjectParams, + StorageObject, load_tasks_json, parse_range, storage_can_resolve_bucket_url, @@ -214,11 +214,11 @@ def iterkeys(self): continue yield file.name - def get_data(self, key) -> list[StorageObjectParams]: + def get_data(self, key) -> list[StorageObject]: if self.use_blob_urls: data_key = settings.DATA_UNDEFINED_NAME task = {data_key: f'{self.url_scheme}://{self.container}/{key}'} - return [StorageObjectParams(key=key, task_data=task)] + return [StorageObject(key=key, task_data=task)] container = self.get_container() blob = container.download_blob(key) diff --git a/label_studio/io_storages/base_models.py b/label_studio/io_storages/base_models.py index b308453bf26f..0303f0e83219 100644 --- a/label_studio/io_storages/base_models.py +++ b/label_studio/io_storages/base_models.py @@ -28,7 +28,7 @@ from django.utils import timezone from django.utils.translation import gettext_lazy as _ from django_rq import job -from io_storages.utils import StorageObjectParams, get_uri_via_regex, parse_bucket_uri +from io_storages.utils import StorageObject, get_uri_via_regex, parse_bucket_uri from rq.job import Job from tasks.models import Annotation, Task from tasks.serializers import AnnotationSerializer, PredictionSerializer @@ -231,7 +231,7 @@ class ImportStorage(Storage): def iterkeys(self): return iter(()) - def get_data(self, key) -> list[StorageObjectParams]: + def get_data(self, key) -> list[StorageObject]: raise NotImplementedError def generate_http_url(self, url): diff --git a/label_studio/io_storages/gcs/models.py b/label_studio/io_storages/gcs/models.py index 39cf81085132..bc87b6d39943 100644 --- a/label_studio/io_storages/gcs/models.py +++ b/label_studio/io_storages/gcs/models.py @@ -24,7 +24,7 @@ ) from io_storages.gcs.utils import GCS from io_storages.utils import ( - StorageObjectParams, + StorageObject, load_tasks_json, parse_range, storage_can_resolve_bucket_url, @@ -185,10 +185,10 @@ def iterkeys(self): return_key=True, ) - def get_data(self, key) -> list[StorageObjectParams]: + def get_data(self, key) -> list[StorageObject]: if self.use_blob_urls: task = {settings.DATA_UNDEFINED_NAME: GCS.get_uri(self.bucket, key)} - return [StorageObjectParams(key=key, task_data=task)] + return [StorageObject(key=key, task_data=task)] blob_str = GCS.read_file( client=self.get_client(), bucket_name=self.bucket, diff --git a/label_studio/io_storages/localfiles/models.py b/label_studio/io_storages/localfiles/models.py index 3971f50b3d52..1ca73f7cfb43 100644 --- a/label_studio/io_storages/localfiles/models.py +++ b/label_studio/io_storages/localfiles/models.py @@ -20,7 +20,7 @@ ImportStorageLink, ProjectStorageMixin, ) -from io_storages.utils import StorageObjectParams, load_tasks_json +from io_storages.utils import StorageObject, load_tasks_json from rest_framework.exceptions import ValidationError from tasks.models import Annotation @@ -79,7 +79,7 @@ def iterkeys(self): continue yield str(file) - def get_data(self, key) -> list[StorageObjectParams]: + def get_data(self, key) -> list[StorageObject]: path = Path(key) if self.use_blob_urls: # include self-hosted links pointed to local resources via @@ -89,7 +89,7 @@ def get_data(self, key) -> list[StorageObjectParams]: task = { settings.DATA_UNDEFINED_NAME: f'{settings.HOSTNAME}/data/local-files/?d={quote(str(relative_path))}' } - return [StorageObjectParams(key=key, task_data=task)] + return [StorageObject(key=key, task_data=task)] try: with open(path, 'rb') as f: diff --git a/label_studio/io_storages/redis/models.py b/label_studio/io_storages/redis/models.py index c002ab609f1d..1557752b7609 100644 --- a/label_studio/io_storages/redis/models.py +++ b/label_studio/io_storages/redis/models.py @@ -16,7 +16,7 @@ ImportStorageLink, ProjectStorageMixin, ) -from io_storages.utils import StorageObjectParams, load_tasks_json +from io_storages.utils import StorageObject, load_tasks_json from tasks.models import Annotation logger = logging.getLogger(__name__) @@ -90,7 +90,7 @@ def iterkeys(self): for key in client.keys(path + '*'): yield key - def get_data(self, key) -> list[StorageObjectParams]: + def get_data(self, key) -> list[StorageObject]: client = self.get_client() value_str = client.get(key) if not value_str: diff --git a/label_studio/io_storages/s3/models.py b/label_studio/io_storages/s3/models.py index c2b436db9a7f..1aa217598329 100644 --- a/label_studio/io_storages/s3/models.py +++ b/label_studio/io_storages/s3/models.py @@ -27,7 +27,7 @@ get_client_and_resource, resolve_s3_url, ) -from io_storages.utils import StorageObjectParams, load_tasks_json, storage_can_resolve_bucket_url +from io_storages.utils import StorageObject, load_tasks_json, storage_can_resolve_bucket_url from tasks.models import Annotation from label_studio.io_storages.s3.utils import AWS @@ -217,12 +217,12 @@ def scan_and_create_links(self): return self._scan_and_create_links(S3ImportStorageLink) @catch_and_reraise_from_none - def get_data(self, key) -> list[StorageObjectParams]: + def get_data(self, key) -> list[StorageObject]: uri = f'{self.url_scheme}://{self.bucket}/{key}' if self.use_blob_urls: data_key = settings.DATA_UNDEFINED_NAME task = {data_key: uri} - return [StorageObjectParams(key=key, task_data=task)] + return [StorageObject(key=key, task_data=task)] # read task json from bucket and validate it _, s3 = self.get_client_and_resource() diff --git a/label_studio/io_storages/tests/test_multitask_import.py b/label_studio/io_storages/tests/test_multitask_import.py index 673c2853bd71..942a9a275ca8 100644 --- a/label_studio/io_storages/tests/test_multitask_import.py +++ b/label_studio/io_storages/tests/test_multitask_import.py @@ -11,10 +11,11 @@ RedisImportStorageFactory, S3ImportStorageFactory, ) -from io_storages.utils import StorageObjectParams, load_tasks_json +from io_storages.utils import StorageObject, load_tasks_json from moto import mock_s3 from projects.tests.factories import ProjectFactory from rest_framework.test import APIClient + from tests.utils import azure_client_mock, gcs_client_mock, mock_feature_flag, redis_client_mock @@ -218,7 +219,7 @@ def storage(): return project, storage -def create_tasks(storage, params_list: list[StorageObjectParams]): +def create_tasks(storage, params_list: list[StorageObject]): project, storage = storage # check that no errors are raised during task creation; not checking the task itself for params in params_list: @@ -278,7 +279,7 @@ def test_bare_task(storage): blob_str = json.dumps(task_data).encode() output = load_tasks_json(blob_str, 'test.json') - expected_output = [StorageObjectParams(key='test.json', task_data=task_data)] + expected_output = [StorageObject(key='test.json', task_data=task_data)] assert output == expected_output create_tasks(storage, output) @@ -289,7 +290,7 @@ def test_data_key(storage): blob_str = json.dumps(task_data).encode() output = load_tasks_json(blob_str, 'test.json') - expected_output = [StorageObjectParams(key='test.json', task_data=task_data)] + expected_output = [StorageObject(key='test.json', task_data=task_data)] assert output == expected_output create_tasks(storage, output) @@ -301,7 +302,7 @@ def test_1elem_list(storage): blob_str = json.dumps(task_data).encode() output = load_tasks_json(blob_str, 'test.json') expected_output = [ - StorageObjectParams(key='test.json', task_data=task_data[0], row_index=0), + StorageObject(key='test.json', task_data=task_data[0], row_index=0), ] assert output == expected_output @@ -314,8 +315,8 @@ def test_2elem_list(storage): blob_str = json.dumps(task_data).encode() output = load_tasks_json(blob_str, 'test.json') expected_output = [ - StorageObjectParams(key='test.json', task_data=task_data[0], row_index=0), - StorageObjectParams(key='test.json', task_data=task_data[1], row_index=1), + StorageObject(key='test.json', task_data=task_data[0], row_index=0), + StorageObject(key='test.json', task_data=task_data[1], row_index=1), ] assert output == expected_output @@ -329,8 +330,8 @@ def test_preds_and_annots_list(storage): output = load_tasks_json(blob_str, 'test.json') expected_output = [ - StorageObjectParams(key='test.json', task_data=task_data[0], row_index=0), - StorageObjectParams(key='test.json', task_data=task_data[1], row_index=1), + StorageObject(key='test.json', task_data=task_data[0], row_index=0), + StorageObject(key='test.json', task_data=task_data[1], row_index=1), ] assert output == expected_output @@ -344,8 +345,8 @@ def test_mixed_formats(storage): output = load_tasks_json(blob_str, 'test.json') expected_output = [ - StorageObjectParams(key='test.json', task_data=task_data[0], row_index=0), - StorageObjectParams(key='test.json', task_data=task_data[1], row_index=1), + StorageObject(key='test.json', task_data=task_data[0], row_index=0), + StorageObject(key='test.json', task_data=task_data[1], row_index=1), ] assert output == expected_output @@ -359,8 +360,8 @@ def test_list_jsonl(storage): blob_str = '\n'.join([json.dumps(task) for task in task_data]).encode() output = load_tasks_json(blob_str, 'test.jsonl') expected_output = [ - StorageObjectParams(key='test.jsonl', task_data=task_data[0], row_index=0), - StorageObjectParams(key='test.jsonl', task_data=task_data[1], row_index=1), + StorageObject(key='test.jsonl', task_data=task_data[0], row_index=0), + StorageObject(key='test.jsonl', task_data=task_data[1], row_index=1), ] assert output == expected_output @@ -377,8 +378,8 @@ def test_list_jsonl_with_preds_and_annots(storage): fixed_task_data_1 = task_data[1].copy() fixed_task_data_1['annotations'] = None # this key exists in the output, since preds exist expected_output = [ - StorageObjectParams(key='test.jsonl', task_data=task_data[0], row_index=0), - StorageObjectParams(key='test.jsonl', task_data=fixed_task_data_1, row_index=1), + StorageObject(key='test.jsonl', task_data=task_data[0], row_index=0), + StorageObject(key='test.jsonl', task_data=fixed_task_data_1, row_index=1), ] assert output == expected_output @@ -408,8 +409,8 @@ def test_mixed_formats_jsonl(storage): fixed_task_data_1['text'] = None expected_output = [ - StorageObjectParams(key='test.jsonl', task_data=fixed_task_data_0, row_index=0), - StorageObjectParams(key='test.jsonl', task_data=fixed_task_data_1, row_index=1), + StorageObject(key='test.jsonl', task_data=fixed_task_data_0, row_index=0), + StorageObject(key='test.jsonl', task_data=fixed_task_data_1, row_index=1), ] assert output == expected_output diff --git a/label_studio/io_storages/utils.py b/label_studio/io_storages/utils.py index a706e1667ea0..8999282b6d5f 100644 --- a/label_studio/io_storages/utils.py +++ b/label_studio/io_storages/utils.py @@ -119,7 +119,7 @@ def parse_range(range_header): @dataclass -class StorageObjectParams: +class StorageObject: task_data: dict key: str row_index: int | None = None @@ -127,19 +127,19 @@ class StorageObjectParams: @classmethod def bulk_create( - cls, task_datas: list[dict], key, row_idxs: list[int] | None = None, row_groups: list[int] | None = None - ) -> list['StorageObjectParams']: - if row_idxs is None: - row_idxs = [None] * len(task_datas) + cls, task_datas: list[dict], key, row_indexes: list[int] | None = None, row_groups: list[int] | None = None + ) -> list['StorageObject']: + if row_indexes is None: + row_indexes = [None] * len(task_datas) if row_groups is None: row_groups = [None] * len(task_datas) return [ cls(key=key, row_index=row_idx, row_group=row_group, task_data=task_data) - for row_idx, row_group, task_data in zip(row_idxs, row_groups, task_datas) + for row_idx, row_group, task_data in zip(row_indexes, row_groups, task_datas) ] -def _load_tasks_json(blob_str: bytes, key: str) -> list[StorageObjectParams]: +def load_tasks_json_lso(blob_str: bytes, key: str) -> list[StorageObject]: """ Parse blob_str containing task JSON(s) and return the validated result or raise an error. @@ -167,21 +167,21 @@ def _error_wrapper(exc: Optional[Exception] = None): table = pyarrow.json.read_json( pa.py_buffer(blob_str), parse_options=pa.json.ParseOptions(newlines_in_values=True) ) - return StorageObjectParams.bulk_create(table.to_pylist(), key, range(table.num_rows)) + return StorageObject.bulk_create(table.to_pylist(), key, range(table.num_rows)) except Exception as e: _error_wrapper(e) else: _error_wrapper(e) if isinstance(value, dict): - return [StorageObjectParams(key=key, task_data=value)] + return [StorageObject(key=key, task_data=value)] if isinstance(value, list): - return StorageObjectParams.bulk_create(value, key, range(len(value))) + return StorageObject.bulk_create(value, key, range(len(value))) _error_wrapper() -def load_tasks_json(blob_str: str, key: str) -> tuple[list[dict], list[StorageObjectParams]]: - # uses _load_tasks_json here and an LSE-specific implementation in LSE +def load_tasks_json(blob_str: str, key: str) -> list[StorageObject]: + # uses load_tasks_json_lso here and an LSE-specific implementation in LSE load_tasks_json_func = load_func(settings.STORAGE_LOAD_TASKS_JSON) return load_tasks_json_func(blob_str, key) From c15d15e2dfc17af2b2056f33c2b4b1fd9802cb51 Mon Sep 17 00:00:00 2001 From: matt-bernstein <60152561+matt-bernstein@users.noreply.github.com> Date: Wed, 21 May 2025 17:16:07 -0400 Subject: [PATCH 59/77] add comment Co-authored-by: Jo Booth --- label_studio/io_storages/base_models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/label_studio/io_storages/base_models.py b/label_studio/io_storages/base_models.py index 0303f0e83219..49333d4b2a85 100644 --- a/label_studio/io_storages/base_models.py +++ b/label_studio/io_storages/base_models.py @@ -523,6 +523,7 @@ def sync(self): self.info_set_queued() import_sync_background(self.__class__, self.id) except Exception: + # needed to facilitate debugging storage-related testcases, since otherwise no exception is logged logger.debug(f'Storage {self} failed', exc_info=True) storage_background_failure(self) From d58650bd72c097c2bb13ded838b55bdc26137e83 Mon Sep 17 00:00:00 2001 From: matt-bernstein <60152561+matt-bernstein@users.noreply.github.com> Date: Wed, 21 May 2025 17:16:51 -0400 Subject: [PATCH 60/77] remove comment Co-authored-by: Jo Booth --- label_studio/io_storages/base_models.py | 1 - 1 file changed, 1 deletion(-) diff --git a/label_studio/io_storages/base_models.py b/label_studio/io_storages/base_models.py index 49333d4b2a85..f0be48f6f65d 100644 --- a/label_studio/io_storages/base_models.py +++ b/label_studio/io_storages/base_models.py @@ -438,7 +438,6 @@ def _scan_and_create_links(self, link_class): logger.debug(f'{self}: found new key {key}') try: - # list of (task data + ImportStorageLink details) links_params = self.get_data(key) except (UnicodeDecodeError, json.decoder.JSONDecodeError) as exc: logger.debug(exc, exc_info=True) From a50ffe928b83515a0e4b7936f428e3f0088f84fc Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Wed, 21 May 2025 17:18:37 -0400 Subject: [PATCH 61/77] update min version --- poetry.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index 60a6be507662..d5b3a8a75328 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. [[package]] name = "annotated-types" @@ -5013,4 +5013,4 @@ uwsgi = ["pyuwsgi", "uwsgitop"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<4" -content-hash = "610d20d68612bf7c3a6c6d9b8b383d5740008f8e0217e2789fbb332a334b8bdc" +content-hash = "6a0e5ec3c931bebf890cbada29d19917ee51ae4f59bf71472c586688d86bd617" diff --git a/pyproject.toml b/pyproject.toml index 2063d03b60ba..1a110ea6d2de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,7 +49,7 @@ dependencies = [ "ordered-set (==4.0.2)", "pandas (>=2.2.3)", "psycopg2-binary (==2.9.10)", - "pyarrow (>=18.0.0,<19.0.0)", + "pyarrow (>=18.1.0,<19.0.0)", "pydantic (>=2.9.2)", "python-dateutil (>=2.8.1)", "pytz (>=2022.1,<2023.0)", From 9bd9639b1419270c60e7f20b560786ce6d517d02 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Wed, 21 May 2025 17:21:42 -0400 Subject: [PATCH 62/77] rename --- label_studio/io_storages/base_models.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/label_studio/io_storages/base_models.py b/label_studio/io_storages/base_models.py index f0be48f6f65d..1a2d7f81f19e 100644 --- a/label_studio/io_storages/base_models.py +++ b/label_studio/io_storages/base_models.py @@ -342,8 +342,8 @@ def _scan_and_create_links_v2(self): raise NotImplementedError @classmethod - def add_task(cls, project, maximum_annotations, max_inner_id, storage, link_params, link_class): - link_kwargs = asdict(link_params) + def add_task(cls, project, maximum_annotations, max_inner_id, storage, link_object, link_class): + link_kwargs = asdict(link_object) data = link_kwargs.pop('task_data') # predictions @@ -438,7 +438,7 @@ def _scan_and_create_links(self, link_class): logger.debug(f'{self}: found new key {key}') try: - links_params = self.get_data(key) + link_objects = self.get_data(key) except (UnicodeDecodeError, json.decoder.JSONDecodeError) as exc: logger.debug(exc, exc_info=True) raise ValueError( @@ -448,9 +448,9 @@ def _scan_and_create_links(self, link_class): ) if not flag_set('fflag_feat_dia_2092_multitasks_per_storage_link'): - links_params = links_params[:1] + link_objects = link_objects[:1] - for link_params in links_params: + for link_object in link_objects: # TODO: batch this loop body with add_task -> add_tasks in a single bulk write. # See DIA-2062 for prerequisites task = self.add_task( @@ -458,7 +458,7 @@ def _scan_and_create_links(self, link_class): maximum_annotations, max_inner_id, self, - link_params, + link_object, link_class=link_class, ) max_inner_id += 1 From b1aae0283c85c7b61231b90db93c5907eed24da2 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Wed, 21 May 2025 17:25:02 -0400 Subject: [PATCH 63/77] defensive fix --- label_studio/io_storages/base_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/label_studio/io_storages/base_models.py b/label_studio/io_storages/base_models.py index 1a2d7f81f19e..86db4c1be859 100644 --- a/label_studio/io_storages/base_models.py +++ b/label_studio/io_storages/base_models.py @@ -342,9 +342,9 @@ def _scan_and_create_links_v2(self): raise NotImplementedError @classmethod - def add_task(cls, project, maximum_annotations, max_inner_id, storage, link_object, link_class): + def add_task(cls, project, maximum_annotations, max_inner_id, storage, link_object: StorageObject, link_class): link_kwargs = asdict(link_object) - data = link_kwargs.pop('task_data') + data = link_kwargs.pop('task_data', None) # predictions predictions = data.get('predictions') or [] From bd189df9a886db20b8c1c677229428c2b63d5ff9 Mon Sep 17 00:00:00 2001 From: robot-ci-heartex Date: Wed, 21 May 2025 21:33:53 +0000 Subject: [PATCH 64/77] Apply pre-commit linters --- label_studio/io_storages/tests/test_multitask_import.py | 1 - 1 file changed, 1 deletion(-) diff --git a/label_studio/io_storages/tests/test_multitask_import.py b/label_studio/io_storages/tests/test_multitask_import.py index 942a9a275ca8..ac520b270048 100644 --- a/label_studio/io_storages/tests/test_multitask_import.py +++ b/label_studio/io_storages/tests/test_multitask_import.py @@ -15,7 +15,6 @@ from moto import mock_s3 from projects.tests.factories import ProjectFactory from rest_framework.test import APIClient - from tests.utils import azure_client_mock, gcs_client_mock, mock_feature_flag, redis_client_mock From 933b27ec0371248309f089d21e585a85ab559bc6 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Thu, 22 May 2025 09:48:53 -0400 Subject: [PATCH 65/77] lockfile --- poetry.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index 1d4593f9c307..7846ed71cc1b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -5013,4 +5013,4 @@ uwsgi = ["pyuwsgi", "uwsgitop"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<4" -content-hash = "6a0e5ec3c931bebf890cbada29d19917ee51ae4f59bf71472c586688d86bd617" +content-hash = "710f79c673fbcd50c5c13f04466b15e2aeadefe563c2025b75328adef4af093f" From 0cb56ec42e425f947183fce31c057655721572a7 Mon Sep 17 00:00:00 2001 From: matt-bernstein Date: Thu, 22 May 2025 13:52:56 +0000 Subject: [PATCH 66/77] Sync Follow Merge dependencies Workflow run: https://github.com/HumanSignal/label-studio/actions/runs/15188365408 From 885da860bffb63bf8c3ceae63ce66995c0edb494 Mon Sep 17 00:00:00 2001 From: matt-bernstein Date: Thu, 22 May 2025 16:43:29 +0000 Subject: [PATCH 67/77] Sync Follow Merge dependencies Workflow run: https://github.com/HumanSignal/label-studio/actions/runs/15192136431 From 6320d8a6127bc506732d5feee50760ec83b47014 Mon Sep 17 00:00:00 2001 From: Jo Booth Date: Thu, 22 May 2025 14:36:51 -0400 Subject: [PATCH 68/77] add some debugging prints --- label_studio/data_export/api.py | 1 + label_studio/data_export/models.py | 1 + 2 files changed, 2 insertions(+) diff --git a/label_studio/data_export/api.py b/label_studio/data_export/api.py index c283b6175de0..50f2d4ae05ef 100644 --- a/label_studio/data_export/api.py +++ b/label_studio/data_export/api.py @@ -430,6 +430,7 @@ def delete(self, *args, **kwargs): if converted_format.file: converted_format.file.delete() except Exception as e: + logger.error(f'Failed to delete export file: {e}', exc_info=True) return Response( status=status.HTTP_500_INTERNAL_SERVER_ERROR, data={ diff --git a/label_studio/data_export/models.py b/label_studio/data_export/models.py index 81e034ad5984..9be4be88f744 100644 --- a/label_studio/data_export/models.py +++ b/label_studio/data_export/models.py @@ -248,6 +248,7 @@ class Status(models.TextChoices): def delete(self, *args, **kwargs): if flag_set('ff_back_dev_4664_remove_storage_file_on_export_delete_29032023_short'): + logger.info(f'Deleting file from storage {args=} {kwargs=}') if self.file: self.file.delete() super().delete(*args, **kwargs) From 8f9dee8712126b12bd0c8de4cf56dfeaadca8ac7 Mon Sep 17 00:00:00 2001 From: Jo Booth Date: Thu, 22 May 2025 15:12:09 -0400 Subject: [PATCH 69/77] more logging, add close call --- label_studio/data_export/api.py | 3 +++ label_studio/data_export/mixins.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/label_studio/data_export/api.py b/label_studio/data_export/api.py index 50f2d4ae05ef..ed33974e25e4 100644 --- a/label_studio/data_export/api.py +++ b/label_studio/data_export/api.py @@ -595,6 +595,9 @@ def async_convert(converted_format_id, export_type, project, hostname, download_ converted_format.status = ConvertedFormat.Status.COMPLETED converted_format.save(update_fields=['file', 'status']) + logger.info('closing file_') + file_.close() + def set_convert_background_failure(job, connection, type, value, traceback_obj): from data_export.models import ConvertedFormat diff --git a/label_studio/data_export/mixins.py b/label_studio/data_export/mixins.py index 50f64016cb50..29b77c3ff292 100644 --- a/label_studio/data_export/mixins.py +++ b/label_studio/data_export/mixins.py @@ -340,11 +340,14 @@ def convert_file(self, to_format, download_resources=False, hostname=None): files = get_all_files_from_dir(out_dir) dirs = get_all_dirs_from_dir(out_dir) + logger.info(f'Export files: {files}') if len(files) == 0 and len(dirs) == 0: return None elif len(files) == 1 and len(dirs) == 0: + logger.info(f'Export file: {files[0]}') output_file = files[0] filename = pathlib.Path(input_name).stem + pathlib.Path(output_file).suffix + logger.info(f'Export filename: {filename=} {output_file=} {input_name=}') else: shutil.make_archive(out_dir, 'zip', out_dir) output_file = pathlib.Path(tmp_dir) / (str(out_dir.stem) + '.zip') From b2484af1318a3de2fa479c4966befcf954aeace6 Mon Sep 17 00:00:00 2001 From: Jo Booth Date: Thu, 22 May 2025 15:34:48 -0400 Subject: [PATCH 70/77] more logging --- label_studio/data_export/api.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/label_studio/data_export/api.py b/label_studio/data_export/api.py index ed33974e25e4..56476164b025 100644 --- a/label_studio/data_export/api.py +++ b/label_studio/data_export/api.py @@ -595,11 +595,12 @@ def async_convert(converted_format_id, export_type, project, hostname, download_ converted_format.status = ConvertedFormat.Status.COMPLETED converted_format.save(update_fields=['file', 'status']) - logger.info('closing file_') + logger.error('closing file_') file_.close() def set_convert_background_failure(job, connection, type, value, traceback_obj): + logger.error(f'set_convert_background_failure {job=} {connection=} {type=} {value=} {traceback_obj=}') from data_export.models import ConvertedFormat convert_id = job.args[0] From f2665f94ce48791fbb8fd522eb383374cab287b2 Mon Sep 17 00:00:00 2001 From: Jo Booth Date: Thu, 22 May 2025 15:59:49 -0400 Subject: [PATCH 71/77] Revert "more logging" This reverts commit b2484af1318a3de2fa479c4966befcf954aeace6. --- label_studio/data_export/api.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/label_studio/data_export/api.py b/label_studio/data_export/api.py index 56476164b025..ed33974e25e4 100644 --- a/label_studio/data_export/api.py +++ b/label_studio/data_export/api.py @@ -595,12 +595,11 @@ def async_convert(converted_format_id, export_type, project, hostname, download_ converted_format.status = ConvertedFormat.Status.COMPLETED converted_format.save(update_fields=['file', 'status']) - logger.error('closing file_') + logger.info('closing file_') file_.close() def set_convert_background_failure(job, connection, type, value, traceback_obj): - logger.error(f'set_convert_background_failure {job=} {connection=} {type=} {value=} {traceback_obj=}') from data_export.models import ConvertedFormat convert_id = job.args[0] From 6a5b397f77e2c337b6c39ae0136d46309f718eb5 Mon Sep 17 00:00:00 2001 From: Jo Booth Date: Thu, 22 May 2025 16:00:02 -0400 Subject: [PATCH 72/77] Revert "more logging, add close call" This reverts commit 8f9dee8712126b12bd0c8de4cf56dfeaadca8ac7. --- label_studio/data_export/api.py | 3 --- label_studio/data_export/mixins.py | 3 --- 2 files changed, 6 deletions(-) diff --git a/label_studio/data_export/api.py b/label_studio/data_export/api.py index ed33974e25e4..50f2d4ae05ef 100644 --- a/label_studio/data_export/api.py +++ b/label_studio/data_export/api.py @@ -595,9 +595,6 @@ def async_convert(converted_format_id, export_type, project, hostname, download_ converted_format.status = ConvertedFormat.Status.COMPLETED converted_format.save(update_fields=['file', 'status']) - logger.info('closing file_') - file_.close() - def set_convert_background_failure(job, connection, type, value, traceback_obj): from data_export.models import ConvertedFormat diff --git a/label_studio/data_export/mixins.py b/label_studio/data_export/mixins.py index 29b77c3ff292..50f64016cb50 100644 --- a/label_studio/data_export/mixins.py +++ b/label_studio/data_export/mixins.py @@ -340,14 +340,11 @@ def convert_file(self, to_format, download_resources=False, hostname=None): files = get_all_files_from_dir(out_dir) dirs = get_all_dirs_from_dir(out_dir) - logger.info(f'Export files: {files}') if len(files) == 0 and len(dirs) == 0: return None elif len(files) == 1 and len(dirs) == 0: - logger.info(f'Export file: {files[0]}') output_file = files[0] filename = pathlib.Path(input_name).stem + pathlib.Path(output_file).suffix - logger.info(f'Export filename: {filename=} {output_file=} {input_name=}') else: shutil.make_archive(out_dir, 'zip', out_dir) output_file = pathlib.Path(tmp_dir) / (str(out_dir.stem) + '.zip') From a8a1c22d8dead3d8024b6ba55dfca54953704ed6 Mon Sep 17 00:00:00 2001 From: Jo Booth Date: Thu, 22 May 2025 16:00:10 -0400 Subject: [PATCH 73/77] Revert "add some debugging prints" This reverts commit 6320d8a6127bc506732d5feee50760ec83b47014. --- label_studio/data_export/api.py | 1 - label_studio/data_export/models.py | 1 - 2 files changed, 2 deletions(-) diff --git a/label_studio/data_export/api.py b/label_studio/data_export/api.py index 50f2d4ae05ef..c283b6175de0 100644 --- a/label_studio/data_export/api.py +++ b/label_studio/data_export/api.py @@ -430,7 +430,6 @@ def delete(self, *args, **kwargs): if converted_format.file: converted_format.file.delete() except Exception as e: - logger.error(f'Failed to delete export file: {e}', exc_info=True) return Response( status=status.HTTP_500_INTERNAL_SERVER_ERROR, data={ diff --git a/label_studio/data_export/models.py b/label_studio/data_export/models.py index 9be4be88f744..81e034ad5984 100644 --- a/label_studio/data_export/models.py +++ b/label_studio/data_export/models.py @@ -248,7 +248,6 @@ class Status(models.TextChoices): def delete(self, *args, **kwargs): if flag_set('ff_back_dev_4664_remove_storage_file_on_export_delete_29032023_short'): - logger.info(f'Deleting file from storage {args=} {kwargs=}') if self.file: self.file.delete() super().delete(*args, **kwargs) From 3d3c8e454a8ceefc24178832fbeba051c812fa29 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Thu, 22 May 2025 16:14:26 -0400 Subject: [PATCH 74/77] manually toggle logging FF --- label_studio/feature_flags.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/label_studio/feature_flags.json b/label_studio/feature_flags.json index 0f75eba69ff5..a8305077e556 100644 --- a/label_studio/feature_flags.json +++ b/label_studio/feature_flags.json @@ -3500,7 +3500,7 @@ }, "fflag_fix_back_leap_1818_set_convert_background_failure_logging_02062025_short": { "key": "fflag_fix_back_leap_1818_set_convert_background_failure_logging_02062025_short", - "on": false, + "on": true, "prerequisites": [], "targets": [], "contextTargets": [], From 80b3492ae79428e0152f43159b568227169812c1 Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Thu, 22 May 2025 16:36:27 -0400 Subject: [PATCH 75/77] conditionally skip test --- label_studio/tests/data_export.tavern.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/label_studio/tests/data_export.tavern.yml b/label_studio/tests/data_export.tavern.yml index 7a4b0cec1021..7550e99396a0 100644 --- a/label_studio/tests/data_export.tavern.yml +++ b/label_studio/tests/data_export.tavern.yml @@ -361,6 +361,7 @@ stages: test_name: export_delete strict: false marks: + - skipif: '"Windows" in platform.system()' - usefixtures: - django_live_url - ff_back_dev_4664_remove_storage_file_on_export_delete_29032023_short_on From 77c3f27a3b83c3d132879d25ab8e57892b29705d Mon Sep 17 00:00:00 2001 From: Matt Bernstein Date: Fri, 23 May 2025 08:55:36 -0400 Subject: [PATCH 76/77] Revert "manually toggle logging FF" This reverts commit 3d3c8e454a8ceefc24178832fbeba051c812fa29. --- label_studio/feature_flags.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/label_studio/feature_flags.json b/label_studio/feature_flags.json index a8305077e556..0f75eba69ff5 100644 --- a/label_studio/feature_flags.json +++ b/label_studio/feature_flags.json @@ -3500,7 +3500,7 @@ }, "fflag_fix_back_leap_1818_set_convert_background_failure_logging_02062025_short": { "key": "fflag_fix_back_leap_1818_set_convert_background_failure_logging_02062025_short", - "on": true, + "on": false, "prerequisites": [], "targets": [], "contextTargets": [], From 046f897e5e5015d4d8f862012eb7f80a83ddaa00 Mon Sep 17 00:00:00 2001 From: matt-bernstein Date: Fri, 23 May 2025 13:02:10 +0000 Subject: [PATCH 77/77] Sync Follow Merge dependencies Workflow run: https://github.com/HumanSignal/label-studio/actions/runs/15210874380