Skip to content

Commit 5d75317

Browse files
thiago-aixplainThiago Castro FerreiraThiago Castro Ferreira
authored
Interval label new structures (#157)
* Interval label new structures * Fixes on data type * Docs for onboarding an image label detection corpus * Adding label detection dataset onboarding example in documentation * Fixing documentation link * Pixel to percentage --------- Co-authored-by: Thiago Castro Ferreira <[email protected]> Co-authored-by: Thiago Castro Ferreira <[email protected]>
1 parent b45f12e commit 5d75317

File tree

11 files changed

+451
-4
lines changed

11 files changed

+451
-4
lines changed

aixplain/enums/data_type.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,4 +35,4 @@ class DataType(Enum):
3535
VIDEO = "video"
3636

3737
def __str__(self):
38-
return self._value_
38+
return self._value_

aixplain/processes/data_onboarding/onboard_functions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,11 +97,11 @@ def process_data_files(
9797
-1,
9898
0,
9999
)
100-
if metadata.dtype in [DataType.AUDIO, DataType.IMAGE] or metadata.dsubtype == DataSubtype.INTERVAL:
100+
if metadata.dtype in [DataType.AUDIO, DataType.IMAGE, DataType.LABEL] or metadata.dsubtype == DataSubtype.INTERVAL:
101101
files, data_column_idx, start_column_idx, end_column_idx, nrows = process_media_files.run(
102102
metadata=metadata, paths=paths, folder=folder
103103
)
104-
elif metadata.dtype in [DataType.TEXT, DataType.LABEL]:
104+
elif metadata.dtype in [DataType.TEXT]:
105105
files, data_column_idx, nrows = process_text_files.run(metadata=metadata, paths=paths, folder=folder)
106106
return files, data_column_idx, start_column_idx, end_column_idx, nrows
107107

aixplain/processes/data_onboarding/process_media_files.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from pathlib import Path
1717
from tqdm import tqdm
1818
from typing import List, Tuple
19+
from urllib.parse import urlparse
1920

2021
AUDIO_MAX_SIZE = 50000000
2122
IMAGE_TEXT_MAX_SIZE = 25000000
@@ -45,6 +46,15 @@ def run(metadata: MetaData, paths: List, folder: Path, batch_size: int = 100) ->
4546
Returns:
4647
Tuple[List[File], int, int, int]: list of s3 links; data, start and end columns index, and number of rows
4748
"""
49+
if metadata.dtype == DataType.LABEL:
50+
assert (
51+
metadata.storage_type != StorageType.TEXT
52+
), f'Data Asset Onboarding Error: Column "{metadata.name}" of type "{metadata.dtype}" can not be stored in text. Label data should be stored in a JSON file.'
53+
else:
54+
assert (
55+
metadata.storage_type != StorageType.TEXT
56+
), f'Data Asset Onboarding Error: Column "{metadata.name}" of type "{metadata.dtype}" can not be stored in text.'
57+
4858
# if files are stored locally, create a folder to store it
4959
media_folder = Path(".")
5060
if metadata.storage_type == StorageType.FILE:
@@ -95,6 +105,14 @@ def run(metadata: MetaData, paths: List, folder: Path, batch_size: int = 100) ->
95105
assert (
96106
os.path.getsize(media_path) <= AUDIO_MAX_SIZE
97107
), f'Data Asset Onboarding Error: Local audio file "{media_path}" exceeds the size limit of 50 MB.'
108+
elif metadata.dtype == DataType.LABEL:
109+
assert (
110+
os.path.getsize(media_path) <= IMAGE_TEXT_MAX_SIZE
111+
), f'Data Asset Onboarding Error: JSON file with labels "{media_path}" exceeds the size limit of 25 MB.'
112+
_, extension = os.path.splitext(media_path)
113+
assert (
114+
extension == ".json"
115+
), f'Data Asset Onboarding Error: Label data should be stored in a JSON file and "{media_path}" is not one.'
98116
else:
99117
assert (
100118
os.path.getsize(media_path) <= IMAGE_TEXT_MAX_SIZE
@@ -105,6 +123,12 @@ def run(metadata: MetaData, paths: List, folder: Path, batch_size: int = 100) ->
105123
shutil.copy2(media_path, new_path)
106124
batch.append(fname)
107125
else:
126+
if metadata.dtype == DataType.LABEL:
127+
path = urlparse(media_path).path
128+
_, extension = os.path.splitext(path)
129+
assert (
130+
extension == ".json"
131+
), f'Data Asset Onboarding Error: Label data should be stored in a JSON file and "{media_path}" is not one.'
108132
batch.append(media_path)
109133

110134
# crop intervals can not be used with interval data types

aixplain/processes/data_onboarding/process_text_files.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def process_text(content: str, storage_type: StorageType) -> Text:
2626
Text: textual content
2727
"""
2828
if storage_type == StorageType.FILE:
29-
# Check the size of file and assert a limit of 50 MB
29+
# Check the size of file and assert a limit of 25 MB
3030
assert (
3131
os.path.getsize(content) <= 25000000
3232
), f'Data Asset Onboarding Error: Local text file "{content}" exceeds the size limit of 25 MB.'
622 KB
Loading
6.24 MB
Loading
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
,images,labels
2+
0,corpus/images/1.jpg,corpus/labels/1.json
3+
1,corpus/images/2.png,corpus/labels/2.json
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"data": "arcade",
3+
"boundingBox": {
4+
"top": 0,
5+
"bottom": 0,
6+
"left": 0,
7+
"right": 0
8+
}
9+
}
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"data": "building",
3+
"boundingBox": {
4+
"top": 0,
5+
"bottom": 0,
6+
"left": 0,
7+
"right": 0
8+
}
9+
}

0 commit comments

Comments
 (0)