-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_script_laion_mi.py
More file actions
71 lines (54 loc) · 1.9 KB
/
data_script_laion_mi.py
File metadata and controls
71 lines (54 loc) · 1.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from tqdm.auto import tqdm
from datasets import load_dataset
import requests
from PIL import Image
import io
import pathlib
import numpy as np
import pandas as pd
timeout_seconds = 15
def get_image(image_url):
try:
response = requests.get(image_url, timeout=timeout_seconds)
response.raise_for_status()
image_data = io.BytesIO(response.content)
img = np.array(Image.open(image_data))
if (3 == img.ndim) and (3 == img.shape[-1]):
return img
except requests.exceptions.RequestException as e:
print(f"Error fetching image: {e}")
except IOError:
print("Error opening or processing image data.")
raise ValueError("")
if __name__ == "__main__":
# Opening dataset
dataset = load_dataset("antoniaaa/laion_mi")
imgs = []
for row in tqdm(dataset["members"]):
try:
imgs.append(
{"image": get_image(row["url"]), "caption": row["caption"], "label": 1}
)
except ValueError:
continue
for row in tqdm(dataset["nonmembers"]):
try:
imgs.append(
{"image": get_image(row["url"]), "caption": row["caption"], "label": 0}
)
except ValueError:
continue
image_path = pathlib.Path("data/laion_mi_image/images")
image_path.mkdir(exist_ok=True, parents=True)
image_paths = []
for i, row in enumerate(tqdm(imgs)):
cur_image_path = image_path.joinpath(f"image_{i}.jpg").as_posix()
save_image_path = cur_image_path.split("/", 1)[-1]
image_paths.append(save_image_path)
Image.fromarray(row["image"]).convert("RGB").save(cur_image_path)
df = pd.DataFrame.from_records(imgs)
df["image_paths"] = image_paths
df.drop(columns=["image"], inplace=True)
df[["image_paths", "caption", "label"]].to_csv(
"data/laion_mi_image/laion_mi_image.csv", index=0
)