Skip to content

Commit c2b717d

Browse files
Don't write metadata file (#875)
We reintroduced writing the metadata file in #864 to preserve the divisions of the data when writing and reading again. We turned this behavior off in the past, but without proper documentation of the reason. I'm now running into issues with Dask workers dying when writing large datasets though, presumably because of the metadata file, as documented in these Dask issues: - dask/dask#6600 - dask/dask#3873 - dask/dask#8901 Also, while I ran into issues with the preservation of divisions before, I can't reproduce this locally with a small example. Let's turn writing metadata off again and validate if we are still having issues with this.
1 parent bd2a218 commit c2b717d

File tree

2 files changed

+8
-25
lines changed

2 files changed

+8
-25
lines changed

src/fondant/component/data_io.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
import dask.dataframe as dd
77
from dask.diagnostics import ProgressBar
8-
from dask.distributed import Client
98

109
from fondant.core.component_spec import OperationSpec
1110
from fondant.core.manifest import Manifest
@@ -157,7 +156,6 @@ def __init__(
157156
def write_dataframe(
158157
self,
159158
dataframe: dd.DataFrame,
160-
dask_client: t.Optional[Client] = None,
161159
) -> None:
162160
dataframe.index = dataframe.index.rename(DEFAULT_INDEX_NAME)
163161

@@ -176,7 +174,7 @@ def write_dataframe(
176174

177175
with ProgressBar():
178176
logging.info("Writing data...")
179-
dd.compute(write_task, scheduler=dask_client)
177+
dd.compute(write_task)
180178

181179
@staticmethod
182180
def validate_dataframe_columns(dataframe: dd.DataFrame, columns: t.List[str]):
@@ -234,7 +232,6 @@ def _create_write_task(
234232
schema=schema,
235233
overwrite=False,
236234
compute=False,
237-
write_metadata_file=True,
238235
)
239236
logging.info(f"Creating write task for: {location}")
240237
return write_task

tests/component/test_data_io.py

+7-21
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import dask.dataframe as dd
55
import pyarrow as pa
66
import pytest
7-
from dask.distributed import Client
87
from fondant.component.data_io import DaskDataLoader, DaskDataWriter
98
from fondant.core.component_spec import ComponentSpec, OperationSpec
109
from fondant.core.manifest import Manifest
@@ -21,13 +20,6 @@
2120
NUMBER_OF_TEST_ROWS = 151
2221

2322

24-
@pytest.fixture()
25-
def dask_client(): # noqa: PT004
26-
client = Client()
27-
yield
28-
client.close()
29-
30-
3123
@pytest.fixture()
3224
def manifest():
3325
return Manifest.from_file(manifest_path)
@@ -121,7 +113,6 @@ def test_write_dataset(
121113
dataframe,
122114
manifest,
123115
component_spec,
124-
dask_client,
125116
):
126117
"""Test writing out subsets."""
127118
# Dictionary specifying the expected subsets to write and their column names
@@ -134,7 +125,7 @@ def test_write_dataset(
134125
operation_spec=OperationSpec(component_spec),
135126
)
136127
# write dataframe to temp dir
137-
data_writer.write_dataframe(dataframe, dask_client)
128+
data_writer.write_dataframe(dataframe)
138129
# read written data and assert
139130
dataframe = dd.read_parquet(
140131
temp_dir
@@ -152,7 +143,6 @@ def test_write_dataset_custom_produces(
152143
dataframe,
153144
manifest,
154145
component_spec_produces,
155-
dask_client,
156146
):
157147
"""Test writing out subsets."""
158148
produces = {
@@ -175,7 +165,7 @@ def test_write_dataset_custom_produces(
175165
)
176166

177167
# write dataframe to temp dir
178-
data_writer.write_dataframe(dataframe, dask_client)
168+
data_writer.write_dataframe(dataframe)
179169
# # read written data and assert
180170
dataframe = dd.read_parquet(
181171
temp_dir
@@ -194,7 +184,6 @@ def test_write_reset_index(
194184
dataframe,
195185
manifest,
196186
component_spec,
197-
dask_client,
198187
):
199188
"""Test writing out the index and fields that have no dask index and checking
200189
if the id index was created.
@@ -207,19 +196,18 @@ def test_write_reset_index(
207196
manifest=manifest,
208197
operation_spec=OperationSpec(component_spec),
209198
)
210-
data_writer.write_dataframe(dataframe, dask_client)
199+
data_writer.write_dataframe(dataframe)
211200
dataframe = dd.read_parquet(fn)
212201
assert dataframe.index.name == "id"
213202

214203

215204
@pytest.mark.parametrize("partitions", list(range(1, 5)))
216-
def test_write_divisions( # noqa: PLR0913
205+
def test_write_divisions(
217206
tmp_path_factory,
218207
dataframe,
219208
manifest,
220209
component_spec,
221210
partitions,
222-
dask_client,
223211
):
224212
"""Test writing out index and subsets and asserting they have the divisions of the dataframe."""
225213
# repartition the dataframe (default is 3 partitions)
@@ -233,7 +221,7 @@ def test_write_divisions( # noqa: PLR0913
233221
operation_spec=OperationSpec(component_spec),
234222
)
235223

236-
data_writer.write_dataframe(dataframe, dask_client)
224+
data_writer.write_dataframe(dataframe)
237225

238226
dataframe = dd.read_parquet(fn)
239227
assert dataframe.index.name == "id"
@@ -245,7 +233,6 @@ def test_write_fields_invalid(
245233
dataframe,
246234
manifest,
247235
component_spec,
248-
dask_client,
249236
):
250237
"""Test writing out fields but the dataframe columns are incomplete."""
251238
with tmp_path_factory.mktemp("temp") as fn:
@@ -262,15 +249,14 @@ def test_write_fields_invalid(
262249
r"but not found in dataframe"
263250
)
264251
with pytest.raises(ValueError, match=expected_error_msg):
265-
data_writer.write_dataframe(dataframe, dask_client)
252+
data_writer.write_dataframe(dataframe)
266253

267254

268255
def test_write_fields_invalid_several_fields_missing(
269256
tmp_path_factory,
270257
dataframe,
271258
manifest,
272259
component_spec,
273-
dask_client,
274260
):
275261
"""Test writing out fields but the dataframe columns are incomplete."""
276262
with tmp_path_factory.mktemp("temp") as fn:
@@ -288,4 +274,4 @@ def test_write_fields_invalid_several_fields_missing(
288274
r"but not found in dataframe"
289275
)
290276
with pytest.raises(ValueError, match=expected_error_msg):
291-
data_writer.write_dataframe(dataframe, dask_client)
277+
data_writer.write_dataframe(dataframe)

0 commit comments

Comments
 (0)