Skip to content

Remove numpy as a dependencyย #1259

Closed
@kevinjqliu

Description

@kevinjqliu

Feature Request / Improvement

#1256 mentioned removing numpy as a dependency. numpy is currently only used in 1 function

def _combine_positional_deletes(positional_deletes: List[pa.ChunkedArray], start_index: int, end_index: int) -> pa.Array:
if len(positional_deletes) == 1:
all_chunks = positional_deletes[0]
else:
all_chunks = pa.chunked_array(itertools.chain(*[arr.chunks for arr in positional_deletes]))
return np.subtract(np.setdiff1d(np.arange(start_index, end_index), all_chunks, assume_unique=False), start_index)

and 2 tests
def test_delete_threshold(session_catalog: Catalog) -> None:
schema = Schema(
NestedField(field_id=101, name="id", field_type=LongType(), required=True),
NestedField(field_id=103, name="created_at", field_type=DateType(), required=False),
NestedField(field_id=104, name="relevancy_score", field_type=DoubleType(), required=False),
)
partition_spec = PartitionSpec(PartitionField(source_id=103, field_id=2000, transform=DayTransform(), name="created_at_day"))
try:
session_catalog.drop_table(
identifier="default.scores",
)
except NoSuchTableError:
pass
session_catalog.create_table(
identifier="default.scores",
schema=schema,
partition_spec=partition_spec,
)
# Parameters
num_rows = 100 # Number of rows in the dataframe
id_min, id_max = 1, 10000
date_start, date_end = date(2024, 1, 1), date(2024, 2, 1)
# Generate the 'id' column
id_column = np.random.randint(id_min, id_max, num_rows)
# Generate the 'created_at' column as dates only
date_range = pd.date_range(start=date_start, end=date_end, freq="D") # Daily frequency for dates
created_at_column = np.random.choice(date_range, num_rows) # Convert to string (YYYY-MM-DD format)
# Generate the 'relevancy_score' column with a peak around 0.1
relevancy_score_column = np.random.beta(a=2, b=20, size=num_rows) # Adjusting parameters to peak around 0.1

@pytest.mark.integration
def test_rewrite_manifest_after_partition_evolution(session_catalog: Catalog) -> None:
np.random.seed(876)
N = 1440
d = {
"timestamp": pa.array([datetime(2023, 1, 1, 0, 0, 0) + timedelta(minutes=i) for i in range(N)]),
"category": pa.array([np.random.choice(["A", "B", "C"]) for _ in range(N)]),
"value": pa.array(np.random.normal(size=N)),
}

It seems like we can replace the _combine_positional_deletes use of numpy with pure python operators or pyarrow operators

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions