Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature: new tests added for tsne to expand test coverage #2229

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 157 additions & 0 deletions sklearnex/manifold/tests/test_tsne.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,169 @@
# ===============================================================================

import numpy as np
import pytest
from numpy.testing import assert_allclose

# Note: n_components must be 2 for now
david-cortes-intel marked this conversation as resolved.
Show resolved Hide resolved
from onedal.tests.utils._dataframes_support import (
_as_numpy,
_convert_to_dataframe,
get_dataframes_and_queues,
)


def test_sklearnex_import():
from sklearnex.manifold import TSNE

X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
tsne = TSNE(n_components=2, perplexity=2.0).fit(X)
assert "daal4py" in tsne.__module__


from sklearnex.manifold import TSNE


@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
def test_sklearnex_tsne_import(dataframe, queue):
"""Test TSNE compatibility with different backends and queues, and validate sklearnex module."""
X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
X_df = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe)
tsne = TSNE(n_components=2, perplexity=2.0).fit(X_df)
assert "daal4py" in tsne.__module__
assert hasattr(tsne, "n_components"), "TSNE missing 'n_components' attribute."
assert tsne.n_components == 2, "TSNE 'n_components' attribute is incorrect."


@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_tsne_functionality_and_edge_cases(dataframe, queue, dtype):
"""
TSNE test covering basic functionality and edge cases using get_dataframes_and_queues.
"""
# Test basic functionality
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any chance that all of these could be parameterized instead? Otherwise, if one of them fails, then the rest wouldn't execute.

Copy link
Contributor

@david-cortes-intel david-cortes-intel Dec 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for looking into it. But what I meant was to parameterize this whole function by turning the inputs and expectations into parameters, so that one parameterization would be "Test basic functionality", another parameterization "Test with random data", and so on.

X_basic = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]], dtype=dtype)
X_basic_df = _convert_to_dataframe(X_basic, sycl_queue=queue, target_df=dataframe)
tsne_basic = TSNE(n_components=2, perplexity=2.0, random_state=42)
embedding_basic = tsne_basic.fit_transform(X_basic_df)
assert embedding_basic.shape == (4, 2)

# Test with random data
X_random = np.random.rand(100, 10).astype(dtype)
X_random_df = _convert_to_dataframe(X_random, sycl_queue=queue, target_df=dataframe)
tsne_random = TSNE(n_components=2, perplexity=30.0, random_state=42)
embedding_random = tsne_random.fit_transform(X_random_df)
assert embedding_random.shape == (100, 2)

# Test reproducibility
X_repro = np.random.rand(50, 10).astype(dtype)
X_repro_df = _convert_to_dataframe(X_repro, sycl_queue=queue, target_df=dataframe)
tsne_repro_1 = TSNE(n_components=2, random_state=42).fit_transform(X_repro_df)
tsne_repro_2 = TSNE(n_components=2, random_state=42).fit_transform(X_repro_df)
tsne_repro_1_np = _as_numpy(tsne_repro_1)
tsne_repro_2_np = _as_numpy(tsne_repro_2)
assert_allclose(tsne_repro_1_np, tsne_repro_2_np, rtol=1e-5)

# Test large data
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It feels like this one is perhaps not needed, considering that there's already a similar test earlier on with shape (100,10).

X_large = np.random.rand(1000, 50).astype(dtype)
X_large_df = _convert_to_dataframe(X_large, sycl_queue=queue, target_df=dataframe)
tsne_large = TSNE(n_components=2, perplexity=50.0, random_state=42)
embedding_large = tsne_large.fit_transform(X_large_df)
assert embedding_large.shape == (1000, 2)

# Test valid minimal data
X_valid = np.array([[0, 0], [1, 1], [2, 2]], dtype=dtype)
X_valid_df = _convert_to_dataframe(X_valid, sycl_queue=queue, target_df=dataframe)
tsne_valid = TSNE(n_components=2, perplexity=2, random_state=42)
embedding_valid = tsne_valid.fit_transform(X_valid_df)
assert embedding_valid.shape == (3, 2)

# Edge case: constant data
X_constant = np.ones((10, 10), dtype=dtype)
X_constant_df = _convert_to_dataframe(
X_constant, sycl_queue=queue, target_df=dataframe
)
tsne_constant = TSNE(n_components=2, perplexity=5, random_state=42)
embedding_constant = tsne_constant.fit(X_constant_df).embedding_
assert embedding_constant.shape == (10, 2)

# Edge case: empty data
X_empty = np.empty((0, 10), dtype=dtype)
with pytest.raises(ValueError):
TSNE(n_components=2).fit(
_convert_to_dataframe(X_empty, sycl_queue=queue, target_df=dataframe)
)

# Edge case: data with NaN or infinite values
X_invalid = np.array([[0, 0], [1, np.nan], [2, np.inf]], dtype=dtype)
with pytest.raises(ValueError):
TSNE(n_components=2).fit(
_convert_to_dataframe(X_invalid, sycl_queue=queue, target_df=dataframe)
)

# Edge Case: Sparse-Like High-Dimensional Data
np.random.seed(42)
X_sparse_like = np.random.rand(50, 500).astype(dtype) * (
np.random.rand(50, 500) > 0.99
)
X_sparse_like_df = _convert_to_dataframe(
X_sparse_like, sycl_queue=queue, target_df=dataframe
)
try:
tsne = TSNE(n_components=2, perplexity=30.0)
tsne.fit(X_sparse_like_df)
except Exception as e:
pytest.fail(f"TSNE failed on sparse-like high-dimensional data: {e}")

# Edge Case: Extremely Low Perplexity
X_low_perplexity = np.random.rand(10, 5).astype(dtype)
X_low_perplexity_df = _convert_to_dataframe(
X_low_perplexity, sycl_queue=queue, target_df=dataframe
)
try:
tsne_low_perplexity = TSNE(n_components=2, perplexity=0.5)
tsne_low_perplexity.fit(X_low_perplexity_df)
except Exception as e:
pytest.fail(f"TSNE failed with low perplexity: {e}")


@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_tsne_with_specific_complex_dataset(dataframe, queue, dtype):
"""Test TSNE with a specific, highly diverse dataset."""
complex_array = np.array(
[
[0, 0, 0, 0],
[1, 1, 1, 1],
[-1e-9, 1e-9, -1e-9, 1e-9],
[-1e9, 1e9, -1e9, 1e9],
[1e-3, 1e3, -1e3, -1e-3],
[0, 1e9, -1e-9, 1],
[1, -1, 1, -1],
[42, 42, 42, 42],
[0, 0, 1, -1],
[-1e5, 0, 1e5, -1],
[2e9, 2e-9, -2e9, -2e-9],
[3, -3, 3e3, -3e-3],
[5e-5, 5e5, -5e-5, -5e5],
[1, 0, -1e8, 1e8],
[9e-7, -9e7, 9e-7, -9e7],
[4e-4, 4e4, -4e-4, -4e4],
[6e-6, -6e6, 6e6, -6e-6],
[8, -8, 8e8, -8e-8],
],
dtype=dtype,
)

complex_array_df = _convert_to_dataframe(
complex_array, sycl_queue=queue, target_df=dataframe
)

try:
tsne = TSNE(n_components=2, perplexity=5.0, random_state=42)
embedding = tsne.fit_transform(complex_array_df)
assert embedding.shape == (
complex_array.shape[0],
2,
), "TSNE embedding shape is incorrect."
except Exception as e:
pytest.fail(f"TSNE failed on the specific complex dataset: {e}")
Loading