diff --git a/codebeaver.yml b/codebeaver.yml new file mode 100644 index 000000000..ac19b7a37 --- /dev/null +++ b/codebeaver.yml @@ -0,0 +1,2 @@ +from: pytest +# This file was generated automatically by CodeBeaver based on your repository. Learn how to customize it here: https://docs.codebeaver.ai/open-source/codebeaver-yml/ \ No newline at end of file diff --git a/mlxtend/data/tests/test_iris.py b/mlxtend/data/tests/test_iris.py index 2669f9ba4..b85f37d5a 100644 --- a/mlxtend/data/tests/test_iris.py +++ b/mlxtend/data/tests/test_iris.py @@ -35,3 +35,88 @@ def test_iris_invalid_choice(): with pytest.raises(ValueError) as excinfo: iris_data(version="bla") assert excinfo.value.message == "version must be 'uci' or 'corrected'." + +def test_iris_data_invalid_version_type(): + """Test that providing a non-string version value raises a ValueError.""" + with pytest.raises(ValueError) as excinfo: + iris_data(version=None) + # Check that the error message contains the expected text + assert "version must be 'uci' or 'corrected'" in str(excinfo.value) + +def test_iris_data_dtype_and_shape(): + """Test that iris_data returns numpy arrays with expected dtypes and shapes for both versions.""" + for version in ["uci", "corrected"]: + X, y = iris_data(version=version) + # Check types: X should be a float array, y an integer array. + assert isinstance(X, np.ndarray) + assert isinstance(y, np.ndarray) + # Check expected shapes: there are 150 samples and 4 features + assert X.shape == (150, 4) + assert y.shape == (150,) + # Check that X's dtype is float and y's dtype is a kind of integer. + assert X.dtype in [np.float64, np.float32] + assert np.issubdtype(y.dtype, np.integer) + +def test_iris_data_file_not_found(monkeypatch): + """Test that iris_data propagates file not found errors from np.genfromtxt.""" + def fake_genfromtxt(*args, **kwargs): + raise IOError("File not found") + # Patch np.genfromtxt so that it raises an IOError to simulate a missing file. + monkeypatch.setattr(np, "genfromtxt", fake_genfromtxt) + with pytest.raises(IOError) as excinfo: + iris_data(version="uci") + assert "File not found" in str(excinfo.value) +def test_iris_data_empty_version(monkeypatch): + """Test that providing an empty string as version raises a ValueError.""" + with pytest.raises(ValueError) as excinfo: + iris_data(version="") + # Check that the error message contains the expected text + assert "version must be 'uci' or 'corrected'" in str(excinfo.value) + +def test_iris_data_uppercase_version(monkeypatch): + """Test that providing an uppercase version string (e.g., 'UCI') raises a ValueError.""" + with pytest.raises(ValueError) as excinfo: + iris_data(version="UCI") + # Check that the error message contains the expected text + assert "version must be 'uci' or 'corrected'" in str(excinfo.value) + +def test_iris_data_incorrect_shape(monkeypatch): + """Test that iris_data raises an IndexError when the data shape is insufficient for the 'corrected' version. + This simulates a scenario where np.genfromtxt returns an array with too few rows. + """ + def fake_genfromtxt(*args, **kwargs): + # Simulate a small array with only 30 rows (instead of the expected 150) and 5 columns + return np.zeros((30, 5)) + + monkeypatch.setattr(np, "genfromtxt", fake_genfromtxt) + with pytest.raises(IndexError): + iris_data(version="corrected") + +def test_iris_data_returns_distinct_arrays(): + """Test that iris_data returns distinct array objects on consecutive calls, + so that modifications to one do not affect the other. + """ + iris_x1, iris_y1 = iris_data() + iris_x2, iris_y2 = iris_data() + # Check that the returned arrays are not the same objects in memory + assert iris_x1 is not iris_x2 + assert iris_y1 is not iris_y2 +def test_iris_data_numeric_version(): + """Test that providing a non-string numeric version (e.g., 123) raises a ValueError.""" + with pytest.raises(ValueError) as excinfo: + iris_data(version=123) + # Check that the error message contains the expected text + assert "version must be 'uci' or 'corrected'" in str(excinfo.value) + +def test_iris_data_whitespace_version(): + """Test that providing a version string with extra whitespace (e.g., ' uci ') raises a ValueError.""" + with pytest.raises(ValueError) as excinfo: + iris_data(version=" uci ") + # Check that the error message contains the expected text + assert "version must be 'uci' or 'corrected'" in str(excinfo.value) + +def test_iris_data_empty_file(monkeypatch): + """Test that iris_data raises an IndexError when np.genfromtxt returns an empty array (simulating an empty data file).""" + monkeypatch.setattr(np, "genfromtxt", lambda *args, **kwargs: np.array([])) + with pytest.raises(IndexError): + iris_data(version="uci") \ No newline at end of file diff --git a/tests/test_autompg.py b/tests/test_autompg.py new file mode 100644 index 000000000..cd58b4eed --- /dev/null +++ b/tests/test_autompg.py @@ -0,0 +1,140 @@ +import numpy as np +import pytest +import os +from mlxtend.data.autompg import autompg_data + +def test_autompg_data_returns_correct_arrays(monkeypatch): + """Test that autompg_data returns correct X and y arrays given valid input data.""" + # Create dummy data with shape (3, 6): 5 features + 1 target (3 samples, 6 columns) + dummy_data = np.array([ + [1, 2, 3, 4, 5, 6], + [7, 8, 9, 10, 11, 12], + [13, 14, 15, 16, 17, 18] + ]) + + # Monkeypatch np.genfromtxt to return dummy_data regardless of file name or delimiter. + monkeypatch.setattr(np, "genfromtxt", lambda fname, delimiter: dummy_data) + + X, y = autompg_data() + + # X should be all columns except the last one, y should be the last column. + expected_X = dummy_data[:, :-1] + expected_y = dummy_data[:, -1] + np.testing.assert_array_equal(X, expected_X) + np.testing.assert_array_equal(y, expected_y) + +def test_autompg_data_empty(monkeypatch): + """Test that autompg_data returns empty arrays when input data is empty.""" + # Create an empty dummy data with 5 columns (4 features + 1 target). + dummy_data = np.empty((0, 5)) + + monkeypatch.setattr(np, "genfromtxt", lambda fname, delimiter: dummy_data) + + X, y = autompg_data() + + assert X.shape == (0, 4) # since dummy_data has 5 columns and X excludes the last column + assert y.shape == (0,) + +def test_autompg_data_invalid_input(monkeypatch): + """Test that autompg_data raises an error when data is invalid (e.g., 1-dimensional).""" + # Return a 1D array instead of a 2D array, which will cause slicing to fail. + dummy_data = np.array([1, 2, 3, 4, 5]) + + monkeypatch.setattr(np, "genfromtxt", lambda fname, delimiter: dummy_data) + + with pytest.raises(IndexError): + autompg_data() +def test_autompg_data_single_sample(monkeypatch): + """Test that autompg_data correctly parses a dataset with a single sample.""" + # Create dummy data with a single sample (row) with 6 columns (5 features + 1 target) + dummy_data = np.array([[10, 20, 30, 40, 50, 60]]) + monkeypatch.setattr(np, "genfromtxt", lambda fname, delimiter: dummy_data) + + X, y = autompg_data() + expected_X = dummy_data[:, :-1] + expected_y = dummy_data[:, -1] + np.testing.assert_array_equal(X, expected_X) + np.testing.assert_array_equal(y, expected_y) + +def test_autompg_data_one_column(monkeypatch): + """Test that autompg_data returns correct shapes when the input data has only one column. + In such a case, since X is taken as all columns except the last and the only column is the target, + X will be an empty array of shape (n, 0) and y will have shape (n,).""" + dummy_data = np.array([[100], [200], [300]]) + monkeypatch.setattr(np, "genfromtxt", lambda fname, delimiter: dummy_data) + + X, y = autompg_data() + # X should have 0 columns since dummy_data has only one column + assert X.shape == (3, 0) + # y should be a 1-dimensional array of length 3 + assert y.shape == (3,) + +def test_autompg_data_none(monkeypatch): + """Test that autompg_data raises a TypeError when np.genfromtxt returns None (no data).""" + monkeypatch.setattr(np, "genfromtxt", lambda fname, delimiter: None) + with pytest.raises(TypeError): + autompg_data() +def test_autompg_data_calls_genfromtxt(monkeypatch): + """Test that autompg_data calls np.genfromtxt with the correct file path and delimiter.""" + calls = [] + + def dummy_genfromtxt(fname, delimiter): + calls.append((fname, delimiter)) + # Return dummy data with two samples: + # 2 features (all columns except target) and 1 target column. + return np.array([[1, 2, 3], [4, 5, 6]]) + + monkeypatch.setattr(np, "genfromtxt", dummy_genfromtxt) + + X, y = autompg_data() + + assert calls, "np.genfromtxt was not called" + fname, delim = calls[0] + # Check that the file path ends with the expected subdirectory/filename. + expected_ending = os.path.join("data", "autompg.csv.gz") + assert fname.endswith(expected_ending), "The file path used is incorrect." + assert delim == ",", "The delimiter used is not a comma." + + # Check that X and y are correctly parsed: + expected_X = np.array([[1, 2], [4, 5]]) + expected_y = np.array([3, 6]) + np.testing.assert_array_equal(X, expected_X) + np.testing.assert_array_equal(y, expected_y) +def test_autompg_data_with_nans(monkeypatch): + """Test that autompg_data returns arrays that correctly preserve np.nan values.""" + # Create dummy data with np.nan values. + dummy_data = np.array([ + [1.0, np.nan, 3.0, 4.0], + [5.0, 6.0, np.nan, 8.0] + ]) + # Monkeypatch np.genfromtxt to return dummy_data + monkeypatch.setattr(np, "genfromtxt", lambda fname, delimiter: dummy_data) + + X, y = autompg_data() + expected_X = dummy_data[:, :-1] + expected_y = dummy_data[:, -1] + np.testing.assert_array_equal(X, expected_X) + np.testing.assert_array_equal(y, expected_y) + +def test_autompg_data_list_input(monkeypatch): + """Test that autompg_data raises a TypeError when np.genfromtxt returns a list instead of an ndarray.""" + # Return a normal Python list rather than a NumPy array. + dummy_data = [[1, 2, 3], [4, 5, 6]] + monkeypatch.setattr(np, "genfromtxt", lambda fname, delimiter: dummy_data) + + with pytest.raises(TypeError): + autompg_data() + +def test_autompg_data_non_numeric(monkeypatch): + """Test that autompg_data correctly parses datasets containing non-numeric (string) values.""" + dummy_data = np.array([ + ["a", "b", "c"], + ["d", "e", "f"] + ]) + monkeypatch.setattr(np, "genfromtxt", lambda fname, delimiter: dummy_data) + + X, y = autompg_data() + expected_X = dummy_data[:, :-1] + expected_y = dummy_data[:, -1] + np.testing.assert_array_equal(X, expected_X) + np.testing.assert_array_equal(y, expected_y) \ No newline at end of file diff --git a/tests/test_boston_housing.py b/tests/test_boston_housing.py new file mode 100644 index 000000000..6beb92092 --- /dev/null +++ b/tests/test_boston_housing.py @@ -0,0 +1,167 @@ +import os +import pytest +import numpy as np +from mlxtend.data.boston_housing import boston_housing_data + + +def test_boston_housing_data_valid(tmp_path, monkeypatch): + """Test that boston_housing_data returns correct arrays when given a valid CSV file.""" + # Create a temporary CSV file with 2 rows and 14 columns (13 features and 1 target) + row1 = ",".join(str(x) for x in range(14)) + row2 = ",".join(str(x) for x in range(14, 28)) + content = row1 + "\n" + row2 + tmp_csv = tmp_path / "boston_housing.csv" + tmp_csv.write_text(content) + + # Monkeypatch the module's DATA_PATH to point to the temporary CSV file + monkeypatch.setattr("mlxtend.data.boston_housing.DATA_PATH", str(tmp_csv)) + + # Call the function and check the returned arrays + X, y = boston_housing_data() + assert isinstance(X, np.ndarray) + assert isinstance(y, np.ndarray) + assert X.shape == (2, 13) + assert y.shape == (2,) + + # Validate some values: first row's features and target value + expected_row1 = [float(x) for x in range(13)] + assert np.allclose(X[0], expected_row1) + assert y[0] == 13.0 + +def test_boston_housing_data_file_not_found(monkeypatch): + """Test that boston_housing_data raises an error if the CSV file is missing.""" + # Set the DATA_PATH to a non-existent file + monkeypatch.setattr("mlxtend.data.boston_housing.DATA_PATH", "non_existent_file.csv") + with pytest.raises(Exception): + boston_housing_data() +def test_boston_housing_data_single_row(tmp_path, monkeypatch): + """Test that function raises an error when the CSV file has a single row (resulting in a 1D array).""" + single_row = ",".join(str(x) for x in range(14)) + tmp_csv = tmp_path / "boston_housing.csv" + tmp_csv.write_text(single_row) + monkeypatch.setattr("mlxtend.data.boston_housing.DATA_PATH", str(tmp_csv)) + with pytest.raises(IndexError): + boston_housing_data() + +def test_boston_housing_data_trailing_newline(tmp_path, monkeypatch): + """Test that the CSV file with a trailing empty newline is read correctly.""" + row1 = ",".join(str(x) for x in range(14)) + row2 = ",".join(str(x) for x in range(14, 28)) + content = row1 + "\n" + row2 + "\n" + tmp_csv = tmp_path / "boston_housing.csv" + tmp_csv.write_text(content) + monkeypatch.setattr("mlxtend.data.boston_housing.DATA_PATH", str(tmp_csv)) + X, y = boston_housing_data() + assert isinstance(X, np.ndarray) + assert isinstance(y, np.ndarray) + assert X.shape == (2, 13) + assert y.shape == (2,) + +def test_boston_housing_data_invalid_numeric(tmp_path, monkeypatch): + """Test that non-numeric data in the CSV file results in NaN in the returned arrays.""" + # Construct row1 with "non_numeric" at column index 12 (feature column) and row2 with valid floats. + row1 = ",".join(str(x) if x != 12 else "non_numeric" for x in range(14)) + row2 = ",".join(str(x) for x in range(14, 28)) + content = row1 + "\n" + row2 + tmp_csv = tmp_path / "boston_housing.csv" + tmp_csv.write_text(content) + monkeypatch.setattr("mlxtend.data.boston_housing.DATA_PATH", str(tmp_csv)) + X, y = boston_housing_data() + # Verify that the non-numeric field is converted to NaN + assert np.isnan(X[0, 12]), "Expected a NaN value where conversion failed" +def test_boston_housing_data_empty_file(tmp_path, monkeypatch): + """Test that an empty CSV file causes an IndexError during slicing due to no data.""" + tmp_csv = tmp_path / "boston_housing.csv" + tmp_csv.write_text("") + monkeypatch.setattr("mlxtend.data.boston_housing.DATA_PATH", str(tmp_csv)) + with pytest.raises(IndexError): + boston_housing_data() + +def test_boston_housing_data_extra_whitespaces(tmp_path, monkeypatch): + """Test that the CSV file with extra whitespaces surrounding the numbers is read correctly.""" + row1 = " , ".join(str(x) for x in range(14)) + row2 = " , ".join(str(x) for x in range(14, 28)) + content = row1 + "\n" + row2 + tmp_csv = tmp_path / "boston_housing.csv" + tmp_csv.write_text(content) + monkeypatch.setattr("mlxtend.data.boston_housing.DATA_PATH", str(tmp_csv)) + X, y = boston_housing_data() + assert isinstance(X, np.ndarray) + assert isinstance(y, np.ndarray) + assert X.shape == (2, 13) + assert y.shape == (2,) + +def test_boston_housing_data_inconsistent_columns(tmp_path, monkeypatch): + """Test that a CSV file with rows having inconsistent number of columns raises an error.""" + # Row1 has 14 columns, row2 has 13 columns + row1 = ",".join(str(x) for x in range(14)) + row2 = ",".join(str(x) for x in range(13, 26)) # has 13 numbers + content = row1 + "\n" + row2 + tmp_csv = tmp_path / "boston_housing.csv" + tmp_csv.write_text(content) + monkeypatch.setattr("mlxtend.data.boston_housing.DATA_PATH", str(tmp_csv)) + with pytest.raises(Exception): + boston_housing_data() +def test_boston_housing_data_with_header(tmp_path, monkeypatch): + """Test that a CSV file containing a header row returns arrays where the header row converts to NaN.""" + # Create a CSV file with a header row followed by two valid data rows (14 columns each) + header = ",".join("Header"+str(i) for i in range(14)) + row1 = ",".join(str(x) for x in range(14)) + row2 = ",".join(str(x) for x in range(14, 28)) + content = header + "\n" + row1 + "\n" + row2 + tmp_csv = tmp_path / "boston_housing.csv" + tmp_csv.write_text(content) + + # Redirect the function to use the temporary CSV file + monkeypatch.setattr("mlxtend.data.boston_housing.DATA_PATH", str(tmp_csv)) + + X, y = boston_housing_data() + + # Check that the header row was converted to NaN values + assert np.all(np.isnan(X[0])), "Header row conversion should result in NaNs for features" + assert np.isnan(y[0]), "Header row conversion should result in NaN for target" + + # Validate the remaining rows are parsed correctly + expected_row1 = [float(x) for x in range(13)] + assert np.allclose(X[1], expected_row1) + assert y[1] == 13.0 + +def test_boston_housing_data_exponential_format(tmp_path, monkeypatch): + """Test that numbers in exponential notation are correctly parsed.""" + # Create a CSV file with 2 rows using exponential format + fmt_exp = lambda x: f"{float(x):.1e}" + row1 = ",".join(fmt_exp(x) for x in range(14)) + row2 = ",".join(fmt_exp(x) for x in range(14, 28)) + content = row1 + "\n" + row2 + tmp_csv = tmp_path / "boston_housing.csv" + tmp_csv.write_text(content) + + monkeypatch.setattr("mlxtend.data.boston_housing.DATA_PATH", str(tmp_csv)) + X, y = boston_housing_data() + + # Validate the first row's features and target using expected float values + expected_row1 = [float(x) for x in range(13)] + assert np.allclose(X[0], expected_row1) + # y[0] is taken from the last column of row1; we compare it to the float conversion of that value. + expected_y0 = float(row1.split(",")[-1]) + assert y[0] == expected_y0 + +def test_boston_housing_data_extra_columns(tmp_path, monkeypatch): + """Test that a CSV file with extra columns (e.g., 15 instead of 14) is parsed consistently using slicing.""" + # Create a CSV file with 2 rows and 15 columns (14 features and 1 target by slicing) + row1 = ",".join(str(x) for x in range(15)) + row2 = ",".join(str(x) for x in range(15, 30)) + content = row1 + "\n" + row2 + tmp_csv = tmp_path / "boston_housing.csv" + tmp_csv.write_text(content) + + monkeypatch.setattr("mlxtend.data.boston_housing.DATA_PATH", str(tmp_csv)) + X, y = boston_housing_data() + + # For each row, X is all but the last column. Therefore, X.shape should be (2, 14) and y.shape should be (2,) + assert X.shape == (2, 14) + assert y.shape == (2,) + + # Validate the target values extracted from the last column of each row + assert y[0] == 14.0 + assert y[1] == 29.0 \ No newline at end of file diff --git a/tests/test_local_mnist.py b/tests/test_local_mnist.py new file mode 100644 index 000000000..a73b901a9 --- /dev/null +++ b/tests/test_local_mnist.py @@ -0,0 +1,218 @@ +import os +import struct +import tempfile +import numpy as np +import pytest + +from mlxtend.data.local_mnist import loadlocal_mnist + +def create_labels_file(file_path, n_labels, labels_data): + # Write header: magic number and count, then label data + with open(file_path, "wb") as f: + f.write(struct.pack(">II", 2049, n_labels)) + f.write(bytearray(labels_data)) + +def create_images_file(file_path, n_images, rows, cols, images_data): + # Write header: magic number, number of images, rows, cols, then image data + with open(file_path, "wb") as f: + f.write(struct.pack(">IIII", 2051, n_images, rows, cols)) + f.write(bytearray(images_data)) + +def test_loadlocal_mnist_success(): + """Test that loadlocal_mnist correctly loads valid MNIST data.""" + n_labels = 2 + rows, cols = 28, 28 + labels = [7, 3] + # Create two dummy images: first image with all zeros and second image with all ones + image1 = [0] * (rows * cols) + image2 = [1] * (rows * cols) + images_data = image1 + image2 + + with tempfile.TemporaryDirectory() as tmpdir: + labels_path = os.path.join(tmpdir, "labels.idx1-ubyte") + images_path = os.path.join(tmpdir, "images.idx3-ubyte") + + create_labels_file(labels_path, n_labels, labels) + create_images_file(images_path, n_labels, rows, cols, images_data) + + images_loaded, labels_loaded = loadlocal_mnist(images_path, labels_path) + assert images_loaded.shape == (n_labels, rows * cols) + assert labels_loaded.shape == (n_labels,) + np.testing.assert_array_equal(images_loaded[0], np.array(image1, dtype=np.uint8)) + np.testing.assert_array_equal(images_loaded[1], np.array(image2, dtype=np.uint8)) + np.testing.assert_array_equal(labels_loaded, np.array(labels, dtype=np.uint8)) + +def test_loadlocal_mnist_invalid_images(): + """Test that loadlocal_mnist raises an error when image file data is incomplete.""" + n_labels = 2 + rows, cols = 28, 28 + labels = [0, 1] + # Provide image data that is insufficient (only one image worth) + image_data = [0] * (rows * cols) + + with tempfile.TemporaryDirectory() as tmpdir: + labels_path = os.path.join(tmpdir, "labels.idx1-ubyte") + images_path = os.path.join(tmpdir, "images.idx3-ubyte") + + create_labels_file(labels_path, n_labels, labels) + create_images_file(images_path, 1, rows, cols, image_data) # Intentionally wrong number of images + + with pytest.raises(ValueError): + loadlocal_mnist(images_path, labels_path) + +def test_loadlocal_mnist_nonexistent_file(): + """Test that loadlocal_mnist raises an error when provided file paths do not exist.""" + with pytest.raises(FileNotFoundError): + loadlocal_mnist("nonexistent_images_file", "nonexistent_labels_file") + +def test_images_data_type_uint8(): + """Test that loaded images are of type uint8.""" + n_labels = 1 + rows, cols = 28, 28 + labels = [5] + image_data = [123] * (rows * cols) + + with tempfile.TemporaryDirectory() as tmpdir: + labels_path = os.path.join(tmpdir, "labels.idx1-ubyte") + images_path = os.path.join(tmpdir, "images.idx3-ubyte") + + create_labels_file(labels_path, n_labels, labels) + create_images_file(images_path, n_labels, rows, cols, image_data) + + images_loaded, _ = loadlocal_mnist(images_path, labels_path) + assert images_loaded.dtype == np.uint8 +def test_loadlocal_mnist_inconsistent_labels(): + """Test that loadlocal_mnist handles an inconsistent labels file where the header count + does not match the actual number of labels written. The function uses the actual number of labels + read from the file to reshape the image file.""" + n_labels_header = 2 + actual_labels = [9] # only one label provided, even though header says 2 + rows, cols = 28, 28 + # Provide exactly 1 image because the actual labels read will be 1. + image_data = [200] * (rows * cols) + + with tempfile.TemporaryDirectory() as tmpdir: + labels_path = os.path.join(tmpdir, "labels.idx1-ubyte") + images_path = os.path.join(tmpdir, "images.idx3-ubyte") + + # Write header that claims 2 labels, but only write 1 label. + with open(labels_path, "wb") as f: + f.write(struct.pack(">II", 2049, n_labels_header)) + f.write(bytearray(actual_labels)) + + # Now create images file with exactly 1 image. + create_images_file(images_path, len(actual_labels), rows, cols, image_data) + + images_loaded, labels_loaded = loadlocal_mnist(images_path, labels_path) + assert images_loaded.shape == (1, rows * cols) + np.testing.assert_array_equal(labels_loaded, np.array(actual_labels, dtype=np.uint8)) + +def test_loadlocal_mnist_empty_labels_file(): + """Test that loadlocal_mnist raises an error when the labels file is empty (i.e. the header cannot be read).""" + rows, cols = 28, 28 + image_data = [100] * (rows * cols) + + with tempfile.TemporaryDirectory() as tmpdir: + labels_path = os.path.join(tmpdir, "labels.idx1-ubyte") + images_path = os.path.join(tmpdir, "images.idx3-ubyte") + + # Create an empty labels file. + open(labels_path, "wb").close() + + create_images_file(images_path, 1, rows, cols, image_data) + + with pytest.raises(struct.error): + loadlocal_mnist(images_path, labels_path) + +def test_loadlocal_mnist_empty_images_file(): + """Test that loadlocal_mnist raises an error when the images file is empty (i.e., header cannot be read).""" + with tempfile.TemporaryDirectory() as tmpdir: + labels_path = os.path.join(tmpdir, "labels.idx1-ubyte") + images_path = os.path.join(tmpdir, "images.idx3-ubyte") + + create_labels_file(labels_path, 1, [2]) + # Create an empty images file. + open(images_path, "wb").close() + + with pytest.raises(struct.error): + loadlocal_mnist(images_path, labels_path) +def test_loadlocal_mnist_extra_labels(): + """Test that loadlocal_mnist correctly loads MNIST data when the labels file contains extra labels than indicated by its header. + The function should load as many images as there are actual labels read, even if the header's count is lower. + """ + rows, cols = 28, 28 + header_count = 1 + extra_labels = [4, 5] # two labels are actually written even though header indicates one + # Create two dummy images using distinct pixel values. + image1 = [50] * (rows * cols) + image2 = [100] * (rows * cols) + images_data = image1 + image2 + + with tempfile.TemporaryDirectory() as tmpdir: + labels_path = os.path.join(tmpdir, "labels.idx1-ubyte") + images_path = os.path.join(tmpdir, "images.idx3-ubyte") + + # Write the labels file with the header count set to header_count but write extra_labels (length 2) + with open(labels_path, "wb") as f: + f.write(struct.pack(">II", 2049, header_count)) + f.write(bytearray(extra_labels)) + + # Create the images file with 2 images (matching the actual labels written) + create_images_file(images_path, len(extra_labels), rows, cols, images_data) + + images_loaded, labels_loaded = loadlocal_mnist(images_path, labels_path) + assert images_loaded.shape == (len(extra_labels), rows * cols) + np.testing.assert_array_equal(labels_loaded, np.array(extra_labels, dtype=np.uint8)) + +def test_loadlocal_mnist_extra_images(): + """Test that loadlocal_mnist raises a ValueError when the images file contains more image data than expected. + In this case, the labels file indicates a single label, but the images file contains two images worth of data; + attempting to reshape the extra bytes into (1, 784) will fail. + """ + n_labels = 1 + rows, cols = 28, 28 + labels = [8] + # Create two dummy images even though we have only one label. + image1 = [77] * (rows * cols) + image2 = [88] * (rows * cols) + images_data = image1 + image2 + + with tempfile.TemporaryDirectory() as tmpdir: + labels_path = os.path.join(tmpdir, "labels.idx1-ubyte") + images_path = os.path.join(tmpdir, "images.idx3-ubyte") + + create_labels_file(labels_path, n_labels, labels) + # Create an images file with header indicating 2 images (rather than 1) and supply data for 2 images. + create_images_file(images_path, 2, rows, cols, images_data) + + with pytest.raises(ValueError): + loadlocal_mnist(images_path, labels_path) +def test_loadlocal_mnist_corrupted_labels_header(): + """Test that loadlocal_mnist raises an error when the labels file header is corrupted (i.e., incomplete header bytes).""" + rows, cols = 28, 28 + # Create a valid images file with one image of constant value. + image_data = [150] * (rows * cols) + with tempfile.TemporaryDirectory() as tmpdir: + labels_path = os.path.join(tmpdir, "corrupted_labels.idx1-ubyte") + images_path = os.path.join(tmpdir, "images.idx3-ubyte") + # Write corrupted labels file with only 4 bytes (should be 8 bytes) + with open(labels_path, "wb") as f: + f.write(b'\x00\x00\x08\x01') + create_images_file(images_path, 1, rows, cols, image_data) + with pytest.raises(struct.error): + loadlocal_mnist(images_path, labels_path) + +def test_loadlocal_mnist_corrupted_images_header(): + """Test that loadlocal_mnist raises an error when the images file header is corrupted (i.e., incomplete header bytes).""" + n_labels = 1 + rows, cols = 28, 28 + labels = [3] + with tempfile.TemporaryDirectory() as tmpdir: + labels_path = os.path.join(tmpdir, "labels.idx1-ubyte") + images_path = os.path.join(tmpdir, "corrupted_images.idx3-ubyte") + create_labels_file(labels_path, n_labels, labels) + # Write a corrupted images file with only 10 bytes (less than the expected 16 bytes for header) + with open(images_path, "wb") as f: + f.write(b'\x00' * 10) + with pytest.raises(struct.error): + loadlocal_mnist(images_path, labels_path) \ No newline at end of file diff --git a/tests/test_mnist.py b/tests/test_mnist.py new file mode 100644 index 000000000..004eb2ec7 --- /dev/null +++ b/tests/test_mnist.py @@ -0,0 +1,115 @@ +import numpy as np +import pytest +from mlxtend.data.mnist import mnist_data + +class TestMNIST: + """Tests for the mnist_data function using monkeypatch to simulate input data.""" + + @staticmethod + def dummy_genfromtxt(fname, delimiter): + # Create dummy data: 10 samples with 784 features each and 1 label column. + # The labels will be sequential numbers from 0 to 9. + data = np.hstack([np.random.rand(10, 784), np.arange(10).reshape(10, 1)]) + return data + + @staticmethod + def dummy_genfromtxt_invalid(fname, delimiter): + # Return an invalid 1D array to simulate erroneous data format. + return np.array([1, 2, 3]) + @staticmethod + def dummy_genfromtxt_empty(fname, delimiter): + # Return an empty array with shape (0, 785) to simulate a dataset with no samples. + return np.empty((0, 785)) + + @staticmethod + def dummy_genfromtxt_invalid_label(fname, delimiter): + # Create dummy data with non-numeric labels that cannot be converted to int. + data = np.hstack([np.random.rand(5, 784), np.array(['a']*5).reshape(5, 1)]) + return data + @staticmethod + def dummy_genfromtxt_single(fname, delimiter): + """Create dummy data with a single sample for testing.""" + # Create dummy data: 1 sample with 784 features and 1 label column. + data = np.hstack([np.zeros((1, 784)), np.array([7]).reshape(1, 1)]) + return data + def test_mnist_returns_correct_shapes_and_types(self, monkeypatch): + """Test that mnist_data returns X and y with expected shapes and correct data types.""" + monkeypatch.setattr(np, "genfromtxt", self.dummy_genfromtxt) + X, y = mnist_data() + # Check that X has 10 samples and 784 features. + assert X.shape == (10, 784) + # Check that y has 10 samples. + assert y.shape == (10,) + # Check that y's values are of integer type. + assert np.issubdtype(y.dtype, np.integer) + + def test_mnist_invalid_data_raises_error(self, monkeypatch): + """Test that mnist_data raises an IndexError when the data returned has an unexpected shape.""" + monkeypatch.setattr(np, "genfromtxt", self.dummy_genfromtxt_invalid) + with pytest.raises(IndexError): + _ = mnist_data() + + def test_mnist_empty_data(self, monkeypatch): + """Test that mnist_data returns empty arrays when no data is provided.""" + monkeypatch.setattr(np, "genfromtxt", self.dummy_genfromtxt_empty) + X, y = mnist_data() + # When empty data is provided, X should have shape (0, 784) and y should have shape (0,) + assert X.shape == (0, 784) + assert y.shape == (0,) + assert isinstance(X, np.ndarray) + assert isinstance(y, np.ndarray) + + def test_mnist_invalid_labels_raises_error(self, monkeypatch): + """Test that mnist_data raises a ValueError when labels cannot be converted to int.""" + monkeypatch.setattr(np, "genfromtxt", self.dummy_genfromtxt_invalid_label) + with pytest.raises(ValueError): + _ = mnist_data() + def test_mnist_single_sample(self, monkeypatch): + """Test that mnist_data correctly handles a single sample input.""" + monkeypatch.setattr(np, "genfromtxt", self.dummy_genfromtxt_single) + X, y = mnist_data() + # Check that X has shape (1, 784) and y has shape (1,) + assert X.shape == (1, 784) + assert y.shape == (1,) + # Check that the label can be correctly converted and that its value is as expected (7 in this case) + assert np.issubdtype(y.dtype, np.integer) + assert y[0] == 7 + + def test_mnist_labels_preservation(self, monkeypatch): + """Test that mnist_data returns labels that are exactly preserved from the input after conversion.""" + # Create a fixed dummy dataset where X is filled with a constant and labels are 0..9. + dummy_data = np.hstack([np.full((10, 784), 3.14), np.arange(10).reshape(10, 1)]) + monkeypatch.setattr(np, "genfromtxt", lambda fname, delimiter: dummy_data) + _, y = mnist_data() + expected_labels = np.arange(10) + np.testing.assert_array_equal(y, expected_labels) + def dummy_genfromtxt_float_labels(self, fname, delimiter): + """Return dummy data with float labels that should be convertible to int.""" + data = np.hstack([np.random.rand(10, 784), np.arange(10, dtype=float).reshape(10, 1)]) + return data + + def test_mnist_float_labels(self, monkeypatch): + """Test that mnist_data correctly converts float labels to int.""" + monkeypatch.setattr(np, "genfromtxt", self.dummy_genfromtxt_float_labels) + X, y = mnist_data() + # Check that X has 10 samples and 784 features. + assert X.shape == (10, 784) + # Check that y has 10 samples, is of integer type and has the expected values. + assert y.shape == (10,) + assert np.issubdtype(y.dtype, np.integer) + np.testing.assert_array_equal(y, np.arange(10)) + + def dummy_genfromtxt_unexpected_features(self, fname, delimiter): + """Return dummy data with an unexpected number of feature columns (500 instead of 784).""" + data = np.hstack([np.random.rand(10, 500), np.arange(10).reshape(10, 1)]) + return data + + def test_mnist_unexpected_feature_count(self, monkeypatch): + """Test that mnist_data handles data with an unexpected number of features.""" + monkeypatch.setattr(np, "genfromtxt", self.dummy_genfromtxt_unexpected_features) + X, y = mnist_data() + # Check that X shape is (10, 500) as provided by dummy_genfromtxt_unexpected_features + assert X.shape == (10, 500) + # y should still have 10 samples. + assert y.shape == (10,) + assert np.issubdtype(y.dtype, np.integer) \ No newline at end of file diff --git a/tests/test_three_blobs.py b/tests/test_three_blobs.py new file mode 100644 index 000000000..334e96602 --- /dev/null +++ b/tests/test_three_blobs.py @@ -0,0 +1,103 @@ +import numpy as np +import pytest +from mlxtend.data.three_blobs import three_blobs_data + +def test_returned_shape_and_values(): + """Test that three_blobs_data returns correct shape and appropriate label values.""" + X, y = three_blobs_data() + # Check if X is a 2D array with exactly two feature columns + assert X.ndim == 2, "X should be a 2D array" + assert X.shape[1] == 2, "X should have exactly two columns" + # Check that y is a 1D array and the number of samples match + assert y.ndim == 1, "y should be a 1D array" + assert X.shape[0] == y.shape[0], "X and y should have the same number of samples" + # Check that the unique labels (if any) are within the set {0, 1, 2} + unique_labels = set(np.unique(y)) + expected_labels = {0, 1, 2} + assert unique_labels.issubset(expected_labels), "Labels must be a subset of {0, 1, 2}" + +def fake_genfromtxt(fname, delimiter): + """Fake np.genfromtxt to simulate a known dataset for testing.""" + return np.array([[1, 2, 0], + [3, 4, 1], + [5, 6, 2]]) + +def test_three_blobs_data_with_monkeypatch(monkeypatch): + """Test three_blobs_data using monkeypatch to simulate file input.""" + import mlxtend.data.three_blobs as tb + monkeypatch.setattr(tb.np, "genfromtxt", fake_genfromtxt) + X, y = tb.three_blobs_data() + np.testing.assert_array_equal(X, np.array([[1, 2], + [3, 4], + [5, 6]])) + np.testing.assert_array_equal(y, np.array([0, 1, 2])) + +def fake_genfromtxt_float(fname, delimiter): + """Fake np.genfromtxt that returns float labels to test integer conversion.""" + return np.array([[7, 8, 0.0], + [9, 10, 1.0], + [11, 12, 2.0]]) + +def test_three_blobs_data_label_conversion(monkeypatch): + """Test that three_blobs_data converts the label column to integer dtype even when provided as float.""" + import mlxtend.data.three_blobs as tb + monkeypatch.setattr(tb.np, "genfromtxt", fake_genfromtxt_float) + X, y = tb.three_blobs_data() + # Ensure that the label array is converted to an integer type + assert issubclass(y.dtype.type, np.integer), "Labels should be of integer type" + np.testing.assert_array_equal(y, np.array([0, 1, 2])) +def test_data_path(): + """Test that the DATA_PATH variable points to a valid file location structure.""" + from mlxtend.data import three_blobs + # Check that the file path contains a 'data' subdirectory and ends with the expected filename + assert "data" in three_blobs.DATA_PATH, "DATA_PATH should include 'data' directory" + assert three_blobs.DATA_PATH.endswith("three_blobs.csv.gz"), "DATA_PATH should end with 'three_blobs.csv.gz'" + +def fake_genfromtxt_empty(fname, delimiter): + """Fake np.genfromtxt that returns an empty array to simulate missing data.""" + return np.array([]) + +def test_three_blobs_data_empty(monkeypatch): + """Test that three_blobs_data raises an error when no data is returned (empty array).""" + import mlxtend.data.three_blobs as tb + monkeypatch.setattr(tb.np, "genfromtxt", fake_genfromtxt_empty) + with pytest.raises(IndexError): + tb.three_blobs_data() + +def fake_genfromtxt_1d(fname, delimiter): + """Fake np.genfromtxt that returns a 1D array to simulate an unexpected data format.""" + return np.array([1, 2, 0]) + +def test_three_blobs_data_1d(monkeypatch): + """Test that three_blobs_data raises an error when the data from genfromtxt is not 2D.""" + import mlxtend.data.three_blobs as tb + monkeypatch.setattr(tb.np, "genfromtxt", fake_genfromtxt_1d) + with pytest.raises(IndexError): + tb.three_blobs_data() +def fake_genfromtxt_invalid(fname, delimiter): + """Fake np.genfromtxt that returns non-numeric labels to simulate invalid label conversion.""" + return np.array([[13, 14, 'a'], + [15, 16, 'b'], + [17, 18, 'c']]) + +def test_three_blobs_data_invalid_dtype(monkeypatch): + """Test that three_blobs_data raises a ValueError when labels cannot be converted to integers.""" + import mlxtend.data.three_blobs as tb + monkeypatch.setattr(tb.np, "genfromtxt", fake_genfromtxt_invalid) + with pytest.raises(ValueError): + tb.three_blobs_data() + +def fake_genfromtxt_one_sample(fname, delimiter): + """Fake np.genfromtxt that returns a 1-sample dataset.""" + return np.array([[21, 22, 0]]) + +def test_three_blobs_data_one_sample(monkeypatch): + """Test that three_blobs_data correctly processes a dataset with only one sample.""" + import mlxtend.data.three_blobs as tb + monkeypatch.setattr(tb.np, "genfromtxt", fake_genfromtxt_one_sample) + X, y = tb.three_blobs_data() + # X should be 2D with one row and two columns and y should be 1D with a single element. + assert X.ndim == 2 and X.shape == (1,2), "X should be a 2D array with one row and two columns" + assert y.ndim == 1 and y.shape[0] == 1, "y should be a 1D array with one element" + np.testing.assert_array_equal(X, np.array([[21, 22]])) + np.testing.assert_array_equal(y, np.array([0])) \ No newline at end of file diff --git a/tests/test_wine.py b/tests/test_wine.py new file mode 100644 index 000000000..49ddc7137 --- /dev/null +++ b/tests/test_wine.py @@ -0,0 +1,32 @@ +import os +import numpy as np +import pytest +from mlxtend.data.wine import wine_data, DATA_PATH + +def test_wine_data_returns_correct_values(tmp_path, monkeypatch): + """Test that wine_data returns the correct feature matrix and labels after monkeypatching DATA_PATH to a custom test CSV file.""" + # Create a temporary CSV file with sample data: + # Each row has 13 feature columns and 1 label column + data = np.array([ + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 0], + [1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, 9.1, 10.1, 11.1, 12.1, 13.1, 1], + [1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2, 8.2, 9.2, 10.2, 11.2, 12.2, 13.2, 2], + ]) + temp_csv = tmp_path / "wine.csv" + np.savetxt(temp_csv, data, delimiter=",") + + # Patch the DATA_PATH to point to the temporary CSV file + monkeypatch.setattr("mlxtend.data.wine.DATA_PATH", str(temp_csv)) + + X, y = wine_data() + # Check that the feature matrix matches the expected shape and values + np.testing.assert_array_equal(X, data[:, :-1]) + # Check that the label vector matches and is of integer type + np.testing.assert_array_equal(y, data[:, -1].astype(int)) + +def test_wine_data_file_not_found(monkeypatch): + """Test that wine_data raises an error when the CSV file is not found.""" + # Monkeypatch DATA_PATH to point to an invalid file path + monkeypatch.setattr("mlxtend.data.wine.DATA_PATH", "non_existing_file.csv") + with pytest.raises(OSError): + wine_data() \ No newline at end of file