From a572a3a7b6e89c65e3eeb24fd715cf3e95889190 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Wed, 25 Jun 2025 14:53:15 +0100 Subject: [PATCH 01/14] deprecate parameters warning --- sdgym/benchmark.py | 28 ++++++++++++++++++++++++++ tests/unit/test_benchmark.py | 38 ++++++++++++++++++++++++++++++++++-- 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index 8614078b..a0c0292e 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -779,6 +779,31 @@ def _create_instance_on_ec2(script_content): print(f'Job kicked off for SDGym on {instance_id}') # noqa +def _handle_deprecated_parameters( + output_filepath, detailed_results_folder, multi_processing_config, run_on_ec2 +): + """Handle deprecated parameters and issue warnings.""" + parameters_to_deprecate = { + 'output_filepath': output_filepath, + 'detailed_results_folder': detailed_results_folder, + 'multi_processing_config': multi_processing_config, + 'run_on_ec2': run_on_ec2, + } + parameters = [] + for name, value in parameters_to_deprecate.items(): + if value is not None and value: + parameters.append(name) + + if parameters: + parameters = "', '".join(sorted(parameters)) + message = ( + f"Parameters '{parameters}' are deprecated in the `benchmark_single_table` " + 'function and will be removed in October 2025. ' + 'Please consider using `output_destination` instead.' + ) + warnings.warn(message, FutureWarning) + + def benchmark_single_table( synthesizers=DEFAULT_SYNTHESIZERS, custom_synthesizers=None, @@ -863,6 +888,9 @@ def benchmark_single_table( pandas.DataFrame: A table containing one row per synthesizer + dataset + metric. """ + _handle_deprecated_parameters( + output_filepath, detailed_results_folder, multi_processing_config, run_on_ec2 + ) if run_on_ec2: print("This will create an instance for the current AWS user's account.") # noqa if output_filepath is not None: diff --git a/tests/unit/test_benchmark.py b/tests/unit/test_benchmark.py index 10b98d12..24fd47b0 100644 --- a/tests/unit/test_benchmark.py +++ b/tests/unit/test_benchmark.py @@ -1,3 +1,4 @@ +import re from unittest.mock import ANY, MagicMock, patch import pandas as pd @@ -9,6 +10,7 @@ _create_sdgym_script, _directory_exists, _format_output, + _handle_deprecated_parameters, ) from sdgym.synthesizers import GaussianCopulaSynthesizer @@ -33,7 +35,8 @@ def test_output_file_exists(path_mock): @patch('sdgym.benchmark.tqdm.tqdm') -def test_progress_bar_updates(tqdm_mock): +@patch('sdgym.benchmark._handle_deprecated_parameters') +def test_benchmark_single_table_deprecated_params(mock_handle_deprecated, tqdm_mock): """Test that the benchmarking function updates the progress bar on one line.""" # Setup scores_mock = MagicMock() @@ -48,9 +51,11 @@ def test_progress_bar_updates(tqdm_mock): ) # Assert + mock_handle_deprecated.assert_called_once_with( + None, None, None, False + ) tqdm_mock.assert_called_once_with(ANY, total=1, position=0, leave=True) - @patch('sdgym.benchmark._score') @patch('sdgym.benchmark.multiprocessing') def test_benchmark_single_table_with_timeout(mock_multiprocessing, mock__score): @@ -301,3 +306,32 @@ def test__format_output(): 'NewMetric': [0.998], }) pd.testing.assert_frame_equal(scores, expected_scores) + + +def test__handle_deprecated_parameters(): + """Test the ``_handle_deprecated_parameters`` function.""" + # Setup + output_filepath = 's3://BucketName/path' + detailed_results_folder = 'mock/path' + multi_processing_config = {'num_processes': 4} + run_on_ec2 = True + expected_message_1 = re.escape( + "Parameters 'detailed_results_folder', 'output_filepath' are deprecated in the " + '`benchmark_single_table` function and will be removed in October 2025. Please ' + 'consider using `output_destination` instead.' + ) + expected_message_2 = re.escape( + "Parameters 'detailed_results_folder', 'multi_processing_config', 'output_filepath'" + ", 'run_on_ec2' are deprecated in the `benchmark_single_table` function and will be" + ' removed in October 2025. Please consider using `output_destination` instead.' + ) + + # Run and Assert + _handle_deprecated_parameters(None, None, None, False) + with pytest.warns(FutureWarning, match=expected_message_1): + _handle_deprecated_parameters(output_filepath, detailed_results_folder, None, False) + + with pytest.warns(FutureWarning, match=expected_message_2): + _handle_deprecated_parameters( + output_filepath, detailed_results_folder, multi_processing_config, run_on_ec2 + ) From 8e967cc68015962fed95d5b6e7f571dc75f2f29d Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Wed, 25 Jun 2025 16:55:01 +0100 Subject: [PATCH 02/14] define _setup_output_destination --- sdgym/benchmark.py | 78 +++++++++++++++++++++++++++++++++ tests/unit/test_benchmark.py | 83 ++++++++++++++++++++++++++++++++++-- 2 files changed, 158 insertions(+), 3 deletions(-) diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index a0c0292e..feaaac06 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -8,7 +8,9 @@ import pickle import tracemalloc import warnings +from collections import defaultdict from contextlib import contextmanager +from datetime import datetime from pathlib import Path import boto3 @@ -804,6 +806,74 @@ def _handle_deprecated_parameters( warnings.warn(message, FutureWarning) +def _validate_output_destination(output_destination): + if not isinstance(output_destination, str): + raise ValueError( + 'The `output_destination` parameter must be a string representing the output path.' + ) + + if is_s3_path(output_destination): + raise ValueError( + 'The `output_destination` parameter cannot be an S3 path. ' + 'Please use `benchmark_single_table_aws` instead.' + ) + + if os.path.exists(output_destination): + raise ValueError(f'The output path {output_destination} already exists.') + + +def _setup_output_destination( + output_destination, synthesizers, custom_synthesizers, sdv_datasets, additional_datasets_folder +): + """Set up the output destination for the benchmark results. + + Args: + output_destination (str or None): + The path to the output directory where results will be saved. + If None, no output will be saved. + synthesizers (list): + The list of synthesizers to benchmark. + custom_synthesizers (list): + The list of custom synthesizers to benchmark. + sdv_datasets (list): + The list of SDV datasets to benchmark. + additional_datasets_folder (str or None): + The path to a folder containing additional datasets. + """ + if output_destination is None: + return None + + _validate_output_destination(output_destination) + os.makedirs(output_destination, exist_ok=True) + today = datetime.today().strftime('%m_%d_%Y') + top_folder = os.path.join(output_destination, f'SDGym_results_{today}') + os.makedirs(top_folder, exist_ok=True) + all_synthesizers = synthesizers + custom_synthesizers + all_datasets = sdv_datasets.copy() + if additional_datasets_folder: + additional_datasets = [ + f'additional_datasets_folder/{name.split(".")[0]}' + for name in os.listdir(additional_datasets_folder) + if name.endswith('.csv') + ] + all_datasets.extend(additional_datasets) + + paths = defaultdict(dict) + for dataset in all_datasets: + dataset_folder_name = f'{dataset}_{today}' + dataset_folder_path = os.path.join(top_folder, dataset_folder_name) + os.makedirs(dataset_folder_path, exist_ok=True) + paths[dataset]['meta'] = os.path.join(dataset_folder_path, 'meta.yaml') + for synth_name in all_synthesizers: + synth_folder_path = os.path.join(dataset_folder_path, synth_name) + os.makedirs(synth_folder_path, exist_ok=True) + synth_file = os.path.join(synth_folder_path, 'synthesizer.pkl') + data_file = os.path.join(synth_folder_path, 'synthetic_data.csv') + paths[dataset][synth_name] = {'synthesizer': synth_file, 'synthetic_data': data_file} + + return paths + + def benchmark_single_table( synthesizers=DEFAULT_SYNTHESIZERS, custom_synthesizers=None, @@ -815,6 +885,7 @@ def benchmark_single_table( compute_privacy_score=True, sdmetrics=None, timeout=None, + output_destination=None, output_filepath=None, detailed_results_folder=None, show_progress=False, @@ -891,6 +962,13 @@ def benchmark_single_table( _handle_deprecated_parameters( output_filepath, detailed_results_folder, multi_processing_config, run_on_ec2 ) + paths = _setup_output_destination( + output_destination, + synthesizers, + custom_synthesizers, + sdv_datasets, + additional_datasets_folder, + ) if run_on_ec2: print("This will create an instance for the current AWS user's account.") # noqa if output_filepath is not None: diff --git a/tests/unit/test_benchmark.py b/tests/unit/test_benchmark.py index 24fd47b0..5278d679 100644 --- a/tests/unit/test_benchmark.py +++ b/tests/unit/test_benchmark.py @@ -1,4 +1,6 @@ +import json import re +from datetime import datetime from unittest.mock import ANY, MagicMock, patch import pandas as pd @@ -11,6 +13,8 @@ _directory_exists, _format_output, _handle_deprecated_parameters, + _setup_output_destination, + _validate_output_destination, ) from sdgym.synthesizers import GaussianCopulaSynthesizer @@ -51,11 +55,10 @@ def test_benchmark_single_table_deprecated_params(mock_handle_deprecated, tqdm_m ) # Assert - mock_handle_deprecated.assert_called_once_with( - None, None, None, False - ) + mock_handle_deprecated.assert_called_once_with(None, None, None, False) tqdm_mock.assert_called_once_with(ANY, total=1, position=0, leave=True) + @patch('sdgym.benchmark._score') @patch('sdgym.benchmark.multiprocessing') def test_benchmark_single_table_with_timeout(mock_multiprocessing, mock__score): @@ -335,3 +338,77 @@ def test__handle_deprecated_parameters(): _handle_deprecated_parameters( output_filepath, detailed_results_folder, multi_processing_config, run_on_ec2 ) + + +def test__validate_output_destination(tmp_path): + """Test the `_validate_output_destination` function.""" + # Setup + wrong_type = 12345 + aws_destination = 's3://valid-bucket/path/to/file' + valid_destination = tmp_path / 'valid-destination' + err_1 = re.escape( + 'The `output_destination` parameter must be a string representing the output path.' + ) + err_2 = re.escape( + 'The `output_destination` parameter cannot be an S3 path. ' + 'Please use `benchmark_single_table_aws` instead.' + ) + err_3 = re.escape(f'The output path {valid_destination} already exists.') + + # Run and Assert + _validate_output_destination(str(valid_destination)) + with pytest.raises(ValueError, match=err_1): + _validate_output_destination(wrong_type) + + with pytest.raises(ValueError, match=err_2): + _validate_output_destination(aws_destination) + + valid_destination.mkdir() + with pytest.raises(ValueError, match=err_3): + _validate_output_destination(str(valid_destination)) + + +@patch('sdgym.benchmark._validate_output_destination') +def test__setup_output_destination(mock_validate, tmp_path): + """Test the `_setup_output_destination` function.""" + # Setup + output_destination = tmp_path / 'output_destination' + synthesizers = ['GaussianCopulaSynthesizer', 'CTGANSynthesizer'] + customsynthesizers = ['CustomSynthesizer'] + datasets = ['adult', 'census'] + additional_datasets_folder = tmp_path / 'additional_datasets' + additional_datasets_folder.mkdir() + additional_data = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + additional_data.to_csv(additional_datasets_folder / 'additional_data.csv', index=False) + today = datetime.today().strftime('%m_%d_%Y') + base_path = output_destination / f'SDGym_results_{today}' + + # Run + result_1 = _setup_output_destination( + None, synthesizers, customsynthesizers, datasets, additional_datasets_folder + ) + result_2 = _setup_output_destination( + output_destination, synthesizers, customsynthesizers, datasets, additional_datasets_folder + ) + + # Assert + expected = { + dataset: { + 'meta': str(base_path / f'{dataset}_{today}' / 'meta.yaml'), + **{ + synth: { + 'synthesizer': str( + base_path / f'{dataset}_{today}' / synth / 'synthesizer.pkl' + ), + 'synthetic_data': str( + base_path / f'{dataset}_{today}' / synth / 'synthetic_data.csv' + ), + } + for synth in synthesizers + customsynthesizers + }, + } + for dataset in datasets + ['additional_datasets_folder/additional_data'] + } + assert result_1 is None + mock_validate.assert_called_once_with(output_destination) + assert json.loads(json.dumps(result_2)) == expected From cdf3fceafd3f5b08e613b4766d57efd1df7eda8e Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Thu, 26 Jun 2025 11:49:14 +0100 Subject: [PATCH 03/14] define run_id.yaml file --- sdgym/benchmark.py | 65 ++++++++++++++++++++++++++++++++++-- tests/unit/test_benchmark.py | 58 ++++++++++++++++++++++++++++++++ 2 files changed, 121 insertions(+), 2 deletions(-) diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index feaaac06..542d4526 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -7,10 +7,12 @@ import os import pickle import tracemalloc +import uuid import warnings from collections import defaultdict from contextlib import contextmanager from datetime import datetime +from importlib.metadata import version from pathlib import Path import boto3 @@ -19,6 +21,7 @@ import numpy as np import pandas as pd import tqdm +import yaml from sdmetrics.reports.multi_table import ( DiagnosticReport as MultiTableDiagnosticReport, ) @@ -65,6 +68,15 @@ 'covtype', ] N_BYTES_IN_MB = 1000 * 1000 +EXTERNAL_SYNTHESIZER_TO_LIBRARY = { + 'RealTabFormerSynthesizer': 'realtabformer', +} +SDV_SINGLE_TABLE_SYNTHESIZERS = [ + 'GaussianCopulaSynthesizer', + 'CTGANSynthesizer', + 'CopulaGANSynthesizer', + 'TVAESynthesizer', +] def _validate_inputs(output_filepath, detailed_results_folder, synthesizers, custom_synthesizers): @@ -151,7 +163,7 @@ def _generate_job_args_list( return job_args_list -def _synthesize(synthesizer_dict, real_data, metadata): +def _synthesize(synthesizer_dict, real_data, metadata, path=None): synthesizer = synthesizer_dict['synthesizer'] if isinstance(synthesizer, type): assert issubclass(synthesizer, BaselineSynthesizer), ( @@ -874,6 +886,39 @@ def _setup_output_destination( return paths +def _write_run_id_file(output_destination, synthesizers, job_args_list): + run_id = str(uuid.uuid4())[:8] + metadata = { + 'run_id': run_id, + 'starting_date': datetime.today().strftime('%m_%d_%Y %H:%M:%S'), + 'completed_date': None, + 'sdgym_version': version('sdgym'), + 'jobs': job_args_list, + } + for synthesizer in synthesizers: + if synthesizer not in SDV_SINGLE_TABLE_SYNTHESIZERS: + ext_lib = EXTERNAL_SYNTHESIZER_TO_LIBRARY[synthesizer] + library_version = version(ext_lib) + metadata[f'{ext_lib}_version'] = library_version + elif 'sdv' not in metadata.keys(): + metadata['sdv_version'] = version('sdv') + + with open(f'{output_destination}/run_{run_id}.yaml', 'w') as file: + yaml.dump(metadata, file) + + return run_id + + +def _update_run_id_file(output_destination, run_id): + run_file = Path(output_destination) / f'run_{run_id}.yaml' + with open(run_file, 'r') as f: + run_data = yaml.safe_load(f) or {} + + run_data['completed_date'] = datetime.today().strftime('%m_%d_%Y %H:%M:%S') + with open(run_file, 'w') as f: + yaml.dump(run_data, f) + + def benchmark_single_table( synthesizers=DEFAULT_SYNTHESIZERS, custom_synthesizers=None, @@ -933,6 +978,18 @@ def benchmark_single_table( timeout (int or ``None``): The maximum number of seconds to wait for synthetic data creation. If ``None``, no timeout is enforced. + output_destination (str or ``None``): + The path to the output directory where results will be saved. If ``None``, no + output is saved. The results are saved with the following structure: + output_destination/ + run_.yaml + SDGym_results_/ + results.csv + _/ + meta.yaml + / + synthesizer.pkl + synthetic_data.csv output_filepath (str or ``None``): A file path for where to write the output as a csv file. If ``None``, no output is written. If run_on_ec2 flag output_filepath needs to be defined and @@ -981,7 +1038,6 @@ def benchmark_single_table( _validate_inputs(output_filepath, detailed_results_folder, synthesizers, custom_synthesizers) _create_detailed_results_directory(detailed_results_folder) - job_args_list = _generate_job_args_list( limit_dataset_size, sdv_datasets, @@ -995,6 +1051,8 @@ def benchmark_single_table( synthesizers, custom_synthesizers, ) + if paths is not None: + run_id = _write_run_id_file(output_destination, synthesizers, job_args_list) if job_args_list: scores = _run_jobs(multi_processing_config, job_args_list, show_progress) @@ -1011,4 +1069,7 @@ def benchmark_single_table( if output_filepath: write_csv(scores, output_filepath, None, None) + if paths is not None: + _update_run_id_file(output_destination, run_id) + return scores diff --git a/tests/unit/test_benchmark.py b/tests/unit/test_benchmark.py index 5278d679..372ca62e 100644 --- a/tests/unit/test_benchmark.py +++ b/tests/unit/test_benchmark.py @@ -1,10 +1,12 @@ import json import re from datetime import datetime +from importlib.metadata import version from unittest.mock import ANY, MagicMock, patch import pandas as pd import pytest +import yaml from sdgym import benchmark_single_table from sdgym.benchmark import ( @@ -14,7 +16,9 @@ _format_output, _handle_deprecated_parameters, _setup_output_destination, + _update_run_id_file, _validate_output_destination, + _write_run_id_file, ) from sdgym.synthesizers import GaussianCopulaSynthesizer @@ -412,3 +416,57 @@ def test__setup_output_destination(mock_validate, tmp_path): assert result_1 is None mock_validate.assert_called_once_with(output_destination) assert json.loads(json.dumps(result_2)) == expected + + +@patch('sdgym.benchmark.uuid.uuid4') +@patch('sdgym.benchmark.datetime') +def test__write_run_id_file(mock_datetime, mock_uuid, tmp_path): + """Test the `_write_run_id_file` method.""" + # Setup + output_destination = tmp_path / 'output_destination' + output_destination.mkdir() + mock_uuid.return_value = '123456789999' + mock_datetime.today.return_value.strftime.return_value = '06_26_2025' + jobs = ['job1', 'job2'] + synthesizers = ['GaussianCopulaSynthesizer', 'CTGANSynthesizer', 'RealTabFormerSynthesizer'] + + # Run + run_id = _write_run_id_file(output_destination, synthesizers, jobs) + + # Assert + assert run_id == '12345678' + run_id_file = output_destination / 'run_12345678.yaml' + assert run_id_file.exists() + with open(run_id_file, 'r') as file: + run_id_data = yaml.safe_load(file) + assert run_id_data['run_id'] == '12345678' + assert run_id_data['starting_date'] == '06_26_2025' + assert run_id_data['jobs'] == jobs + assert run_id_data['sdgym_version'] == version('sdgym') + assert run_id_data['sdv_version'] == version('sdv') + assert run_id_data['realtabformer_version'] == version('realtabformer') + assert run_id_data['completed_date'] is None + + +@patch('sdgym.benchmark.datetime') +def test__update_run_id_file(mock_datetime, tmp_path): + """Test the `_update_run_id_file` method.""" + # Setup + output_destination = tmp_path / 'output_destination' + output_destination.mkdir() + run_id = '12345678' + mock_datetime.today.return_value.strftime.return_value = '06_26_2025' + metadata = {'run_id': run_id, 'starting_date': '06_25_2025', 'completed_date': None} + run_id_file = output_destination / f'run_{run_id}.yaml' + with open(run_id_file, 'w') as file: + yaml.dump(metadata, file) + + # Run + _update_run_id_file(output_destination, run_id) + + # Assert + with open(run_id_file, 'r') as file: + run_id_data = yaml.safe_load(file) + assert run_id_data['completed_date'] == '06_26_2025' + assert run_id_data['starting_date'] == '06_25_2025' + assert run_id_data['run_id'] == run_id From 6f6b0bcb5f52d7f7d7cb8410813907afa1b3971a Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Thu, 26 Jun 2025 16:07:09 +0100 Subject: [PATCH 04/14] save synthetic_data and synthesizer --- sdgym/benchmark.py | 129 +++++++++++++++++------------------ tests/unit/test_benchmark.py | 27 +++----- 2 files changed, 73 insertions(+), 83 deletions(-) diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index 542d4526..ff0c570d 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -107,6 +107,46 @@ def _create_detailed_results_directory(detailed_results_folder): os.makedirs(detailed_results_folder, exist_ok=True) +def _setup_output_destination(output_destination, synthesizers, datasets): + """Set up the output destination for the benchmark results. + + Args: + output_destination (str or None): + The path to the output directory where results will be saved. + If None, no output will be saved. + synthesizers (list): + The list of synthesizers to benchmark. + datasets (list): + The list of datasets to benchmark. + """ + if output_destination is None: + return {} + + _validate_output_destination(output_destination) + output_path = Path(output_destination) + output_path.mkdir(parents=True, exist_ok=True) + today = datetime.today().strftime('%m_%d_%Y') + top_folder = output_path / f'SDGym_results_{today}' + top_folder.mkdir(parents=True, exist_ok=True) + + paths = defaultdict(dict) + for dataset in datasets: + dataset_folder = top_folder / f'{dataset}_{today}' + dataset_folder.mkdir(parents=True, exist_ok=True) + paths[dataset]['meta'] = str(dataset_folder / 'meta.yaml') + + for synth_name in synthesizers: + synth_folder = dataset_folder / synth_name + synth_folder.mkdir(parents=True, exist_ok=True) + + paths[dataset][synth_name] = { + 'synthesizer': str(synth_folder / 'synthesizer.pkl'), + 'synthetic_data': str(synth_folder / 'synthetic_data.csv'), + } + + return paths + + def _generate_job_args_list( limit_dataset_size, sdv_datasets, @@ -114,6 +154,7 @@ def _generate_job_args_list( sdmetrics, detailed_results_folder, timeout, + output_destination, compute_quality_score, compute_diagnostic_score, compute_privacy_score, @@ -133,7 +174,9 @@ def _generate_job_args_list( else get_dataset_paths(bucket=additional_datasets_folder) ) datasets = sdv_datasets + additional_datasets - + synthesizer_names = [synthesizer['name'] for synthesizer in synthesizers] + dataset_names = [dataset.name for dataset in datasets] + paths = _setup_output_destination(output_destination, synthesizer_names, dataset_names) job_tuples = [] for dataset in datasets: for synthesizer in synthesizers: @@ -144,7 +187,7 @@ def _generate_job_args_list( data, metadata_dict = load_dataset( 'single_table', dataset, limit_dataset_size=limit_dataset_size ) - + path = paths.get(dataset.name, {}).get(synthesizer['name'], None) args = ( synthesizer, data, @@ -157,6 +200,7 @@ def _generate_job_args_list( compute_privacy_score, dataset.name, 'single_table', + path, ) job_args_list.append(args) @@ -191,6 +235,10 @@ def _synthesize(synthesizer_dict, real_data, metadata, path=None): peak_memory = tracemalloc.get_traced_memory()[1] / N_BYTES_IN_MB tracemalloc.stop() tracemalloc.clear_traces() + if path is not None: + synthetic_data.to_csv(path['synthetic_data'], index=False) + with open(path['synthesizer'], 'wb') as f: + pickle.dump(synthesizer_obj, f) return synthetic_data, train_now - now, sample_now - train_now, synthesizer_size, peak_memory @@ -297,6 +345,7 @@ def _score( compute_privacy_score=False, modality=None, dataset_name=None, + path=None, ): if output is None: output = {} @@ -316,7 +365,7 @@ def _score( # To be deleted if there is no error output['error'] = 'Synthesizer Timeout' synthetic_data, train_time, sample_time, synthesizer_size, peak_memory = _synthesize( - synthesizer, data.copy(), metadata + synthesizer, data.copy(), metadata, path ) output['synthetic_data'] = synthetic_data @@ -397,6 +446,7 @@ def _score_with_timeout( compute_privacy_score=False, modality=None, dataset_name=None, + path=None, ): with multiprocessing_context(): with multiprocessing.Manager() as manager: @@ -414,6 +464,7 @@ def _score_with_timeout( compute_privacy_score, modality, dataset_name, + path, ), ) @@ -514,6 +565,7 @@ def _run_job(args): compute_privacy_score, dataset_name, modality, + path, ) = args name = synthesizer['name'] @@ -524,7 +576,6 @@ def _run_job(args): timeout, used_memory(), ) - output = {} try: if timeout: @@ -539,6 +590,7 @@ def _run_job(args): compute_privacy_score=compute_privacy_score, modality=modality, dataset_name=dataset_name, + path=path, ) else: output = _score( @@ -551,6 +603,7 @@ def _run_job(args): compute_privacy_score=compute_privacy_score, modality=modality, dataset_name=dataset_name, + path=path, ) except Exception as error: output['exception'] = error @@ -834,66 +887,15 @@ def _validate_output_destination(output_destination): raise ValueError(f'The output path {output_destination} already exists.') -def _setup_output_destination( - output_destination, synthesizers, custom_synthesizers, sdv_datasets, additional_datasets_folder -): - """Set up the output destination for the benchmark results. - - Args: - output_destination (str or None): - The path to the output directory where results will be saved. - If None, no output will be saved. - synthesizers (list): - The list of synthesizers to benchmark. - custom_synthesizers (list): - The list of custom synthesizers to benchmark. - sdv_datasets (list): - The list of SDV datasets to benchmark. - additional_datasets_folder (str or None): - The path to a folder containing additional datasets. - """ - if output_destination is None: - return None - - _validate_output_destination(output_destination) - os.makedirs(output_destination, exist_ok=True) - today = datetime.today().strftime('%m_%d_%Y') - top_folder = os.path.join(output_destination, f'SDGym_results_{today}') - os.makedirs(top_folder, exist_ok=True) - all_synthesizers = synthesizers + custom_synthesizers - all_datasets = sdv_datasets.copy() - if additional_datasets_folder: - additional_datasets = [ - f'additional_datasets_folder/{name.split(".")[0]}' - for name in os.listdir(additional_datasets_folder) - if name.endswith('.csv') - ] - all_datasets.extend(additional_datasets) - - paths = defaultdict(dict) - for dataset in all_datasets: - dataset_folder_name = f'{dataset}_{today}' - dataset_folder_path = os.path.join(top_folder, dataset_folder_name) - os.makedirs(dataset_folder_path, exist_ok=True) - paths[dataset]['meta'] = os.path.join(dataset_folder_path, 'meta.yaml') - for synth_name in all_synthesizers: - synth_folder_path = os.path.join(dataset_folder_path, synth_name) - os.makedirs(synth_folder_path, exist_ok=True) - synth_file = os.path.join(synth_folder_path, 'synthesizer.pkl') - data_file = os.path.join(synth_folder_path, 'synthetic_data.csv') - paths[dataset][synth_name] = {'synthesizer': synth_file, 'synthetic_data': data_file} - - return paths - - def _write_run_id_file(output_destination, synthesizers, job_args_list): + jobs = [[job[0]['name'], job[-3]] for job in job_args_list] run_id = str(uuid.uuid4())[:8] metadata = { 'run_id': run_id, 'starting_date': datetime.today().strftime('%m_%d_%Y %H:%M:%S'), 'completed_date': None, 'sdgym_version': version('sdgym'), - 'jobs': job_args_list, + 'jobs': jobs, } for synthesizer in synthesizers: if synthesizer not in SDV_SINGLE_TABLE_SYNTHESIZERS: @@ -1019,13 +1021,6 @@ def benchmark_single_table( _handle_deprecated_parameters( output_filepath, detailed_results_folder, multi_processing_config, run_on_ec2 ) - paths = _setup_output_destination( - output_destination, - synthesizers, - custom_synthesizers, - sdv_datasets, - additional_datasets_folder, - ) if run_on_ec2: print("This will create an instance for the current AWS user's account.") # noqa if output_filepath is not None: @@ -1036,7 +1031,6 @@ def benchmark_single_table( return None _validate_inputs(output_filepath, detailed_results_folder, synthesizers, custom_synthesizers) - _create_detailed_results_directory(detailed_results_folder) job_args_list = _generate_job_args_list( limit_dataset_size, @@ -1045,13 +1039,14 @@ def benchmark_single_table( sdmetrics, detailed_results_folder, timeout, + output_destination, compute_quality_score, compute_diagnostic_score, compute_privacy_score, synthesizers, custom_synthesizers, ) - if paths is not None: + if output_destination is not None: run_id = _write_run_id_file(output_destination, synthesizers, job_args_list) if job_args_list: @@ -1069,7 +1064,7 @@ def benchmark_single_table( if output_filepath: write_csv(scores, output_filepath, None, None) - if paths is not None: + if output_destination is not None: _update_run_id_file(output_destination, run_id) return scores diff --git a/tests/unit/test_benchmark.py b/tests/unit/test_benchmark.py index 372ca62e..ac1c642d 100644 --- a/tests/unit/test_benchmark.py +++ b/tests/unit/test_benchmark.py @@ -378,22 +378,13 @@ def test__setup_output_destination(mock_validate, tmp_path): # Setup output_destination = tmp_path / 'output_destination' synthesizers = ['GaussianCopulaSynthesizer', 'CTGANSynthesizer'] - customsynthesizers = ['CustomSynthesizer'] datasets = ['adult', 'census'] - additional_datasets_folder = tmp_path / 'additional_datasets' - additional_datasets_folder.mkdir() - additional_data = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) - additional_data.to_csv(additional_datasets_folder / 'additional_data.csv', index=False) today = datetime.today().strftime('%m_%d_%Y') base_path = output_destination / f'SDGym_results_{today}' # Run - result_1 = _setup_output_destination( - None, synthesizers, customsynthesizers, datasets, additional_datasets_folder - ) - result_2 = _setup_output_destination( - output_destination, synthesizers, customsynthesizers, datasets, additional_datasets_folder - ) + result_1 = _setup_output_destination(None, synthesizers, datasets) + result_2 = _setup_output_destination(output_destination, synthesizers, datasets) # Assert expected = { @@ -408,12 +399,12 @@ def test__setup_output_destination(mock_validate, tmp_path): base_path / f'{dataset}_{today}' / synth / 'synthetic_data.csv' ), } - for synth in synthesizers + customsynthesizers + for synth in synthesizers }, } - for dataset in datasets + ['additional_datasets_folder/additional_data'] + for dataset in datasets } - assert result_1 is None + assert result_1 == {} mock_validate.assert_called_once_with(output_destination) assert json.loads(json.dumps(result_2)) == expected @@ -427,7 +418,11 @@ def test__write_run_id_file(mock_datetime, mock_uuid, tmp_path): output_destination.mkdir() mock_uuid.return_value = '123456789999' mock_datetime.today.return_value.strftime.return_value = '06_26_2025' - jobs = ['job1', 'job2'] + jobs = [ + ({'name': 'GaussianCopulaSynthesizer'}, 'adult', None, None), + ({'name': 'CTGANSynthesizer'}, 'census', None, None), + ] + expected_jobs = [['GaussianCopulaSynthesizer', 'adult'], ['CTGANSynthesizer', 'census']] synthesizers = ['GaussianCopulaSynthesizer', 'CTGANSynthesizer', 'RealTabFormerSynthesizer'] # Run @@ -441,7 +436,7 @@ def test__write_run_id_file(mock_datetime, mock_uuid, tmp_path): run_id_data = yaml.safe_load(file) assert run_id_data['run_id'] == '12345678' assert run_id_data['starting_date'] == '06_26_2025' - assert run_id_data['jobs'] == jobs + assert run_id_data['jobs'] == expected_jobs assert run_id_data['sdgym_version'] == version('sdgym') assert run_id_data['sdv_version'] == version('sdv') assert run_id_data['realtabformer_version'] == version('realtabformer') From 8a7037892063d22d1aa827fa158c118502b619f2 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Thu, 26 Jun 2025 16:44:10 +0100 Subject: [PATCH 05/14] save final results --- sdgym/benchmark.py | 33 +++++++++++++++++++++------------ tests/unit/test_benchmark.py | 2 +- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index ff0c570d..c638d2df 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -207,7 +207,7 @@ def _generate_job_args_list( return job_args_list -def _synthesize(synthesizer_dict, real_data, metadata, path=None): +def _synthesize(synthesizer_dict, real_data, metadata, synthesizer_path=None): synthesizer = synthesizer_dict['synthesizer'] if isinstance(synthesizer, type): assert issubclass(synthesizer, BaselineSynthesizer), ( @@ -235,9 +235,9 @@ def _synthesize(synthesizer_dict, real_data, metadata, path=None): peak_memory = tracemalloc.get_traced_memory()[1] / N_BYTES_IN_MB tracemalloc.stop() tracemalloc.clear_traces() - if path is not None: - synthetic_data.to_csv(path['synthetic_data'], index=False) - with open(path['synthesizer'], 'wb') as f: + if synthesizer_path is not None: + synthetic_data.to_csv(synthesizer_path['synthetic_data'], index=False) + with open(synthesizer_path['synthesizer'], 'wb') as f: pickle.dump(synthesizer_obj, f) return synthetic_data, train_now - now, sample_now - train_now, synthesizer_size, peak_memory @@ -345,7 +345,7 @@ def _score( compute_privacy_score=False, modality=None, dataset_name=None, - path=None, + synthesizer_path=None, ): if output is None: output = {} @@ -365,7 +365,7 @@ def _score( # To be deleted if there is no error output['error'] = 'Synthesizer Timeout' synthetic_data, train_time, sample_time, synthesizer_size, peak_memory = _synthesize( - synthesizer, data.copy(), metadata, path + synthesizer, data.copy(), metadata, synthesizer_path=synthesizer_path ) output['synthetic_data'] = synthetic_data @@ -446,7 +446,7 @@ def _score_with_timeout( compute_privacy_score=False, modality=None, dataset_name=None, - path=None, + synthesizer_path=None, ): with multiprocessing_context(): with multiprocessing.Manager() as manager: @@ -464,7 +464,7 @@ def _score_with_timeout( compute_privacy_score, modality, dataset_name, - path, + synthesizer_path, ), ) @@ -565,7 +565,7 @@ def _run_job(args): compute_privacy_score, dataset_name, modality, - path, + synthesizer_path, ) = args name = synthesizer['name'] @@ -590,7 +590,7 @@ def _run_job(args): compute_privacy_score=compute_privacy_score, modality=modality, dataset_name=dataset_name, - path=path, + synthesizer_path=synthesizer_path, ) else: output = _score( @@ -603,7 +603,7 @@ def _run_job(args): compute_privacy_score=compute_privacy_score, modality=modality, dataset_name=dataset_name, - path=path, + synthesizer_path=synthesizer_path, ) except Exception as error: output['exception'] = error @@ -618,6 +618,15 @@ def _run_job(args): cache_dir, ) + if synthesizer_path is not None: + synth_path = Path(synthesizer_path['synthesizer']) + root_path = synth_path.parents[2] + result_file = root_path / 'results.csv' + if not result_file.exists(): + scores.to_csv(result_file, index=False, mode='w') + else: + scores.to_csv(result_file, index=False, mode='a', header=False) + return scores @@ -888,7 +897,7 @@ def _validate_output_destination(output_destination): def _write_run_id_file(output_destination, synthesizers, job_args_list): - jobs = [[job[0]['name'], job[-3]] for job in job_args_list] + jobs = [[job[-3], job[0]['name']] for job in job_args_list] run_id = str(uuid.uuid4())[:8] metadata = { 'run_id': run_id, diff --git a/tests/unit/test_benchmark.py b/tests/unit/test_benchmark.py index ac1c642d..9ed5c144 100644 --- a/tests/unit/test_benchmark.py +++ b/tests/unit/test_benchmark.py @@ -422,7 +422,7 @@ def test__write_run_id_file(mock_datetime, mock_uuid, tmp_path): ({'name': 'GaussianCopulaSynthesizer'}, 'adult', None, None), ({'name': 'CTGANSynthesizer'}, 'census', None, None), ] - expected_jobs = [['GaussianCopulaSynthesizer', 'adult'], ['CTGANSynthesizer', 'census']] + expected_jobs = [['adult', 'GaussianCopulaSynthesizer'], ['census', 'CTGANSynthesizer']] synthesizers = ['GaussianCopulaSynthesizer', 'CTGANSynthesizer', 'RealTabFormerSynthesizer'] # Run From 98f6ce3891d5ee1d1aa59af19eeb87fc52f552e8 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Thu, 26 Jun 2025 17:34:20 +0100 Subject: [PATCH 06/14] integration test --- tests/integration/test_benchmark.py | 43 +++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tests/integration/test_benchmark.py b/tests/integration/test_benchmark.py index 8d52a993..2302da16 100644 --- a/tests/integration/test_benchmark.py +++ b/tests/integration/test_benchmark.py @@ -2,6 +2,7 @@ import contextlib import io +import os import re import sys import time @@ -10,6 +11,7 @@ import numpy as np import pandas as pd import pytest +import yaml from sdv.metadata.single_table import SingleTableMetadata from sdv.single_table.copulas import GaussianCopulaSynthesizer @@ -585,3 +587,44 @@ def test_benchmark_single_table_no_warnings(): ) future_warnings = [warning for warning in w if issubclass(warning.category, FutureWarning)] assert len(future_warnings) == 0 + + +def test_benchmark_single_table_with_output_destination(tmp_path): + """Test it works with the ``output_destination`` argument.""" + # Setup + output_destination = str(tmp_path / 'benchmark_output') + today_date = pd.Timestamp.now().strftime('%m_%d_%Y') + + # Run + results = benchmark_single_table( + synthesizers=['GaussianCopulaSynthesizer', 'TVAESynthesizer'], + sdv_datasets=['expedia_hotel_logs'], + output_destination=output_destination, + ) + + # Assert + directions = os.listdir(output_destination) + assert f'SDGym_results_{today_date}' in directions + for file in directions: + if file.endswith('.yaml'): + with open(os.path.join(output_destination, file), 'r') as f: + metadata = yaml.safe_load(f) + assert metadata['completed_date'] is not None + assert metadata['sdgym_version'] == sdgym.__version__ + else: + subdirections = os.listdir(os.path.join(output_destination, file)) + assert set(subdirections) == {'results.csv', f'expedia_hotel_logs_{today_date}'} + synthesizer_directions = os.listdir( + os.path.join(output_destination, file, f'expedia_hotel_logs_{today_date}') + ) + assert set(synthesizer_directions) == {'TVAESynthesizer', 'GaussianCopulaSynthesizer'} + for synthesizer in synthesizer_directions: + synthesizer_files = os.listdir( + os.path.join( + output_destination, file, f'expedia_hotel_logs_{today_date}', synthesizer + ) + ) + assert set(synthesizer_files) == {'synthesizer.pkl', 'synthetic_data.csv'} + + saved_result = pd.read_csv(f'{output_destination}/SDGym_results_{today_date}/results.csv') + pd.testing.assert_frame_equal(results, saved_result, check_dtype=False) From d83d96c10d5c4e58a390e9b53de6963e19507ec5 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Fri, 27 Jun 2025 16:43:24 +0100 Subject: [PATCH 07/14] allow existing folder --- sdgym/benchmark.py | 3 -- tests/integration/test_benchmark.py | 53 +++++++++++++++++++++++++++++ tests/unit/test_benchmark.py | 5 --- 3 files changed, 53 insertions(+), 8 deletions(-) diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index c638d2df..ee90360b 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -892,9 +892,6 @@ def _validate_output_destination(output_destination): 'Please use `benchmark_single_table_aws` instead.' ) - if os.path.exists(output_destination): - raise ValueError(f'The output path {output_destination} already exists.') - def _write_run_id_file(output_destination, synthesizers, job_args_list): jobs = [[job[-3], job[0]['name']] for job in job_args_list] diff --git a/tests/integration/test_benchmark.py b/tests/integration/test_benchmark.py index 2302da16..ff4e63f7 100644 --- a/tests/integration/test_benchmark.py +++ b/tests/integration/test_benchmark.py @@ -628,3 +628,56 @@ def test_benchmark_single_table_with_output_destination(tmp_path): saved_result = pd.read_csv(f'{output_destination}/SDGym_results_{today_date}/results.csv') pd.testing.assert_frame_equal(results, saved_result, check_dtype=False) + + +def test_benchmark_single_table_with_output_destination_multiple_runs(tmp_path): + """Test saving in ``output_destination`` with multiple runs. + + Here two benchmark runs are performed with different synthesizers + on the same dataset, and the results are saved in the same output directory. + The directory contains a `results.csv` file with the combined results + and a subdirectory for each synthesizer with its own results. + """ + # Setup + output_destination = str(tmp_path / 'benchmark_output') + today_date = pd.Timestamp.now().strftime('%m_%d_%Y') + + # Run + result_1 = benchmark_single_table( + synthesizers=['GaussianCopulaSynthesizer'], + sdv_datasets=['expedia_hotel_logs'], + output_destination=output_destination, + ) + result_2 = benchmark_single_table( + synthesizers=['TVAESynthesizer'], + sdv_datasets=['expedia_hotel_logs'], + output_destination=output_destination, + ) + + # Assert + final_results = pd.concat([result_1, result_2], ignore_index=True) + directions = os.listdir(output_destination) + assert f'SDGym_results_{today_date}' in directions + for file in directions: + if file.endswith('.yaml'): + with open(os.path.join(output_destination, file), 'r') as f: + metadata = yaml.safe_load(f) + assert metadata['completed_date'] is not None + assert metadata['sdgym_version'] == sdgym.__version__ + else: + subdirections = os.listdir(os.path.join(output_destination, file)) + assert set(subdirections) == {'results.csv', f'expedia_hotel_logs_{today_date}'} + synthesizer_directions = os.listdir( + os.path.join(output_destination, file, f'expedia_hotel_logs_{today_date}') + ) + assert set(synthesizer_directions) == {'TVAESynthesizer', 'GaussianCopulaSynthesizer'} + for synthesizer in synthesizer_directions: + synthesizer_files = os.listdir( + os.path.join( + output_destination, file, f'expedia_hotel_logs_{today_date}', synthesizer + ) + ) + assert set(synthesizer_files) == {'synthesizer.pkl', 'synthetic_data.csv'} + + saved_result = pd.read_csv(f'{output_destination}/SDGym_results_{today_date}/results.csv') + pd.testing.assert_frame_equal(final_results, saved_result, check_dtype=False) diff --git a/tests/unit/test_benchmark.py b/tests/unit/test_benchmark.py index 9ed5c144..c2a60c66 100644 --- a/tests/unit/test_benchmark.py +++ b/tests/unit/test_benchmark.py @@ -357,7 +357,6 @@ def test__validate_output_destination(tmp_path): 'The `output_destination` parameter cannot be an S3 path. ' 'Please use `benchmark_single_table_aws` instead.' ) - err_3 = re.escape(f'The output path {valid_destination} already exists.') # Run and Assert _validate_output_destination(str(valid_destination)) @@ -367,10 +366,6 @@ def test__validate_output_destination(tmp_path): with pytest.raises(ValueError, match=err_2): _validate_output_destination(aws_destination) - valid_destination.mkdir() - with pytest.raises(ValueError, match=err_3): - _validate_output_destination(str(valid_destination)) - @patch('sdgym.benchmark._validate_output_destination') def test__setup_output_destination(mock_validate, tmp_path): From 995ab2eecffb54bce82ab37e367cceca248c8f14 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Fri, 27 Jun 2025 17:08:00 +0100 Subject: [PATCH 08/14] update naming --- sdgym/benchmark.py | 4 ++-- tests/integration/test_benchmark.py | 28 +++++++++++++++++----------- tests/unit/test_benchmark.py | 4 ++-- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index ee90360b..fe7ee099 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -140,8 +140,8 @@ def _setup_output_destination(output_destination, synthesizers, datasets): synth_folder.mkdir(parents=True, exist_ok=True) paths[dataset][synth_name] = { - 'synthesizer': str(synth_folder / 'synthesizer.pkl'), - 'synthetic_data': str(synth_folder / 'synthetic_data.csv'), + 'synthesizer': str(synth_folder / f'{synth_name}_synthesizer.pkl'), + 'synthetic_data': str(synth_folder / f'{synth_name}_synthetic_data.csv'), } return paths diff --git a/tests/integration/test_benchmark.py b/tests/integration/test_benchmark.py index ff4e63f7..a20be7a8 100644 --- a/tests/integration/test_benchmark.py +++ b/tests/integration/test_benchmark.py @@ -598,7 +598,7 @@ def test_benchmark_single_table_with_output_destination(tmp_path): # Run results = benchmark_single_table( synthesizers=['GaussianCopulaSynthesizer', 'TVAESynthesizer'], - sdv_datasets=['expedia_hotel_logs'], + sdv_datasets=['fake_companies'], output_destination=output_destination, ) @@ -613,18 +613,21 @@ def test_benchmark_single_table_with_output_destination(tmp_path): assert metadata['sdgym_version'] == sdgym.__version__ else: subdirections = os.listdir(os.path.join(output_destination, file)) - assert set(subdirections) == {'results.csv', f'expedia_hotel_logs_{today_date}'} + assert set(subdirections) == {'results.csv', f'fake_companies_{today_date}'} synthesizer_directions = os.listdir( - os.path.join(output_destination, file, f'expedia_hotel_logs_{today_date}') + os.path.join(output_destination, file, f'fake_companies_{today_date}') ) assert set(synthesizer_directions) == {'TVAESynthesizer', 'GaussianCopulaSynthesizer'} for synthesizer in synthesizer_directions: synthesizer_files = os.listdir( os.path.join( - output_destination, file, f'expedia_hotel_logs_{today_date}', synthesizer + output_destination, file, f'fake_companies_{today_date}', synthesizer ) ) - assert set(synthesizer_files) == {'synthesizer.pkl', 'synthetic_data.csv'} + assert set(synthesizer_files) == { + f'{synthesizer}_synthesizer.pkl', + f'{synthesizer}_synthetic_data.csv', + } saved_result = pd.read_csv(f'{output_destination}/SDGym_results_{today_date}/results.csv') pd.testing.assert_frame_equal(results, saved_result, check_dtype=False) @@ -645,12 +648,12 @@ def test_benchmark_single_table_with_output_destination_multiple_runs(tmp_path): # Run result_1 = benchmark_single_table( synthesizers=['GaussianCopulaSynthesizer'], - sdv_datasets=['expedia_hotel_logs'], + sdv_datasets=['fake_companies'], output_destination=output_destination, ) result_2 = benchmark_single_table( synthesizers=['TVAESynthesizer'], - sdv_datasets=['expedia_hotel_logs'], + sdv_datasets=['fake_companies'], output_destination=output_destination, ) @@ -666,18 +669,21 @@ def test_benchmark_single_table_with_output_destination_multiple_runs(tmp_path): assert metadata['sdgym_version'] == sdgym.__version__ else: subdirections = os.listdir(os.path.join(output_destination, file)) - assert set(subdirections) == {'results.csv', f'expedia_hotel_logs_{today_date}'} + assert set(subdirections) == {'results.csv', f'fake_companies_{today_date}'} synthesizer_directions = os.listdir( - os.path.join(output_destination, file, f'expedia_hotel_logs_{today_date}') + os.path.join(output_destination, file, f'fake_companies_{today_date}') ) assert set(synthesizer_directions) == {'TVAESynthesizer', 'GaussianCopulaSynthesizer'} for synthesizer in synthesizer_directions: synthesizer_files = os.listdir( os.path.join( - output_destination, file, f'expedia_hotel_logs_{today_date}', synthesizer + output_destination, file, f'fake_companies_{today_date}', synthesizer ) ) - assert set(synthesizer_files) == {'synthesizer.pkl', 'synthetic_data.csv'} + assert set(synthesizer_files) == { + f'{synthesizer}_synthesizer.pkl', + f'{synthesizer}_synthetic_data.csv', + } saved_result = pd.read_csv(f'{output_destination}/SDGym_results_{today_date}/results.csv') pd.testing.assert_frame_equal(final_results, saved_result, check_dtype=False) diff --git a/tests/unit/test_benchmark.py b/tests/unit/test_benchmark.py index c2a60c66..e003e997 100644 --- a/tests/unit/test_benchmark.py +++ b/tests/unit/test_benchmark.py @@ -388,10 +388,10 @@ def test__setup_output_destination(mock_validate, tmp_path): **{ synth: { 'synthesizer': str( - base_path / f'{dataset}_{today}' / synth / 'synthesizer.pkl' + base_path / f'{dataset}_{today}' / synth / f'{synth}_synthesizer.pkl' ), 'synthetic_data': str( - base_path / f'{dataset}_{today}' / synth / 'synthetic_data.csv' + base_path / f'{dataset}_{today}' / synth / f'{synth}_synthetic_data.csv' ), } for synth in synthesizers From 1b70b044812aa64b6cc025cc827646899ca80c8e Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Mon, 7 Jul 2025 12:38:59 +0200 Subject: [PATCH 09/14] update warning message --- sdgym/benchmark.py | 12 +++++------- tests/unit/test_benchmark.py | 23 ++++++++++++----------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index fe7ee099..73edc828 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -856,14 +856,13 @@ def _create_instance_on_ec2(script_content): def _handle_deprecated_parameters( - output_filepath, detailed_results_folder, multi_processing_config, run_on_ec2 + output_filepath, detailed_results_folder, multi_processing_config ): """Handle deprecated parameters and issue warnings.""" parameters_to_deprecate = { 'output_filepath': output_filepath, 'detailed_results_folder': detailed_results_folder, 'multi_processing_config': multi_processing_config, - 'run_on_ec2': run_on_ec2, } parameters = [] for name, value in parameters_to_deprecate.items(): @@ -873,9 +872,10 @@ def _handle_deprecated_parameters( if parameters: parameters = "', '".join(sorted(parameters)) message = ( - f"Parameters '{parameters}' are deprecated in the `benchmark_single_table` " + f"Parameters '{parameters}' are deprecated in the 'benchmark_single_table' " 'function and will be removed in October 2025. ' - 'Please consider using `output_destination` instead.' + "For saving results, please use the 'output_destination' parameter. For running SDGym" + " remotely on AWS please use the 'benchmark_single_table_aws' method." ) warnings.warn(message, FutureWarning) @@ -1024,9 +1024,7 @@ def benchmark_single_table( pandas.DataFrame: A table containing one row per synthesizer + dataset + metric. """ - _handle_deprecated_parameters( - output_filepath, detailed_results_folder, multi_processing_config, run_on_ec2 - ) + _handle_deprecated_parameters(output_filepath, detailed_results_folder, multi_processing_config) if run_on_ec2: print("This will create an instance for the current AWS user's account.") # noqa if output_filepath is not None: diff --git a/tests/unit/test_benchmark.py b/tests/unit/test_benchmark.py index e003e997..397b1889 100644 --- a/tests/unit/test_benchmark.py +++ b/tests/unit/test_benchmark.py @@ -59,7 +59,7 @@ def test_benchmark_single_table_deprecated_params(mock_handle_deprecated, tqdm_m ) # Assert - mock_handle_deprecated.assert_called_once_with(None, None, None, False) + mock_handle_deprecated.assert_called_once_with(None, None, None) tqdm_mock.assert_called_once_with(ANY, total=1, position=0, leave=True) @@ -321,26 +321,27 @@ def test__handle_deprecated_parameters(): output_filepath = 's3://BucketName/path' detailed_results_folder = 'mock/path' multi_processing_config = {'num_processes': 4} - run_on_ec2 = True - expected_message_1 = re.escape( + expected_message_1 = ( "Parameters 'detailed_results_folder', 'output_filepath' are deprecated in the " - '`benchmark_single_table` function and will be removed in October 2025. Please ' - 'consider using `output_destination` instead.' + "'benchmark_single_table' function and will be removed in October 2025. For saving" + " results, please use the 'output_destination' parameter. For running SDGym remotely" + " on AWS please use the 'benchmark_single_table_aws' method." ) - expected_message_2 = re.escape( + expected_message_2 = ( "Parameters 'detailed_results_folder', 'multi_processing_config', 'output_filepath'" - ", 'run_on_ec2' are deprecated in the `benchmark_single_table` function and will be" - ' removed in October 2025. Please consider using `output_destination` instead.' + " are deprecated in the 'benchmark_single_table' function and will be removed in October" + " 2025. For saving results, please use the 'output_destination' parameter. For running" + " SDGym remotely on AWS please use the 'benchmark_single_table_aws' method." ) # Run and Assert - _handle_deprecated_parameters(None, None, None, False) + _handle_deprecated_parameters(None, None, None) with pytest.warns(FutureWarning, match=expected_message_1): - _handle_deprecated_parameters(output_filepath, detailed_results_folder, None, False) + _handle_deprecated_parameters(output_filepath, detailed_results_folder, None) with pytest.warns(FutureWarning, match=expected_message_2): _handle_deprecated_parameters( - output_filepath, detailed_results_folder, multi_processing_config, run_on_ec2 + output_filepath, detailed_results_folder, multi_processing_config ) From 341c77d0b2307e4fa4d355d171b723ae9adcb5cf Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Mon, 7 Jul 2025 14:49:30 +0200 Subject: [PATCH 10/14] add a locker to serialize savings --- pyproject.toml | 1 + sdgym/benchmark.py | 15 +++++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0553c69f..6d37e224 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,6 +56,7 @@ dependencies = [ 'rdt>=1.17.0', 'sdmetrics>=0.20.1', 'sdv>=1.21.0', + 'portalocker>=3.2.0', ] [project.urls] diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index 73edc828..95e078c8 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -14,6 +14,7 @@ from datetime import datetime from importlib.metadata import version from pathlib import Path +import portalocker import boto3 import cloudpickle @@ -548,6 +549,15 @@ def _format_output( return scores +def safe_append(scores, result_file): + result_file = Path(result_file) + result_file.parent.mkdir(parents=True, exist_ok=True) + + with open(result_file, 'a+') as f: + portalocker.lock(f, portalocker.LOCK_EX) + f.seek(0, 2) + is_empty = f.tell() == 0 + scores.to_csv(f, index=False, header=is_empty) def _run_job(args): # Reset random seed @@ -622,10 +632,7 @@ def _run_job(args): synth_path = Path(synthesizer_path['synthesizer']) root_path = synth_path.parents[2] result_file = root_path / 'results.csv' - if not result_file.exists(): - scores.to_csv(result_file, index=False, mode='w') - else: - scores.to_csv(result_file, index=False, mode='a', header=False) + safe_append(scores, result_file) return scores From 0f81febd551a9ef69a03005dfc1b260b3d0cc2ee Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Mon, 7 Jul 2025 14:59:46 +0200 Subject: [PATCH 11/14] update portallocker version --- pyproject.toml | 2 +- sdgym/benchmark.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6d37e224..38add8ae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,7 +56,7 @@ dependencies = [ 'rdt>=1.17.0', 'sdmetrics>=0.20.1', 'sdv>=1.21.0', - 'portalocker>=3.2.0', + 'portalocker>=3.0.0', ] [project.urls] diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index 95e078c8..b88118b6 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -14,13 +14,13 @@ from datetime import datetime from importlib.metadata import version from pathlib import Path -import portalocker import boto3 import cloudpickle import compress_pickle import numpy as np import pandas as pd +import portalocker import tqdm import yaml from sdmetrics.reports.multi_table import ( @@ -549,7 +549,8 @@ def _format_output( return scores -def safe_append(scores, result_file): + +def _safe_append(scores, result_file): result_file = Path(result_file) result_file.parent.mkdir(parents=True, exist_ok=True) @@ -559,6 +560,7 @@ def safe_append(scores, result_file): is_empty = f.tell() == 0 scores.to_csv(f, index=False, header=is_empty) + def _run_job(args): # Reset random seed np.random.seed() @@ -632,7 +634,7 @@ def _run_job(args): synth_path = Path(synthesizer_path['synthesizer']) root_path = synth_path.parents[2] result_file = root_path / 'results.csv' - safe_append(scores, result_file) + _safe_append(scores, result_file) return scores From a1502a65c8788b0b0ba22cfb20b285787ca45365 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Mon, 7 Jul 2025 15:00:41 +0200 Subject: [PATCH 12/14] cleaning --- sdgym/benchmark.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index b88118b6..df255e2c 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -553,12 +553,11 @@ def _format_output( def _safe_append(scores, result_file): result_file = Path(result_file) result_file.parent.mkdir(parents=True, exist_ok=True) - - with open(result_file, 'a+') as f: - portalocker.lock(f, portalocker.LOCK_EX) - f.seek(0, 2) - is_empty = f.tell() == 0 - scores.to_csv(f, index=False, header=is_empty) + with open(result_file, 'a+') as file: + portalocker.lock(file, portalocker.LOCK_EX) + file.seek(0, 2) + is_empty = file.tell() == 0 + scores.to_csv(file, index=False, header=is_empty) def _run_job(args): From 1d77b581bdaa77f7d3a907cfa75063763a4f0c84 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Tue, 8 Jul 2025 13:20:58 +0200 Subject: [PATCH 13/14] remove portallocker and save files separately + all together --- pyproject.toml | 1 - sdgym/benchmark.py | 26 ++++++++++------------ tests/integration/test_benchmark.py | 34 +++++++++++++++++++++++++++-- tests/unit/test_benchmark.py | 3 +++ 4 files changed, 46 insertions(+), 18 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 38add8ae..0553c69f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,7 +56,6 @@ dependencies = [ 'rdt>=1.17.0', 'sdmetrics>=0.20.1', 'sdv>=1.21.0', - 'portalocker>=3.0.0', ] [project.urls] diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index df255e2c..492f90d7 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -20,7 +20,6 @@ import compress_pickle import numpy as np import pandas as pd -import portalocker import tqdm import yaml from sdmetrics.reports.multi_table import ( @@ -143,6 +142,7 @@ def _setup_output_destination(output_destination, synthesizers, datasets): paths[dataset][synth_name] = { 'synthesizer': str(synth_folder / f'{synth_name}_synthesizer.pkl'), 'synthetic_data': str(synth_folder / f'{synth_name}_synthetic_data.csv'), + 'benchmark_result': str(synth_folder / f'{synth_name}_benchmark_result.csv'), } return paths @@ -550,16 +550,6 @@ def _format_output( return scores -def _safe_append(scores, result_file): - result_file = Path(result_file) - result_file.parent.mkdir(parents=True, exist_ok=True) - with open(result_file, 'a+') as file: - portalocker.lock(file, portalocker.LOCK_EX) - file.seek(0, 2) - is_empty = file.tell() == 0 - scores.to_csv(file, index=False, header=is_empty) - - def _run_job(args): # Reset random seed np.random.seed() @@ -630,10 +620,7 @@ def _run_job(args): ) if synthesizer_path is not None: - synth_path = Path(synthesizer_path['synthesizer']) - root_path = synth_path.parents[2] - result_file = root_path / 'results.csv' - _safe_append(scores, result_file) + scores.to_csv(synthesizer_path['benchmark_result'], index=False) return scores @@ -691,6 +678,15 @@ def _run_jobs(multi_processing_config, job_args_list, show_progress): raise SDGymError('No valid Dataset/Synthesizer combination given.') scores = pd.concat(scores, ignore_index=True) + output_directions = job_args_list[0][-1] + if output_directions and isinstance(output_directions, dict): + synth_path = Path(output_directions['synthesizer']) + root_path = synth_path.parents[2] + result_file = root_path / 'results.csv' + if not result_file.exists(): + scores.to_csv(result_file, index=False, mode='w') + else: + scores.to_csv(result_file, index=False, mode='a', header=False) return scores diff --git a/tests/integration/test_benchmark.py b/tests/integration/test_benchmark.py index a20be7a8..e6d0dd0c 100644 --- a/tests/integration/test_benchmark.py +++ b/tests/integration/test_benchmark.py @@ -604,6 +604,7 @@ def test_benchmark_single_table_with_output_destination(tmp_path): # Assert directions = os.listdir(output_destination) + score_saved_separately = pd.DataFrame() assert f'SDGym_results_{today_date}' in directions for file in directions: if file.endswith('.yaml'): @@ -618,7 +619,7 @@ def test_benchmark_single_table_with_output_destination(tmp_path): os.path.join(output_destination, file, f'fake_companies_{today_date}') ) assert set(synthesizer_directions) == {'TVAESynthesizer', 'GaussianCopulaSynthesizer'} - for synthesizer in synthesizer_directions: + for synthesizer in sorted(synthesizer_directions): synthesizer_files = os.listdir( os.path.join( output_destination, file, f'fake_companies_{today_date}', synthesizer @@ -627,10 +628,24 @@ def test_benchmark_single_table_with_output_destination(tmp_path): assert set(synthesizer_files) == { f'{synthesizer}_synthesizer.pkl', f'{synthesizer}_synthetic_data.csv', + f'{synthesizer}_benchmark_result.csv', } + score = pd.read_csv( + os.path.join( + output_destination, + file, + f'fake_companies_{today_date}', + synthesizer, + f'{synthesizer}_benchmark_result.csv', + ) + ) + score_saved_separately = pd.concat( + [score_saved_separately, score], ignore_index=True + ) saved_result = pd.read_csv(f'{output_destination}/SDGym_results_{today_date}/results.csv') pd.testing.assert_frame_equal(results, saved_result, check_dtype=False) + pd.testing.assert_frame_equal(results, score_saved_separately, check_dtype=False) def test_benchmark_single_table_with_output_destination_multiple_runs(tmp_path): @@ -659,6 +674,7 @@ def test_benchmark_single_table_with_output_destination_multiple_runs(tmp_path): # Assert final_results = pd.concat([result_1, result_2], ignore_index=True) + score_saved_separately = pd.DataFrame() directions = os.listdir(output_destination) assert f'SDGym_results_{today_date}' in directions for file in directions: @@ -674,7 +690,7 @@ def test_benchmark_single_table_with_output_destination_multiple_runs(tmp_path): os.path.join(output_destination, file, f'fake_companies_{today_date}') ) assert set(synthesizer_directions) == {'TVAESynthesizer', 'GaussianCopulaSynthesizer'} - for synthesizer in synthesizer_directions: + for synthesizer in sorted(synthesizer_directions): synthesizer_files = os.listdir( os.path.join( output_destination, file, f'fake_companies_{today_date}', synthesizer @@ -683,7 +699,21 @@ def test_benchmark_single_table_with_output_destination_multiple_runs(tmp_path): assert set(synthesizer_files) == { f'{synthesizer}_synthesizer.pkl', f'{synthesizer}_synthetic_data.csv', + f'{synthesizer}_benchmark_result.csv', } + score = pd.read_csv( + os.path.join( + output_destination, + file, + f'fake_companies_{today_date}', + synthesizer, + f'{synthesizer}_benchmark_result.csv', + ) + ) + score_saved_separately = pd.concat( + [score_saved_separately, score], ignore_index=True + ) saved_result = pd.read_csv(f'{output_destination}/SDGym_results_{today_date}/results.csv') pd.testing.assert_frame_equal(final_results, saved_result, check_dtype=False) + pd.testing.assert_frame_equal(final_results, score_saved_separately, check_dtype=False) diff --git a/tests/unit/test_benchmark.py b/tests/unit/test_benchmark.py index 397b1889..f85358d7 100644 --- a/tests/unit/test_benchmark.py +++ b/tests/unit/test_benchmark.py @@ -394,6 +394,9 @@ def test__setup_output_destination(mock_validate, tmp_path): 'synthetic_data': str( base_path / f'{dataset}_{today}' / synth / f'{synth}_synthetic_data.csv' ), + 'benchmark_result': str( + base_path / f'{dataset}_{today}' / synth / f'{synth}_benchmark_result.csv' + ), } for synth in synthesizers }, From 662786d61c83a81ccb9d4c77c0537b194de8612f Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Thu, 10 Jul 2025 10:43:20 +0200 Subject: [PATCH 14/14] restructure files --- sdgym/benchmark.py | 38 ++++--- tests/integration/test_benchmark.py | 170 +++++++++++++++------------- tests/unit/test_benchmark.py | 29 +++-- 3 files changed, 129 insertions(+), 108 deletions(-) diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index 492f90d7..17db2e6f 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -6,8 +6,8 @@ import multiprocessing import os import pickle +import re import tracemalloc -import uuid import warnings from collections import defaultdict from contextlib import contextmanager @@ -128,12 +128,22 @@ def _setup_output_destination(output_destination, synthesizers, datasets): today = datetime.today().strftime('%m_%d_%Y') top_folder = output_path / f'SDGym_results_{today}' top_folder.mkdir(parents=True, exist_ok=True) + pattern = re.compile(rf'run_{re.escape(today)}_(\d+)\.yaml$') + increments = [] + for file in top_folder.glob(f'run_{today}_*.yaml'): + match = pattern.match(file.name) + if match: + increments.append(int(match.group(1))) + + if increments: + next_increment = max(increments) + 1 + else: + next_increment = 1 paths = defaultdict(dict) for dataset in datasets: dataset_folder = top_folder / f'{dataset}_{today}' dataset_folder.mkdir(parents=True, exist_ok=True) - paths[dataset]['meta'] = str(dataset_folder / 'meta.yaml') for synth_name in synthesizers: synth_folder = dataset_folder / synth_name @@ -143,6 +153,8 @@ def _setup_output_destination(output_destination, synthesizers, datasets): 'synthesizer': str(synth_folder / f'{synth_name}_synthesizer.pkl'), 'synthetic_data': str(synth_folder / f'{synth_name}_synthetic_data.csv'), 'benchmark_result': str(synth_folder / f'{synth_name}_benchmark_result.csv'), + 'run_id': str(top_folder / f'run_{today}_{next_increment}.yaml'), + 'results': str(top_folder / f'results_{today}_{next_increment}.csv'), } return paths @@ -680,9 +692,7 @@ def _run_jobs(multi_processing_config, job_args_list, show_progress): scores = pd.concat(scores, ignore_index=True) output_directions = job_args_list[0][-1] if output_directions and isinstance(output_directions, dict): - synth_path = Path(output_directions['synthesizer']) - root_path = synth_path.parents[2] - result_file = root_path / 'results.csv' + result_file = Path(output_directions['results']) if not result_file.exists(): scores.to_csv(result_file, index=False, mode='w') else: @@ -897,9 +907,11 @@ def _validate_output_destination(output_destination): ) -def _write_run_id_file(output_destination, synthesizers, job_args_list): +def _write_run_id_file(synthesizers, job_args_list): jobs = [[job[-3], job[0]['name']] for job in job_args_list] - run_id = str(uuid.uuid4())[:8] + output_directions = job_args_list[0][-1] + path = output_directions['run_id'] + run_id = Path(path).stem metadata = { 'run_id': run_id, 'starting_date': datetime.today().strftime('%m_%d_%Y %H:%M:%S'), @@ -915,14 +927,11 @@ def _write_run_id_file(output_destination, synthesizers, job_args_list): elif 'sdv' not in metadata.keys(): metadata['sdv_version'] = version('sdv') - with open(f'{output_destination}/run_{run_id}.yaml', 'w') as file: + with open(path, 'w') as file: yaml.dump(metadata, file) - return run_id - -def _update_run_id_file(output_destination, run_id): - run_file = Path(output_destination) / f'run_{run_id}.yaml' +def _update_run_id_file(run_file): with open(run_file, 'r') as f: run_data = yaml.safe_load(f) or {} @@ -1055,7 +1064,7 @@ def benchmark_single_table( custom_synthesizers, ) if output_destination is not None: - run_id = _write_run_id_file(output_destination, synthesizers, job_args_list) + _write_run_id_file(synthesizers, job_args_list) if job_args_list: scores = _run_jobs(multi_processing_config, job_args_list, show_progress) @@ -1073,6 +1082,7 @@ def benchmark_single_table( write_csv(scores, output_filepath, None, None) if output_destination is not None: - _update_run_id_file(output_destination, run_id) + run_id_filename = job_args_list[0][-1]['run_id'] + _update_run_id_file(run_id_filename) return scores diff --git a/tests/integration/test_benchmark.py b/tests/integration/test_benchmark.py index e6d0dd0c..e3932a27 100644 --- a/tests/integration/test_benchmark.py +++ b/tests/integration/test_benchmark.py @@ -605,45 +605,49 @@ def test_benchmark_single_table_with_output_destination(tmp_path): # Assert directions = os.listdir(output_destination) score_saved_separately = pd.DataFrame() - assert f'SDGym_results_{today_date}' in directions - for file in directions: - if file.endswith('.yaml'): - with open(os.path.join(output_destination, file), 'r') as f: - metadata = yaml.safe_load(f) - assert metadata['completed_date'] is not None - assert metadata['sdgym_version'] == sdgym.__version__ - else: - subdirections = os.listdir(os.path.join(output_destination, file)) - assert set(subdirections) == {'results.csv', f'fake_companies_{today_date}'} - synthesizer_directions = os.listdir( - os.path.join(output_destination, file, f'fake_companies_{today_date}') + assert directions == [f'SDGym_results_{today_date}'] + subdirections = os.listdir(os.path.join(output_destination, directions[0])) + assert set(subdirections) == { + f'results_{today_date}_1.csv', + f'fake_companies_{today_date}', + f'run_{today_date}_1.yaml', + } + with open( + os.path.join(output_destination, directions[0], f'run_{today_date}_1.yaml'), 'r' + ) as f: + metadata = yaml.safe_load(f) + assert metadata['completed_date'] is not None + assert metadata['sdgym_version'] == sdgym.__version__ + + synthesizer_directions = os.listdir( + os.path.join(output_destination, directions[0], f'fake_companies_{today_date}') + ) + assert set(synthesizer_directions) == {'TVAESynthesizer', 'GaussianCopulaSynthesizer'} + for synthesizer in sorted(synthesizer_directions): + synthesizer_files = os.listdir( + os.path.join( + output_destination, directions[0], f'fake_companies_{today_date}', synthesizer ) - assert set(synthesizer_directions) == {'TVAESynthesizer', 'GaussianCopulaSynthesizer'} - for synthesizer in sorted(synthesizer_directions): - synthesizer_files = os.listdir( - os.path.join( - output_destination, file, f'fake_companies_{today_date}', synthesizer - ) - ) - assert set(synthesizer_files) == { - f'{synthesizer}_synthesizer.pkl', - f'{synthesizer}_synthetic_data.csv', - f'{synthesizer}_benchmark_result.csv', - } - score = pd.read_csv( - os.path.join( - output_destination, - file, - f'fake_companies_{today_date}', - synthesizer, - f'{synthesizer}_benchmark_result.csv', - ) - ) - score_saved_separately = pd.concat( - [score_saved_separately, score], ignore_index=True - ) - - saved_result = pd.read_csv(f'{output_destination}/SDGym_results_{today_date}/results.csv') + ) + assert set(synthesizer_files) == { + f'{synthesizer}_synthesizer.pkl', + f'{synthesizer}_synthetic_data.csv', + f'{synthesizer}_benchmark_result.csv', + } + score = pd.read_csv( + os.path.join( + output_destination, + directions[0], + f'fake_companies_{today_date}', + synthesizer, + f'{synthesizer}_benchmark_result.csv', + ) + ) + score_saved_separately = pd.concat([score_saved_separately, score], ignore_index=True) + + saved_result = pd.read_csv( + f'{output_destination}/SDGym_results_{today_date}/results_{today_date}_1.csv' + ) pd.testing.assert_frame_equal(results, saved_result, check_dtype=False) pd.testing.assert_frame_equal(results, score_saved_separately, check_dtype=False) @@ -673,47 +677,55 @@ def test_benchmark_single_table_with_output_destination_multiple_runs(tmp_path): ) # Assert - final_results = pd.concat([result_1, result_2], ignore_index=True) score_saved_separately = pd.DataFrame() directions = os.listdir(output_destination) - assert f'SDGym_results_{today_date}' in directions - for file in directions: - if file.endswith('.yaml'): - with open(os.path.join(output_destination, file), 'r') as f: - metadata = yaml.safe_load(f) - assert metadata['completed_date'] is not None - assert metadata['sdgym_version'] == sdgym.__version__ - else: - subdirections = os.listdir(os.path.join(output_destination, file)) - assert set(subdirections) == {'results.csv', f'fake_companies_{today_date}'} - synthesizer_directions = os.listdir( - os.path.join(output_destination, file, f'fake_companies_{today_date}') + assert directions == [f'SDGym_results_{today_date}'] + subdirections = os.listdir(os.path.join(output_destination, directions[0])) + assert set(subdirections) == { + f'results_{today_date}_1.csv', + f'results_{today_date}_2.csv', + f'fake_companies_{today_date}', + f'run_{today_date}_1.yaml', + f'run_{today_date}_2.yaml', + } + with open( + os.path.join(output_destination, directions[0], f'run_{today_date}_1.yaml'), 'r' + ) as f: + metadata = yaml.safe_load(f) + assert metadata['completed_date'] is not None + assert metadata['sdgym_version'] == sdgym.__version__ + + synthesizer_directions = os.listdir( + os.path.join(output_destination, directions[0], f'fake_companies_{today_date}') + ) + assert set(synthesizer_directions) == {'TVAESynthesizer', 'GaussianCopulaSynthesizer'} + for synthesizer in sorted(synthesizer_directions): + synthesizer_files = os.listdir( + os.path.join( + output_destination, directions[0], f'fake_companies_{today_date}', synthesizer ) - assert set(synthesizer_directions) == {'TVAESynthesizer', 'GaussianCopulaSynthesizer'} - for synthesizer in sorted(synthesizer_directions): - synthesizer_files = os.listdir( - os.path.join( - output_destination, file, f'fake_companies_{today_date}', synthesizer - ) - ) - assert set(synthesizer_files) == { - f'{synthesizer}_synthesizer.pkl', - f'{synthesizer}_synthetic_data.csv', - f'{synthesizer}_benchmark_result.csv', - } - score = pd.read_csv( - os.path.join( - output_destination, - file, - f'fake_companies_{today_date}', - synthesizer, - f'{synthesizer}_benchmark_result.csv', - ) - ) - score_saved_separately = pd.concat( - [score_saved_separately, score], ignore_index=True - ) - - saved_result = pd.read_csv(f'{output_destination}/SDGym_results_{today_date}/results.csv') - pd.testing.assert_frame_equal(final_results, saved_result, check_dtype=False) - pd.testing.assert_frame_equal(final_results, score_saved_separately, check_dtype=False) + ) + assert set(synthesizer_files) == { + f'{synthesizer}_synthesizer.pkl', + f'{synthesizer}_synthetic_data.csv', + f'{synthesizer}_benchmark_result.csv', + } + score = pd.read_csv( + os.path.join( + output_destination, + directions[0], + f'fake_companies_{today_date}', + synthesizer, + f'{synthesizer}_benchmark_result.csv', + ) + ) + score_saved_separately = pd.concat([score_saved_separately, score], ignore_index=True) + + saved_result_1 = pd.read_csv( + f'{output_destination}/SDGym_results_{today_date}/results_{today_date}_1.csv' + ) + saved_result_2 = pd.read_csv( + f'{output_destination}/SDGym_results_{today_date}/results_{today_date}_2.csv' + ) + pd.testing.assert_frame_equal(result_1, saved_result_1, check_dtype=False) + pd.testing.assert_frame_equal(result_2, saved_result_2, check_dtype=False) diff --git a/tests/unit/test_benchmark.py b/tests/unit/test_benchmark.py index f85358d7..086b6098 100644 --- a/tests/unit/test_benchmark.py +++ b/tests/unit/test_benchmark.py @@ -2,6 +2,7 @@ import re from datetime import datetime from importlib.metadata import version +from pathlib import Path from unittest.mock import ANY, MagicMock, patch import pandas as pd @@ -385,7 +386,6 @@ def test__setup_output_destination(mock_validate, tmp_path): # Assert expected = { dataset: { - 'meta': str(base_path / f'{dataset}_{today}' / 'meta.yaml'), **{ synth: { 'synthesizer': str( @@ -397,6 +397,8 @@ def test__setup_output_destination(mock_validate, tmp_path): 'benchmark_result': str( base_path / f'{dataset}_{today}' / synth / f'{synth}_benchmark_result.csv' ), + 'run_id': str(base_path / f'run_{today}_1.yaml'), + 'results': str(base_path / f'results_{today}_1.csv'), } for synth in synthesizers }, @@ -408,32 +410,29 @@ def test__setup_output_destination(mock_validate, tmp_path): assert json.loads(json.dumps(result_2)) == expected -@patch('sdgym.benchmark.uuid.uuid4') @patch('sdgym.benchmark.datetime') -def test__write_run_id_file(mock_datetime, mock_uuid, tmp_path): +def test__write_run_id_file(mock_datetime, tmp_path): """Test the `_write_run_id_file` method.""" # Setup output_destination = tmp_path / 'output_destination' output_destination.mkdir() - mock_uuid.return_value = '123456789999' mock_datetime.today.return_value.strftime.return_value = '06_26_2025' + file_name = {'run_id': f'{output_destination}/run_06_26_2025_1.yaml'} jobs = [ - ({'name': 'GaussianCopulaSynthesizer'}, 'adult', None, None), + ({'name': 'GaussianCopulaSynthesizer'}, 'adult', None, file_name), ({'name': 'CTGANSynthesizer'}, 'census', None, None), ] expected_jobs = [['adult', 'GaussianCopulaSynthesizer'], ['census', 'CTGANSynthesizer']] synthesizers = ['GaussianCopulaSynthesizer', 'CTGANSynthesizer', 'RealTabFormerSynthesizer'] # Run - run_id = _write_run_id_file(output_destination, synthesizers, jobs) + _write_run_id_file(synthesizers, jobs) # Assert - assert run_id == '12345678' - run_id_file = output_destination / 'run_12345678.yaml' - assert run_id_file.exists() - with open(run_id_file, 'r') as file: + assert Path(file_name['run_id']).exists() + with open(file_name['run_id'], 'r') as file: run_id_data = yaml.safe_load(file) - assert run_id_data['run_id'] == '12345678' + assert run_id_data['run_id'] == 'run_06_26_2025_1' assert run_id_data['starting_date'] == '06_26_2025' assert run_id_data['jobs'] == expected_jobs assert run_id_data['sdgym_version'] == version('sdgym') @@ -448,15 +447,15 @@ def test__update_run_id_file(mock_datetime, tmp_path): # Setup output_destination = tmp_path / 'output_destination' output_destination.mkdir() - run_id = '12345678' + metadata = {'run_id': 'run_06_25_2025_1', 'starting_date': '06_25_2025', 'completed_date': None} + run_id_file = output_destination / 'run_06_25_2025_1.yaml' + run_id = 'run_06_25_2025_1' mock_datetime.today.return_value.strftime.return_value = '06_26_2025' - metadata = {'run_id': run_id, 'starting_date': '06_25_2025', 'completed_date': None} - run_id_file = output_destination / f'run_{run_id}.yaml' with open(run_id_file, 'w') as file: yaml.dump(metadata, file) # Run - _update_run_id_file(output_destination, run_id) + _update_run_id_file(run_id_file) # Assert with open(run_id_file, 'r') as file: