deezer · pasdeloup · Jun 16, 2025 · Jun 16, 2025 · Jun 17, 2025 · Jun 23, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 data/*
 results/*
+*.bak
diff --git a/Makefile b/Makefile
@@ -19,6 +19,9 @@ usage:
 build:
 	docker build -t $(DOCKER_IMAGE_NAME) .
 
+all:
+	$(DOCKER_RUN_MOUNT) /bin/bash modeling_activity_pace/run_all.sh || true
+
 run-bash: build
 	$(DOCKER_RUN_MOUNT) /bin/bash || true
 

diff --git a/README.md b/README.md
@@ -37,13 +37,12 @@ poetry run python modeling_activity_pace/<script.py>
 
 ## Scripts
 
-1. Raw user histories are trasformed into time series using `process_raw_streams.py`.
-2. User answers to survey are prepared using `process_raw_answers.py`.
-3. Dictionary Learning algorithm is run using `compute_dictionary.py`.
-4. The selection of the best iteration in dictionary learning is done using `choose_dictionary.py`.
-5. `compute_baselines.py` computes baselines scores and scores of PACE embeddings.
-6. `analyse_models.py` plots logistic regression coefficients and related statistical reports.
-7. `make_fig1.py` saves the plot of Figure 1.
+1. User logs are transformed into time series using `compute_time_series.py`.
+2. Dictionary Learning algorithm is run using `compute_dictionary.py`.
+3. The selection of the best iteration in dictionary learning is done using `choose_dictionary.py`.
+4. `compute_baselines.py` computes baselines scores and scores of PACE embeddings.
+5. `analyse_models.py` plots logistic regression coefficients and related statistical reports.
+6. `make_fig1.py` saves the plot of Figure 1.
 
 ## Data
 
@@ -54,15 +53,12 @@ Input data folder must be organized as follows :
 pace/
 │
 └── data/
-  └── raw/
-    ├── streams/
-    │ ├── one_year_all_respondents000000000000.csv
-    │ ├── ...
-    │ └── one_year_all_respondents0000000000399.csv
-    ├── other/
-    │ └── user_favorites.csv
-    └── answers/
-      └── records.csv
+  └── answers/
+      └── processed_records.csv
+  ├── streams/
+  │ ├── processed_streams_000000000000.csv
+  │ ├── ...
+  │ └── processed_streams_000000000091.csv
 ```
 
 Where ```one_year_all_respondents.csv``` files are stream history csv files with columns :  ```user_id, ts_listen, media_id, context_id, context_type, listening_time, context_4```.

diff --git a/...ling_activity_pace/process_raw_streams.py → ...ling_activity_pace/compute_time_series.py b/...ling_activity_pace/process_raw_streams.py → ...ling_activity_pace/compute_time_series.py
@@ -1,8 +1,10 @@
 import pandas as pd
+import os
 
 from src.process_raw_data.streams_processor import ProcessStreams
 from src.helpers import save_data
 from src.settings import (
+    TIME_SERIES_PATH,
     channel_names,
     time_labels_full,
     MIN_DATE,
@@ -11,10 +13,12 @@
     INSTANT_ZERO,
 )
 
-
 if __name__ == "__main__":
+    # Create destination directory if it doesn't exist
+    os.makedirs(TIME_SERIES_PATH, exist_ok=True)
+
     # Initialize a ProcessStreams instance
-    stream_processor = ProcessStreams("data/raw/streams/", usr_drop_rate=0)
+    stream_processor = ProcessStreams("data/streams/", usr_drop_rate=0)
 
     # Process the streams data
     stream_processor.process(MIN_DATE, MAX_DATE, N_SUBDIVISION_1HOUR, INSTANT_ZERO)
@@ -32,4 +36,4 @@
         )
 
         # Save the DataFrame to a CSV file
-        save_data(channel_data, f"data/processed/streams/X_{channel_name}.csv", index=True)
+        save_data(channel_data, f"{TIME_SERIES_PATH}/X_{channel_name}.csv", index=True)
diff --git a/modeling_activity_pace/make_fig1.py b/modeling_activity_pace/make_fig1.py
@@ -10,6 +10,7 @@
 from src.settings import (
     N_SUBDIVISION_1HOUR,
     INSTANT_ZERO,
+    TIME_SERIES_PATH,
     time_labels_full,
     time_labels,
 )
@@ -18,7 +19,6 @@
 tqdm.pandas()
 
 build_result_folder()
-SELECTED_USER_ID = 3356219324
 
 
 def add_time_range(df, n_subdivisions, instant_zero):
@@ -31,12 +31,13 @@ def add_time_range(df, n_subdivisions, instant_zero):
 
 # Import data
 
-df_processed = load_data("data/processed/streams/X_volume.csv")
+df_processed = load_data(f"{TIME_SERIES_PATH}/X_volume.csv")
 
+SELECTED_USER_ID = df_processed.index[0]
 
 streams_df_list = []
-for i in tqdm(os.listdir("data/raw/streams/")):
-    df_streams = load_data("data/raw/streams/" + i).reset_index()
+for i in tqdm(os.listdir("data/streams/")):
+    df_streams = load_data("data/streams/" + i).reset_index()
     streams_df_list.append(df_streams[df_streams["user_id"] == SELECTED_USER_ID])
 
 df = pd.concat(streams_df_list)

diff --git a/modeling_activity_pace/process_raw_answers.py b/modeling_activity_pace/process_raw_answers.py
diff --git a/modeling_activity_pace/run_all.sh b/modeling_activity_pace/run_all.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+echo "**********************************************"
+echo "1. User logs are transformed into time series "
+echo "**********************************************"
+poetry run python modeling_activity_pace/compute_time_series.py
+
+echo "****************************************"
+echo "2. Dictionary Learning algorithm is run "
+echo "****************************************"
+poetry run python modeling_activity_pace/compute_dictionary.py
+
+echo "**********************************************************"
+echo "3. Selection of the best iteration in dictionary learning "
+echo "**********************************************************"
+poetry run python modeling_activity_pace/choose_dictionary.py
+
+echo ""
+echo "***********************************************************"
+echo "4. Computes baselines scores and scores of PACE embeddings "
+echo "***********************************************************"
+poetry run python modeling_activity_pace/compute_baselines.py
+
+echo ""
+echo "******************************************"
+echo "5. Plots logistic regression coefficients "
+echo "   and related statistical reports        "
+echo "******************************************"
+poetry run python modeling_activity_pace/analyse_models.py
+
+echo -e ""
+echo "******************************"
+echo "6. Saves the plot of Figure 1 "
+echo "******************************"
+poetry run python modeling_activity_pace/make_fig1.py
diff --git a/modeling_activity_pace/src/baselines/baselines_functions.py b/modeling_activity_pace/src/baselines/baselines_functions.py
@@ -3,7 +3,7 @@
 from tqdm import tqdm
 
 from src.process_answers import ProcessAnswers
-from src.settings import channel_names, ANSWERS_PATH
+from src.settings import TIME_SERIES_PATH, channel_names, ANSWERS_PATH
 from src.helpers import load_data, get_ids_from_signals
 from src.process_raw_data.answers_helpers import process_data_for_classifier
 from src.modeling_functions import (
@@ -25,7 +25,7 @@ def compute_other_activities_baseline(cols_to_predict):
     Returns:
     - List of AUC scores.
     """
-    X_list = [load_data(f"data/processed/streams/X_{i}.csv") for i in channel_names]
+    X_list = [load_data(f"{TIME_SERIES_PATH}/X_{i}.csv") for i in channel_names]
 
     # Extract user IDs
     ids = get_ids_from_signals(X_list)
@@ -80,7 +80,7 @@ def compute_total_volume_baseline(cols_to_predict):
     """
 
     # Load volume data
-    df_volume = load_data("data/processed/streams/X_volume.csv")
+    df_volume = load_data(f"{TIME_SERIES_PATH}/X_volume.csv")
 
     # Sum the volume across time (transpose and sum)
     df = pd.DataFrame(df_volume.T.sum(), columns=["Total volume"])
@@ -155,4 +155,3 @@ def compute_baseline_scores(
     scores = perform_grid_search(X_train, X_test, y_train_, y_test_, cols_to_predict)
 
     return scores
-
diff --git a/modeling_activity_pace/src/dict_learning/choose_best_iteration.py b/modeling_activity_pace/src/dict_learning/choose_best_iteration.py
@@ -105,7 +105,14 @@ def choose_best_dict(X, y, cols_to_predict):
         index=True,
     )
     plot_reconstruction_iterations_perfs(df_reconstruction_scores, "results/figures/reconstruction_scores_over_iterations.pdf")
-    best_iter = int(input("Choose the best iteration :  "))
+
+    ## Uncomment here to manually select the best iteration
+    # best_iter = int(input("Choose the best iteration :  "))
+
+    best_iter_offset = 5
+    best_iter = best_iter_offset + class_scores[best_iter_offset:].index(max(class_scores[best_iter_offset:]))
+    print(f"Best iteration is {best_iter} with score {class_scores[best_iter]}")
+
     shutil.copy(
         DICT_ITER_PATH
         + [f"D_{i}.npy" for i in range(len(os.listdir(DICT_ITER_PATH)))][best_iter],

diff --git a/modeling_activity_pace/src/dict_learning/dictionary_helpers.py b/modeling_activity_pace/src/dict_learning/dictionary_helpers.py
@@ -5,6 +5,7 @@
 
 from src.process_answers import ProcessAnswers
 from src.settings import (
+    TIME_SERIES_PATH,
     n_channels,
     channel_names,
     ANSWERS_ACTIVITY_COLUMNS,
@@ -26,7 +27,7 @@ def process_data_for_DL():
     Returns:
         tuple: A tuple containing input data (X_list_array_clean) and target labels (y).
     """
-    X_list = [load_data(f"data/processed/streams/X_{i}.csv") for i in channel_names]
+    X_list = [load_data(f"{TIME_SERIES_PATH}/X_{i}.csv") for i in channel_names]
 
     ids = get_ids_from_signals(X_list)
     cols_to_predict = ANSWERS_ACTIVITY_COLUMNS

diff --git a/modeling_activity_pace/src/process_answers.py b/modeling_activity_pace/src/process_answers.py
@@ -1,11 +1,4 @@
 from src.helpers import load_data
-from src.process_raw_data.filter import process_filter
-from src.process_raw_data.process_activities_deezer import (
-    process_activities_deezer_feature,
-)
-from src.process_raw_data.process_age_gender import process_age_gender
-from src.helpers import load_data
-
 
 class ProcessAnswers:
     def __init__(self, path) -> None:
@@ -46,20 +39,3 @@ def process(self, ids, cols):
         """
         self.import_data()
         self.filter(ids, cols)
-
-
-def process_answer_data_pipeline(df, columns):
-    """
-    Process answers DataFrame by applying filtering and feature processing.
-
-    Args:
-        df (pd.DataFrame): The DataFrame to be processed.
-        columns (list): List of columns to keep in the processed DataFrame.
-
-    Returns:
-        pd.DataFrame: The processed DataFrame with selected columns.
-    """
-    df = process_filter(df)
-    df = process_activities_deezer_feature(df)
-    df = process_age_gender(df)
-    return df[columns]
diff --git a/modeling_activity_pace/src/process_raw_data/answers_helpers.py b/modeling_activity_pace/src/process_raw_data/answers_helpers.py
@@ -6,6 +6,7 @@
 from src.helpers import load_data, get_ids_from_signals
 from src.modeling_functions import convolve_signals, normalize_signals, split
 from src.settings import (
+    TIME_SERIES_PATH,
     channel_names,
     n_channels,
     ANSWERS_PATH,
@@ -35,7 +36,7 @@ def process_data_for_classifier(
     - y_train_: Training labels.
     - y_test_: Test labels.
     """
-    X_list = [load_data(f"data/processed/streams/X_{i}.csv") for i in channel_names]
+    X_list = [load_data(f"{TIME_SERIES_PATH}/X_{i}.csv") for i in channel_names]
     D = np.load("results/dict_learning/D.npy")[:, :, :166]
 
     # Extract user IDs

diff --git a/modeling_activity_pace/src/process_raw_data/process_activities_deezer.py b/modeling_activity_pace/src/process_raw_data/process_activities_deezer.py