From fa199491dcdace0c663e60f138f9737ce739699d Mon Sep 17 00:00:00 2001
From: jpasdeloup <jpasdeloup@deezer.com>
Date: Mon, 16 Jun 2025 10:20:41 +0200
Subject: [PATCH 1/8] WIP use new dataset: compute timeseries from
 pre-processed streams Add make all

---
 Makefile                                      |   3 +
 README.md                                     |  28 ++-
 ..._raw_streams.py => compute_time_series.py} |  10 +-
 modeling_activity_pace/make_fig1.py           |   6 +-
 modeling_activity_pace/process_raw_answers.py |  21 ---
 modeling_activity_pace/run_all.sh             |  35 ++++
 .../src/baselines/baselines_functions.py      |   5 +-
 .../dict_learning/choose_best_iteration.py    |   8 +-
 .../src/dict_learning/dictionary_helpers.py   |   2 +-
 modeling_activity_pace/src/process_answers.py |  65 -------
 .../src/process_raw_data/answers_helpers.py   |   2 +-
 .../process_activities_deezer.py              |  34 ----
 .../process_raw_data/process_age_gender.py    | 142 ---------------
 .../src/process_raw_data/streams_processor.py | 166 +++++++++---------
 modeling_activity_pace/src/settings.py        |   5 +-
 15 files changed, 157 insertions(+), 375 deletions(-)
 rename modeling_activity_pace/{process_raw_streams.py => compute_time_series.py} (75%)
 delete mode 100644 modeling_activity_pace/process_raw_answers.py
 create mode 100644 modeling_activity_pace/run_all.sh
 delete mode 100644 modeling_activity_pace/src/process_answers.py
 delete mode 100644 modeling_activity_pace/src/process_raw_data/process_activities_deezer.py
 delete mode 100644 modeling_activity_pace/src/process_raw_data/process_age_gender.py

diff --git a/Makefile b/Makefile
index 86cb40f..cf87ce6 100644
--- a/Makefile
+++ b/Makefile
@@ -19,6 +19,9 @@ usage:
 build:
 	docker build -t $(DOCKER_IMAGE_NAME) .
 
+all:
+	$(DOCKER_RUN_MOUNT) /bin/bash modeling_activity_pace/run_all.sh || true
+
 run-bash: build
 	$(DOCKER_RUN_MOUNT) /bin/bash || true
 
diff --git a/README.md b/README.md
index 6cc6d2b..a13a4fe 100644
--- a/README.md
+++ b/README.md
@@ -37,13 +37,12 @@ poetry run python modeling_activity_pace/<script.py>
 
 ## Scripts
 
-1. Raw user histories are trasformed into time series using `process_raw_streams.py`.
-2. User answers to survey are prepared using `process_raw_answers.py`.
-3. Dictionary Learning algorithm is run using `compute_dictionary.py`.
-4. The selection of the best iteration in dictionary learning is done using `choose_dictionary.py`.
-5. `compute_baselines.py` computes baselines scores and scores of PACE embeddings.
-6. `analyse_models.py` plots logistic regression coefficients and related statistical reports.
-7. `make_fig1.py` saves the plot of Figure 1.
+1. User logs are transformed into time series using `compute_time_series.py`.
+2. Dictionary Learning algorithm is run using `compute_dictionary.py`.
+3. The selection of the best iteration in dictionary learning is done using `choose_dictionary.py`.
+4. `compute_baselines.py` computes baselines scores and scores of PACE embeddings.
+5. `analyse_models.py` plots logistic regression coefficients and related statistical reports.
+6. `make_fig1.py` saves the plot of Figure 1.
 
 ## Data
 
@@ -54,15 +53,12 @@ Input data folder must be organized as follows :
 pace/
 │
 └── data/
-  └── raw/
-    ├── streams/
-    │ ├── one_year_all_respondents000000000000.csv
-    │ ├── ...
-    │ └── one_year_all_respondents0000000000399.csv
-    ├── other/
-    │ └── user_favorites.csv
-    └── answers/
-      └── records.csv
+  └── answers/
+      └── processed_records.csv
+  ├── streams/
+  │ ├── processed_streams_000000000000.csv
+  │ ├── ...
+  │ └── processed_streams_000000000091.csv
 ```
 
 Where ```one_year_all_respondents.csv``` files are stream history csv files with columns :  ```user_id, ts_listen, media_id, context_id, context_type, listening_time, context_4```.
diff --git a/modeling_activity_pace/process_raw_streams.py b/modeling_activity_pace/compute_time_series.py
similarity index 75%
rename from modeling_activity_pace/process_raw_streams.py
rename to modeling_activity_pace/compute_time_series.py
index 986ef47..c94048d 100644
--- a/modeling_activity_pace/process_raw_streams.py
+++ b/modeling_activity_pace/compute_time_series.py
@@ -1,8 +1,10 @@
 import pandas as pd
+import os
 
 from src.process_raw_data.streams_processor import ProcessStreams
 from src.helpers import save_data
 from src.settings import (
+    TIME_SERIES_PATH,
     channel_names,
     time_labels_full,
     MIN_DATE,
@@ -11,10 +13,12 @@
     INSTANT_ZERO,
 )
 
-
 if __name__ == "__main__":
+    # Create destination directory if it doesn't exist
+    os.makedirs(TIME_SERIES_PATH, exist_ok=True)
+
     # Initialize a ProcessStreams instance
-    stream_processor = ProcessStreams("data/raw/streams/", usr_drop_rate=0)
+    stream_processor = ProcessStreams("data/streams/", usr_drop_rate=0)
 
     # Process the streams data
     stream_processor.process(MIN_DATE, MAX_DATE, N_SUBDIVISION_1HOUR, INSTANT_ZERO)
@@ -32,4 +36,4 @@
         )
 
         # Save the DataFrame to a CSV file
-        save_data(channel_data, f"data/processed/streams/X_{channel_name}.csv", index=True)
+        save_data(channel_data, f"{TIME_SERIES_PATH}/X_{channel_name}.csv", index=True)
diff --git a/modeling_activity_pace/make_fig1.py b/modeling_activity_pace/make_fig1.py
index 06d7a69..95078aa 100644
--- a/modeling_activity_pace/make_fig1.py
+++ b/modeling_activity_pace/make_fig1.py
@@ -31,12 +31,12 @@ def add_time_range(df, n_subdivisions, instant_zero):
 
 # Import data
 
-df_processed = load_data("data/processed/streams/X_volume.csv")
+df_processed = load_data("data/timeseries/X_volume.csv")
 
 
 streams_df_list = []
-for i in tqdm(os.listdir("data/raw/streams/")):
-    df_streams = load_data("data/raw/streams/" + i).reset_index()
+for i in tqdm(os.listdir("data/streams/")):
+    df_streams = load_data("data/streams/" + i).reset_index()
     streams_df_list.append(df_streams[df_streams["user_id"] == SELECTED_USER_ID])
 
 df = pd.concat(streams_df_list)
diff --git a/modeling_activity_pace/process_raw_answers.py b/modeling_activity_pace/process_raw_answers.py
deleted file mode 100644
index 53a7a10..0000000
--- a/modeling_activity_pace/process_raw_answers.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from src.process_raw_data.process_age_gender import NEW_COL_NAMES_AGE_GENDER
-from src.process_answers import process_answer_data_pipeline
-from src.helpers import load_data, save_data
-from src.settings import ANSWERS_ACTIVITY_COLUMNS
-
-
-def main():
-    """
-    Main function to process data and save the processed DataFrame to a CSV file.
-    """
-    FILE_PATH = "data/raw/answers/records.csv"
-    COLS_TO_KEEP = ["user_id"] + ANSWERS_ACTIVITY_COLUMNS + NEW_COL_NAMES_AGE_GENDER
-    COLS_TO_KEEP.remove("annee_naissance")
-
-    df = load_data(FILE_PATH)
-    df = process_answer_data_pipeline(df, COLS_TO_KEEP)
-    save_data(df, "data/processed/answers/processed_records.csv")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/modeling_activity_pace/run_all.sh b/modeling_activity_pace/run_all.sh
new file mode 100644
index 0000000..965ea92
--- /dev/null
+++ b/modeling_activity_pace/run_all.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+echo "**********************************************"
+echo "1. User logs are transformed into time series "
+echo "**********************************************"
+poetry run python modeling_activity_pace/compute_time_series.py
+
+echo "****************************************"
+echo "2. Dictionary Learning algorithm is run "
+echo "****************************************"
+#poetry run python modeling_activity_pace/compute_dictionary.py
+
+echo "**********************************************************"
+echo "3. Selection of the best iteration in dictionary learning "
+echo "**********************************************************"
+#poetry run python modeling_activity_pace/choose_dictionary.py
+
+echo ""
+echo "***********************************************************"
+echo "4. Computes baselines scores and scores of PACE embeddings "
+echo "***********************************************************"
+#poetry run python modeling_activity_pace/compute_baselines.py
+
+echo ""
+echo "******************************************"
+echo "5. Plots logistic regression coefficients "
+echo "   and related statistical reports        "
+echo "******************************************"
+#poetry run python modeling_activity_pace/analyse_models.py
+
+echo -e ""
+echo "******************************"
+echo "6. Saves the plot of Figure 1 "
+echo "******************************"
+#poetry run python modeling_activity_pace/make_fig1.py
diff --git a/modeling_activity_pace/src/baselines/baselines_functions.py b/modeling_activity_pace/src/baselines/baselines_functions.py
index 54bc036..5497684 100644
--- a/modeling_activity_pace/src/baselines/baselines_functions.py
+++ b/modeling_activity_pace/src/baselines/baselines_functions.py
@@ -25,7 +25,7 @@ def compute_other_activities_baseline(cols_to_predict):
     Returns:
     - List of AUC scores.
     """
-    X_list = [load_data(f"data/processed/streams/X_{i}.csv") for i in channel_names]
+    X_list = [load_data(f"data/timeseries/X_{i}.csv") for i in channel_names]
 
     # Extract user IDs
     ids = get_ids_from_signals(X_list)
@@ -80,7 +80,7 @@ def compute_total_volume_baseline(cols_to_predict):
     """
 
     # Load volume data
-    df_volume = load_data("data/processed/streams/X_volume.csv")
+    df_volume = load_data("data/timeseries/X_volume.csv")
 
     # Sum the volume across time (transpose and sum)
     df = pd.DataFrame(df_volume.T.sum(), columns=["Total volume"])
@@ -155,4 +155,3 @@ def compute_baseline_scores(
     scores = perform_grid_search(X_train, X_test, y_train_, y_test_, cols_to_predict)
 
     return scores
-
diff --git a/modeling_activity_pace/src/dict_learning/choose_best_iteration.py b/modeling_activity_pace/src/dict_learning/choose_best_iteration.py
index 567261f..4ad283d 100644
--- a/modeling_activity_pace/src/dict_learning/choose_best_iteration.py
+++ b/modeling_activity_pace/src/dict_learning/choose_best_iteration.py
@@ -105,7 +105,13 @@ def choose_best_dict(X, y, cols_to_predict):
         index=True,
     )
     plot_reconstruction_iterations_perfs(df_reconstruction_scores, "results/figures/reconstruction_scores_over_iterations.pdf")
-    best_iter = int(input("Choose the best iteration :  "))
+
+    ## Uncomment here to manually select the best iteration
+    # best_iter = int(input("Choose the best iteration :  "))
+
+    best_iter = class_scores.index(max(class_scores))
+    print(f"Best iteration is {best_iter} with score {class_scores[best_iter]}")
+
     shutil.copy(
         DICT_ITER_PATH
         + [f"D_{i}.npy" for i in range(len(os.listdir(DICT_ITER_PATH)))][best_iter],
diff --git a/modeling_activity_pace/src/dict_learning/dictionary_helpers.py b/modeling_activity_pace/src/dict_learning/dictionary_helpers.py
index 3e57f72..d810540 100644
--- a/modeling_activity_pace/src/dict_learning/dictionary_helpers.py
+++ b/modeling_activity_pace/src/dict_learning/dictionary_helpers.py
@@ -26,7 +26,7 @@ def process_data_for_DL():
     Returns:
         tuple: A tuple containing input data (X_list_array_clean) and target labels (y).
     """
-    X_list = [load_data(f"data/processed/streams/X_{i}.csv") for i in channel_names]
+    X_list = [load_data(f"data/timeseries/X_{i}.csv") for i in channel_names]
 
     ids = get_ids_from_signals(X_list)
     cols_to_predict = ANSWERS_ACTIVITY_COLUMNS
diff --git a/modeling_activity_pace/src/process_answers.py b/modeling_activity_pace/src/process_answers.py
deleted file mode 100644
index 78337f8..0000000
--- a/modeling_activity_pace/src/process_answers.py
+++ /dev/null
@@ -1,65 +0,0 @@
-from src.helpers import load_data
-from src.process_raw_data.filter import process_filter
-from src.process_raw_data.process_activities_deezer import (
-    process_activities_deezer_feature,
-)
-from src.process_raw_data.process_age_gender import process_age_gender
-from src.helpers import load_data
-
-
-class ProcessAnswers:
-    def __init__(self, path) -> None:
-        """
-        Initialize the ProcessAnswers class.
-
-        Args:
-            path (str): The path to the CSV file containing answers data.
-        """
-        self.path = path
-
-    def import_data(self):
-        """
-        Import data from the specified CSV file.
-        """
-        self.df = load_data(self.path).reset_index()
-
-    def filter(self, ids, cols):
-        """
-        Filter the DataFrame based on user IDs and selected columns.
-
-        Args:
-            ids (list): List of user IDs to include in the filtered data.
-            cols (list): List of column names to include in the filtered data.
-        """
-        sorting_dict = dict(list(zip(ids, range(len(ids)))))
-        self.df = self.df[self.df["user_id"].isin(ids)]
-        self.df = self.df[["user_id"] + cols]
-        self.df = self.df.sort_values(by="user_id", key=lambda x: x.map(sorting_dict))
-
-    def process(self, ids, cols):
-        """
-        Process the answers data by importing and filtering.
-
-        Args:
-            ids (list): List of user IDs to filter the data.
-            cols (list): List of column names to filter the data.
-        """
-        self.import_data()
-        self.filter(ids, cols)
-
-
-def process_answer_data_pipeline(df, columns):
-    """
-    Process answers DataFrame by applying filtering and feature processing.
-
-    Args:
-        df (pd.DataFrame): The DataFrame to be processed.
-        columns (list): List of columns to keep in the processed DataFrame.
-
-    Returns:
-        pd.DataFrame: The processed DataFrame with selected columns.
-    """
-    df = process_filter(df)
-    df = process_activities_deezer_feature(df)
-    df = process_age_gender(df)
-    return df[columns]
diff --git a/modeling_activity_pace/src/process_raw_data/answers_helpers.py b/modeling_activity_pace/src/process_raw_data/answers_helpers.py
index 5a9c226..57fb5a6 100644
--- a/modeling_activity_pace/src/process_raw_data/answers_helpers.py
+++ b/modeling_activity_pace/src/process_raw_data/answers_helpers.py
@@ -35,7 +35,7 @@ def process_data_for_classifier(
     - y_train_: Training labels.
     - y_test_: Test labels.
     """
-    X_list = [load_data(f"data/processed/streams/X_{i}.csv") for i in channel_names]
+    X_list = [load_data(f"data/timeseries/X_{i}.csv") for i in channel_names]
     D = np.load("results/dict_learning/D.npy")[:, :, :166]
 
     # Extract user IDs
diff --git a/modeling_activity_pace/src/process_raw_data/process_activities_deezer.py b/modeling_activity_pace/src/process_raw_data/process_activities_deezer.py
deleted file mode 100644
index 563b516..0000000
--- a/modeling_activity_pace/src/process_raw_data/process_activities_deezer.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from src.helpers import rename_columns
-from src.settings import ANSWERS_ACTIVITY_COLUMNS
-
-COL_NAMES_CONTEXT = [f"B_contexts_deezer_{i}" for i in [1, 4, 5, 2, 12, 10]]
-
-
-def encode_activities_columns(df, new_col_names):
-    """
-    Encode context-related columns in a DataFrame.
-
-    Args:
-        df (pd.DataFrame): The DataFrame to encode context columns in.
-        new_col_names (list): List of new context-related column names.
-
-    Returns:
-        pd.DataFrame: The DataFrame with context columns encoded.
-    """
-    df[new_col_names] = df[new_col_names].map(lambda x: 1 if isinstance(x, str) else 0)
-    return df
-
-
-def process_activities_deezer_feature(df):
-    """
-    Process context-related features in a DataFrame.
-
-    Args:
-        df (pd.DataFrame): The DataFrame containing context-related features.
-
-    Returns:
-        pd.DataFrame: The processed DataFrame.
-    """
-    df = rename_columns(df, COL_NAMES_CONTEXT, ANSWERS_ACTIVITY_COLUMNS)
-    df = encode_activities_columns(df, ANSWERS_ACTIVITY_COLUMNS)
-    return df
diff --git a/modeling_activity_pace/src/process_raw_data/process_age_gender.py b/modeling_activity_pace/src/process_raw_data/process_age_gender.py
deleted file mode 100644
index a6fcd35..0000000
--- a/modeling_activity_pace/src/process_raw_data/process_age_gender.py
+++ /dev/null
@@ -1,142 +0,0 @@
-import pandas as pd
-
-from src.helpers import rename_columns
-
-
-ENCODE_AGE = {
-    "Entre 45 et 54 ans": 4,
-    "Entre 18 et 24 ans": 1,
-    "Moins de 18 ans": 0,
-    "Entre 25 et 34 ans": 2,
-    "Entre 55 et 64 ans": 5,
-    "Entre 35 et 44 ans": 3,
-    "+ de 65 ans": 6,
-}
-
-ENCODE_GENDER = {
-    "Un homme": 0,
-    "Une femme": 1,
-    "Je ne souhaite pas répondre": 2,
-    "Je préfère me décrire moi-même": 2,
-    "Non-binaire/Transgenre": 2,
-}
-
-COL_NAMES_AGE_GENDER = [
-    "E_birth_year",
-    "E_age_range",
-    "E_gender",
-]
-
-NEW_COL_NAMES_AGE_GENDER = [
-    "annee_naissance",
-    "age_group",
-    "gender",
-]
-
-
-def convert_age_to_int(df):
-    """
-    Convert the 'annee_naissance' column in a DataFrame to integer values.
-
-    Args:
-        df (pd.DataFrame): The DataFrame in which 'annee_naissance' values should be converted.
-
-    Returns:
-        pd.DataFrame: The DataFrame with 'annee_naissance' values converted to integers.
-    """
-    df["annee_naissance"] = df["annee_naissance"].apply(
-        lambda x: int(x) if not pd.isna(x) else -1
-    )
-    return df
-
-
-def encode_age_category(df):
-    """
-    Encode the 'age_group' column in a DataFrame based on predefined age categories.
-
-    Args:
-        df (pd.DataFrame): The DataFrame in which 'age_group' values should be encoded.
-
-    Returns:
-        pd.DataFrame: The DataFrame with 'age_group' values encoded.
-    """
-    df["age_group"] = df["age_group"].apply(
-        lambda x: ENCODE_AGE[x] if isinstance(x, str) else -1
-    )
-    return df
-
-
-def encode_gender(df):
-    """
-    Encode the 'genre' column in a DataFrame based on predefined gender categories.
-
-    Args:
-        df (pd.DataFrame): The DataFrame in which 'genre' values should be encoded.
-
-    Returns:
-        pd.DataFrame: The DataFrame with 'genre' values encoded.
-    """
-    df["gender"] = df["gender"].apply(
-        lambda x: ENCODE_GENDER[x] if isinstance(x, str) else -1
-    )
-    return df
-
-
-def assign_age_category(df):
-    """
-    Assign age categories based on the 'annee_naissance' column in a DataFrame.
-
-    Args:
-        df (pd.DataFrame): The DataFrame in which age categories should be assigned.
-
-    Returns:
-        pd.DataFrame: The DataFrame with 'age_group' values assigned based on age.
-    """
-    df["age_group"] = df["annee_naissance"].apply(
-        lambda x: assign_age_category_helper(2023 - x)
-    )
-    return df
-
-
-def assign_age_category_helper(age):
-    """
-    Helper function to assign age categories based on age.
-
-    Args:
-        age (int): The age value.
-
-    Returns:
-        int: The assigned age category.
-    """
-    if age < 18:
-        return 0
-    elif age <= 24:
-        return 1
-    elif age <= 34:
-        return 2
-    elif age <= 44:
-        return 3
-    elif age <= 54:
-        return 4
-    elif age <= 64:
-        return 5
-    else:
-        return 6
-
-
-def process_age_gender(df):
-    """
-    Process a DataFrame by renaming columns and encoding age and gender categories.
-
-    Args:
-        df (pd.DataFrame): The DataFrame to be processed.
-
-    Returns:
-        pd.DataFrame: The processed DataFrame.
-    """
-    df = rename_columns(df, COL_NAMES_AGE_GENDER, NEW_COL_NAMES_AGE_GENDER)
-    df = convert_age_to_int(df)
-    df = encode_age_category(df)
-    df = encode_gender(df)
-    df = assign_age_category(df)
-    return df
diff --git a/modeling_activity_pace/src/process_raw_data/streams_processor.py b/modeling_activity_pace/src/process_raw_data/streams_processor.py
index 6c5d8c5..550e0de 100644
--- a/modeling_activity_pace/src/process_raw_data/streams_processor.py
+++ b/modeling_activity_pace/src/process_raw_data/streams_processor.py
@@ -22,7 +22,6 @@ def __init__(self, path, usr_drop_rate=0) -> None:
         """
         self.path = path
         self.usr_drop_rate = usr_drop_rate
-        self.liked_df = load_data("data/raw/other/user_favorites.csv").reset_index()
 
     def import_data(self):
         """
@@ -35,7 +34,6 @@ def import_data(self):
             )
         ]
         self.df = pd.concat(df_list)
-        # self.df = self.df[self.df["user_id"].isin(self.df.user_id.tolist()[:500] + [3356219324])]
 
     def convert_timestamps(self):
         """
@@ -45,19 +43,21 @@ def convert_timestamps(self):
             dt.datetime.fromtimestamp
         )
 
-    def filter(self, min_date, max_date):
-        """
-        Filter the DataFrame based on date and listening time.
-
-        Args:
-            min_date (datetime.date): Minimum date.
-            max_date (datetime.date): Maximum date.
-        """
-        self.df = self.df[
-            (self.df["ts_listen"].dt.date >= min_date)
-            & (self.df["ts_listen"].dt.date <= max_date)
-            & (self.df["listening_time"] >= 30)
-        ]
+    # def filter(self, min_date, max_date):
+    #     """
+    #     Filter the DataFrame based on date and listening time.
+
+    #     Args:
+    #         min_date (datetime.date): Minimum date.
+    #         max_date (datetime.date): Maximum date.
+    #     """
+    #     print(f"BEFORE FILTER = {len(self.df)}")
+    #     self.df = self.df[
+    #         (self.df["ts_listen"].dt.date >= min_date)
+    #         & (self.df["ts_listen"].dt.date <= max_date)
+    #         & (self.df["listening_time"] >= 30)
+    #     ]
+    #     print(f"AFTER FILTER = {len(self.df)}")
 
     def build_ids_list(self):
         """
@@ -99,58 +99,58 @@ def filter_users(self):
         self.ids = self.ids[:keep_n]
         self.df = self.df[self.df["user_id"].isin(self.ids)]
 
-    def compute_is_organic(self):
-        """
-        Compute an 'is_organic' column based on the 'context_4' column.
-        """
-        self.df["is_organic"] = (self.df["context_4"] == "organic").astype(int)
-
-    def identify_context(self, x):
-        """
-        Identify context types based on the 'context_type' column.
-
-        Args:
-            x (str): The context string.
-
-        Returns:
-            str: The identified context type.
-        """
-        context_keywords = ["album", "albums", "playlist", "playlists"]
-        if any(keyword in x for keyword in context_keywords):
-            return "album" if "album" in x or "albums" in x else "playlist"
-        return "other"
-
-    def convert_context(self):
-        """
-        Convert the 'context_type' column to 'context_identified'.
-        """
-        self.df["context_identified"] = self.df["context_type"].progress_apply(
-            self.identify_context
-        )
-
-    def is_fav(self, row, dict_liked):
-        """
-        Check if a row is marked as a favorite.
-
-        Args:
-            row (pd.Series): The row to check.
-            dict_liked (dict): Dictionary of favorite items.
-
-        Returns:
-            int: 1 if it's a favorite, 0 otherwise.
-        """
-        song_id = row.media_id
-        if "song" in dict_liked and song_id in dict_liked["song"]:
-            return 1
-        context_identified = row.context_identified
-        context_id = row.context_id
-        if (
-            context_identified in ["playlist", "album"]
-            and context_identified in dict_liked
-            and context_id in dict_liked[context_identified]
-        ):
-            return 1
-        return 0
+    # def compute_is_organic(self):
+    #     """
+    #     Compute an 'is_organic' column based on the 'context_4' column.
+    #     """
+    #     self.df["is_organic"] = (self.df["context_4"] == "organic").astype(int)
+
+    # def identify_context(self, x):
+    #     """
+    #     Identify context types based on the 'context_type' column.
+
+    #     Args:
+    #         x (str): The context string.
+
+    #     Returns:
+    #         str: The identified context type.
+    #     """
+    #     context_keywords = ["album", "albums", "playlist", "playlists"]
+    #     if any(keyword in x for keyword in context_keywords):
+    #         return "album" if "album" in x or "albums" in x else "playlist"
+    #     return "other"
+
+    # def convert_context(self):
+    #     """
+    #     Convert the 'context_type' column to 'context_identified'.
+    #     """
+    #     self.df["context_identified"] = self.df["context_type"].progress_apply(
+    #         self.identify_context
+    #     )
+
+    # def is_fav(self, row, dict_liked):
+    #     """
+    #     Check if a row is marked as a favorite.
+
+    #     Args:
+    #         row (pd.Series): The row to check.
+    #         dict_liked (dict): Dictionary of favorite items.
+
+    #     Returns:
+    #         int: 1 if it's a favorite, 0 otherwise.
+    #     """
+    #     song_id = row.media_id
+    #     if "song" in dict_liked and song_id in dict_liked["song"]:
+    #         return 1
+    #     context_identified = row.context_identified
+    #     context_id = row.context_id
+    #     if (
+    #         context_identified in ["playlist", "album"]
+    #         and context_identified in dict_liked
+    #         and context_id in dict_liked[context_identified]
+    #     ):
+    #         return 1
+    #     return 0
 
     def process(self, min_date, max_date, n_subdivisions, instant_zero):
         """
@@ -164,13 +164,13 @@ def process(self, min_date, max_date, n_subdivisions, instant_zero):
         """
         self.import_data()
         self.convert_timestamps()
-        self.filter(min_date, max_date)
+        # self.filter(min_date, max_date)
         self.build_ids_list()
         self.filter_users()
         self.add_time_range(n_subdivisions, instant_zero)
         self.add_date()
-        self.compute_is_organic()
-        self.convert_context()
+        # self.compute_is_organic()
+        # self.convert_context()
         self.all_time_date_couples = set(
             [tuple(i) for i in self.df[["time_range", "date"]].to_numpy()]
         )
@@ -323,19 +323,19 @@ def add_liked(self, df_user, k, id, channel_index):
             id (int): User ID.
             channel_index (int): Index of the channel.
         """
-        liked_df_user = self.liked_df[self.liked_df["user_id"] == id]
-        dict_liked = dict(
-            liked_df_user[
-                liked_df_user["item_type"].isin(["album", "playlist", "song"])
-            ][["item_id", "item_type"]]
-            .groupby("item_type")
-            .agg(list)
-            .reset_index()
-            .values
-        )
-        df_user["is_fav"] = df_user.apply(
-            lambda row: self.is_fav(row, dict_liked), axis=1
-        )
+        # liked_df_user = self.liked_df[self.liked_df["user_id"] == id]
+        # dict_liked = dict(
+        #     liked_df_user[
+        #         liked_df_user["item_type"].isin(["album", "playlist", "song"])
+        #     ][["item_id", "item_type"]]
+        #     .groupby("item_type")
+        #     .agg(list)
+        #     .reset_index()
+        #     .values
+        # )
+        # df_user["is_fav"] = df_user.apply(
+        #     lambda row: self.is_fav(row, dict_liked), axis=1
+        # )
 
         df_user = df_user[df_user["is_fav"] == 1]
         full_df_liked = self.compute_ratio_df(df_user)
diff --git a/modeling_activity_pace/src/settings.py b/modeling_activity_pace/src/settings.py
index e625c63..9ee32d5 100644
--- a/modeling_activity_pace/src/settings.py
+++ b/modeling_activity_pace/src/settings.py
@@ -8,8 +8,9 @@
 MAX_DATE = dt.date(2023, 5, 19)
 INSTANT_ZERO = dt.datetime(2022, 12, 26)
 
-ANSWERS_PATH = "data/processed/answers/processed_records.csv"
+ANSWERS_PATH = "data/answers/processed_records.csv"
 DICT_ITER_PATH = "results/dict_learning/all_iterations/"
+TIME_SERIES_PATH = "data/timeseries"
 
 ANSWERS_ACTIVITY_COLUMNS = [
     "activity_wake_up",
@@ -46,4 +47,4 @@
 HOURS = [f"{i}h" for i in range(24)]
 
 time_labels_full = [f"{day},{hour}" for day in DAYS for hour in HOURS]
-time_labels = time_labels_full[1:-1]
\ No newline at end of file
+time_labels = time_labels_full[1:-1]

From 602f77bf0613666be253e04f2d67ce9cca4b9be9 Mon Sep 17 00:00:00 2001
From: jpasdeloup <jpasdeloup@deezer.com>
Date: Mon, 16 Jun 2025 10:27:34 +0200
Subject: [PATCH 2/8] Reactive all steps

---
 modeling_activity_pace/run_all.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/modeling_activity_pace/run_all.sh b/modeling_activity_pace/run_all.sh
index 965ea92..3e1bc46 100644
--- a/modeling_activity_pace/run_all.sh
+++ b/modeling_activity_pace/run_all.sh
@@ -8,28 +8,28 @@ poetry run python modeling_activity_pace/compute_time_series.py
 echo "****************************************"
 echo "2. Dictionary Learning algorithm is run "
 echo "****************************************"
-#poetry run python modeling_activity_pace/compute_dictionary.py
+poetry run python modeling_activity_pace/compute_dictionary.py
 
 echo "**********************************************************"
 echo "3. Selection of the best iteration in dictionary learning "
 echo "**********************************************************"
-#poetry run python modeling_activity_pace/choose_dictionary.py
+poetry run python modeling_activity_pace/choose_dictionary.py
 
 echo ""
 echo "***********************************************************"
 echo "4. Computes baselines scores and scores of PACE embeddings "
 echo "***********************************************************"
-#poetry run python modeling_activity_pace/compute_baselines.py
+poetry run python modeling_activity_pace/compute_baselines.py
 
 echo ""
 echo "******************************************"
 echo "5. Plots logistic regression coefficients "
 echo "   and related statistical reports        "
 echo "******************************************"
-#poetry run python modeling_activity_pace/analyse_models.py
+poetry run python modeling_activity_pace/analyse_models.py
 
 echo -e ""
 echo "******************************"
 echo "6. Saves the plot of Figure 1 "
 echo "******************************"
-#poetry run python modeling_activity_pace/make_fig1.py
+poetry run python modeling_activity_pace/make_fig1.py

From 000e44a20f3115e0f8a8a4c2e1f4624f294aac1b Mon Sep 17 00:00:00 2001
From: jpasdeloup <jpasdeloup@deezer.com>
Date: Tue, 17 Jun 2025 09:31:45 +0200
Subject: [PATCH 3/8] Restore missing file

---
 modeling_activity_pace/src/process_answers.py | 41 +++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 modeling_activity_pace/src/process_answers.py

diff --git a/modeling_activity_pace/src/process_answers.py b/modeling_activity_pace/src/process_answers.py
new file mode 100644
index 0000000..713de2e
--- /dev/null
+++ b/modeling_activity_pace/src/process_answers.py
@@ -0,0 +1,41 @@
+from src.helpers import load_data
+
+class ProcessAnswers:
+    def __init__(self, path) -> None:
+        """
+        Initialize the ProcessAnswers class.
+
+        Args:
+            path (str): The path to the CSV file containing answers data.
+        """
+        self.path = path
+
+    def import_data(self):
+        """
+        Import data from the specified CSV file.
+        """
+        self.df = load_data(self.path).reset_index()
+
+    def filter(self, ids, cols):
+        """
+        Filter the DataFrame based on user IDs and selected columns.
+
+        Args:
+            ids (list): List of user IDs to include in the filtered data.
+            cols (list): List of column names to include in the filtered data.
+        """
+        sorting_dict = dict(list(zip(ids, range(len(ids)))))
+        self.df = self.df[self.df["user_id"].isin(ids)]
+        self.df = self.df[["user_id"] + cols]
+        self.df = self.df.sort_values(by="user_id", key=lambda x: x.map(sorting_dict))
+
+    def process(self, ids, cols):
+        """
+        Process the answers data by importing and filtering.
+
+        Args:
+            ids (list): List of user IDs to filter the data.
+            cols (list): List of column names to filter the data.
+        """
+        self.import_data()
+        self.filter(ids, cols)

From 74de55051dfc98480664c2541430ea9a4a609779 Mon Sep 17 00:00:00 2001
From: jpasdeloup <jpasdeloup@deezer.com>
Date: Mon, 23 Jun 2025 09:05:30 +0200
Subject: [PATCH 4/8] Ignore 5 first iterations

---
 .../src/dict_learning/choose_best_iteration.py                 | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/modeling_activity_pace/src/dict_learning/choose_best_iteration.py b/modeling_activity_pace/src/dict_learning/choose_best_iteration.py
index 4ad283d..6f66947 100644
--- a/modeling_activity_pace/src/dict_learning/choose_best_iteration.py
+++ b/modeling_activity_pace/src/dict_learning/choose_best_iteration.py
@@ -109,7 +109,8 @@ def choose_best_dict(X, y, cols_to_predict):
     ## Uncomment here to manually select the best iteration
     # best_iter = int(input("Choose the best iteration :  "))
 
-    best_iter = class_scores.index(max(class_scores))
+    best_iter_offset = 5
+    best_iter = best_iter_offset + class_scores[best_iter_offset:].index(max(class_scores[best_iter_offset:]))
     print(f"Best iteration is {best_iter} with score {class_scores[best_iter]}")
 
     shutil.copy(

From e53f12b6ade05c4f894a0daec63a364cc1ce553f Mon Sep 17 00:00:00 2001
From: jpasdeloup <jpasdeloup@deezer.com>
Date: Tue, 24 Jun 2025 10:29:35 +0200
Subject: [PATCH 5/8] Ignore bak files

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 7a2a2b5..a3f579f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 data/*
 results/*
+*.bak

From 9b9713952b6009287852bdcf906548203d6c1690 Mon Sep 17 00:00:00 2001
From: jpasdeloup <jpasdeloup@deezer.com>
Date: Tue, 24 Jun 2025 10:34:23 +0200
Subject: [PATCH 6/8] Remove commented code

---
 modeling_activity_pace/make_fig1.py           |  3 +-
 .../src/baselines/baselines_functions.py      |  6 +-
 .../src/process_raw_data/answers_helpers.py   |  3 +-
 .../src/process_raw_data/streams_processor.py | 82 -------------------
 4 files changed, 7 insertions(+), 87 deletions(-)

diff --git a/modeling_activity_pace/make_fig1.py b/modeling_activity_pace/make_fig1.py
index 95078aa..2dfb9f3 100644
--- a/modeling_activity_pace/make_fig1.py
+++ b/modeling_activity_pace/make_fig1.py
@@ -10,6 +10,7 @@
 from src.settings import (
     N_SUBDIVISION_1HOUR,
     INSTANT_ZERO,
+    TIME_SERIES_PATH,
     time_labels_full,
     time_labels,
 )
@@ -31,7 +32,7 @@ def add_time_range(df, n_subdivisions, instant_zero):
 
 # Import data
 
-df_processed = load_data("data/timeseries/X_volume.csv")
+df_processed = load_data(f"{TIME_SERIES_PATH}/X_volume.csv")
 
 
 streams_df_list = []
diff --git a/modeling_activity_pace/src/baselines/baselines_functions.py b/modeling_activity_pace/src/baselines/baselines_functions.py
index 5497684..00b2258 100644
--- a/modeling_activity_pace/src/baselines/baselines_functions.py
+++ b/modeling_activity_pace/src/baselines/baselines_functions.py
@@ -3,7 +3,7 @@
 from tqdm import tqdm
 
 from src.process_answers import ProcessAnswers
-from src.settings import channel_names, ANSWERS_PATH
+from src.settings import TIME_SERIES_PATH, channel_names, ANSWERS_PATH
 from src.helpers import load_data, get_ids_from_signals
 from src.process_raw_data.answers_helpers import process_data_for_classifier
 from src.modeling_functions import (
@@ -25,7 +25,7 @@ def compute_other_activities_baseline(cols_to_predict):
     Returns:
     - List of AUC scores.
     """
-    X_list = [load_data(f"data/timeseries/X_{i}.csv") for i in channel_names]
+    X_list = [load_data(f"{TIME_SERIES_PATH}/X_{i}.csv") for i in channel_names]
 
     # Extract user IDs
     ids = get_ids_from_signals(X_list)
@@ -80,7 +80,7 @@ def compute_total_volume_baseline(cols_to_predict):
     """
 
     # Load volume data
-    df_volume = load_data("data/timeseries/X_volume.csv")
+    df_volume = load_data(f"{TIME_SERIES_PATH}/X_volume.csv")
 
     # Sum the volume across time (transpose and sum)
     df = pd.DataFrame(df_volume.T.sum(), columns=["Total volume"])
diff --git a/modeling_activity_pace/src/process_raw_data/answers_helpers.py b/modeling_activity_pace/src/process_raw_data/answers_helpers.py
index 57fb5a6..fc22ebb 100644
--- a/modeling_activity_pace/src/process_raw_data/answers_helpers.py
+++ b/modeling_activity_pace/src/process_raw_data/answers_helpers.py
@@ -6,6 +6,7 @@
 from src.helpers import load_data, get_ids_from_signals
 from src.modeling_functions import convolve_signals, normalize_signals, split
 from src.settings import (
+    TIME_SERIES_PATH,
     channel_names,
     n_channels,
     ANSWERS_PATH,
@@ -35,7 +36,7 @@ def process_data_for_classifier(
     - y_train_: Training labels.
     - y_test_: Test labels.
     """
-    X_list = [load_data(f"data/timeseries/X_{i}.csv") for i in channel_names]
+    X_list = [load_data(f"{TIME_SERIES_PATH}/X_{i}.csv") for i in channel_names]
     D = np.load("results/dict_learning/D.npy")[:, :, :166]
 
     # Extract user IDs
diff --git a/modeling_activity_pace/src/process_raw_data/streams_processor.py b/modeling_activity_pace/src/process_raw_data/streams_processor.py
index 550e0de..6f75fab 100644
--- a/modeling_activity_pace/src/process_raw_data/streams_processor.py
+++ b/modeling_activity_pace/src/process_raw_data/streams_processor.py
@@ -43,22 +43,6 @@ def convert_timestamps(self):
             dt.datetime.fromtimestamp
         )
 
-    # def filter(self, min_date, max_date):
-    #     """
-    #     Filter the DataFrame based on date and listening time.
-
-    #     Args:
-    #         min_date (datetime.date): Minimum date.
-    #         max_date (datetime.date): Maximum date.
-    #     """
-    #     print(f"BEFORE FILTER = {len(self.df)}")
-    #     self.df = self.df[
-    #         (self.df["ts_listen"].dt.date >= min_date)
-    #         & (self.df["ts_listen"].dt.date <= max_date)
-    #         & (self.df["listening_time"] >= 30)
-    #     ]
-    #     print(f"AFTER FILTER = {len(self.df)}")
-
     def build_ids_list(self):
         """
         Build a list of user IDs sorted by the number of occurrences.
@@ -99,59 +83,6 @@ def filter_users(self):
         self.ids = self.ids[:keep_n]
         self.df = self.df[self.df["user_id"].isin(self.ids)]
 
-    # def compute_is_organic(self):
-    #     """
-    #     Compute an 'is_organic' column based on the 'context_4' column.
-    #     """
-    #     self.df["is_organic"] = (self.df["context_4"] == "organic").astype(int)
-
-    # def identify_context(self, x):
-    #     """
-    #     Identify context types based on the 'context_type' column.
-
-    #     Args:
-    #         x (str): The context string.
-
-    #     Returns:
-    #         str: The identified context type.
-    #     """
-    #     context_keywords = ["album", "albums", "playlist", "playlists"]
-    #     if any(keyword in x for keyword in context_keywords):
-    #         return "album" if "album" in x or "albums" in x else "playlist"
-    #     return "other"
-
-    # def convert_context(self):
-    #     """
-    #     Convert the 'context_type' column to 'context_identified'.
-    #     """
-    #     self.df["context_identified"] = self.df["context_type"].progress_apply(
-    #         self.identify_context
-    #     )
-
-    # def is_fav(self, row, dict_liked):
-    #     """
-    #     Check if a row is marked as a favorite.
-
-    #     Args:
-    #         row (pd.Series): The row to check.
-    #         dict_liked (dict): Dictionary of favorite items.
-
-    #     Returns:
-    #         int: 1 if it's a favorite, 0 otherwise.
-    #     """
-    #     song_id = row.media_id
-    #     if "song" in dict_liked and song_id in dict_liked["song"]:
-    #         return 1
-    #     context_identified = row.context_identified
-    #     context_id = row.context_id
-    #     if (
-    #         context_identified in ["playlist", "album"]
-    #         and context_identified in dict_liked
-    #         and context_id in dict_liked[context_identified]
-    #     ):
-    #         return 1
-    #     return 0
-
     def process(self, min_date, max_date, n_subdivisions, instant_zero):
         """
         Process the data by importing, filtering, and adding columns.
@@ -323,19 +254,6 @@ def add_liked(self, df_user, k, id, channel_index):
             id (int): User ID.
             channel_index (int): Index of the channel.
         """
-        # liked_df_user = self.liked_df[self.liked_df["user_id"] == id]
-        # dict_liked = dict(
-        #     liked_df_user[
-        #         liked_df_user["item_type"].isin(["album", "playlist", "song"])
-        #     ][["item_id", "item_type"]]
-        #     .groupby("item_type")
-        #     .agg(list)
-        #     .reset_index()
-        #     .values
-        # )
-        # df_user["is_fav"] = df_user.apply(
-        #     lambda row: self.is_fav(row, dict_liked), axis=1
-        # )
 
         df_user = df_user[df_user["is_fav"] == 1]
         full_df_liked = self.compute_ratio_df(df_user)

From 66d2bf4b25fa517cd28c1d352cf99fb1b2352c74 Mon Sep 17 00:00:00 2001
From: jpasdeloup <jpasdeloup@deezer.com>
Date: Tue, 24 Jun 2025 10:35:45 +0200
Subject: [PATCH 7/8] Remove additional old code

---
 .../src/process_raw_data/streams_processor.py                  | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/modeling_activity_pace/src/process_raw_data/streams_processor.py b/modeling_activity_pace/src/process_raw_data/streams_processor.py
index 6f75fab..4ed922b 100644
--- a/modeling_activity_pace/src/process_raw_data/streams_processor.py
+++ b/modeling_activity_pace/src/process_raw_data/streams_processor.py
@@ -95,13 +95,10 @@ def process(self, min_date, max_date, n_subdivisions, instant_zero):
         """
         self.import_data()
         self.convert_timestamps()
-        # self.filter(min_date, max_date)
         self.build_ids_list()
         self.filter_users()
         self.add_time_range(n_subdivisions, instant_zero)
         self.add_date()
-        # self.compute_is_organic()
-        # self.convert_context()
         self.all_time_date_couples = set(
             [tuple(i) for i in self.df[["time_range", "date"]].to_numpy()]
         )

From 825d26808a518cbad61b0e1f05ac2618ad067414 Mon Sep 17 00:00:00 2001
From: jpasdeloup <jpasdeloup@deezer.com>
Date: Tue, 24 Jun 2025 12:17:53 +0200
Subject: [PATCH 8/8] Select 1st user for fig1

---
 modeling_activity_pace/make_fig1.py                            | 2 +-
 modeling_activity_pace/src/dict_learning/dictionary_helpers.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/modeling_activity_pace/make_fig1.py b/modeling_activity_pace/make_fig1.py
index 2dfb9f3..1b85af3 100644
--- a/modeling_activity_pace/make_fig1.py
+++ b/modeling_activity_pace/make_fig1.py
@@ -19,7 +19,6 @@
 tqdm.pandas()
 
 build_result_folder()
-SELECTED_USER_ID = 3356219324
 
 
 def add_time_range(df, n_subdivisions, instant_zero):
@@ -34,6 +33,7 @@ def add_time_range(df, n_subdivisions, instant_zero):
 
 df_processed = load_data(f"{TIME_SERIES_PATH}/X_volume.csv")
 
+SELECTED_USER_ID = df_processed.index[0]
 
 streams_df_list = []
 for i in tqdm(os.listdir("data/streams/")):
diff --git a/modeling_activity_pace/src/dict_learning/dictionary_helpers.py b/modeling_activity_pace/src/dict_learning/dictionary_helpers.py
index d810540..b354487 100644
--- a/modeling_activity_pace/src/dict_learning/dictionary_helpers.py
+++ b/modeling_activity_pace/src/dict_learning/dictionary_helpers.py
@@ -5,6 +5,7 @@
 
 from src.process_answers import ProcessAnswers
 from src.settings import (
+    TIME_SERIES_PATH,
     n_channels,
     channel_names,
     ANSWERS_ACTIVITY_COLUMNS,
@@ -26,7 +27,7 @@ def process_data_for_DL():
     Returns:
         tuple: A tuple containing input data (X_list_array_clean) and target labels (y).
     """
-    X_list = [load_data(f"data/timeseries/X_{i}.csv") for i in channel_names]
+    X_list = [load_data(f"{TIME_SERIES_PATH}/X_{i}.csv") for i in channel_names]
 
     ids = get_ids_from_signals(X_list)
     cols_to_predict = ANSWERS_ACTIVITY_COLUMNS