From fa199491dcdace0c663e60f138f9737ce739699d Mon Sep 17 00:00:00 2001 From: jpasdeloup Date: Mon, 16 Jun 2025 10:20:41 +0200 Subject: [PATCH 1/8] WIP use new dataset: compute timeseries from pre-processed streams Add make all --- Makefile | 3 + README.md | 28 ++- ..._raw_streams.py => compute_time_series.py} | 10 +- modeling_activity_pace/make_fig1.py | 6 +- modeling_activity_pace/process_raw_answers.py | 21 --- modeling_activity_pace/run_all.sh | 35 ++++ .../src/baselines/baselines_functions.py | 5 +- .../dict_learning/choose_best_iteration.py | 8 +- .../src/dict_learning/dictionary_helpers.py | 2 +- modeling_activity_pace/src/process_answers.py | 65 ------- .../src/process_raw_data/answers_helpers.py | 2 +- .../process_activities_deezer.py | 34 ---- .../process_raw_data/process_age_gender.py | 142 --------------- .../src/process_raw_data/streams_processor.py | 166 +++++++++--------- modeling_activity_pace/src/settings.py | 5 +- 15 files changed, 157 insertions(+), 375 deletions(-) rename modeling_activity_pace/{process_raw_streams.py => compute_time_series.py} (75%) delete mode 100644 modeling_activity_pace/process_raw_answers.py create mode 100644 modeling_activity_pace/run_all.sh delete mode 100644 modeling_activity_pace/src/process_answers.py delete mode 100644 modeling_activity_pace/src/process_raw_data/process_activities_deezer.py delete mode 100644 modeling_activity_pace/src/process_raw_data/process_age_gender.py diff --git a/Makefile b/Makefile index 86cb40f..cf87ce6 100644 --- a/Makefile +++ b/Makefile @@ -19,6 +19,9 @@ usage: build: docker build -t $(DOCKER_IMAGE_NAME) . +all: + $(DOCKER_RUN_MOUNT) /bin/bash modeling_activity_pace/run_all.sh || true + run-bash: build $(DOCKER_RUN_MOUNT) /bin/bash || true diff --git a/README.md b/README.md index 6cc6d2b..a13a4fe 100644 --- a/README.md +++ b/README.md @@ -37,13 +37,12 @@ poetry run python modeling_activity_pace/ ## Scripts -1. Raw user histories are trasformed into time series using `process_raw_streams.py`. -2. User answers to survey are prepared using `process_raw_answers.py`. -3. Dictionary Learning algorithm is run using `compute_dictionary.py`. -4. The selection of the best iteration in dictionary learning is done using `choose_dictionary.py`. -5. `compute_baselines.py` computes baselines scores and scores of PACE embeddings. -6. `analyse_models.py` plots logistic regression coefficients and related statistical reports. -7. `make_fig1.py` saves the plot of Figure 1. +1. User logs are transformed into time series using `compute_time_series.py`. +2. Dictionary Learning algorithm is run using `compute_dictionary.py`. +3. The selection of the best iteration in dictionary learning is done using `choose_dictionary.py`. +4. `compute_baselines.py` computes baselines scores and scores of PACE embeddings. +5. `analyse_models.py` plots logistic regression coefficients and related statistical reports. +6. `make_fig1.py` saves the plot of Figure 1. ## Data @@ -54,15 +53,12 @@ Input data folder must be organized as follows : pace/ │ └── data/ - └── raw/ - ├── streams/ - │ ├── one_year_all_respondents000000000000.csv - │ ├── ... - │ └── one_year_all_respondents0000000000399.csv - ├── other/ - │ └── user_favorites.csv - └── answers/ - └── records.csv + └── answers/ + └── processed_records.csv + ├── streams/ + │ ├── processed_streams_000000000000.csv + │ ├── ... + │ └── processed_streams_000000000091.csv ``` Where ```one_year_all_respondents.csv``` files are stream history csv files with columns : ```user_id, ts_listen, media_id, context_id, context_type, listening_time, context_4```. diff --git a/modeling_activity_pace/process_raw_streams.py b/modeling_activity_pace/compute_time_series.py similarity index 75% rename from modeling_activity_pace/process_raw_streams.py rename to modeling_activity_pace/compute_time_series.py index 986ef47..c94048d 100644 --- a/modeling_activity_pace/process_raw_streams.py +++ b/modeling_activity_pace/compute_time_series.py @@ -1,8 +1,10 @@ import pandas as pd +import os from src.process_raw_data.streams_processor import ProcessStreams from src.helpers import save_data from src.settings import ( + TIME_SERIES_PATH, channel_names, time_labels_full, MIN_DATE, @@ -11,10 +13,12 @@ INSTANT_ZERO, ) - if __name__ == "__main__": + # Create destination directory if it doesn't exist + os.makedirs(TIME_SERIES_PATH, exist_ok=True) + # Initialize a ProcessStreams instance - stream_processor = ProcessStreams("data/raw/streams/", usr_drop_rate=0) + stream_processor = ProcessStreams("data/streams/", usr_drop_rate=0) # Process the streams data stream_processor.process(MIN_DATE, MAX_DATE, N_SUBDIVISION_1HOUR, INSTANT_ZERO) @@ -32,4 +36,4 @@ ) # Save the DataFrame to a CSV file - save_data(channel_data, f"data/processed/streams/X_{channel_name}.csv", index=True) + save_data(channel_data, f"{TIME_SERIES_PATH}/X_{channel_name}.csv", index=True) diff --git a/modeling_activity_pace/make_fig1.py b/modeling_activity_pace/make_fig1.py index 06d7a69..95078aa 100644 --- a/modeling_activity_pace/make_fig1.py +++ b/modeling_activity_pace/make_fig1.py @@ -31,12 +31,12 @@ def add_time_range(df, n_subdivisions, instant_zero): # Import data -df_processed = load_data("data/processed/streams/X_volume.csv") +df_processed = load_data("data/timeseries/X_volume.csv") streams_df_list = [] -for i in tqdm(os.listdir("data/raw/streams/")): - df_streams = load_data("data/raw/streams/" + i).reset_index() +for i in tqdm(os.listdir("data/streams/")): + df_streams = load_data("data/streams/" + i).reset_index() streams_df_list.append(df_streams[df_streams["user_id"] == SELECTED_USER_ID]) df = pd.concat(streams_df_list) diff --git a/modeling_activity_pace/process_raw_answers.py b/modeling_activity_pace/process_raw_answers.py deleted file mode 100644 index 53a7a10..0000000 --- a/modeling_activity_pace/process_raw_answers.py +++ /dev/null @@ -1,21 +0,0 @@ -from src.process_raw_data.process_age_gender import NEW_COL_NAMES_AGE_GENDER -from src.process_answers import process_answer_data_pipeline -from src.helpers import load_data, save_data -from src.settings import ANSWERS_ACTIVITY_COLUMNS - - -def main(): - """ - Main function to process data and save the processed DataFrame to a CSV file. - """ - FILE_PATH = "data/raw/answers/records.csv" - COLS_TO_KEEP = ["user_id"] + ANSWERS_ACTIVITY_COLUMNS + NEW_COL_NAMES_AGE_GENDER - COLS_TO_KEEP.remove("annee_naissance") - - df = load_data(FILE_PATH) - df = process_answer_data_pipeline(df, COLS_TO_KEEP) - save_data(df, "data/processed/answers/processed_records.csv") - - -if __name__ == "__main__": - main() diff --git a/modeling_activity_pace/run_all.sh b/modeling_activity_pace/run_all.sh new file mode 100644 index 0000000..965ea92 --- /dev/null +++ b/modeling_activity_pace/run_all.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +echo "**********************************************" +echo "1. User logs are transformed into time series " +echo "**********************************************" +poetry run python modeling_activity_pace/compute_time_series.py + +echo "****************************************" +echo "2. Dictionary Learning algorithm is run " +echo "****************************************" +#poetry run python modeling_activity_pace/compute_dictionary.py + +echo "**********************************************************" +echo "3. Selection of the best iteration in dictionary learning " +echo "**********************************************************" +#poetry run python modeling_activity_pace/choose_dictionary.py + +echo "" +echo "***********************************************************" +echo "4. Computes baselines scores and scores of PACE embeddings " +echo "***********************************************************" +#poetry run python modeling_activity_pace/compute_baselines.py + +echo "" +echo "******************************************" +echo "5. Plots logistic regression coefficients " +echo " and related statistical reports " +echo "******************************************" +#poetry run python modeling_activity_pace/analyse_models.py + +echo -e "" +echo "******************************" +echo "6. Saves the plot of Figure 1 " +echo "******************************" +#poetry run python modeling_activity_pace/make_fig1.py diff --git a/modeling_activity_pace/src/baselines/baselines_functions.py b/modeling_activity_pace/src/baselines/baselines_functions.py index 54bc036..5497684 100644 --- a/modeling_activity_pace/src/baselines/baselines_functions.py +++ b/modeling_activity_pace/src/baselines/baselines_functions.py @@ -25,7 +25,7 @@ def compute_other_activities_baseline(cols_to_predict): Returns: - List of AUC scores. """ - X_list = [load_data(f"data/processed/streams/X_{i}.csv") for i in channel_names] + X_list = [load_data(f"data/timeseries/X_{i}.csv") for i in channel_names] # Extract user IDs ids = get_ids_from_signals(X_list) @@ -80,7 +80,7 @@ def compute_total_volume_baseline(cols_to_predict): """ # Load volume data - df_volume = load_data("data/processed/streams/X_volume.csv") + df_volume = load_data("data/timeseries/X_volume.csv") # Sum the volume across time (transpose and sum) df = pd.DataFrame(df_volume.T.sum(), columns=["Total volume"]) @@ -155,4 +155,3 @@ def compute_baseline_scores( scores = perform_grid_search(X_train, X_test, y_train_, y_test_, cols_to_predict) return scores - diff --git a/modeling_activity_pace/src/dict_learning/choose_best_iteration.py b/modeling_activity_pace/src/dict_learning/choose_best_iteration.py index 567261f..4ad283d 100644 --- a/modeling_activity_pace/src/dict_learning/choose_best_iteration.py +++ b/modeling_activity_pace/src/dict_learning/choose_best_iteration.py @@ -105,7 +105,13 @@ def choose_best_dict(X, y, cols_to_predict): index=True, ) plot_reconstruction_iterations_perfs(df_reconstruction_scores, "results/figures/reconstruction_scores_over_iterations.pdf") - best_iter = int(input("Choose the best iteration : ")) + + ## Uncomment here to manually select the best iteration + # best_iter = int(input("Choose the best iteration : ")) + + best_iter = class_scores.index(max(class_scores)) + print(f"Best iteration is {best_iter} with score {class_scores[best_iter]}") + shutil.copy( DICT_ITER_PATH + [f"D_{i}.npy" for i in range(len(os.listdir(DICT_ITER_PATH)))][best_iter], diff --git a/modeling_activity_pace/src/dict_learning/dictionary_helpers.py b/modeling_activity_pace/src/dict_learning/dictionary_helpers.py index 3e57f72..d810540 100644 --- a/modeling_activity_pace/src/dict_learning/dictionary_helpers.py +++ b/modeling_activity_pace/src/dict_learning/dictionary_helpers.py @@ -26,7 +26,7 @@ def process_data_for_DL(): Returns: tuple: A tuple containing input data (X_list_array_clean) and target labels (y). """ - X_list = [load_data(f"data/processed/streams/X_{i}.csv") for i in channel_names] + X_list = [load_data(f"data/timeseries/X_{i}.csv") for i in channel_names] ids = get_ids_from_signals(X_list) cols_to_predict = ANSWERS_ACTIVITY_COLUMNS diff --git a/modeling_activity_pace/src/process_answers.py b/modeling_activity_pace/src/process_answers.py deleted file mode 100644 index 78337f8..0000000 --- a/modeling_activity_pace/src/process_answers.py +++ /dev/null @@ -1,65 +0,0 @@ -from src.helpers import load_data -from src.process_raw_data.filter import process_filter -from src.process_raw_data.process_activities_deezer import ( - process_activities_deezer_feature, -) -from src.process_raw_data.process_age_gender import process_age_gender -from src.helpers import load_data - - -class ProcessAnswers: - def __init__(self, path) -> None: - """ - Initialize the ProcessAnswers class. - - Args: - path (str): The path to the CSV file containing answers data. - """ - self.path = path - - def import_data(self): - """ - Import data from the specified CSV file. - """ - self.df = load_data(self.path).reset_index() - - def filter(self, ids, cols): - """ - Filter the DataFrame based on user IDs and selected columns. - - Args: - ids (list): List of user IDs to include in the filtered data. - cols (list): List of column names to include in the filtered data. - """ - sorting_dict = dict(list(zip(ids, range(len(ids))))) - self.df = self.df[self.df["user_id"].isin(ids)] - self.df = self.df[["user_id"] + cols] - self.df = self.df.sort_values(by="user_id", key=lambda x: x.map(sorting_dict)) - - def process(self, ids, cols): - """ - Process the answers data by importing and filtering. - - Args: - ids (list): List of user IDs to filter the data. - cols (list): List of column names to filter the data. - """ - self.import_data() - self.filter(ids, cols) - - -def process_answer_data_pipeline(df, columns): - """ - Process answers DataFrame by applying filtering and feature processing. - - Args: - df (pd.DataFrame): The DataFrame to be processed. - columns (list): List of columns to keep in the processed DataFrame. - - Returns: - pd.DataFrame: The processed DataFrame with selected columns. - """ - df = process_filter(df) - df = process_activities_deezer_feature(df) - df = process_age_gender(df) - return df[columns] diff --git a/modeling_activity_pace/src/process_raw_data/answers_helpers.py b/modeling_activity_pace/src/process_raw_data/answers_helpers.py index 5a9c226..57fb5a6 100644 --- a/modeling_activity_pace/src/process_raw_data/answers_helpers.py +++ b/modeling_activity_pace/src/process_raw_data/answers_helpers.py @@ -35,7 +35,7 @@ def process_data_for_classifier( - y_train_: Training labels. - y_test_: Test labels. """ - X_list = [load_data(f"data/processed/streams/X_{i}.csv") for i in channel_names] + X_list = [load_data(f"data/timeseries/X_{i}.csv") for i in channel_names] D = np.load("results/dict_learning/D.npy")[:, :, :166] # Extract user IDs diff --git a/modeling_activity_pace/src/process_raw_data/process_activities_deezer.py b/modeling_activity_pace/src/process_raw_data/process_activities_deezer.py deleted file mode 100644 index 563b516..0000000 --- a/modeling_activity_pace/src/process_raw_data/process_activities_deezer.py +++ /dev/null @@ -1,34 +0,0 @@ -from src.helpers import rename_columns -from src.settings import ANSWERS_ACTIVITY_COLUMNS - -COL_NAMES_CONTEXT = [f"B_contexts_deezer_{i}" for i in [1, 4, 5, 2, 12, 10]] - - -def encode_activities_columns(df, new_col_names): - """ - Encode context-related columns in a DataFrame. - - Args: - df (pd.DataFrame): The DataFrame to encode context columns in. - new_col_names (list): List of new context-related column names. - - Returns: - pd.DataFrame: The DataFrame with context columns encoded. - """ - df[new_col_names] = df[new_col_names].map(lambda x: 1 if isinstance(x, str) else 0) - return df - - -def process_activities_deezer_feature(df): - """ - Process context-related features in a DataFrame. - - Args: - df (pd.DataFrame): The DataFrame containing context-related features. - - Returns: - pd.DataFrame: The processed DataFrame. - """ - df = rename_columns(df, COL_NAMES_CONTEXT, ANSWERS_ACTIVITY_COLUMNS) - df = encode_activities_columns(df, ANSWERS_ACTIVITY_COLUMNS) - return df diff --git a/modeling_activity_pace/src/process_raw_data/process_age_gender.py b/modeling_activity_pace/src/process_raw_data/process_age_gender.py deleted file mode 100644 index a6fcd35..0000000 --- a/modeling_activity_pace/src/process_raw_data/process_age_gender.py +++ /dev/null @@ -1,142 +0,0 @@ -import pandas as pd - -from src.helpers import rename_columns - - -ENCODE_AGE = { - "Entre 45 et 54 ans": 4, - "Entre 18 et 24 ans": 1, - "Moins de 18 ans": 0, - "Entre 25 et 34 ans": 2, - "Entre 55 et 64 ans": 5, - "Entre 35 et 44 ans": 3, - "+ de 65 ans": 6, -} - -ENCODE_GENDER = { - "Un homme": 0, - "Une femme": 1, - "Je ne souhaite pas répondre": 2, - "Je préfère me décrire moi-même": 2, - "Non-binaire/Transgenre": 2, -} - -COL_NAMES_AGE_GENDER = [ - "E_birth_year", - "E_age_range", - "E_gender", -] - -NEW_COL_NAMES_AGE_GENDER = [ - "annee_naissance", - "age_group", - "gender", -] - - -def convert_age_to_int(df): - """ - Convert the 'annee_naissance' column in a DataFrame to integer values. - - Args: - df (pd.DataFrame): The DataFrame in which 'annee_naissance' values should be converted. - - Returns: - pd.DataFrame: The DataFrame with 'annee_naissance' values converted to integers. - """ - df["annee_naissance"] = df["annee_naissance"].apply( - lambda x: int(x) if not pd.isna(x) else -1 - ) - return df - - -def encode_age_category(df): - """ - Encode the 'age_group' column in a DataFrame based on predefined age categories. - - Args: - df (pd.DataFrame): The DataFrame in which 'age_group' values should be encoded. - - Returns: - pd.DataFrame: The DataFrame with 'age_group' values encoded. - """ - df["age_group"] = df["age_group"].apply( - lambda x: ENCODE_AGE[x] if isinstance(x, str) else -1 - ) - return df - - -def encode_gender(df): - """ - Encode the 'genre' column in a DataFrame based on predefined gender categories. - - Args: - df (pd.DataFrame): The DataFrame in which 'genre' values should be encoded. - - Returns: - pd.DataFrame: The DataFrame with 'genre' values encoded. - """ - df["gender"] = df["gender"].apply( - lambda x: ENCODE_GENDER[x] if isinstance(x, str) else -1 - ) - return df - - -def assign_age_category(df): - """ - Assign age categories based on the 'annee_naissance' column in a DataFrame. - - Args: - df (pd.DataFrame): The DataFrame in which age categories should be assigned. - - Returns: - pd.DataFrame: The DataFrame with 'age_group' values assigned based on age. - """ - df["age_group"] = df["annee_naissance"].apply( - lambda x: assign_age_category_helper(2023 - x) - ) - return df - - -def assign_age_category_helper(age): - """ - Helper function to assign age categories based on age. - - Args: - age (int): The age value. - - Returns: - int: The assigned age category. - """ - if age < 18: - return 0 - elif age <= 24: - return 1 - elif age <= 34: - return 2 - elif age <= 44: - return 3 - elif age <= 54: - return 4 - elif age <= 64: - return 5 - else: - return 6 - - -def process_age_gender(df): - """ - Process a DataFrame by renaming columns and encoding age and gender categories. - - Args: - df (pd.DataFrame): The DataFrame to be processed. - - Returns: - pd.DataFrame: The processed DataFrame. - """ - df = rename_columns(df, COL_NAMES_AGE_GENDER, NEW_COL_NAMES_AGE_GENDER) - df = convert_age_to_int(df) - df = encode_age_category(df) - df = encode_gender(df) - df = assign_age_category(df) - return df diff --git a/modeling_activity_pace/src/process_raw_data/streams_processor.py b/modeling_activity_pace/src/process_raw_data/streams_processor.py index 6c5d8c5..550e0de 100644 --- a/modeling_activity_pace/src/process_raw_data/streams_processor.py +++ b/modeling_activity_pace/src/process_raw_data/streams_processor.py @@ -22,7 +22,6 @@ def __init__(self, path, usr_drop_rate=0) -> None: """ self.path = path self.usr_drop_rate = usr_drop_rate - self.liked_df = load_data("data/raw/other/user_favorites.csv").reset_index() def import_data(self): """ @@ -35,7 +34,6 @@ def import_data(self): ) ] self.df = pd.concat(df_list) - # self.df = self.df[self.df["user_id"].isin(self.df.user_id.tolist()[:500] + [3356219324])] def convert_timestamps(self): """ @@ -45,19 +43,21 @@ def convert_timestamps(self): dt.datetime.fromtimestamp ) - def filter(self, min_date, max_date): - """ - Filter the DataFrame based on date and listening time. - - Args: - min_date (datetime.date): Minimum date. - max_date (datetime.date): Maximum date. - """ - self.df = self.df[ - (self.df["ts_listen"].dt.date >= min_date) - & (self.df["ts_listen"].dt.date <= max_date) - & (self.df["listening_time"] >= 30) - ] + # def filter(self, min_date, max_date): + # """ + # Filter the DataFrame based on date and listening time. + + # Args: + # min_date (datetime.date): Minimum date. + # max_date (datetime.date): Maximum date. + # """ + # print(f"BEFORE FILTER = {len(self.df)}") + # self.df = self.df[ + # (self.df["ts_listen"].dt.date >= min_date) + # & (self.df["ts_listen"].dt.date <= max_date) + # & (self.df["listening_time"] >= 30) + # ] + # print(f"AFTER FILTER = {len(self.df)}") def build_ids_list(self): """ @@ -99,58 +99,58 @@ def filter_users(self): self.ids = self.ids[:keep_n] self.df = self.df[self.df["user_id"].isin(self.ids)] - def compute_is_organic(self): - """ - Compute an 'is_organic' column based on the 'context_4' column. - """ - self.df["is_organic"] = (self.df["context_4"] == "organic").astype(int) - - def identify_context(self, x): - """ - Identify context types based on the 'context_type' column. - - Args: - x (str): The context string. - - Returns: - str: The identified context type. - """ - context_keywords = ["album", "albums", "playlist", "playlists"] - if any(keyword in x for keyword in context_keywords): - return "album" if "album" in x or "albums" in x else "playlist" - return "other" - - def convert_context(self): - """ - Convert the 'context_type' column to 'context_identified'. - """ - self.df["context_identified"] = self.df["context_type"].progress_apply( - self.identify_context - ) - - def is_fav(self, row, dict_liked): - """ - Check if a row is marked as a favorite. - - Args: - row (pd.Series): The row to check. - dict_liked (dict): Dictionary of favorite items. - - Returns: - int: 1 if it's a favorite, 0 otherwise. - """ - song_id = row.media_id - if "song" in dict_liked and song_id in dict_liked["song"]: - return 1 - context_identified = row.context_identified - context_id = row.context_id - if ( - context_identified in ["playlist", "album"] - and context_identified in dict_liked - and context_id in dict_liked[context_identified] - ): - return 1 - return 0 + # def compute_is_organic(self): + # """ + # Compute an 'is_organic' column based on the 'context_4' column. + # """ + # self.df["is_organic"] = (self.df["context_4"] == "organic").astype(int) + + # def identify_context(self, x): + # """ + # Identify context types based on the 'context_type' column. + + # Args: + # x (str): The context string. + + # Returns: + # str: The identified context type. + # """ + # context_keywords = ["album", "albums", "playlist", "playlists"] + # if any(keyword in x for keyword in context_keywords): + # return "album" if "album" in x or "albums" in x else "playlist" + # return "other" + + # def convert_context(self): + # """ + # Convert the 'context_type' column to 'context_identified'. + # """ + # self.df["context_identified"] = self.df["context_type"].progress_apply( + # self.identify_context + # ) + + # def is_fav(self, row, dict_liked): + # """ + # Check if a row is marked as a favorite. + + # Args: + # row (pd.Series): The row to check. + # dict_liked (dict): Dictionary of favorite items. + + # Returns: + # int: 1 if it's a favorite, 0 otherwise. + # """ + # song_id = row.media_id + # if "song" in dict_liked and song_id in dict_liked["song"]: + # return 1 + # context_identified = row.context_identified + # context_id = row.context_id + # if ( + # context_identified in ["playlist", "album"] + # and context_identified in dict_liked + # and context_id in dict_liked[context_identified] + # ): + # return 1 + # return 0 def process(self, min_date, max_date, n_subdivisions, instant_zero): """ @@ -164,13 +164,13 @@ def process(self, min_date, max_date, n_subdivisions, instant_zero): """ self.import_data() self.convert_timestamps() - self.filter(min_date, max_date) + # self.filter(min_date, max_date) self.build_ids_list() self.filter_users() self.add_time_range(n_subdivisions, instant_zero) self.add_date() - self.compute_is_organic() - self.convert_context() + # self.compute_is_organic() + # self.convert_context() self.all_time_date_couples = set( [tuple(i) for i in self.df[["time_range", "date"]].to_numpy()] ) @@ -323,19 +323,19 @@ def add_liked(self, df_user, k, id, channel_index): id (int): User ID. channel_index (int): Index of the channel. """ - liked_df_user = self.liked_df[self.liked_df["user_id"] == id] - dict_liked = dict( - liked_df_user[ - liked_df_user["item_type"].isin(["album", "playlist", "song"]) - ][["item_id", "item_type"]] - .groupby("item_type") - .agg(list) - .reset_index() - .values - ) - df_user["is_fav"] = df_user.apply( - lambda row: self.is_fav(row, dict_liked), axis=1 - ) + # liked_df_user = self.liked_df[self.liked_df["user_id"] == id] + # dict_liked = dict( + # liked_df_user[ + # liked_df_user["item_type"].isin(["album", "playlist", "song"]) + # ][["item_id", "item_type"]] + # .groupby("item_type") + # .agg(list) + # .reset_index() + # .values + # ) + # df_user["is_fav"] = df_user.apply( + # lambda row: self.is_fav(row, dict_liked), axis=1 + # ) df_user = df_user[df_user["is_fav"] == 1] full_df_liked = self.compute_ratio_df(df_user) diff --git a/modeling_activity_pace/src/settings.py b/modeling_activity_pace/src/settings.py index e625c63..9ee32d5 100644 --- a/modeling_activity_pace/src/settings.py +++ b/modeling_activity_pace/src/settings.py @@ -8,8 +8,9 @@ MAX_DATE = dt.date(2023, 5, 19) INSTANT_ZERO = dt.datetime(2022, 12, 26) -ANSWERS_PATH = "data/processed/answers/processed_records.csv" +ANSWERS_PATH = "data/answers/processed_records.csv" DICT_ITER_PATH = "results/dict_learning/all_iterations/" +TIME_SERIES_PATH = "data/timeseries" ANSWERS_ACTIVITY_COLUMNS = [ "activity_wake_up", @@ -46,4 +47,4 @@ HOURS = [f"{i}h" for i in range(24)] time_labels_full = [f"{day},{hour}" for day in DAYS for hour in HOURS] -time_labels = time_labels_full[1:-1] \ No newline at end of file +time_labels = time_labels_full[1:-1] From 602f77bf0613666be253e04f2d67ce9cca4b9be9 Mon Sep 17 00:00:00 2001 From: jpasdeloup Date: Mon, 16 Jun 2025 10:27:34 +0200 Subject: [PATCH 2/8] Reactive all steps --- modeling_activity_pace/run_all.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/modeling_activity_pace/run_all.sh b/modeling_activity_pace/run_all.sh index 965ea92..3e1bc46 100644 --- a/modeling_activity_pace/run_all.sh +++ b/modeling_activity_pace/run_all.sh @@ -8,28 +8,28 @@ poetry run python modeling_activity_pace/compute_time_series.py echo "****************************************" echo "2. Dictionary Learning algorithm is run " echo "****************************************" -#poetry run python modeling_activity_pace/compute_dictionary.py +poetry run python modeling_activity_pace/compute_dictionary.py echo "**********************************************************" echo "3. Selection of the best iteration in dictionary learning " echo "**********************************************************" -#poetry run python modeling_activity_pace/choose_dictionary.py +poetry run python modeling_activity_pace/choose_dictionary.py echo "" echo "***********************************************************" echo "4. Computes baselines scores and scores of PACE embeddings " echo "***********************************************************" -#poetry run python modeling_activity_pace/compute_baselines.py +poetry run python modeling_activity_pace/compute_baselines.py echo "" echo "******************************************" echo "5. Plots logistic regression coefficients " echo " and related statistical reports " echo "******************************************" -#poetry run python modeling_activity_pace/analyse_models.py +poetry run python modeling_activity_pace/analyse_models.py echo -e "" echo "******************************" echo "6. Saves the plot of Figure 1 " echo "******************************" -#poetry run python modeling_activity_pace/make_fig1.py +poetry run python modeling_activity_pace/make_fig1.py From 000e44a20f3115e0f8a8a4c2e1f4624f294aac1b Mon Sep 17 00:00:00 2001 From: jpasdeloup Date: Tue, 17 Jun 2025 09:31:45 +0200 Subject: [PATCH 3/8] Restore missing file --- modeling_activity_pace/src/process_answers.py | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 modeling_activity_pace/src/process_answers.py diff --git a/modeling_activity_pace/src/process_answers.py b/modeling_activity_pace/src/process_answers.py new file mode 100644 index 0000000..713de2e --- /dev/null +++ b/modeling_activity_pace/src/process_answers.py @@ -0,0 +1,41 @@ +from src.helpers import load_data + +class ProcessAnswers: + def __init__(self, path) -> None: + """ + Initialize the ProcessAnswers class. + + Args: + path (str): The path to the CSV file containing answers data. + """ + self.path = path + + def import_data(self): + """ + Import data from the specified CSV file. + """ + self.df = load_data(self.path).reset_index() + + def filter(self, ids, cols): + """ + Filter the DataFrame based on user IDs and selected columns. + + Args: + ids (list): List of user IDs to include in the filtered data. + cols (list): List of column names to include in the filtered data. + """ + sorting_dict = dict(list(zip(ids, range(len(ids))))) + self.df = self.df[self.df["user_id"].isin(ids)] + self.df = self.df[["user_id"] + cols] + self.df = self.df.sort_values(by="user_id", key=lambda x: x.map(sorting_dict)) + + def process(self, ids, cols): + """ + Process the answers data by importing and filtering. + + Args: + ids (list): List of user IDs to filter the data. + cols (list): List of column names to filter the data. + """ + self.import_data() + self.filter(ids, cols) From 74de55051dfc98480664c2541430ea9a4a609779 Mon Sep 17 00:00:00 2001 From: jpasdeloup Date: Mon, 23 Jun 2025 09:05:30 +0200 Subject: [PATCH 4/8] Ignore 5 first iterations --- .../src/dict_learning/choose_best_iteration.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modeling_activity_pace/src/dict_learning/choose_best_iteration.py b/modeling_activity_pace/src/dict_learning/choose_best_iteration.py index 4ad283d..6f66947 100644 --- a/modeling_activity_pace/src/dict_learning/choose_best_iteration.py +++ b/modeling_activity_pace/src/dict_learning/choose_best_iteration.py @@ -109,7 +109,8 @@ def choose_best_dict(X, y, cols_to_predict): ## Uncomment here to manually select the best iteration # best_iter = int(input("Choose the best iteration : ")) - best_iter = class_scores.index(max(class_scores)) + best_iter_offset = 5 + best_iter = best_iter_offset + class_scores[best_iter_offset:].index(max(class_scores[best_iter_offset:])) print(f"Best iteration is {best_iter} with score {class_scores[best_iter]}") shutil.copy( From e53f12b6ade05c4f894a0daec63a364cc1ce553f Mon Sep 17 00:00:00 2001 From: jpasdeloup Date: Tue, 24 Jun 2025 10:29:35 +0200 Subject: [PATCH 5/8] Ignore bak files --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 7a2a2b5..a3f579f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ data/* results/* +*.bak From 9b9713952b6009287852bdcf906548203d6c1690 Mon Sep 17 00:00:00 2001 From: jpasdeloup Date: Tue, 24 Jun 2025 10:34:23 +0200 Subject: [PATCH 6/8] Remove commented code --- modeling_activity_pace/make_fig1.py | 3 +- .../src/baselines/baselines_functions.py | 6 +- .../src/process_raw_data/answers_helpers.py | 3 +- .../src/process_raw_data/streams_processor.py | 82 ------------------- 4 files changed, 7 insertions(+), 87 deletions(-) diff --git a/modeling_activity_pace/make_fig1.py b/modeling_activity_pace/make_fig1.py index 95078aa..2dfb9f3 100644 --- a/modeling_activity_pace/make_fig1.py +++ b/modeling_activity_pace/make_fig1.py @@ -10,6 +10,7 @@ from src.settings import ( N_SUBDIVISION_1HOUR, INSTANT_ZERO, + TIME_SERIES_PATH, time_labels_full, time_labels, ) @@ -31,7 +32,7 @@ def add_time_range(df, n_subdivisions, instant_zero): # Import data -df_processed = load_data("data/timeseries/X_volume.csv") +df_processed = load_data(f"{TIME_SERIES_PATH}/X_volume.csv") streams_df_list = [] diff --git a/modeling_activity_pace/src/baselines/baselines_functions.py b/modeling_activity_pace/src/baselines/baselines_functions.py index 5497684..00b2258 100644 --- a/modeling_activity_pace/src/baselines/baselines_functions.py +++ b/modeling_activity_pace/src/baselines/baselines_functions.py @@ -3,7 +3,7 @@ from tqdm import tqdm from src.process_answers import ProcessAnswers -from src.settings import channel_names, ANSWERS_PATH +from src.settings import TIME_SERIES_PATH, channel_names, ANSWERS_PATH from src.helpers import load_data, get_ids_from_signals from src.process_raw_data.answers_helpers import process_data_for_classifier from src.modeling_functions import ( @@ -25,7 +25,7 @@ def compute_other_activities_baseline(cols_to_predict): Returns: - List of AUC scores. """ - X_list = [load_data(f"data/timeseries/X_{i}.csv") for i in channel_names] + X_list = [load_data(f"{TIME_SERIES_PATH}/X_{i}.csv") for i in channel_names] # Extract user IDs ids = get_ids_from_signals(X_list) @@ -80,7 +80,7 @@ def compute_total_volume_baseline(cols_to_predict): """ # Load volume data - df_volume = load_data("data/timeseries/X_volume.csv") + df_volume = load_data(f"{TIME_SERIES_PATH}/X_volume.csv") # Sum the volume across time (transpose and sum) df = pd.DataFrame(df_volume.T.sum(), columns=["Total volume"]) diff --git a/modeling_activity_pace/src/process_raw_data/answers_helpers.py b/modeling_activity_pace/src/process_raw_data/answers_helpers.py index 57fb5a6..fc22ebb 100644 --- a/modeling_activity_pace/src/process_raw_data/answers_helpers.py +++ b/modeling_activity_pace/src/process_raw_data/answers_helpers.py @@ -6,6 +6,7 @@ from src.helpers import load_data, get_ids_from_signals from src.modeling_functions import convolve_signals, normalize_signals, split from src.settings import ( + TIME_SERIES_PATH, channel_names, n_channels, ANSWERS_PATH, @@ -35,7 +36,7 @@ def process_data_for_classifier( - y_train_: Training labels. - y_test_: Test labels. """ - X_list = [load_data(f"data/timeseries/X_{i}.csv") for i in channel_names] + X_list = [load_data(f"{TIME_SERIES_PATH}/X_{i}.csv") for i in channel_names] D = np.load("results/dict_learning/D.npy")[:, :, :166] # Extract user IDs diff --git a/modeling_activity_pace/src/process_raw_data/streams_processor.py b/modeling_activity_pace/src/process_raw_data/streams_processor.py index 550e0de..6f75fab 100644 --- a/modeling_activity_pace/src/process_raw_data/streams_processor.py +++ b/modeling_activity_pace/src/process_raw_data/streams_processor.py @@ -43,22 +43,6 @@ def convert_timestamps(self): dt.datetime.fromtimestamp ) - # def filter(self, min_date, max_date): - # """ - # Filter the DataFrame based on date and listening time. - - # Args: - # min_date (datetime.date): Minimum date. - # max_date (datetime.date): Maximum date. - # """ - # print(f"BEFORE FILTER = {len(self.df)}") - # self.df = self.df[ - # (self.df["ts_listen"].dt.date >= min_date) - # & (self.df["ts_listen"].dt.date <= max_date) - # & (self.df["listening_time"] >= 30) - # ] - # print(f"AFTER FILTER = {len(self.df)}") - def build_ids_list(self): """ Build a list of user IDs sorted by the number of occurrences. @@ -99,59 +83,6 @@ def filter_users(self): self.ids = self.ids[:keep_n] self.df = self.df[self.df["user_id"].isin(self.ids)] - # def compute_is_organic(self): - # """ - # Compute an 'is_organic' column based on the 'context_4' column. - # """ - # self.df["is_organic"] = (self.df["context_4"] == "organic").astype(int) - - # def identify_context(self, x): - # """ - # Identify context types based on the 'context_type' column. - - # Args: - # x (str): The context string. - - # Returns: - # str: The identified context type. - # """ - # context_keywords = ["album", "albums", "playlist", "playlists"] - # if any(keyword in x for keyword in context_keywords): - # return "album" if "album" in x or "albums" in x else "playlist" - # return "other" - - # def convert_context(self): - # """ - # Convert the 'context_type' column to 'context_identified'. - # """ - # self.df["context_identified"] = self.df["context_type"].progress_apply( - # self.identify_context - # ) - - # def is_fav(self, row, dict_liked): - # """ - # Check if a row is marked as a favorite. - - # Args: - # row (pd.Series): The row to check. - # dict_liked (dict): Dictionary of favorite items. - - # Returns: - # int: 1 if it's a favorite, 0 otherwise. - # """ - # song_id = row.media_id - # if "song" in dict_liked and song_id in dict_liked["song"]: - # return 1 - # context_identified = row.context_identified - # context_id = row.context_id - # if ( - # context_identified in ["playlist", "album"] - # and context_identified in dict_liked - # and context_id in dict_liked[context_identified] - # ): - # return 1 - # return 0 - def process(self, min_date, max_date, n_subdivisions, instant_zero): """ Process the data by importing, filtering, and adding columns. @@ -323,19 +254,6 @@ def add_liked(self, df_user, k, id, channel_index): id (int): User ID. channel_index (int): Index of the channel. """ - # liked_df_user = self.liked_df[self.liked_df["user_id"] == id] - # dict_liked = dict( - # liked_df_user[ - # liked_df_user["item_type"].isin(["album", "playlist", "song"]) - # ][["item_id", "item_type"]] - # .groupby("item_type") - # .agg(list) - # .reset_index() - # .values - # ) - # df_user["is_fav"] = df_user.apply( - # lambda row: self.is_fav(row, dict_liked), axis=1 - # ) df_user = df_user[df_user["is_fav"] == 1] full_df_liked = self.compute_ratio_df(df_user) From 66d2bf4b25fa517cd28c1d352cf99fb1b2352c74 Mon Sep 17 00:00:00 2001 From: jpasdeloup Date: Tue, 24 Jun 2025 10:35:45 +0200 Subject: [PATCH 7/8] Remove additional old code --- .../src/process_raw_data/streams_processor.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/modeling_activity_pace/src/process_raw_data/streams_processor.py b/modeling_activity_pace/src/process_raw_data/streams_processor.py index 6f75fab..4ed922b 100644 --- a/modeling_activity_pace/src/process_raw_data/streams_processor.py +++ b/modeling_activity_pace/src/process_raw_data/streams_processor.py @@ -95,13 +95,10 @@ def process(self, min_date, max_date, n_subdivisions, instant_zero): """ self.import_data() self.convert_timestamps() - # self.filter(min_date, max_date) self.build_ids_list() self.filter_users() self.add_time_range(n_subdivisions, instant_zero) self.add_date() - # self.compute_is_organic() - # self.convert_context() self.all_time_date_couples = set( [tuple(i) for i in self.df[["time_range", "date"]].to_numpy()] ) From 825d26808a518cbad61b0e1f05ac2618ad067414 Mon Sep 17 00:00:00 2001 From: jpasdeloup Date: Tue, 24 Jun 2025 12:17:53 +0200 Subject: [PATCH 8/8] Select 1st user for fig1 --- modeling_activity_pace/make_fig1.py | 2 +- modeling_activity_pace/src/dict_learning/dictionary_helpers.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/modeling_activity_pace/make_fig1.py b/modeling_activity_pace/make_fig1.py index 2dfb9f3..1b85af3 100644 --- a/modeling_activity_pace/make_fig1.py +++ b/modeling_activity_pace/make_fig1.py @@ -19,7 +19,6 @@ tqdm.pandas() build_result_folder() -SELECTED_USER_ID = 3356219324 def add_time_range(df, n_subdivisions, instant_zero): @@ -34,6 +33,7 @@ def add_time_range(df, n_subdivisions, instant_zero): df_processed = load_data(f"{TIME_SERIES_PATH}/X_volume.csv") +SELECTED_USER_ID = df_processed.index[0] streams_df_list = [] for i in tqdm(os.listdir("data/streams/")): diff --git a/modeling_activity_pace/src/dict_learning/dictionary_helpers.py b/modeling_activity_pace/src/dict_learning/dictionary_helpers.py index d810540..b354487 100644 --- a/modeling_activity_pace/src/dict_learning/dictionary_helpers.py +++ b/modeling_activity_pace/src/dict_learning/dictionary_helpers.py @@ -5,6 +5,7 @@ from src.process_answers import ProcessAnswers from src.settings import ( + TIME_SERIES_PATH, n_channels, channel_names, ANSWERS_ACTIVITY_COLUMNS, @@ -26,7 +27,7 @@ def process_data_for_DL(): Returns: tuple: A tuple containing input data (X_list_array_clean) and target labels (y). """ - X_list = [load_data(f"data/timeseries/X_{i}.csv") for i in channel_names] + X_list = [load_data(f"{TIME_SERIES_PATH}/X_{i}.csv") for i in channel_names] ids = get_ids_from_signals(X_list) cols_to_predict = ANSWERS_ACTIVITY_COLUMNS