diff --git a/.gitignore b/.gitignore index 7a2a2b5..a3f579f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ data/* results/* +*.bak diff --git a/Makefile b/Makefile index 86cb40f..cf87ce6 100644 --- a/Makefile +++ b/Makefile @@ -19,6 +19,9 @@ usage: build: docker build -t $(DOCKER_IMAGE_NAME) . +all: + $(DOCKER_RUN_MOUNT) /bin/bash modeling_activity_pace/run_all.sh || true + run-bash: build $(DOCKER_RUN_MOUNT) /bin/bash || true diff --git a/README.md b/README.md index 6cc6d2b..a13a4fe 100644 --- a/README.md +++ b/README.md @@ -37,13 +37,12 @@ poetry run python modeling_activity_pace/ ## Scripts -1. Raw user histories are trasformed into time series using `process_raw_streams.py`. -2. User answers to survey are prepared using `process_raw_answers.py`. -3. Dictionary Learning algorithm is run using `compute_dictionary.py`. -4. The selection of the best iteration in dictionary learning is done using `choose_dictionary.py`. -5. `compute_baselines.py` computes baselines scores and scores of PACE embeddings. -6. `analyse_models.py` plots logistic regression coefficients and related statistical reports. -7. `make_fig1.py` saves the plot of Figure 1. +1. User logs are transformed into time series using `compute_time_series.py`. +2. Dictionary Learning algorithm is run using `compute_dictionary.py`. +3. The selection of the best iteration in dictionary learning is done using `choose_dictionary.py`. +4. `compute_baselines.py` computes baselines scores and scores of PACE embeddings. +5. `analyse_models.py` plots logistic regression coefficients and related statistical reports. +6. `make_fig1.py` saves the plot of Figure 1. ## Data @@ -54,15 +53,12 @@ Input data folder must be organized as follows : pace/ │ └── data/ - └── raw/ - ├── streams/ - │ ├── one_year_all_respondents000000000000.csv - │ ├── ... - │ └── one_year_all_respondents0000000000399.csv - ├── other/ - │ └── user_favorites.csv - └── answers/ - └── records.csv + └── answers/ + └── processed_records.csv + ├── streams/ + │ ├── processed_streams_000000000000.csv + │ ├── ... + │ └── processed_streams_000000000091.csv ``` Where ```one_year_all_respondents.csv``` files are stream history csv files with columns : ```user_id, ts_listen, media_id, context_id, context_type, listening_time, context_4```. diff --git a/modeling_activity_pace/process_raw_streams.py b/modeling_activity_pace/compute_time_series.py similarity index 75% rename from modeling_activity_pace/process_raw_streams.py rename to modeling_activity_pace/compute_time_series.py index 986ef47..c94048d 100644 --- a/modeling_activity_pace/process_raw_streams.py +++ b/modeling_activity_pace/compute_time_series.py @@ -1,8 +1,10 @@ import pandas as pd +import os from src.process_raw_data.streams_processor import ProcessStreams from src.helpers import save_data from src.settings import ( + TIME_SERIES_PATH, channel_names, time_labels_full, MIN_DATE, @@ -11,10 +13,12 @@ INSTANT_ZERO, ) - if __name__ == "__main__": + # Create destination directory if it doesn't exist + os.makedirs(TIME_SERIES_PATH, exist_ok=True) + # Initialize a ProcessStreams instance - stream_processor = ProcessStreams("data/raw/streams/", usr_drop_rate=0) + stream_processor = ProcessStreams("data/streams/", usr_drop_rate=0) # Process the streams data stream_processor.process(MIN_DATE, MAX_DATE, N_SUBDIVISION_1HOUR, INSTANT_ZERO) @@ -32,4 +36,4 @@ ) # Save the DataFrame to a CSV file - save_data(channel_data, f"data/processed/streams/X_{channel_name}.csv", index=True) + save_data(channel_data, f"{TIME_SERIES_PATH}/X_{channel_name}.csv", index=True) diff --git a/modeling_activity_pace/make_fig1.py b/modeling_activity_pace/make_fig1.py index 06d7a69..1b85af3 100644 --- a/modeling_activity_pace/make_fig1.py +++ b/modeling_activity_pace/make_fig1.py @@ -10,6 +10,7 @@ from src.settings import ( N_SUBDIVISION_1HOUR, INSTANT_ZERO, + TIME_SERIES_PATH, time_labels_full, time_labels, ) @@ -18,7 +19,6 @@ tqdm.pandas() build_result_folder() -SELECTED_USER_ID = 3356219324 def add_time_range(df, n_subdivisions, instant_zero): @@ -31,12 +31,13 @@ def add_time_range(df, n_subdivisions, instant_zero): # Import data -df_processed = load_data("data/processed/streams/X_volume.csv") +df_processed = load_data(f"{TIME_SERIES_PATH}/X_volume.csv") +SELECTED_USER_ID = df_processed.index[0] streams_df_list = [] -for i in tqdm(os.listdir("data/raw/streams/")): - df_streams = load_data("data/raw/streams/" + i).reset_index() +for i in tqdm(os.listdir("data/streams/")): + df_streams = load_data("data/streams/" + i).reset_index() streams_df_list.append(df_streams[df_streams["user_id"] == SELECTED_USER_ID]) df = pd.concat(streams_df_list) diff --git a/modeling_activity_pace/process_raw_answers.py b/modeling_activity_pace/process_raw_answers.py deleted file mode 100644 index 53a7a10..0000000 --- a/modeling_activity_pace/process_raw_answers.py +++ /dev/null @@ -1,21 +0,0 @@ -from src.process_raw_data.process_age_gender import NEW_COL_NAMES_AGE_GENDER -from src.process_answers import process_answer_data_pipeline -from src.helpers import load_data, save_data -from src.settings import ANSWERS_ACTIVITY_COLUMNS - - -def main(): - """ - Main function to process data and save the processed DataFrame to a CSV file. - """ - FILE_PATH = "data/raw/answers/records.csv" - COLS_TO_KEEP = ["user_id"] + ANSWERS_ACTIVITY_COLUMNS + NEW_COL_NAMES_AGE_GENDER - COLS_TO_KEEP.remove("annee_naissance") - - df = load_data(FILE_PATH) - df = process_answer_data_pipeline(df, COLS_TO_KEEP) - save_data(df, "data/processed/answers/processed_records.csv") - - -if __name__ == "__main__": - main() diff --git a/modeling_activity_pace/run_all.sh b/modeling_activity_pace/run_all.sh new file mode 100644 index 0000000..3e1bc46 --- /dev/null +++ b/modeling_activity_pace/run_all.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +echo "**********************************************" +echo "1. User logs are transformed into time series " +echo "**********************************************" +poetry run python modeling_activity_pace/compute_time_series.py + +echo "****************************************" +echo "2. Dictionary Learning algorithm is run " +echo "****************************************" +poetry run python modeling_activity_pace/compute_dictionary.py + +echo "**********************************************************" +echo "3. Selection of the best iteration in dictionary learning " +echo "**********************************************************" +poetry run python modeling_activity_pace/choose_dictionary.py + +echo "" +echo "***********************************************************" +echo "4. Computes baselines scores and scores of PACE embeddings " +echo "***********************************************************" +poetry run python modeling_activity_pace/compute_baselines.py + +echo "" +echo "******************************************" +echo "5. Plots logistic regression coefficients " +echo " and related statistical reports " +echo "******************************************" +poetry run python modeling_activity_pace/analyse_models.py + +echo -e "" +echo "******************************" +echo "6. Saves the plot of Figure 1 " +echo "******************************" +poetry run python modeling_activity_pace/make_fig1.py diff --git a/modeling_activity_pace/src/baselines/baselines_functions.py b/modeling_activity_pace/src/baselines/baselines_functions.py index 54bc036..00b2258 100644 --- a/modeling_activity_pace/src/baselines/baselines_functions.py +++ b/modeling_activity_pace/src/baselines/baselines_functions.py @@ -3,7 +3,7 @@ from tqdm import tqdm from src.process_answers import ProcessAnswers -from src.settings import channel_names, ANSWERS_PATH +from src.settings import TIME_SERIES_PATH, channel_names, ANSWERS_PATH from src.helpers import load_data, get_ids_from_signals from src.process_raw_data.answers_helpers import process_data_for_classifier from src.modeling_functions import ( @@ -25,7 +25,7 @@ def compute_other_activities_baseline(cols_to_predict): Returns: - List of AUC scores. """ - X_list = [load_data(f"data/processed/streams/X_{i}.csv") for i in channel_names] + X_list = [load_data(f"{TIME_SERIES_PATH}/X_{i}.csv") for i in channel_names] # Extract user IDs ids = get_ids_from_signals(X_list) @@ -80,7 +80,7 @@ def compute_total_volume_baseline(cols_to_predict): """ # Load volume data - df_volume = load_data("data/processed/streams/X_volume.csv") + df_volume = load_data(f"{TIME_SERIES_PATH}/X_volume.csv") # Sum the volume across time (transpose and sum) df = pd.DataFrame(df_volume.T.sum(), columns=["Total volume"]) @@ -155,4 +155,3 @@ def compute_baseline_scores( scores = perform_grid_search(X_train, X_test, y_train_, y_test_, cols_to_predict) return scores - diff --git a/modeling_activity_pace/src/dict_learning/choose_best_iteration.py b/modeling_activity_pace/src/dict_learning/choose_best_iteration.py index 567261f..6f66947 100644 --- a/modeling_activity_pace/src/dict_learning/choose_best_iteration.py +++ b/modeling_activity_pace/src/dict_learning/choose_best_iteration.py @@ -105,7 +105,14 @@ def choose_best_dict(X, y, cols_to_predict): index=True, ) plot_reconstruction_iterations_perfs(df_reconstruction_scores, "results/figures/reconstruction_scores_over_iterations.pdf") - best_iter = int(input("Choose the best iteration : ")) + + ## Uncomment here to manually select the best iteration + # best_iter = int(input("Choose the best iteration : ")) + + best_iter_offset = 5 + best_iter = best_iter_offset + class_scores[best_iter_offset:].index(max(class_scores[best_iter_offset:])) + print(f"Best iteration is {best_iter} with score {class_scores[best_iter]}") + shutil.copy( DICT_ITER_PATH + [f"D_{i}.npy" for i in range(len(os.listdir(DICT_ITER_PATH)))][best_iter], diff --git a/modeling_activity_pace/src/dict_learning/dictionary_helpers.py b/modeling_activity_pace/src/dict_learning/dictionary_helpers.py index 3e57f72..b354487 100644 --- a/modeling_activity_pace/src/dict_learning/dictionary_helpers.py +++ b/modeling_activity_pace/src/dict_learning/dictionary_helpers.py @@ -5,6 +5,7 @@ from src.process_answers import ProcessAnswers from src.settings import ( + TIME_SERIES_PATH, n_channels, channel_names, ANSWERS_ACTIVITY_COLUMNS, @@ -26,7 +27,7 @@ def process_data_for_DL(): Returns: tuple: A tuple containing input data (X_list_array_clean) and target labels (y). """ - X_list = [load_data(f"data/processed/streams/X_{i}.csv") for i in channel_names] + X_list = [load_data(f"{TIME_SERIES_PATH}/X_{i}.csv") for i in channel_names] ids = get_ids_from_signals(X_list) cols_to_predict = ANSWERS_ACTIVITY_COLUMNS diff --git a/modeling_activity_pace/src/process_answers.py b/modeling_activity_pace/src/process_answers.py index 78337f8..713de2e 100644 --- a/modeling_activity_pace/src/process_answers.py +++ b/modeling_activity_pace/src/process_answers.py @@ -1,11 +1,4 @@ from src.helpers import load_data -from src.process_raw_data.filter import process_filter -from src.process_raw_data.process_activities_deezer import ( - process_activities_deezer_feature, -) -from src.process_raw_data.process_age_gender import process_age_gender -from src.helpers import load_data - class ProcessAnswers: def __init__(self, path) -> None: @@ -46,20 +39,3 @@ def process(self, ids, cols): """ self.import_data() self.filter(ids, cols) - - -def process_answer_data_pipeline(df, columns): - """ - Process answers DataFrame by applying filtering and feature processing. - - Args: - df (pd.DataFrame): The DataFrame to be processed. - columns (list): List of columns to keep in the processed DataFrame. - - Returns: - pd.DataFrame: The processed DataFrame with selected columns. - """ - df = process_filter(df) - df = process_activities_deezer_feature(df) - df = process_age_gender(df) - return df[columns] diff --git a/modeling_activity_pace/src/process_raw_data/answers_helpers.py b/modeling_activity_pace/src/process_raw_data/answers_helpers.py index 5a9c226..fc22ebb 100644 --- a/modeling_activity_pace/src/process_raw_data/answers_helpers.py +++ b/modeling_activity_pace/src/process_raw_data/answers_helpers.py @@ -6,6 +6,7 @@ from src.helpers import load_data, get_ids_from_signals from src.modeling_functions import convolve_signals, normalize_signals, split from src.settings import ( + TIME_SERIES_PATH, channel_names, n_channels, ANSWERS_PATH, @@ -35,7 +36,7 @@ def process_data_for_classifier( - y_train_: Training labels. - y_test_: Test labels. """ - X_list = [load_data(f"data/processed/streams/X_{i}.csv") for i in channel_names] + X_list = [load_data(f"{TIME_SERIES_PATH}/X_{i}.csv") for i in channel_names] D = np.load("results/dict_learning/D.npy")[:, :, :166] # Extract user IDs diff --git a/modeling_activity_pace/src/process_raw_data/process_activities_deezer.py b/modeling_activity_pace/src/process_raw_data/process_activities_deezer.py deleted file mode 100644 index 563b516..0000000 --- a/modeling_activity_pace/src/process_raw_data/process_activities_deezer.py +++ /dev/null @@ -1,34 +0,0 @@ -from src.helpers import rename_columns -from src.settings import ANSWERS_ACTIVITY_COLUMNS - -COL_NAMES_CONTEXT = [f"B_contexts_deezer_{i}" for i in [1, 4, 5, 2, 12, 10]] - - -def encode_activities_columns(df, new_col_names): - """ - Encode context-related columns in a DataFrame. - - Args: - df (pd.DataFrame): The DataFrame to encode context columns in. - new_col_names (list): List of new context-related column names. - - Returns: - pd.DataFrame: The DataFrame with context columns encoded. - """ - df[new_col_names] = df[new_col_names].map(lambda x: 1 if isinstance(x, str) else 0) - return df - - -def process_activities_deezer_feature(df): - """ - Process context-related features in a DataFrame. - - Args: - df (pd.DataFrame): The DataFrame containing context-related features. - - Returns: - pd.DataFrame: The processed DataFrame. - """ - df = rename_columns(df, COL_NAMES_CONTEXT, ANSWERS_ACTIVITY_COLUMNS) - df = encode_activities_columns(df, ANSWERS_ACTIVITY_COLUMNS) - return df diff --git a/modeling_activity_pace/src/process_raw_data/process_age_gender.py b/modeling_activity_pace/src/process_raw_data/process_age_gender.py deleted file mode 100644 index a6fcd35..0000000 --- a/modeling_activity_pace/src/process_raw_data/process_age_gender.py +++ /dev/null @@ -1,142 +0,0 @@ -import pandas as pd - -from src.helpers import rename_columns - - -ENCODE_AGE = { - "Entre 45 et 54 ans": 4, - "Entre 18 et 24 ans": 1, - "Moins de 18 ans": 0, - "Entre 25 et 34 ans": 2, - "Entre 55 et 64 ans": 5, - "Entre 35 et 44 ans": 3, - "+ de 65 ans": 6, -} - -ENCODE_GENDER = { - "Un homme": 0, - "Une femme": 1, - "Je ne souhaite pas répondre": 2, - "Je préfère me décrire moi-même": 2, - "Non-binaire/Transgenre": 2, -} - -COL_NAMES_AGE_GENDER = [ - "E_birth_year", - "E_age_range", - "E_gender", -] - -NEW_COL_NAMES_AGE_GENDER = [ - "annee_naissance", - "age_group", - "gender", -] - - -def convert_age_to_int(df): - """ - Convert the 'annee_naissance' column in a DataFrame to integer values. - - Args: - df (pd.DataFrame): The DataFrame in which 'annee_naissance' values should be converted. - - Returns: - pd.DataFrame: The DataFrame with 'annee_naissance' values converted to integers. - """ - df["annee_naissance"] = df["annee_naissance"].apply( - lambda x: int(x) if not pd.isna(x) else -1 - ) - return df - - -def encode_age_category(df): - """ - Encode the 'age_group' column in a DataFrame based on predefined age categories. - - Args: - df (pd.DataFrame): The DataFrame in which 'age_group' values should be encoded. - - Returns: - pd.DataFrame: The DataFrame with 'age_group' values encoded. - """ - df["age_group"] = df["age_group"].apply( - lambda x: ENCODE_AGE[x] if isinstance(x, str) else -1 - ) - return df - - -def encode_gender(df): - """ - Encode the 'genre' column in a DataFrame based on predefined gender categories. - - Args: - df (pd.DataFrame): The DataFrame in which 'genre' values should be encoded. - - Returns: - pd.DataFrame: The DataFrame with 'genre' values encoded. - """ - df["gender"] = df["gender"].apply( - lambda x: ENCODE_GENDER[x] if isinstance(x, str) else -1 - ) - return df - - -def assign_age_category(df): - """ - Assign age categories based on the 'annee_naissance' column in a DataFrame. - - Args: - df (pd.DataFrame): The DataFrame in which age categories should be assigned. - - Returns: - pd.DataFrame: The DataFrame with 'age_group' values assigned based on age. - """ - df["age_group"] = df["annee_naissance"].apply( - lambda x: assign_age_category_helper(2023 - x) - ) - return df - - -def assign_age_category_helper(age): - """ - Helper function to assign age categories based on age. - - Args: - age (int): The age value. - - Returns: - int: The assigned age category. - """ - if age < 18: - return 0 - elif age <= 24: - return 1 - elif age <= 34: - return 2 - elif age <= 44: - return 3 - elif age <= 54: - return 4 - elif age <= 64: - return 5 - else: - return 6 - - -def process_age_gender(df): - """ - Process a DataFrame by renaming columns and encoding age and gender categories. - - Args: - df (pd.DataFrame): The DataFrame to be processed. - - Returns: - pd.DataFrame: The processed DataFrame. - """ - df = rename_columns(df, COL_NAMES_AGE_GENDER, NEW_COL_NAMES_AGE_GENDER) - df = convert_age_to_int(df) - df = encode_age_category(df) - df = encode_gender(df) - df = assign_age_category(df) - return df diff --git a/modeling_activity_pace/src/process_raw_data/streams_processor.py b/modeling_activity_pace/src/process_raw_data/streams_processor.py index 6c5d8c5..4ed922b 100644 --- a/modeling_activity_pace/src/process_raw_data/streams_processor.py +++ b/modeling_activity_pace/src/process_raw_data/streams_processor.py @@ -22,7 +22,6 @@ def __init__(self, path, usr_drop_rate=0) -> None: """ self.path = path self.usr_drop_rate = usr_drop_rate - self.liked_df = load_data("data/raw/other/user_favorites.csv").reset_index() def import_data(self): """ @@ -35,7 +34,6 @@ def import_data(self): ) ] self.df = pd.concat(df_list) - # self.df = self.df[self.df["user_id"].isin(self.df.user_id.tolist()[:500] + [3356219324])] def convert_timestamps(self): """ @@ -45,20 +43,6 @@ def convert_timestamps(self): dt.datetime.fromtimestamp ) - def filter(self, min_date, max_date): - """ - Filter the DataFrame based on date and listening time. - - Args: - min_date (datetime.date): Minimum date. - max_date (datetime.date): Maximum date. - """ - self.df = self.df[ - (self.df["ts_listen"].dt.date >= min_date) - & (self.df["ts_listen"].dt.date <= max_date) - & (self.df["listening_time"] >= 30) - ] - def build_ids_list(self): """ Build a list of user IDs sorted by the number of occurrences. @@ -99,59 +83,6 @@ def filter_users(self): self.ids = self.ids[:keep_n] self.df = self.df[self.df["user_id"].isin(self.ids)] - def compute_is_organic(self): - """ - Compute an 'is_organic' column based on the 'context_4' column. - """ - self.df["is_organic"] = (self.df["context_4"] == "organic").astype(int) - - def identify_context(self, x): - """ - Identify context types based on the 'context_type' column. - - Args: - x (str): The context string. - - Returns: - str: The identified context type. - """ - context_keywords = ["album", "albums", "playlist", "playlists"] - if any(keyword in x for keyword in context_keywords): - return "album" if "album" in x or "albums" in x else "playlist" - return "other" - - def convert_context(self): - """ - Convert the 'context_type' column to 'context_identified'. - """ - self.df["context_identified"] = self.df["context_type"].progress_apply( - self.identify_context - ) - - def is_fav(self, row, dict_liked): - """ - Check if a row is marked as a favorite. - - Args: - row (pd.Series): The row to check. - dict_liked (dict): Dictionary of favorite items. - - Returns: - int: 1 if it's a favorite, 0 otherwise. - """ - song_id = row.media_id - if "song" in dict_liked and song_id in dict_liked["song"]: - return 1 - context_identified = row.context_identified - context_id = row.context_id - if ( - context_identified in ["playlist", "album"] - and context_identified in dict_liked - and context_id in dict_liked[context_identified] - ): - return 1 - return 0 - def process(self, min_date, max_date, n_subdivisions, instant_zero): """ Process the data by importing, filtering, and adding columns. @@ -164,13 +95,10 @@ def process(self, min_date, max_date, n_subdivisions, instant_zero): """ self.import_data() self.convert_timestamps() - self.filter(min_date, max_date) self.build_ids_list() self.filter_users() self.add_time_range(n_subdivisions, instant_zero) self.add_date() - self.compute_is_organic() - self.convert_context() self.all_time_date_couples = set( [tuple(i) for i in self.df[["time_range", "date"]].to_numpy()] ) @@ -323,19 +251,6 @@ def add_liked(self, df_user, k, id, channel_index): id (int): User ID. channel_index (int): Index of the channel. """ - liked_df_user = self.liked_df[self.liked_df["user_id"] == id] - dict_liked = dict( - liked_df_user[ - liked_df_user["item_type"].isin(["album", "playlist", "song"]) - ][["item_id", "item_type"]] - .groupby("item_type") - .agg(list) - .reset_index() - .values - ) - df_user["is_fav"] = df_user.apply( - lambda row: self.is_fav(row, dict_liked), axis=1 - ) df_user = df_user[df_user["is_fav"] == 1] full_df_liked = self.compute_ratio_df(df_user) diff --git a/modeling_activity_pace/src/settings.py b/modeling_activity_pace/src/settings.py index e625c63..9ee32d5 100644 --- a/modeling_activity_pace/src/settings.py +++ b/modeling_activity_pace/src/settings.py @@ -8,8 +8,9 @@ MAX_DATE = dt.date(2023, 5, 19) INSTANT_ZERO = dt.datetime(2022, 12, 26) -ANSWERS_PATH = "data/processed/answers/processed_records.csv" +ANSWERS_PATH = "data/answers/processed_records.csv" DICT_ITER_PATH = "results/dict_learning/all_iterations/" +TIME_SERIES_PATH = "data/timeseries" ANSWERS_ACTIVITY_COLUMNS = [ "activity_wake_up", @@ -46,4 +47,4 @@ HOURS = [f"{i}h" for i in range(24)] time_labels_full = [f"{day},{hour}" for day in DAYS for hour in HOURS] -time_labels = time_labels_full[1:-1] \ No newline at end of file +time_labels = time_labels_full[1:-1]