Skip to content

Use new dataset #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
data/*
results/*
*.bak
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ usage:
build:
docker build -t $(DOCKER_IMAGE_NAME) .

all:
$(DOCKER_RUN_MOUNT) /bin/bash modeling_activity_pace/run_all.sh || true

run-bash: build
$(DOCKER_RUN_MOUNT) /bin/bash || true

Expand Down
28 changes: 12 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,12 @@ poetry run python modeling_activity_pace/<script.py>

## Scripts

1. Raw user histories are trasformed into time series using `process_raw_streams.py`.
2. User answers to survey are prepared using `process_raw_answers.py`.
3. Dictionary Learning algorithm is run using `compute_dictionary.py`.
4. The selection of the best iteration in dictionary learning is done using `choose_dictionary.py`.
5. `compute_baselines.py` computes baselines scores and scores of PACE embeddings.
6. `analyse_models.py` plots logistic regression coefficients and related statistical reports.
7. `make_fig1.py` saves the plot of Figure 1.
1. User logs are transformed into time series using `compute_time_series.py`.
2. Dictionary Learning algorithm is run using `compute_dictionary.py`.
3. The selection of the best iteration in dictionary learning is done using `choose_dictionary.py`.
4. `compute_baselines.py` computes baselines scores and scores of PACE embeddings.
5. `analyse_models.py` plots logistic regression coefficients and related statistical reports.
6. `make_fig1.py` saves the plot of Figure 1.

## Data

Expand All @@ -54,15 +53,12 @@ Input data folder must be organized as follows :
pace/
└── data/
└── raw/
├── streams/
│ ├── one_year_all_respondents000000000000.csv
│ ├── ...
│ └── one_year_all_respondents0000000000399.csv
├── other/
│ └── user_favorites.csv
└── answers/
└── records.csv
└── answers/
└── processed_records.csv
├── streams/
│ ├── processed_streams_000000000000.csv
│ ├── ...
│ └── processed_streams_000000000091.csv
```

Where ```one_year_all_respondents.csv``` files are stream history csv files with columns : ```user_id, ts_listen, media_id, context_id, context_type, listening_time, context_4```.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import pandas as pd
import os

from src.process_raw_data.streams_processor import ProcessStreams
from src.helpers import save_data
from src.settings import (
TIME_SERIES_PATH,
channel_names,
time_labels_full,
MIN_DATE,
Expand All @@ -11,10 +13,12 @@
INSTANT_ZERO,
)


if __name__ == "__main__":
# Create destination directory if it doesn't exist
os.makedirs(TIME_SERIES_PATH, exist_ok=True)

# Initialize a ProcessStreams instance
stream_processor = ProcessStreams("data/raw/streams/", usr_drop_rate=0)
stream_processor = ProcessStreams("data/streams/", usr_drop_rate=0)

# Process the streams data
stream_processor.process(MIN_DATE, MAX_DATE, N_SUBDIVISION_1HOUR, INSTANT_ZERO)
Expand All @@ -32,4 +36,4 @@
)

# Save the DataFrame to a CSV file
save_data(channel_data, f"data/processed/streams/X_{channel_name}.csv", index=True)
save_data(channel_data, f"{TIME_SERIES_PATH}/X_{channel_name}.csv", index=True)
9 changes: 5 additions & 4 deletions modeling_activity_pace/make_fig1.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from src.settings import (
N_SUBDIVISION_1HOUR,
INSTANT_ZERO,
TIME_SERIES_PATH,
time_labels_full,
time_labels,
)
Expand All @@ -18,7 +19,6 @@
tqdm.pandas()

build_result_folder()
SELECTED_USER_ID = 3356219324


def add_time_range(df, n_subdivisions, instant_zero):
Expand All @@ -31,12 +31,13 @@ def add_time_range(df, n_subdivisions, instant_zero):

# Import data

df_processed = load_data("data/processed/streams/X_volume.csv")
df_processed = load_data(f"{TIME_SERIES_PATH}/X_volume.csv")

SELECTED_USER_ID = df_processed.index[0]

streams_df_list = []
for i in tqdm(os.listdir("data/raw/streams/")):
df_streams = load_data("data/raw/streams/" + i).reset_index()
for i in tqdm(os.listdir("data/streams/")):
df_streams = load_data("data/streams/" + i).reset_index()
streams_df_list.append(df_streams[df_streams["user_id"] == SELECTED_USER_ID])

df = pd.concat(streams_df_list)
Expand Down
21 changes: 0 additions & 21 deletions modeling_activity_pace/process_raw_answers.py

This file was deleted.

35 changes: 35 additions & 0 deletions modeling_activity_pace/run_all.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash

echo "**********************************************"
echo "1. User logs are transformed into time series "
echo "**********************************************"
poetry run python modeling_activity_pace/compute_time_series.py

echo "****************************************"
echo "2. Dictionary Learning algorithm is run "
echo "****************************************"
poetry run python modeling_activity_pace/compute_dictionary.py

echo "**********************************************************"
echo "3. Selection of the best iteration in dictionary learning "
echo "**********************************************************"
poetry run python modeling_activity_pace/choose_dictionary.py

echo ""
echo "***********************************************************"
echo "4. Computes baselines scores and scores of PACE embeddings "
echo "***********************************************************"
poetry run python modeling_activity_pace/compute_baselines.py

echo ""
echo "******************************************"
echo "5. Plots logistic regression coefficients "
echo " and related statistical reports "
echo "******************************************"
poetry run python modeling_activity_pace/analyse_models.py

echo -e ""
echo "******************************"
echo "6. Saves the plot of Figure 1 "
echo "******************************"
poetry run python modeling_activity_pace/make_fig1.py
7 changes: 3 additions & 4 deletions modeling_activity_pace/src/baselines/baselines_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from tqdm import tqdm

from src.process_answers import ProcessAnswers
from src.settings import channel_names, ANSWERS_PATH
from src.settings import TIME_SERIES_PATH, channel_names, ANSWERS_PATH
from src.helpers import load_data, get_ids_from_signals
from src.process_raw_data.answers_helpers import process_data_for_classifier
from src.modeling_functions import (
Expand All @@ -25,7 +25,7 @@ def compute_other_activities_baseline(cols_to_predict):
Returns:
- List of AUC scores.
"""
X_list = [load_data(f"data/processed/streams/X_{i}.csv") for i in channel_names]
X_list = [load_data(f"{TIME_SERIES_PATH}/X_{i}.csv") for i in channel_names]

# Extract user IDs
ids = get_ids_from_signals(X_list)
Expand Down Expand Up @@ -80,7 +80,7 @@ def compute_total_volume_baseline(cols_to_predict):
"""

# Load volume data
df_volume = load_data("data/processed/streams/X_volume.csv")
df_volume = load_data(f"{TIME_SERIES_PATH}/X_volume.csv")

# Sum the volume across time (transpose and sum)
df = pd.DataFrame(df_volume.T.sum(), columns=["Total volume"])
Expand Down Expand Up @@ -155,4 +155,3 @@ def compute_baseline_scores(
scores = perform_grid_search(X_train, X_test, y_train_, y_test_, cols_to_predict)

return scores

Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,14 @@ def choose_best_dict(X, y, cols_to_predict):
index=True,
)
plot_reconstruction_iterations_perfs(df_reconstruction_scores, "results/figures/reconstruction_scores_over_iterations.pdf")
best_iter = int(input("Choose the best iteration : "))

## Uncomment here to manually select the best iteration
# best_iter = int(input("Choose the best iteration : "))

best_iter_offset = 5
best_iter = best_iter_offset + class_scores[best_iter_offset:].index(max(class_scores[best_iter_offset:]))
print(f"Best iteration is {best_iter} with score {class_scores[best_iter]}")

shutil.copy(
DICT_ITER_PATH
+ [f"D_{i}.npy" for i in range(len(os.listdir(DICT_ITER_PATH)))][best_iter],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from src.process_answers import ProcessAnswers
from src.settings import (
TIME_SERIES_PATH,
n_channels,
channel_names,
ANSWERS_ACTIVITY_COLUMNS,
Expand All @@ -26,7 +27,7 @@ def process_data_for_DL():
Returns:
tuple: A tuple containing input data (X_list_array_clean) and target labels (y).
"""
X_list = [load_data(f"data/processed/streams/X_{i}.csv") for i in channel_names]
X_list = [load_data(f"{TIME_SERIES_PATH}/X_{i}.csv") for i in channel_names]

ids = get_ids_from_signals(X_list)
cols_to_predict = ANSWERS_ACTIVITY_COLUMNS
Expand Down
24 changes: 0 additions & 24 deletions modeling_activity_pace/src/process_answers.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,4 @@
from src.helpers import load_data
from src.process_raw_data.filter import process_filter
from src.process_raw_data.process_activities_deezer import (
process_activities_deezer_feature,
)
from src.process_raw_data.process_age_gender import process_age_gender
from src.helpers import load_data


class ProcessAnswers:
def __init__(self, path) -> None:
Expand Down Expand Up @@ -46,20 +39,3 @@ def process(self, ids, cols):
"""
self.import_data()
self.filter(ids, cols)


def process_answer_data_pipeline(df, columns):
"""
Process answers DataFrame by applying filtering and feature processing.

Args:
df (pd.DataFrame): The DataFrame to be processed.
columns (list): List of columns to keep in the processed DataFrame.

Returns:
pd.DataFrame: The processed DataFrame with selected columns.
"""
df = process_filter(df)
df = process_activities_deezer_feature(df)
df = process_age_gender(df)
return df[columns]
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from src.helpers import load_data, get_ids_from_signals
from src.modeling_functions import convolve_signals, normalize_signals, split
from src.settings import (
TIME_SERIES_PATH,
channel_names,
n_channels,
ANSWERS_PATH,
Expand Down Expand Up @@ -35,7 +36,7 @@ def process_data_for_classifier(
- y_train_: Training labels.
- y_test_: Test labels.
"""
X_list = [load_data(f"data/processed/streams/X_{i}.csv") for i in channel_names]
X_list = [load_data(f"{TIME_SERIES_PATH}/X_{i}.csv") for i in channel_names]
D = np.load("results/dict_learning/D.npy")[:, :, :166]

# Extract user IDs
Expand Down

This file was deleted.

Loading