Skip to content

Commit e3ae8c7

Browse files
Merge pull request #5 from fpavy/main
work on table
2 parents c8c9367 + 0ae459a commit e3ae8c7

File tree

9 files changed

+579
-122
lines changed

9 files changed

+579
-122
lines changed

coverage.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

crep/base.py

Lines changed: 63 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def merge(
8686
df.index = range(len(df))
8787
if remove_duplicates:
8888
df = suppress_duplicates(df, id_discrete=id_discrete,
89-
continuous_index=id_continuous)
89+
id_continuous=id_continuous)
9090
if verbose:
9191
print("[merge] nb rows left table frame ", data_left.shape[0])
9292
print("[merge] nb rows right table frame ", data_right.shape[0])
@@ -96,7 +96,9 @@ def merge(
9696

9797
def unbalanced_merge(
9898
data_admissible: pd.DataFrame,
99-
data_not_admissible: pd.DataFrame, id_discrete: iter, id_continuous: [Any, Any]) -> pd.DataFrame:
99+
data_not_admissible: pd.DataFrame,
100+
id_discrete: iter,
101+
id_continuous: [Any, Any]) -> pd.DataFrame:
100102
"""
101103
Merge admissible and non-admissible dataframes based on discrete and continuous identifiers.
102104
@@ -464,7 +466,8 @@ def aggregate_constant(df: pd.DataFrame,
464466
return data_merge[df.columns].drop_duplicates().astype(dtypes)
465467

466468

467-
def __merge_index(data_left, data_right,
469+
def __merge_index(data_left,
470+
data_right,
468471
id_discrete,
469472
id_continuous,
470473
names=("left", "right")):
@@ -479,19 +482,15 @@ def __merge_index(data_left, data_right,
479482

480483

481484
def merge_event(
482-
data_left: pd.DataFrame, data_right: pd.DataFrame,
485+
data_left: pd.DataFrame,
486+
data_right: pd.DataFrame,
483487
id_discrete: iter,
484488
id_continuous: [Any, Any],
485489
id_event
486490
):
487491
"""
488-
Merges two dataframes on both discrete and continuous indices, with forward-filling of missing data.
489-
490-
This function merges two Pandas DataFrames (`data_left` and `data_right`) based on discrete and continuous keys.
491-
It assigns the event data from data_right to the correct segment in data_left, if the event is not "out-of-bound"
492-
relative to the segments in data_left. The result is a dataframe with a new row for each event. Rows with NaN
493-
event data are kept to represent the segment state prior to the occurrence of any event (as such the returned
494-
dataframe contains duplicates based on subsets of columns id_discrete and id_continuous).
492+
Assigns the details of events occurring at a specific points, in data_right, to the corresponding segment
493+
in data_left.
495494
496495
Parameters
497496
----------
@@ -514,51 +513,46 @@ def merge_event(
514513
A merged dataframe that combines `data_left` and `data_right`.
515514
516515
"""
517-
data_left_ = data_left.__deepcopy__()
518-
data_right_ = data_right.__deepcopy__()
519-
data_left_ = _increasing_continuous_index(data_left_, id_continuous)
520-
521-
data_left_ = data_left_.reset_index(drop=True)
522-
data_right_ = data_right_.reset_index(drop=True)
523-
524-
data_left_["__t__"] = True
525-
data_right_["__t__"] = False
526-
527-
df_merge = pd.concat([data_left_, data_right_], axis=0)
528-
df_merge.loc[df_merge["__t__"], id_event] = df_merge.loc[df_merge["__t__"], id_continuous[1]]
529-
df_merge = df_merge.sort_values(by=[*id_discrete, id_event]).reset_index(drop=True)
530-
531-
# event in data_right_ can be out-of-bound based on segments in data_left_.
532-
mask = (~df_merge[id_discrete].eq(df_merge[id_discrete].shift()))
533-
df_merge["__new_seg__"] = mask.sum(axis=1) > 0
534-
df_merge["__new_seg_b_"] = np.nan
535-
df_merge.loc[df_merge["__t__"], "__new_seg_b_"] = df_merge.loc[df_merge["__t__"], "__new_seg__"]
536-
df_merge["__new_seg_b_"] = df_merge["__new_seg_b_"].bfill()
537-
df_merge["__oob__"] = False
538-
df_merge.loc[(~df_merge["__t__"]) & df_merge["__new_seg_b_"], "__oob__"] = True
539-
540-
if df_merge["__oob__"].sum() > 1:
541-
warnings.warn("Not all events in data_right could be associated with a segment from data_left.")
542-
print(f"Dropped: {df_merge['__oob__'].sum()}/{data_left_.shape[0]} rows")
543-
df_merge = df_merge.loc[~df_merge["__oob__"], :].reset_index(drop=True)
544-
545-
# assign event data to
546-
df_merge.loc[~df_merge["__t__"], id_continuous] = np.nan
547-
df_merge[data_left_.columns] = df_merge[data_left_.columns].bfill()
548-
df_merge.loc[df_merge["__t__"], id_event] = np.nan
549-
550-
df_merge = df_merge.sort_values(
551-
by=[*id_discrete, id_continuous[1], "__t__"],
552-
ascending=[*[True] * len(id_discrete), True, False]
553-
).reset_index(drop=True)
554-
df_merge = df_merge.drop(columns=[col for col in df_merge.columns if "__" in col])
555-
556-
cols_left = [col for col in data_left.columns if col not in data_right.columns]
557-
cols_right = [col for col in data_right.columns if col not in data_left.columns]
558-
559-
df_merge = df_merge[id_discrete + id_continuous + cols_left + cols_right]
516+
if not tools.admissible_dataframe(data=data_left, id_discrete=id_discrete, id_continuous=id_continuous):
517+
raise Exception("The left dataframe is not admissible. Consider using aggregate_duplicates() and "
518+
"tools.build_admissible_data() if you want to make the dataframe admissible.")
519+
else:
520+
df1 = data_left.copy()
521+
df2 = data_right.copy()
522+
df1 = df1.fillna(1234.56789)
523+
df1["__t__"] = True
524+
df2["__t__"] = False
525+
df = pd.concat([df1, df2], axis=0)
526+
df.loc[df["__t__"], id_event] = df.loc[df["__t__"], id_continuous[1]]
527+
df = df.sort_values(by=[*id_discrete, id_event]).reset_index(drop=True)
528+
529+
# ========== identify match with pk event in df2 and concerned row in df1 ==========
530+
mask = (~df[id_discrete].eq(df[id_discrete].shift()))
531+
df["__new_seg__"] = mask.sum(axis=1) > 0
532+
df["__new_seg_b_"] = np.nan
533+
df.loc[df["__t__"], "__new_seg_b_"] = df.loc[df["__t__"], "__new_seg__"]
534+
df["__new_seg_b_"] = df["__new_seg_b_"].bfill()
535+
df["__no_match__"] = False
536+
df.loc[(~df["__t__"]) & df["__new_seg_b_"], "__no_match__"] = True
537+
538+
if df["__no_match__"].sum() > 1:
539+
warnings.warn("Not all events in data_right could be associated to a segment in date_left.")
540+
print(f"In merge_event, dropped: {df['__no_match__'].sum()}/{df2.shape[0]} rows")
541+
df = df.loc[~df["__no_match__"], :].reset_index(drop=True)
542+
543+
# =========================== merge info from df1 and df2 ===========================
544+
df.loc[~df["__t__"], id_continuous] = np.nan
545+
df[df1.columns] = df[df1.columns].bfill()
546+
df.loc[df["__t__"], id_event] = np.nan
547+
df = df.replace(1234.56789, np.nan)
548+
549+
df = df.sort_values(
550+
by=[*id_discrete, id_continuous[1], "__t__"],
551+
ascending=[*[True] * len(id_discrete), True, False]
552+
).reset_index(drop=True)
553+
df = df.drop(columns=[col for col in df.columns if "__" in col])
560554

561-
return df_merge
555+
return df
562556

563557

564558
def create_regular_segmentation(
@@ -745,14 +739,14 @@ def __fix_discrete_index(
745739
return data_left, data_right
746740

747741

748-
def suppress_duplicates(df, id_discrete, continuous_index):
749-
df = df.sort_values([*id_discrete, *continuous_index])
750-
df_duplicated = df.drop([*id_discrete, *continuous_index], axis=1)
742+
def suppress_duplicates(df, id_discrete, id_continuous):
743+
df = df.sort_values([*id_discrete, *id_continuous])
744+
df_duplicated = df.drop([*id_discrete, *id_continuous], axis=1)
751745
mat_duplicated = pd.DataFrame(
752746
df_duplicated.iloc[1:].values == df_duplicated.iloc[
753747
:-1].values)
754-
id1 = continuous_index[0]
755-
id2 = continuous_index[1]
748+
id1 = id_continuous[0]
749+
id2 = id_continuous[1]
756750
index = mat_duplicated.sum(axis=1) == df_duplicated.shape[1]
757751
index = np.where(index)[0]
758752
df1 = df.iloc[index]
@@ -1026,7 +1020,7 @@ def split_segment(
10261020
id_discrete: list[Any],
10271021
id_continuous: [Any, Any],
10281022
target_size: int,
1029-
col_sum_agg: list[str] = None,
1023+
columns_sum_aggregation: list[str] = None,
10301024
verbose: bool = False
10311025
) -> pd.DataFrame:
10321026
"""
@@ -1042,7 +1036,7 @@ def split_segment(
10421036
continuous columns that delimit the segments' start and end
10431037
target_size: integer > 0
10441038
targeted segment size
1045-
col_sum_agg: list[str], optional
1039+
columns_sum_aggregation: list[str], optional
10461040
Default to empty list. Some columns may have to be summed over several segments when creating super segments.
10471041
If so, splitting a row and assigning to each new row the same value as in the original non-split row may
10481042
result in inflated sums later on. To counter that, the columns that should later be summed are specified in
@@ -1056,8 +1050,8 @@ def split_segment(
10561050
df: pandas dataframe
10571051
"""
10581052
df = df.copy()
1059-
if col_sum_agg is None:
1060-
col_sum_agg = []
1053+
if columns_sum_aggregation is None:
1054+
columns_sum_aggregation = []
10611055

10621056
df["__n_cut__"] = tools.n_cut_finder(
10631057
df=df,
@@ -1071,7 +1065,7 @@ def split_segment(
10711065
if "__diff__" not in df.columns:
10721066
df["__diff__"] = df[id_continuous[1]] - df[id_continuous[0]]
10731067

1074-
for col in col_sum_agg:
1068+
for col in columns_sum_aggregation:
10751069
df[col] = df[col] / df["__diff__"]
10761070

10771071
new_rows = []
@@ -1090,7 +1084,7 @@ def split_segment(
10901084
df = pd.concat(new_rows, axis=0).sort_values(by=[*id_discrete, id_continuous[1]]).reset_index(drop=True)
10911085

10921086
df["__diff__"] = df[id_continuous[1]] - df[id_continuous[0]]
1093-
for col in col_sum_agg:
1087+
for col in columns_sum_aggregation:
10941088
df[col] = df[col] * df["__diff__"]
10951089

10961090
df = df.drop(["__diff__", "__n_cut__", "__n_cut_dyn__"], axis=1)
@@ -1186,7 +1180,7 @@ def homogenize_within(
11861180
initial_ts = f"{target_size}"
11871181
target_size = max(int(df["__diff__"].min() * 1.33), 20)
11881182
warnings.warn(f"Specified target_size for method {method} was not congruent with segment sizes in the"
1189-
" dataframe. target_size has been modified from " + initial_ts + f" to{target_size}.")
1183+
" dataframe. target_size has been modified from " + initial_ts + f" to {target_size}.")
11901184

11911185
if "__diff__" in df.columns:
11921186
df = df.drop("__diff__", axis=1)
@@ -1204,7 +1198,7 @@ def homogenize_within(
12041198
id_discrete=id_discrete,
12051199
id_continuous=id_continuous,
12061200
target_size=target_size // 3 if "agg" in method else target_size,
1207-
col_sum_agg=col_sum_agg,
1201+
columns_sum_aggregation=col_sum_agg,
12081202
verbose=verbose
12091203
)
12101204

@@ -1337,7 +1331,7 @@ def segmentation_irregular(
13371331
adjusted to be as close as possible to length_target.
13381332
"""
13391333

1340-
df_new = tools.create_continuity_modified(
1334+
df_new = tools.create_continuity(
13411335
df=df,
13421336
id_discrete=id_discrete,
13431337
id_continuous=id_continuous,

0 commit comments

Comments
 (0)