@@ -86,7 +86,7 @@ def merge(
8686 df .index = range (len (df ))
8787 if remove_duplicates :
8888 df = suppress_duplicates (df , id_discrete = id_discrete ,
89- continuous_index = id_continuous )
89+ id_continuous = id_continuous )
9090 if verbose :
9191 print ("[merge] nb rows left table frame " , data_left .shape [0 ])
9292 print ("[merge] nb rows right table frame " , data_right .shape [0 ])
@@ -96,7 +96,9 @@ def merge(
9696
9797def unbalanced_merge (
9898 data_admissible : pd .DataFrame ,
99- data_not_admissible : pd .DataFrame , id_discrete : iter , id_continuous : [Any , Any ]) -> pd .DataFrame :
99+ data_not_admissible : pd .DataFrame ,
100+ id_discrete : iter ,
101+ id_continuous : [Any , Any ]) -> pd .DataFrame :
100102 """
101103 Merge admissible and non-admissible dataframes based on discrete and continuous identifiers.
102104
@@ -464,7 +466,8 @@ def aggregate_constant(df: pd.DataFrame,
464466 return data_merge [df .columns ].drop_duplicates ().astype (dtypes )
465467
466468
467- def __merge_index (data_left , data_right ,
469+ def __merge_index (data_left ,
470+ data_right ,
468471 id_discrete ,
469472 id_continuous ,
470473 names = ("left" , "right" )):
@@ -479,19 +482,15 @@ def __merge_index(data_left, data_right,
479482
480483
481484def merge_event (
482- data_left : pd .DataFrame , data_right : pd .DataFrame ,
485+ data_left : pd .DataFrame ,
486+ data_right : pd .DataFrame ,
483487 id_discrete : iter ,
484488 id_continuous : [Any , Any ],
485489 id_event
486490):
487491 """
488- Merges two dataframes on both discrete and continuous indices, with forward-filling of missing data.
489-
490- This function merges two Pandas DataFrames (`data_left` and `data_right`) based on discrete and continuous keys.
491- It assigns the event data from data_right to the correct segment in data_left, if the event is not "out-of-bound"
492- relative to the segments in data_left. The result is a dataframe with a new row for each event. Rows with NaN
493- event data are kept to represent the segment state prior to the occurrence of any event (as such the returned
494- dataframe contains duplicates based on subsets of columns id_discrete and id_continuous).
492+ Assigns the details of events occurring at a specific points, in data_right, to the corresponding segment
493+ in data_left.
495494
496495 Parameters
497496 ----------
@@ -514,51 +513,46 @@ def merge_event(
514513 A merged dataframe that combines `data_left` and `data_right`.
515514
516515 """
517- data_left_ = data_left .__deepcopy__ ()
518- data_right_ = data_right .__deepcopy__ ()
519- data_left_ = _increasing_continuous_index (data_left_ , id_continuous )
520-
521- data_left_ = data_left_ .reset_index (drop = True )
522- data_right_ = data_right_ .reset_index (drop = True )
523-
524- data_left_ ["__t__" ] = True
525- data_right_ ["__t__" ] = False
526-
527- df_merge = pd .concat ([data_left_ , data_right_ ], axis = 0 )
528- df_merge .loc [df_merge ["__t__" ], id_event ] = df_merge .loc [df_merge ["__t__" ], id_continuous [1 ]]
529- df_merge = df_merge .sort_values (by = [* id_discrete , id_event ]).reset_index (drop = True )
530-
531- # event in data_right_ can be out-of-bound based on segments in data_left_.
532- mask = (~ df_merge [id_discrete ].eq (df_merge [id_discrete ].shift ()))
533- df_merge ["__new_seg__" ] = mask .sum (axis = 1 ) > 0
534- df_merge ["__new_seg_b_" ] = np .nan
535- df_merge .loc [df_merge ["__t__" ], "__new_seg_b_" ] = df_merge .loc [df_merge ["__t__" ], "__new_seg__" ]
536- df_merge ["__new_seg_b_" ] = df_merge ["__new_seg_b_" ].bfill ()
537- df_merge ["__oob__" ] = False
538- df_merge .loc [(~ df_merge ["__t__" ]) & df_merge ["__new_seg_b_" ], "__oob__" ] = True
539-
540- if df_merge ["__oob__" ].sum () > 1 :
541- warnings .warn ("Not all events in data_right could be associated with a segment from data_left." )
542- print (f"Dropped: { df_merge ['__oob__' ].sum ()} /{ data_left_ .shape [0 ]} rows" )
543- df_merge = df_merge .loc [~ df_merge ["__oob__" ], :].reset_index (drop = True )
544-
545- # assign event data to
546- df_merge .loc [~ df_merge ["__t__" ], id_continuous ] = np .nan
547- df_merge [data_left_ .columns ] = df_merge [data_left_ .columns ].bfill ()
548- df_merge .loc [df_merge ["__t__" ], id_event ] = np .nan
549-
550- df_merge = df_merge .sort_values (
551- by = [* id_discrete , id_continuous [1 ], "__t__" ],
552- ascending = [* [True ] * len (id_discrete ), True , False ]
553- ).reset_index (drop = True )
554- df_merge = df_merge .drop (columns = [col for col in df_merge .columns if "__" in col ])
555-
556- cols_left = [col for col in data_left .columns if col not in data_right .columns ]
557- cols_right = [col for col in data_right .columns if col not in data_left .columns ]
558-
559- df_merge = df_merge [id_discrete + id_continuous + cols_left + cols_right ]
516+ if not tools .admissible_dataframe (data = data_left , id_discrete = id_discrete , id_continuous = id_continuous ):
517+ raise Exception ("The left dataframe is not admissible. Consider using aggregate_duplicates() and "
518+ "tools.build_admissible_data() if you want to make the dataframe admissible." )
519+ else :
520+ df1 = data_left .copy ()
521+ df2 = data_right .copy ()
522+ df1 = df1 .fillna (1234.56789 )
523+ df1 ["__t__" ] = True
524+ df2 ["__t__" ] = False
525+ df = pd .concat ([df1 , df2 ], axis = 0 )
526+ df .loc [df ["__t__" ], id_event ] = df .loc [df ["__t__" ], id_continuous [1 ]]
527+ df = df .sort_values (by = [* id_discrete , id_event ]).reset_index (drop = True )
528+
529+ # ========== identify match with pk event in df2 and concerned row in df1 ==========
530+ mask = (~ df [id_discrete ].eq (df [id_discrete ].shift ()))
531+ df ["__new_seg__" ] = mask .sum (axis = 1 ) > 0
532+ df ["__new_seg_b_" ] = np .nan
533+ df .loc [df ["__t__" ], "__new_seg_b_" ] = df .loc [df ["__t__" ], "__new_seg__" ]
534+ df ["__new_seg_b_" ] = df ["__new_seg_b_" ].bfill ()
535+ df ["__no_match__" ] = False
536+ df .loc [(~ df ["__t__" ]) & df ["__new_seg_b_" ], "__no_match__" ] = True
537+
538+ if df ["__no_match__" ].sum () > 1 :
539+ warnings .warn ("Not all events in data_right could be associated to a segment in date_left." )
540+ print (f"In merge_event, dropped: { df ['__no_match__' ].sum ()} /{ df2 .shape [0 ]} rows" )
541+ df = df .loc [~ df ["__no_match__" ], :].reset_index (drop = True )
542+
543+ # =========================== merge info from df1 and df2 ===========================
544+ df .loc [~ df ["__t__" ], id_continuous ] = np .nan
545+ df [df1 .columns ] = df [df1 .columns ].bfill ()
546+ df .loc [df ["__t__" ], id_event ] = np .nan
547+ df = df .replace (1234.56789 , np .nan )
548+
549+ df = df .sort_values (
550+ by = [* id_discrete , id_continuous [1 ], "__t__" ],
551+ ascending = [* [True ] * len (id_discrete ), True , False ]
552+ ).reset_index (drop = True )
553+ df = df .drop (columns = [col for col in df .columns if "__" in col ])
560554
561- return df_merge
555+ return df
562556
563557
564558def create_regular_segmentation (
@@ -745,14 +739,14 @@ def __fix_discrete_index(
745739 return data_left , data_right
746740
747741
748- def suppress_duplicates (df , id_discrete , continuous_index ):
749- df = df .sort_values ([* id_discrete , * continuous_index ])
750- df_duplicated = df .drop ([* id_discrete , * continuous_index ], axis = 1 )
742+ def suppress_duplicates (df , id_discrete , id_continuous ):
743+ df = df .sort_values ([* id_discrete , * id_continuous ])
744+ df_duplicated = df .drop ([* id_discrete , * id_continuous ], axis = 1 )
751745 mat_duplicated = pd .DataFrame (
752746 df_duplicated .iloc [1 :].values == df_duplicated .iloc [
753747 :- 1 ].values )
754- id1 = continuous_index [0 ]
755- id2 = continuous_index [1 ]
748+ id1 = id_continuous [0 ]
749+ id2 = id_continuous [1 ]
756750 index = mat_duplicated .sum (axis = 1 ) == df_duplicated .shape [1 ]
757751 index = np .where (index )[0 ]
758752 df1 = df .iloc [index ]
@@ -1026,7 +1020,7 @@ def split_segment(
10261020 id_discrete : list [Any ],
10271021 id_continuous : [Any , Any ],
10281022 target_size : int ,
1029- col_sum_agg : list [str ] = None ,
1023+ columns_sum_aggregation : list [str ] = None ,
10301024 verbose : bool = False
10311025) -> pd .DataFrame :
10321026 """
@@ -1042,7 +1036,7 @@ def split_segment(
10421036 continuous columns that delimit the segments' start and end
10431037 target_size: integer > 0
10441038 targeted segment size
1045- col_sum_agg : list[str], optional
1039+ columns_sum_aggregation : list[str], optional
10461040 Default to empty list. Some columns may have to be summed over several segments when creating super segments.
10471041 If so, splitting a row and assigning to each new row the same value as in the original non-split row may
10481042 result in inflated sums later on. To counter that, the columns that should later be summed are specified in
@@ -1056,8 +1050,8 @@ def split_segment(
10561050 df: pandas dataframe
10571051 """
10581052 df = df .copy ()
1059- if col_sum_agg is None :
1060- col_sum_agg = []
1053+ if columns_sum_aggregation is None :
1054+ columns_sum_aggregation = []
10611055
10621056 df ["__n_cut__" ] = tools .n_cut_finder (
10631057 df = df ,
@@ -1071,7 +1065,7 @@ def split_segment(
10711065 if "__diff__" not in df .columns :
10721066 df ["__diff__" ] = df [id_continuous [1 ]] - df [id_continuous [0 ]]
10731067
1074- for col in col_sum_agg :
1068+ for col in columns_sum_aggregation :
10751069 df [col ] = df [col ] / df ["__diff__" ]
10761070
10771071 new_rows = []
@@ -1090,7 +1084,7 @@ def split_segment(
10901084 df = pd .concat (new_rows , axis = 0 ).sort_values (by = [* id_discrete , id_continuous [1 ]]).reset_index (drop = True )
10911085
10921086 df ["__diff__" ] = df [id_continuous [1 ]] - df [id_continuous [0 ]]
1093- for col in col_sum_agg :
1087+ for col in columns_sum_aggregation :
10941088 df [col ] = df [col ] * df ["__diff__" ]
10951089
10961090 df = df .drop (["__diff__" , "__n_cut__" , "__n_cut_dyn__" ], axis = 1 )
@@ -1186,7 +1180,7 @@ def homogenize_within(
11861180 initial_ts = f"{ target_size } "
11871181 target_size = max (int (df ["__diff__" ].min () * 1.33 ), 20 )
11881182 warnings .warn (f"Specified target_size for method { method } was not congruent with segment sizes in the"
1189- " dataframe. target_size has been modified from " + initial_ts + f" to{ target_size } ." )
1183+ " dataframe. target_size has been modified from " + initial_ts + f" to { target_size } ." )
11901184
11911185 if "__diff__" in df .columns :
11921186 df = df .drop ("__diff__" , axis = 1 )
@@ -1204,7 +1198,7 @@ def homogenize_within(
12041198 id_discrete = id_discrete ,
12051199 id_continuous = id_continuous ,
12061200 target_size = target_size // 3 if "agg" in method else target_size ,
1207- col_sum_agg = col_sum_agg ,
1201+ columns_sum_aggregation = col_sum_agg ,
12081202 verbose = verbose
12091203 )
12101204
@@ -1337,7 +1331,7 @@ def segmentation_irregular(
13371331 adjusted to be as close as possible to length_target.
13381332 """
13391333
1340- df_new = tools .create_continuity_modified (
1334+ df_new = tools .create_continuity (
13411335 df = df ,
13421336 id_discrete = id_discrete ,
13431337 id_continuous = id_continuous ,
0 commit comments