1010import pandas as pd
1111
1212from crep import tools
13+ from crep .tools import concretize_aggregation
1314
1415
1516def merge (
@@ -1023,6 +1024,7 @@ def split_segment(
10231024 id_discrete : list [Any ],
10241025 id_continuous : [Any , Any ],
10251026 target_size : int ,
1027+ col_sum_agg : list [str ] = [],
10261028 verbose : bool = False
10271029) -> pd .DataFrame :
10281030 """
@@ -1038,6 +1040,14 @@ def split_segment(
10381040 continuous columns that delimit the segments' start and end
10391041 target_size: integer > 0
10401042 targeted segment size
1043+ col_sum_agg: list[str], optional
1044+ Default to empty list. Some columns may have to be summed over several segments when creating super segments.
1045+ If so, splitting a row and assigning to each new row the same value as in the original non-split row may
1046+ result in inflated sums later on. To counter that, the columns that should later be summed are specified in
1047+ this list. The values are transformed into ratios relative to the segment size, then the row is split, and
1048+ then an inverse transformation is done to reassign a non-ratio value.
1049+ hist : optional. boolean
1050+ if True, display a histogram of the segment size post aggregation
10411051 verbose: optional. boolean
10421052 whether to print shape of df and if df is admissible at the end of the function.
10431053
@@ -1059,6 +1069,9 @@ def split_segment(
10591069 if "__diff__" not in df .columns :
10601070 df ["__diff__" ] = df [id_continuous [1 ]] - df [id_continuous [0 ]]
10611071
1072+ for col in col_sum_agg :
1073+ df [col ] = df [col ] / df ["__diff__" ]
1074+
10621075 new_rows = []
10631076 while df ["__n_cut_dyn__" ].max () > 0 :
10641077 df_temp = df .loc [df ["__n_cut_dyn__" ] >= 1 , :].copy ()
@@ -1073,6 +1086,11 @@ def split_segment(
10731086 new_rows .append (df_temp )
10741087 df ["__n_cut_dyn__" ] -= 1
10751088 df = pd .concat (new_rows , axis = 0 ).sort_values (by = [* id_discrete , id_continuous [1 ]]).reset_index (drop = True )
1089+
1090+ df ["__diff__" ] = df [id_continuous [1 ]] - df [id_continuous [0 ]]
1091+ for col in col_sum_agg :
1092+ df [col ] = df [col ] * df ["__diff__" ]
1093+
10761094 df = df .drop (["__diff__" , "__n_cut__" , "__n_cut_dyn__" ], axis = 1 )
10771095
10781096 if verbose :
@@ -1087,15 +1105,15 @@ def homogenize_within(
10871105 df : pd .DataFrame ,
10881106 id_discrete : list [Any ],
10891107 id_continuous : [Any , Any ],
1108+ target_size : float | int | None = None ,
10901109 method : Literal ["agg" , "split" ] | list [Literal ["agg" , "split" ]] | set [Literal ["agg" , "split" ]] | None = None ,
1091- target_size : None | int = None ,
10921110 dict_agg : dict [str , list [Any ]] | None = None ,
10931111 strict_size : bool = False ,
10941112 verbose : bool = False
10951113) -> pd .DataFrame :
10961114 """
10971115 Uniformizes segment size by splitting them into shorter segments close to target size. The uniformization aims
1098- to get a close a possible to target_size with +- 1.33 * target_size as maximum error margin.
1116+ to get a close a possible to target_size with +- 1.33 * target_size as maximum error margin.
10991117
11001118 Parameters
11011119 ----------
@@ -1105,11 +1123,11 @@ def homogenize_within(
11051123 discrete columns (object or categorical)
11061124 id_continuous : list of 2 column names
11071125 continuous columns that delimit the segments' start and end
1126+ target_size: optional, integer > 0 or None
1127+ targeted segment size. If None, the median is selected.
11081128 method : optional str, either "agg" or "split"
11091129 Whether to homogenize segment length by splitting long segments ("split") or by aggregating short segments ("agg") or both.
11101130 Default to None lets the function define the method.
1111- target_size: optional, integer > 0 or None
1112- targeted segment size. Default to None lets the function define the target size.
11131131 strict_size: whether to strictly respect target_size specified in argument, if any specified.
11141132 The function can change the target size if the value is not congruent with the method
11151133 dict_agg: optional. dict, keys: agg operator, values: list of columns or None,
@@ -1118,6 +1136,11 @@ def homogenize_within(
11181136 verbose: optional. boolean
11191137 whether to print shape of df and if df is admissible at the end of the function.
11201138
1139+ Raises
1140+ ------
1141+ Exception:
1142+ If method is not defined and if the function failed to select automatically a method.
1143+
11211144 Returns
11221145 -------
11231146 df: pandas dataframe
@@ -1145,41 +1168,41 @@ def homogenize_within(
11451168 "not specified and 'agg' method was not specified either." )
11461169
11471170 if len (method ) == 0 :
1148- if df ["__diff__" ].min () < 54 and agg_applicable :
1171+ if df ["__diff__" ].min () < target_size / 1.5 and agg_applicable :
11491172 method .add ("agg" )
1150- elif df ["__diff__" ].max () > 216 :
1173+ elif df ["__diff__" ].max () > target_size * 1.33 :
11511174 method .add ("split" )
1152- elif df ["__diff__" ].min () > 108 :
1153- method .add ("split" )
1154- elif agg_applicable :
1155- method .add ("agg" )
11561175 else :
1157- method .add ("split" )
1176+ warnings .warn ("No method selected. Please, check whether the dataframe is admissible and "
1177+ "whether the target size is coherent given the size of the segments in the dataframe." )
11581178
11591179 if target_size is None :
1160- if df ["__diff__" ].min () < 108 < df ["__diff__" ].max ():
1161- target_size = 108
1162- else :
1163- target_size = int (df ["__diff__" ].median ())
1180+ target_size = int (df ["__diff__" ].median ())
1181+ warnings .warn (f"Unspecified target size set at median: { target_size } " )
11641182
11651183 if "agg" not in method and target_size > min_thresh and not strict_size :
11661184 initial_ts = f"{ target_size } "
11671185 target_size = max (int (df ["__diff__" ].min () * 1.33 ), 20 )
11681186 warnings .warn (f"Specified target_size for method { method } was not congruent with segment sizes in the"
1169- f" dataframe. "
1170- "target_size has been modified from " + initial_ts + f" to{ target_size } ." )
1187+ " dataframe. target_size has been modified from " + initial_ts + f" to{ target_size } ." )
11711188
11721189 if "__diff__" in df .columns :
11731190 df = df .drop ("__diff__" , axis = 1 )
11741191
11751192 # ==================
11761193 # apply method(s)
1194+ col_sum_agg = []
1195+ if dict_agg is not None :
1196+ if "sum" in dict_agg .keys ():
1197+ col_sum_agg = dict_agg ["sum" ]
1198+
11771199 if "split" in method or ("agg" in method and target_size < min_thresh ):
11781200 df = split_segment (
11791201 df = df ,
11801202 id_discrete = id_discrete ,
11811203 id_continuous = id_continuous ,
11821204 target_size = target_size // 3 if "agg" in method else target_size ,
1205+ col_sum_agg = col_sum_agg ,
11831206 verbose = verbose
11841207 )
11851208
@@ -1200,7 +1223,10 @@ def homogenize_between(
12001223 df1 : pd .DataFrame ,
12011224 df2 : pd .DataFrame ,
12021225 id_discrete : list [Any ],
1203- id_continuous : [Any , Any ],
1226+ id_continuous : list [Any ],
1227+ dict_agg_df1 : dict [str , list [str ]] | None = None ,
1228+ dict_agg_df2 : dict [str , list [str ]] | None = None ,
1229+ keep_df1 : bool = False ,
12041230 verbose : bool = False
12051231) -> tuple [pd .DataFrame , pd .DataFrame ]:
12061232 """
@@ -1230,6 +1256,12 @@ def homogenize_between(
12301256 discrete columns (object or categorical)
12311257 id_continuous : list of 2 column names
12321258 continuous columns that delimit the segments' start and end
1259+ dict_agg_df1: optional, dict[str, list[str]] | None
1260+ dictionary with settings about how to handle the columns in df1 that are neither id_discrete nor id_continuous
1261+ dict_agg_df2: optional, dict[str, list[str]] | None
1262+ dictionary with settings about how to handle the columns in df2 that are neither id_discrete nor id_continuous
1263+ keep_df1: optional, bool
1264+ default to False. If True, the segmentation in df1 does not change. Only df2 adapts to df1.
12331265 verbose: optional. boolean
12341266 whether to print shape of df and if df is admissible at the end of the function.
12351267
@@ -1250,22 +1282,26 @@ def homogenize_between(
12501282 target_size = int (1.33 * min_diff )
12511283 else :
12521284 target_size = int (1.33 * min_diff_ref )
1285+ print (f"homogenize_between: chosen target size: { target_size } " )
12531286
12541287 df2 = homogenize_within (
12551288 df = df2 .drop ("__diff__" , axis = 1 ),
12561289 id_discrete = id_discrete ,
12571290 id_continuous = id_continuous ,
12581291 target_size = target_size ,
1292+ dict_agg = dict_agg_df2 ,
12591293 verbose = verbose
12601294 )
12611295
1262- df1 = homogenize_within (
1263- df = df1 .drop ("__diff__" , axis = 1 ),
1264- id_discrete = id_discrete ,
1265- id_continuous = id_continuous ,
1266- target_size = target_size ,
1267- verbose = verbose
1268- )
1296+ if not keep_df1 :
1297+ df1 = homogenize_within (
1298+ df = df1 .drop ("__diff__" , axis = 1 ),
1299+ id_discrete = id_discrete ,
1300+ id_continuous = id_continuous ,
1301+ target_size = target_size ,
1302+ dict_agg = dict_agg_df1 ,
1303+ verbose = verbose
1304+ )
12691305
12701306 return df1 , df2
12711307
@@ -1309,7 +1345,48 @@ def segmentation_irregular(
13091345 length_target ,
13101346 length_minimal ,
13111347) -> pd .DataFrame :
1312- return df
1348+ """
1349+ Parameters
1350+ ----------
1351+ df: pd.DataFrame
1352+ id_discrete: list[str]
1353+ list of name of columns of categorical type
1354+ id_continuous: list[str, str]
1355+ list of name of 2 columns of numerical type, indicating the start and the end of the segment
1356+ length_target
1357+ length to obtain at the end of the segmentation
1358+ length_minimal
1359+ When there are gaps in the dataframe, define the length beyond which this could be considered as a
1360+ deliberate break in the segmentation and not as missing data. Under this threshold, a new row will
1361+ be created to ensure the continuity between successive segments in the dataframe.
1362+
1363+ Returns
1364+ -------
1365+ pd.DataFrame
1366+ New dataframe containing only the columns id_discrete and id_continuous, with the length of the segments
1367+ adjusted to be as close as possible to length_target.
1368+ """
1369+
1370+ df_new = tools .create_continuity_modified (
1371+ df = df ,
1372+ id_discrete = id_discrete ,
1373+ id_continuous = id_continuous ,
1374+ limit = length_minimal ,
1375+ sort = False
1376+ )
1377+
1378+ df_new = homogenize_within (
1379+ df = df_new [[* id_discrete , * id_continuous ]],
1380+ id_discrete = id_discrete ,
1381+ id_continuous = id_continuous ,
1382+ method = ["agg" , "split" ],
1383+ target_size = length_target ,
1384+ dict_agg = None ,
1385+ strict_size = False ,
1386+ verbose = False
1387+ )
1388+
1389+ return df_new
13131390
13141391
13151392def segmentation_regular (
@@ -1358,3 +1435,92 @@ def segmentation_regular(
13581435 df_new .index = range (len (df_new ))
13591436
13601437 return df_new
1438+
1439+
1440+ def fill_segmentation (
1441+ df_segmentation : pd .DataFrame ,
1442+ df_features : pd .DataFrame ,
1443+ id_discrete : list [str ],
1444+ id_continuous : list [str ],
1445+ dict_agg : dict [str , list [str ]] | None = None
1446+ ):
1447+ """
1448+ adds data to segmentation
1449+
1450+ Parameters
1451+ ----------
1452+ df_segmentation: pd.DataFrame
1453+ the dataframe containing the segmentation. Should contain only columns id_discrete and id_continuous
1454+ df_features: pd.DataFrame
1455+ the dataframe containing the features to fit to the segmentation. Should contain the columns
1456+ id_discrete and id_continuous as well as other columns for the features of interest.
1457+ id_discrete
1458+ id_continuous
1459+ dict_agg:
1460+
1461+ Returns
1462+ -------
1463+ pd.DataFrame:
1464+ a dataframe with the feature data fitted to the new segmentation.
1465+ """
1466+ # verification of requirements
1467+ for col in id_continuous + id_discrete :
1468+ if col not in df_segmentation .columns or col not in df_features .columns :
1469+ raise Exception (f"Error: { col } is not present in both dataframes df_segm and df_feat." )
1470+
1471+ is_df_segm_admissible = tools .admissible_dataframe (
1472+ data = df_segmentation , id_discrete = id_discrete , id_continuous = id_continuous
1473+ )
1474+ is_df_feat_admissible = tools .admissible_dataframe (
1475+ data = df_features , id_discrete = id_discrete , id_continuous = id_continuous
1476+ )
1477+ if not is_df_segm_admissible or not is_df_feat_admissible :
1478+ raise Exception ("Error: Both dataframes should be admissible:"
1479+ f"Is df_segm admissible? { is_df_segm_admissible } "
1480+ f"Is df_feat admissible? { is_df_feat_admissible } " )
1481+
1482+ # homogenize_between() reduces the difference in segment size between df_feat and df_segm. More precisely, it
1483+ # adjusts df_feat to df_segm. This may reduce the risk of error when using merge().
1484+ df_segmentation , df_features = homogenize_between (
1485+ df1 = df_segmentation ,
1486+ df2 = df_features ,
1487+ id_discrete = id_discrete ,
1488+ id_continuous = id_continuous ,
1489+ dict_agg_df1 = None ,
1490+ dict_agg_df2 = dict_agg ,
1491+ keep_df1 = True ,
1492+ verbose = False
1493+ )
1494+
1495+ df_segmentation ["__id__" ] = 1
1496+ df_segmentation ["__id__" ] = df_segmentation ["__id__" ].cumsum ()
1497+
1498+ # merging the segmentations in both df
1499+ df_merge = merge (
1500+ data_left = df_segmentation ,
1501+ data_right = df_features ,
1502+ id_continuous = id_continuous ,
1503+ id_discrete = id_discrete ,
1504+ how = "left" ,
1505+ remove_duplicates = False ,
1506+ verbose = False
1507+ )
1508+
1509+ # groupby based on the settings in dict_agg and based on grouping variable __id__
1510+ df_merge = concretize_aggregation (
1511+ df = df_merge ,
1512+ id_discrete = id_discrete ,
1513+ id_continuous = id_continuous ,
1514+ dict_agg = dict_agg ,
1515+ add_group_by = "__id__" ,
1516+ verbose = False
1517+ )
1518+
1519+ df_merge = df_merge .drop (columns = ["__id__" ])
1520+
1521+ df_merge = tools .reorder_columns (df_merge , id_discrete , id_continuous )
1522+
1523+ return df_merge
1524+
1525+
1526+
0 commit comments