Skip to content

Commit c545e7b

Browse files
Merge pull request #4 from fpavy/main
function to fill (regular) segmentation
2 parents 5b3e598 + 73e110d commit c545e7b

File tree

5 files changed

+230
-34
lines changed

5 files changed

+230
-34
lines changed

crep/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@
44
# You may obtain a copy of the License at
55
# https://cecill.info/
66
from crep.base import (merge, aggregate_constant, unbalanced_merge, unbalanced_concat, homogenize_within,
7-
aggregate_duplicates, merge_event)
7+
aggregate_duplicates, merge_event, fill_segmentation)
88
from crep.tools import compute_discontinuity

crep/base.py

Lines changed: 192 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import pandas as pd
1111

1212
from crep import tools
13+
from crep.tools import concretize_aggregation
1314

1415

1516
def merge(
@@ -1023,6 +1024,7 @@ def split_segment(
10231024
id_discrete: list[Any],
10241025
id_continuous: [Any, Any],
10251026
target_size: int,
1027+
col_sum_agg: list[str] = [],
10261028
verbose: bool = False
10271029
) -> pd.DataFrame:
10281030
"""
@@ -1038,6 +1040,14 @@ def split_segment(
10381040
continuous columns that delimit the segments' start and end
10391041
target_size: integer > 0
10401042
targeted segment size
1043+
col_sum_agg: list[str], optional
1044+
Default to empty list. Some columns may have to be summed over several segments when creating super segments.
1045+
If so, splitting a row and assigning to each new row the same value as in the original non-split row may
1046+
result in inflated sums later on. To counter that, the columns that should later be summed are specified in
1047+
this list. The values are transformed into ratios relative to the segment size, then the row is split, and
1048+
then an inverse transformation is done to reassign a non-ratio value.
1049+
hist : optional. boolean
1050+
if True, display a histogram of the segment size post aggregation
10411051
verbose: optional. boolean
10421052
whether to print shape of df and if df is admissible at the end of the function.
10431053
@@ -1059,6 +1069,9 @@ def split_segment(
10591069
if "__diff__" not in df.columns:
10601070
df["__diff__"] = df[id_continuous[1]] - df[id_continuous[0]]
10611071

1072+
for col in col_sum_agg:
1073+
df[col] = df[col] / df["__diff__"]
1074+
10621075
new_rows = []
10631076
while df["__n_cut_dyn__"].max() > 0:
10641077
df_temp = df.loc[df["__n_cut_dyn__"] >= 1, :].copy()
@@ -1073,6 +1086,11 @@ def split_segment(
10731086
new_rows.append(df_temp)
10741087
df["__n_cut_dyn__"] -= 1
10751088
df = pd.concat(new_rows, axis=0).sort_values(by=[*id_discrete, id_continuous[1]]).reset_index(drop=True)
1089+
1090+
df["__diff__"] = df[id_continuous[1]] - df[id_continuous[0]]
1091+
for col in col_sum_agg:
1092+
df[col] = df[col] * df["__diff__"]
1093+
10761094
df = df.drop(["__diff__", "__n_cut__", "__n_cut_dyn__"], axis=1)
10771095

10781096
if verbose:
@@ -1087,15 +1105,15 @@ def homogenize_within(
10871105
df: pd.DataFrame,
10881106
id_discrete: list[Any],
10891107
id_continuous: [Any, Any],
1108+
target_size: float | int | None = None,
10901109
method: Literal["agg", "split"] | list[Literal["agg", "split"]] | set[Literal["agg", "split"]] | None = None,
1091-
target_size: None | int = None,
10921110
dict_agg: dict[str, list[Any]] | None = None,
10931111
strict_size: bool = False,
10941112
verbose: bool = False
10951113
) -> pd.DataFrame:
10961114
"""
10971115
Uniformizes segment size by splitting them into shorter segments close to target size. The uniformization aims
1098-
to get a close a possible to target_size with +- 1.33 * target_size as maximum error margin.
1116+
to get a close a possible to target_size with +- 1.33 * target_size as maximum error margin.
10991117
11001118
Parameters
11011119
----------
@@ -1105,11 +1123,11 @@ def homogenize_within(
11051123
discrete columns (object or categorical)
11061124
id_continuous : list of 2 column names
11071125
continuous columns that delimit the segments' start and end
1126+
target_size: optional, integer > 0 or None
1127+
targeted segment size. If None, the median is selected.
11081128
method : optional str, either "agg" or "split"
11091129
Whether to homogenize segment length by splitting long segments ("split") or by aggregating short segments ("agg") or both.
11101130
Default to None lets the function define the method.
1111-
target_size: optional, integer > 0 or None
1112-
targeted segment size. Default to None lets the function define the target size.
11131131
strict_size: whether to strictly respect target_size specified in argument, if any specified.
11141132
The function can change the target size if the value is not congruent with the method
11151133
dict_agg: optional. dict, keys: agg operator, values: list of columns or None,
@@ -1118,6 +1136,11 @@ def homogenize_within(
11181136
verbose: optional. boolean
11191137
whether to print shape of df and if df is admissible at the end of the function.
11201138
1139+
Raises
1140+
------
1141+
Exception:
1142+
If method is not defined and if the function failed to select automatically a method.
1143+
11211144
Returns
11221145
-------
11231146
df: pandas dataframe
@@ -1145,41 +1168,41 @@ def homogenize_within(
11451168
"not specified and 'agg' method was not specified either.")
11461169

11471170
if len(method) == 0:
1148-
if df["__diff__"].min() < 54 and agg_applicable:
1171+
if df["__diff__"].min() < target_size / 1.5 and agg_applicable:
11491172
method.add("agg")
1150-
elif df["__diff__"].max() > 216:
1173+
elif df["__diff__"].max() > target_size * 1.33:
11511174
method.add("split")
1152-
elif df["__diff__"].min() > 108:
1153-
method.add("split")
1154-
elif agg_applicable:
1155-
method.add("agg")
11561175
else:
1157-
method.add("split")
1176+
warnings.warn("No method selected. Please, check whether the dataframe is admissible and "
1177+
"whether the target size is coherent given the size of the segments in the dataframe.")
11581178

11591179
if target_size is None:
1160-
if df["__diff__"].min() < 108 < df["__diff__"].max():
1161-
target_size = 108
1162-
else:
1163-
target_size = int(df["__diff__"].median())
1180+
target_size = int(df["__diff__"].median())
1181+
warnings.warn(f"Unspecified target size set at median: {target_size}")
11641182

11651183
if "agg" not in method and target_size > min_thresh and not strict_size:
11661184
initial_ts = f"{target_size}"
11671185
target_size = max(int(df["__diff__"].min() * 1.33), 20)
11681186
warnings.warn(f"Specified target_size for method {method} was not congruent with segment sizes in the"
1169-
f" dataframe. "
1170-
"target_size has been modified from " + initial_ts + f" to{target_size}.")
1187+
" dataframe. target_size has been modified from " + initial_ts + f" to{target_size}.")
11711188

11721189
if "__diff__" in df.columns:
11731190
df = df.drop("__diff__", axis=1)
11741191

11751192
# ==================
11761193
# apply method(s)
1194+
col_sum_agg = []
1195+
if dict_agg is not None:
1196+
if "sum" in dict_agg.keys():
1197+
col_sum_agg = dict_agg["sum"]
1198+
11771199
if "split" in method or ("agg" in method and target_size < min_thresh):
11781200
df = split_segment(
11791201
df=df,
11801202
id_discrete=id_discrete,
11811203
id_continuous=id_continuous,
11821204
target_size=target_size // 3 if "agg" in method else target_size,
1205+
col_sum_agg=col_sum_agg,
11831206
verbose=verbose
11841207
)
11851208

@@ -1200,7 +1223,10 @@ def homogenize_between(
12001223
df1: pd.DataFrame,
12011224
df2: pd.DataFrame,
12021225
id_discrete: list[Any],
1203-
id_continuous: [Any, Any],
1226+
id_continuous: list[Any],
1227+
dict_agg_df1: dict[str, list[str]] | None = None,
1228+
dict_agg_df2: dict[str, list[str]] | None = None,
1229+
keep_df1: bool = False,
12041230
verbose: bool = False
12051231
) -> tuple[pd.DataFrame, pd.DataFrame]:
12061232
"""
@@ -1230,6 +1256,12 @@ def homogenize_between(
12301256
discrete columns (object or categorical)
12311257
id_continuous : list of 2 column names
12321258
continuous columns that delimit the segments' start and end
1259+
dict_agg_df1: optional, dict[str, list[str]] | None
1260+
dictionary with settings about how to handle the columns in df1 that are neither id_discrete nor id_continuous
1261+
dict_agg_df2: optional, dict[str, list[str]] | None
1262+
dictionary with settings about how to handle the columns in df2 that are neither id_discrete nor id_continuous
1263+
keep_df1: optional, bool
1264+
default to False. If True, the segmentation in df1 does not change. Only df2 adapts to df1.
12331265
verbose: optional. boolean
12341266
whether to print shape of df and if df is admissible at the end of the function.
12351267
@@ -1250,22 +1282,26 @@ def homogenize_between(
12501282
target_size = int(1.33 * min_diff)
12511283
else:
12521284
target_size = int(1.33 * min_diff_ref)
1285+
print(f"homogenize_between: chosen target size: {target_size}")
12531286

12541287
df2 = homogenize_within(
12551288
df=df2.drop("__diff__", axis=1),
12561289
id_discrete=id_discrete,
12571290
id_continuous=id_continuous,
12581291
target_size=target_size,
1292+
dict_agg=dict_agg_df2,
12591293
verbose=verbose
12601294
)
12611295

1262-
df1 = homogenize_within(
1263-
df=df1.drop("__diff__", axis=1),
1264-
id_discrete=id_discrete,
1265-
id_continuous=id_continuous,
1266-
target_size=target_size,
1267-
verbose=verbose
1268-
)
1296+
if not keep_df1:
1297+
df1 = homogenize_within(
1298+
df=df1.drop("__diff__", axis=1),
1299+
id_discrete=id_discrete,
1300+
id_continuous=id_continuous,
1301+
target_size=target_size,
1302+
dict_agg=dict_agg_df1,
1303+
verbose=verbose
1304+
)
12691305

12701306
return df1, df2
12711307

@@ -1309,7 +1345,48 @@ def segmentation_irregular(
13091345
length_target,
13101346
length_minimal,
13111347
) -> pd.DataFrame:
1312-
return df
1348+
"""
1349+
Parameters
1350+
----------
1351+
df: pd.DataFrame
1352+
id_discrete: list[str]
1353+
list of name of columns of categorical type
1354+
id_continuous: list[str, str]
1355+
list of name of 2 columns of numerical type, indicating the start and the end of the segment
1356+
length_target
1357+
length to obtain at the end of the segmentation
1358+
length_minimal
1359+
When there are gaps in the dataframe, define the length beyond which this could be considered as a
1360+
deliberate break in the segmentation and not as missing data. Under this threshold, a new row will
1361+
be created to ensure the continuity between successive segments in the dataframe.
1362+
1363+
Returns
1364+
-------
1365+
pd.DataFrame
1366+
New dataframe containing only the columns id_discrete and id_continuous, with the length of the segments
1367+
adjusted to be as close as possible to length_target.
1368+
"""
1369+
1370+
df_new = tools.create_continuity_modified(
1371+
df=df,
1372+
id_discrete=id_discrete,
1373+
id_continuous=id_continuous,
1374+
limit=length_minimal,
1375+
sort=False
1376+
)
1377+
1378+
df_new = homogenize_within(
1379+
df=df_new[[*id_discrete, *id_continuous]],
1380+
id_discrete=id_discrete,
1381+
id_continuous=id_continuous,
1382+
method=["agg", "split"],
1383+
target_size=length_target,
1384+
dict_agg=None,
1385+
strict_size=False,
1386+
verbose=False
1387+
)
1388+
1389+
return df_new
13131390

13141391

13151392
def segmentation_regular(
@@ -1358,3 +1435,92 @@ def segmentation_regular(
13581435
df_new.index = range(len(df_new))
13591436

13601437
return df_new
1438+
1439+
1440+
def fill_segmentation(
1441+
df_segmentation: pd.DataFrame,
1442+
df_features: pd.DataFrame,
1443+
id_discrete: list[str],
1444+
id_continuous: list[str],
1445+
dict_agg: dict[str, list[str]] | None = None
1446+
):
1447+
"""
1448+
adds data to segmentation
1449+
1450+
Parameters
1451+
----------
1452+
df_segmentation: pd.DataFrame
1453+
the dataframe containing the segmentation. Should contain only columns id_discrete and id_continuous
1454+
df_features: pd.DataFrame
1455+
the dataframe containing the features to fit to the segmentation. Should contain the columns
1456+
id_discrete and id_continuous as well as other columns for the features of interest.
1457+
id_discrete
1458+
id_continuous
1459+
dict_agg:
1460+
1461+
Returns
1462+
-------
1463+
pd.DataFrame:
1464+
a dataframe with the feature data fitted to the new segmentation.
1465+
"""
1466+
# verification of requirements
1467+
for col in id_continuous + id_discrete:
1468+
if col not in df_segmentation.columns or col not in df_features.columns:
1469+
raise Exception(f"Error: {col} is not present in both dataframes df_segm and df_feat.")
1470+
1471+
is_df_segm_admissible = tools.admissible_dataframe(
1472+
data=df_segmentation, id_discrete=id_discrete, id_continuous=id_continuous
1473+
)
1474+
is_df_feat_admissible = tools.admissible_dataframe(
1475+
data=df_features, id_discrete=id_discrete, id_continuous=id_continuous
1476+
)
1477+
if not is_df_segm_admissible or not is_df_feat_admissible:
1478+
raise Exception("Error: Both dataframes should be admissible:"
1479+
f"Is df_segm admissible? {is_df_segm_admissible}"
1480+
f"Is df_feat admissible? {is_df_feat_admissible}")
1481+
1482+
# homogenize_between() reduces the difference in segment size between df_feat and df_segm. More precisely, it
1483+
# adjusts df_feat to df_segm. This may reduce the risk of error when using merge().
1484+
df_segmentation, df_features = homogenize_between(
1485+
df1=df_segmentation,
1486+
df2=df_features,
1487+
id_discrete=id_discrete,
1488+
id_continuous=id_continuous,
1489+
dict_agg_df1=None,
1490+
dict_agg_df2=dict_agg,
1491+
keep_df1=True,
1492+
verbose=False
1493+
)
1494+
1495+
df_segmentation["__id__"] = 1
1496+
df_segmentation["__id__"] = df_segmentation["__id__"].cumsum()
1497+
1498+
# merging the segmentations in both df
1499+
df_merge = merge(
1500+
data_left=df_segmentation,
1501+
data_right=df_features,
1502+
id_continuous=id_continuous,
1503+
id_discrete=id_discrete,
1504+
how="left",
1505+
remove_duplicates=False,
1506+
verbose=False
1507+
)
1508+
1509+
# groupby based on the settings in dict_agg and based on grouping variable __id__
1510+
df_merge = concretize_aggregation(
1511+
df=df_merge,
1512+
id_discrete=id_discrete,
1513+
id_continuous=id_continuous,
1514+
dict_agg=dict_agg,
1515+
add_group_by="__id__",
1516+
verbose=False
1517+
)
1518+
1519+
df_merge = df_merge.drop(columns=["__id__"])
1520+
1521+
df_merge = tools.reorder_columns(df_merge, id_discrete, id_continuous)
1522+
1523+
return df_merge
1524+
1525+
1526+

0 commit comments

Comments
 (0)