From 155ec9fc5b415ee9209b556a02b25969b8056ca6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20Sch=C3=BCler?= Date: Tue, 3 Jun 2025 15:10:15 +0200 Subject: [PATCH 1/9] edge attributes in pandas dataframes are now also stored in the TemporalGraph in the function df_to_temporal_graph() --- src/pathpyG/core/temporal_graph.py | 4 +- src/pathpyG/io/pandas.py | 148 ++++++++++++++++++++--------- 2 files changed, 103 insertions(+), 49 deletions(-) diff --git a/src/pathpyG/core/temporal_graph.py b/src/pathpyG/core/temporal_graph.py index a019f12b..ef4f1caa 100644 --- a/src/pathpyG/core/temporal_graph.py +++ b/src/pathpyG/core/temporal_graph.py @@ -65,7 +65,7 @@ def __init__(self, data: Data, mapping: IndexMap | None = None) -> None: @staticmethod def from_edge_list(edge_list, num_nodes: Optional[int] = None) -> TemporalGraph: - edge_array = np.array(edge_list) + edge_array = np.array(edge_list).astype(int)#changed the type to enable the right ordering in the index_map, before the ordering was lexical over the strings which lead to the wrong ordering, for example '1' '10' '2' instead of '1' '2' '10' ts = edge_array[:, 2].astype(np.number) index_map = IndexMap(np.unique(edge_array[:, :2])) @@ -77,7 +77,7 @@ def from_edge_list(edge_list, num_nodes: Optional[int] = None) -> TemporalGraph: return TemporalGraph( data=Data( edge_index=edge_index, - time=torch.Tensor(ts), + time=torch.tensor(ts, dtype=torch.float64), #before it was float32, where the precision is to low num_nodes=num_nodes, ), mapping=index_map, diff --git a/src/pathpyG/io/pandas.py b/src/pathpyG/io/pandas.py index d4b1a789..28359eb7 100644 --- a/src/pathpyG/io/pandas.py +++ b/src/pathpyG/io/pandas.py @@ -189,61 +189,115 @@ def add_edge_attributes(df: pd.DataFrame, g: Graph) -> None: """Add edge attributes from pandas data frame to existing graph, where source/target node IDs are given in columns `v` and `w` and edge attributes x are given in columns `edge_x` """ + if "t" in df: + if "v" not in df or "w" not in df or "t" not in df: + print("data frame must have columns `v` and `w` and `t`") + return - if "v" not in df or "w" not in df: - print("data frame must have columns `v` and `w`") - return + attributed_edges = list(zip(df["v"], df["w"], df["t"])) - attributed_edges = list(zip(df["v"], df["w"])) - # check for duplicated edge attributes - if len(set(attributed_edges)) < len(attributed_edges): - print("data frame contains multiple attribute values for single edge") - return + # extract indices of source/target node of edges + src = [g.mapping.to_idx(x) for x in df["v"]] + tgt = [g.mapping.to_idx(x) for x in df["w"]] + time = [x for x in df["t"]] - # check for difference between edges in graph and edges in attributes - if set(attributed_edges) != set([(v, w) for v, w in g.edges]): - print("Mismatch between edges in DataFrame and edges in graph") - return + # unique index for each edge independent of v,w,t because there exist temporal networks with duplicated temporal edges + edge_idx = list(range(len(src))) + + #sort the edge_index for the case that the data_frame is not sorted + paired = list(zip(time, edge_idx)) + paired.sort(key=lambda x: x[0]) + edge_idx = [idx for _, idx in paired] + + for attr in df.columns: + if attr != "v" and attr != "w" and attr != "t": + prefix = "" + if not attr.startswith("edge_"): + prefix = "edge_" + + # eval values for array-valued attributes + try: + values = np.array([eval(x) for x in df[attr].values]) + + g.data[prefix + attr] = torch.from_numpy(values[edge_idx]).to(device=g.data.edge_index.device) + continue + except: + pass + + # try to directly construct tensor for scalar values + try: + g.data[prefix + attr] = torch.from_numpy(df[attr].values[edge_idx]).to(device=g.data.edge_index.device) + continue + except: + pass + + # numpy array of strings + try: + g.data[prefix + attr] = np.array(df[attr].values.astype(str)[edge_idx]) + except: + t = df[attr].dtype + print(f"Could not assign edge attribute {attr} of type {t}") - # extract indices of source/target node of edges - src = [g.mapping.to_idx(x) for x in df["v"]] - tgt = [g.mapping.to_idx(x) for x in df["w"]] - # find indices of edges in edge_index - edge_idx = [] - for i in range(len(src)): - x = torch.where((g.data.edge_index[0, :] == src[i]) & (g.data.edge_index[1, :] == tgt[i]))[0].item() - edge_idx.append(x) - for attr in df.columns: - if attr != "v" and attr != "w": - prefix = "" - if not attr.startswith("edge_"): - prefix = "edge_" - - # eval values for array-valued attributes - try: - values = np.array([eval(x) for x in df[attr].values]) - g.data[prefix + attr] = torch.from_numpy(values[edge_idx]).to(device=g.data.edge_index.device) - continue - except: - pass - # try to directly construct tensor for scalar values - try: - g.data[prefix + attr] = torch.from_numpy(df[attr].values[edge_idx]).to(device=g.data.edge_index.device) - continue - except: - pass - # numpy array of strings - try: - g.data[prefix + attr] = np.array(df[attr].values.astype(str)[edge_idx]) - except: - t = df[attr].dtype - print(f"Could not assign edge attribute {attr} of type {t}") - # g.data[prefix+attr] = df[attr].values[edge_idx] + else: + if "v" not in df or "w" not in df: + print("data frame must have columns `v` and `w`") + return + + attributed_edges = list(zip(df["v"], df["w"])) + + # check for duplicated edge attributes + if len(set(attributed_edges)) < len(attributed_edges): + print("data frame contains multiple attribute values for single edge") + return + + # check for difference between edges in graph and edges in attributes + if set(attributed_edges) != set([(v, w) for v, w in g.edges]): + print("Mismatch between edges in DataFrame and edges in graph") + return + + # extract indices of source/target node of edges + src = [g.mapping.to_idx(x) for x in df["v"]] + tgt = [g.mapping.to_idx(x) for x in df["w"]] + + # find indices of edges in edge_index + edge_idx = [] + for i in range(len(src)): + x = torch.where((g.data.edge_index[0, :] == src[i]) & (g.data.edge_index[1, :] == tgt[i]))[0].item() + edge_idx.append(x) + for attr in df.columns: + if attr != "v" and attr != "w": + prefix = "" + if not attr.startswith("edge_"): + prefix = "edge_" + + # eval values for array-valued attributes + try: + values = np.array([eval(x) for x in df[attr].values]) + g.data[prefix + attr] = torch.from_numpy(values[edge_idx]).to(device=g.data.edge_index.device) + continue + except: + pass + + # try to directly construct tensor for scalar values + try: + g.data[prefix + attr] = torch.from_numpy(df[attr].values[edge_idx]).to(device=g.data.edge_index.device) + continue + except: + pass + + # numpy array of strings + try: + g.data[prefix + attr] = np.array(df[attr].values.astype(str)[edge_idx]) + except: + t = df[attr].dtype + print(f"Could not assign edge attribute {attr} of type {t}") + + # g.data[prefix+attr] = df[attr].values[edge_idx] def df_to_temporal_graph( @@ -314,7 +368,7 @@ def df_to_temporal_graph( tedges.append((_v, _w, int(t / time_rescale))) g = TemporalGraph.from_edge_list(tedges, **kwargs) - + add_edge_attributes(df, g) if is_undirected: return g.to_undirected() else: From 4cd0ee9067a7fea22be69ba50a885e7d03a7da59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20Sch=C3=BCler?= Date: Tue, 3 Jun 2025 21:44:00 +0200 Subject: [PATCH 2/9] fixed add_edge_attributes bug from the last commit --- src/pathpyG/core/temporal_graph.py | 2 +- src/pathpyG/io/pandas.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pathpyG/core/temporal_graph.py b/src/pathpyG/core/temporal_graph.py index ef4f1caa..92b97df8 100644 --- a/src/pathpyG/core/temporal_graph.py +++ b/src/pathpyG/core/temporal_graph.py @@ -65,7 +65,7 @@ def __init__(self, data: Data, mapping: IndexMap | None = None) -> None: @staticmethod def from_edge_list(edge_list, num_nodes: Optional[int] = None) -> TemporalGraph: - edge_array = np.array(edge_list).astype(int)#changed the type to enable the right ordering in the index_map, before the ordering was lexical over the strings which lead to the wrong ordering, for example '1' '10' '2' instead of '1' '2' '10' + edge_array = np.array(edge_list) ts = edge_array[:, 2].astype(np.number) index_map = IndexMap(np.unique(edge_array[:, :2])) diff --git a/src/pathpyG/io/pandas.py b/src/pathpyG/io/pandas.py index 28359eb7..5d59a4f5 100644 --- a/src/pathpyG/io/pandas.py +++ b/src/pathpyG/io/pandas.py @@ -198,8 +198,8 @@ def add_edge_attributes(df: pd.DataFrame, g: Graph) -> None: # extract indices of source/target node of edges - src = [g.mapping.to_idx(x) for x in df["v"]] - tgt = [g.mapping.to_idx(x) for x in df["w"]] + src = [g.mapping.to_idx(str(x)) for x in df["v"]] + tgt = [g.mapping.to_idx(str(x)) for x in df["w"]] time = [x for x in df["t"]] # unique index for each edge independent of v,w,t because there exist temporal networks with duplicated temporal edges From 284578d6e32eca96e0ad5c992059dbc297dedfc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20Sch=C3=BCler?= Date: Wed, 4 Jun 2025 10:22:09 +0200 Subject: [PATCH 3/9] changed a dtype in TemporalGraph.from_edge_list() from float64 to long --- src/pathpyG/core/temporal_graph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pathpyG/core/temporal_graph.py b/src/pathpyG/core/temporal_graph.py index 92b97df8..82f405ff 100644 --- a/src/pathpyG/core/temporal_graph.py +++ b/src/pathpyG/core/temporal_graph.py @@ -77,7 +77,7 @@ def from_edge_list(edge_list, num_nodes: Optional[int] = None) -> TemporalGraph: return TemporalGraph( data=Data( edge_index=edge_index, - time=torch.tensor(ts, dtype=torch.float64), #before it was float32, where the precision is to low + time=torch.tensor(ts, dtype=torch.long), #before it was float32, where the precision is to low num_nodes=num_nodes, ), mapping=index_map, From 8f6e784ec8752c3acff72ec00ec06bf5098d258c Mon Sep 17 00:00:00 2001 From: Moritz Lampert Date: Wed, 4 Jun 2025 10:07:57 +0000 Subject: [PATCH 4/9] assign tensor dtype automatically --- src/pathpyG/core/temporal_graph.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/pathpyG/core/temporal_graph.py b/src/pathpyG/core/temporal_graph.py index 82f405ff..2ccdd3ab 100644 --- a/src/pathpyG/core/temporal_graph.py +++ b/src/pathpyG/core/temporal_graph.py @@ -68,6 +68,12 @@ def from_edge_list(edge_list, num_nodes: Optional[int] = None) -> TemporalGraph: edge_array = np.array(edge_list) ts = edge_array[:, 2].astype(np.number) + # Convert timestamps to tensor + if np.issubdtype(ts.dtype, np.integer): + ts = torch.tensor(ts, dtype=torch.long) + else: + ts = torch.tensor(ts, dtype=torch.float32) + index_map = IndexMap(np.unique(edge_array[:, :2])) edge_index = index_map.to_idxs(edge_array[:, :2].T) @@ -77,7 +83,7 @@ def from_edge_list(edge_list, num_nodes: Optional[int] = None) -> TemporalGraph: return TemporalGraph( data=Data( edge_index=edge_index, - time=torch.tensor(ts, dtype=torch.long), #before it was float32, where the precision is to low + time=ts, num_nodes=num_nodes, ), mapping=index_map, From 07734459272ed0f2e6ba28d2747b9479abcd6eb2 Mon Sep 17 00:00:00 2001 From: Moritz Lampert Date: Thu, 5 Jun 2025 10:00:26 +0000 Subject: [PATCH 5/9] fix indexing bugs --- src/pathpyG/io/pandas.py | 319 +++++++++++++++------------------ tests/io/test_netzschleuder.py | 9 + 2 files changed, 151 insertions(+), 177 deletions(-) diff --git a/src/pathpyG/io/pandas.py b/src/pathpyG/io/pandas.py index 5d59a4f5..aaabc6d9 100644 --- a/src/pathpyG/io/pandas.py +++ b/src/pathpyG/io/pandas.py @@ -1,18 +1,30 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union +from typing import Any, Optional, Union -from collections import Counter +import ast +import re +import warnings import pandas as pd import torch import numpy as np -import datetime -from time import mktime - from pathpyG.core.graph import Graph from pathpyG.core.temporal_graph import TemporalGraph -from pathpyG.utils.config import config + +# Regex to check if the attribute is iterable (e.g., list, dict, etc.) +_iterable_re = re.compile(r"^\s*[\[\{\(].*[\]\}\)]\s*$") +_number_re = re.compile( + r"""^\s* # optional leading whitespace + [+-]? # optional sign + ( # start group + (\d+\.\d*)|(\.\d+)|(\d+) # float or int + ) + ([eE][+-]?\d+)? # optional exponent + \s*$ # optional trailing whitespace +""" +) +_integer_re = re.compile(r"^\s*[+-]?\d+\s*$") def _check_column_name(frame: pd.DataFrame, name: str, synonyms: list) -> pd.DataFrame: @@ -25,6 +37,30 @@ def _check_column_name(frame: pd.DataFrame, name: str, synonyms: list) -> pd.Dat return frame +def _parse_df_column(df: pd.DataFrame, g: Graph, idx: list, attr: str, prefix: str = "") -> None: + """Helper function to parse a column in a DataFrame and add it as an attribute to the graph.""" + if df[attr].dtype == "object": + if _iterable_re.match(str(df[attr].values[0])): + g.data[prefix + attr] = torch.tensor( + [ast.literal_eval(x) for x in df[attr].values[idx]], device=g.data.edge_index.device + ) + elif _number_re.match(str(df[attr].values[0])): + # if the attribute is a number, convert it to a tensor + if _integer_re.match(str(df[attr].values[0])): + g.data[prefix + attr] = torch.tensor( + df[attr].values.astype(int)[idx], device=g.data.edge_index.device + ) + else: + g.data[prefix + attr] = torch.tensor( + df[attr].values.astype(float)[idx], device=g.data.edge_index.device + ) + else: + # if the attribute is not iterable, convert it to a string + g.data[prefix + attr] = np.array(df[attr].values.astype(str)[idx]) + else: + g.data[prefix + attr] = torch.tensor(df[attr].values[idx], device=g.data.edge_index.device) + + def df_to_graph(df: pd.DataFrame, is_undirected: bool = False, multiedges: bool = False, **kwargs: Any) -> Graph: """Reads a network from a pandas data frame. @@ -65,7 +101,6 @@ def df_to_graph(df: pd.DataFrame, is_undirected: bool = False, multiedges: bool print(n) ``` """ - # assign column names if no header is present no_header = all(isinstance(x, int) for x in df.columns.values.tolist()) @@ -77,38 +112,13 @@ def df_to_graph(df: pd.DataFrame, is_undirected: bool = False, multiedges: bool col_names += ["edge_attr_{0}".format(i - 2)] df.columns = col_names - df["v"] = df["v"].astype(str) - df["w"] = df["w"].astype(str) - - edges: list = [] - edge_set: set = set() - - # counter for multiple edges - counter: Counter = Counter() - - for row in df.to_dict(orient="records"): - _v, _w = row.pop("v"), row.pop("w") - - # check if edge was already generated - if (_v, _w) in edge_set and not multiedges: - counter[(_v, _w)] += 1 - else: - # add edge - edges.append((_v, _w)) - edge_set.add((_v, _w)) - - # check for multi-edges - if len(counter) > 0: - print( - "%i edges existed already " - "and were not considered. " - "To capture those edges, consider creating " - "a multiedge and/or directed network.", - sum(counter.values()), - ) + edge_df = df[["v", "w"]].drop_duplicates() + if not multiedges and (len(edge_df) != len(df)): + print("Data frame contains multiple edges, but multiedges is set to False. Removing duplicates.") + df = df.drop_duplicates(subset=["v", "w"]) # create graph - g = Graph.from_edge_list(edges, is_undirected=is_undirected, **kwargs) + g = Graph.from_edge_list(edge_list=edge_df.values, is_undirected=is_undirected, **kwargs) # assign edge attributes add_edge_attributes(df, g) @@ -116,8 +126,14 @@ def df_to_graph(df: pd.DataFrame, is_undirected: bool = False, multiedges: bool def add_node_attributes(df: pd.DataFrame, g: Graph): - """Add node attributes from pandas data frame to existing graph, where node - IDs or indices are given in column `v` and node attributes x are given in columns `node_x` + """Add node attributes from pandas data frame to existing `Graph`. + + Add node attributes from pandas data frame to existing graph, where node + IDs or indices are given in column `v` and node attributes x are given in columns `node_x`. + + Args: + df: A DataFrame with rows containing nodes and optional node attributes. + g: The graph to which the node attributes should be added. """ if "v" in df: print("Mapping node attributes based on node names in column `v`") @@ -141,7 +157,7 @@ def add_node_attributes(df: pd.DataFrame, g: Graph): return # get indices of nodes in tensor - node_idx = [g.mapping.to_idx(x) for x in df["v"]] + node_idx = g.mapping.to_idxs(attributed_nodes) else: if set(attributed_nodes) != set([i for i in range(g.n)]): print("Mismatch between nodes in DataFrame and nodes in graph") @@ -158,146 +174,87 @@ def add_node_attributes(df: pd.DataFrame, g: Graph): continue # prefix attribute names that are not already prefixed - prefix = "" - if not attr.startswith("node_"): + if attr.startswith("node_"): + prefix = "" + else: prefix = "node_" - # eval values for array-valued attributes - try: - values = np.array([eval(x) for x in df[attr].values]) - g.data[prefix + attr] = torch.from_numpy(values[node_idx]).to(device=g.data.edge_index.device) - continue - except: - pass + _parse_df_column( + df=df, + g=g, + idx=node_idx, + attr=attr, + prefix=prefix, + ) - # try to directly construct tensor for scalar values - try: - g.data[prefix + attr] = torch.from_numpy(df[attr].values[node_idx]).to(device=g.data.edge_index.device) - continue - except: - pass - # numpy array of strings - try: - g.data[prefix + attr] = np.array(df[attr].values.astype(str)[node_idx]) - except: - t = df[attr].dtype - print(f"Could not assign node attribute {attr} of type {t}") +def add_edge_attributes(df: pd.DataFrame, g: Graph, time_attr: str | None = None) -> None: + """Add (temporal) edge attributes from pandas data frame to existing `Graph`. + Add edge attributes from `pandas.DataFrame` to existing `Graph`, where source/target node + IDs are given in columns `v` and `w` and edge attributes x are given in columns `edge_x`. + If `time_attr` is not None, the dataframe is expected to contain temporal data with a timestamp + in a column named as specified in `time_attr`. -def add_edge_attributes(df: pd.DataFrame, g: Graph) -> None: - """Add edge attributes from pandas data frame to existing graph, where source/target node - IDs are given in columns `v` and `w` and edge attributes x are given in columns `edge_x` + Args: + df: A DataFrame with rows containing edges and optional edge attributes. + g: The graph to which the edge attributes should be added. + time_attr: If not None, the name of the column containing time stamps for temporal edges. """ - if "t" in df: - if "v" not in df or "w" not in df or "t" not in df: - print("data frame must have columns `v` and `w` and `t`") - return - - attributed_edges = list(zip(df["v"], df["w"], df["t"])) - - - # extract indices of source/target node of edges - src = [g.mapping.to_idx(str(x)) for x in df["v"]] - tgt = [g.mapping.to_idx(str(x)) for x in df["w"]] - time = [x for x in df["t"]] - - # unique index for each edge independent of v,w,t because there exist temporal networks with duplicated temporal edges - edge_idx = list(range(len(src))) - - #sort the edge_index for the case that the data_frame is not sorted - paired = list(zip(time, edge_idx)) - paired.sort(key=lambda x: x[0]) - edge_idx = [idx for _, idx in paired] - - for attr in df.columns: - if attr != "v" and attr != "w" and attr != "t": - prefix = "" - if not attr.startswith("edge_"): - prefix = "edge_" - - # eval values for array-valued attributes - try: - values = np.array([eval(x) for x in df[attr].values]) - - g.data[prefix + attr] = torch.from_numpy(values[edge_idx]).to(device=g.data.edge_index.device) - continue - except: - pass - - # try to directly construct tensor for scalar values - try: - g.data[prefix + attr] = torch.from_numpy(df[attr].values[edge_idx]).to(device=g.data.edge_index.device) - continue - except: - pass - - # numpy array of strings - try: - g.data[prefix + attr] = np.array(df[attr].values.astype(str)[edge_idx]) - except: - t = df[attr].dtype - print(f"Could not assign edge attribute {attr} of type {t}") + assert "v" in df and "w" in df, "Data frame must have columns `v` and `w` for source and target nodes" + # extract indices of source/target node of edges + src = g.mapping.to_idxs(df["v"].tolist()) + tgt = g.mapping.to_idxs(df["w"].tolist()) + edge_attrs = list(df.columns) + edge_attrs.remove("v") + edge_attrs.remove("w") + if time_attr is not None: + assert time_attr in df, f"Data frame must have column `{time_attr}` for time stamps" + time = df[time_attr].values + edge_attrs.remove(time_attr) + # find indices of edges in edge_index + edge_idx = [] + for src_i, tgt_i, time_i in zip(src, tgt, time): + matching_idx = torch.where( + (g.data.edge_index[0, :] == src_i) & (g.data.edge_index[1, :] == tgt_i) & (g.data.time == time_i) + )[0] + if matching_idx.numel() == 1: + edge_idx.append(matching_idx.item()) + else: + # if the edge is not unique, raise a warning + if matching_idx.numel() > 1: + # if there are multiple edges, take the first one + edge_idx.append(matching_idx[0].item()) + warnings.warn(f"Edge ({src_i}, {tgt_i}) exists {matching_idx.numel()} times in the graph", stacklevel=2) else: - if "v" not in df or "w" not in df: - print("data frame must have columns `v` and `w`") - return - - attributed_edges = list(zip(df["v"], df["w"])) - - # check for duplicated edge attributes - if len(set(attributed_edges)) < len(attributed_edges): - print("data frame contains multiple attribute values for single edge") - return - - # check for difference between edges in graph and edges in attributes - if set(attributed_edges) != set([(v, w) for v, w in g.edges]): - print("Mismatch between edges in DataFrame and edges in graph") - return - - # extract indices of source/target node of edges - src = [g.mapping.to_idx(x) for x in df["v"]] - tgt = [g.mapping.to_idx(x) for x in df["w"]] - # find indices of edges in edge_index edge_idx = [] - for i in range(len(src)): - x = torch.where((g.data.edge_index[0, :] == src[i]) & (g.data.edge_index[1, :] == tgt[i]))[0].item() - edge_idx.append(x) - for attr in df.columns: - if attr != "v" and attr != "w": - prefix = "" - if not attr.startswith("edge_"): - prefix = "edge_" - - # eval values for array-valued attributes - try: - values = np.array([eval(x) for x in df[attr].values]) - g.data[prefix + attr] = torch.from_numpy(values[edge_idx]).to(device=g.data.edge_index.device) - continue - except: - pass - - # try to directly construct tensor for scalar values - try: - g.data[prefix + attr] = torch.from_numpy(df[attr].values[edge_idx]).to(device=g.data.edge_index.device) - continue - except: - pass - - # numpy array of strings - try: - g.data[prefix + attr] = np.array(df[attr].values.astype(str)[edge_idx]) - except: - t = df[attr].dtype - print(f"Could not assign edge attribute {attr} of type {t}") - - # g.data[prefix+attr] = df[attr].values[edge_idx] + for src_i, tgt_i in zip(src, tgt): + matching_idx = torch.where((g.data.edge_index[0, :] == src_i) & (g.data.edge_index[1, :] == tgt_i))[0] + assert ( + matching_idx.numel() == 1 + ), f"Edge ({src_i}, {tgt_i}) either does not exist or is duplicated in the graph" + edge_idx.append(matching_idx.item()) + + for attr in edge_attrs: + if attr.startswith("edge_"): + prefix = "" + else: + prefix = "edge_" + + # parse column and add to graph + _parse_df_column( + df=df, + g=g, + idx=edge_idx, + attr=attr, + prefix=prefix, + ) def df_to_temporal_graph( @@ -355,20 +312,28 @@ def df_to_temporal_graph( col_names += ["edge_attr_{0}".format(i - 2)] df.columns = col_names - tedges = [] - for row in df.to_dict(orient="records"): - _v, _w, _t = str(row.pop("v")), str(row.pop("w")), str(row.pop("t")) - try: - t = float(_t) - except: - # if time stamp is a string, use timestamp_format to convert - # it to UNIX timestamp - x = datetime.datetime.strptime(_t, timestamp_format) - t = int(mktime(x.timetuple())) - tedges.append((_v, _w, int(t / time_rescale))) + # optionally parse time stamps + if df["t"].dtype == "object": + # convert time stamps to seconds since epoch + df["t"] = pd.to_datetime(df["t"], format=timestamp_format) + # rescale time stamps + df["t"] = df["t"].astype("int64") // time_rescale + elif df["t"].dtype == "int64" or df["t"].dtype == "float64": + # rescale time stamps + df["t"] = df["t"] // time_rescale + elif pd.api.types.is_datetime64_any_dtype(df["t"]): + df["t"] = df["t"].astype("int64") // time_rescale + else: + raise ValueError( + "Column `t` must be of type `object`, `int64`, `float64`, or a datetime type. " + f"Found {df['t'].dtype} instead." + ) + + tedges = df[["v", "w", "t"]].values + print(tedges) g = TemporalGraph.from_edge_list(tedges, **kwargs) - add_edge_attributes(df, g) + add_edge_attributes(df, g, "t") if is_undirected: return g.to_undirected() else: diff --git a/tests/io/test_netzschleuder.py b/tests/io/test_netzschleuder.py index 0c5d1eb8..8c749a8c 100644 --- a/tests/io/test_netzschleuder.py +++ b/tests/io/test_netzschleuder.py @@ -103,6 +103,15 @@ def test_read_netzschleuder_record(): record = read_netzschleuder_record(record_name, url) +def test_read_netzschleuder_graph(): + """Test the read_netzschleuder_graph() function for timestamped data.""" + + g = read_netzschleuder_graph(name="email_company") + assert isinstance(g, Graph) + assert g.n == 167 + assert g.m == 5784 + + def test_read_netzschleuder_graph_temporal(): """Test the read_netzschleuder_graph() function for timestamped data.""" From 888a54733675e99655f5592c29ccb27d9bf166ad Mon Sep 17 00:00:00 2001 From: Moritz Lampert Date: Thu, 5 Jun 2025 10:57:21 +0000 Subject: [PATCH 6/9] make netzschleuder temporal read efficient --- src/pathpyG/io/netzschleuder.py | 6 +-- src/pathpyG/io/pandas.py | 89 ++++++++++++++++++++++++--------- tests/io/test_netzschleuder.py | 20 ++++---- 3 files changed, 78 insertions(+), 37 deletions(-) diff --git a/src/pathpyG/io/netzschleuder.py b/src/pathpyG/io/netzschleuder.py index 2597479b..ffca72a5 100644 --- a/src/pathpyG/io/netzschleuder.py +++ b/src/pathpyG/io/netzschleuder.py @@ -172,11 +172,9 @@ def read_netzschleuder_graph( # construct graph and assign edge attributes if timestamps: - g = df_to_temporal_graph(df=edges, is_undirected=not is_directed, num_nodes=num_nodes) + g = df_to_temporal_graph(df=edges, is_undirected=not is_directed) else: - g = df_to_graph(df=edges, multiedges=True, num_nodes=num_nodes) - if not is_directed: - g = g.to_undirected() + g = df_to_graph(df=edges, multiedges=multiedges, is_undirected=not is_directed, num_nodes=num_nodes) node_attrs = pd.read_csv( f"{temp_dir}/nodes.csv", header=0, sep=",", skip_blank_lines=True, skipinitialspace=True diff --git a/src/pathpyG/io/pandas.py b/src/pathpyG/io/pandas.py index aaabc6d9..03d357bf 100644 --- a/src/pathpyG/io/pandas.py +++ b/src/pathpyG/io/pandas.py @@ -8,9 +8,11 @@ import pandas as pd import torch import numpy as np +from torch_geometric.data import Data from pathpyG.core.graph import Graph from pathpyG.core.temporal_graph import TemporalGraph +from pathpyG.core.index_map import IndexMap # Regex to check if the attribute is iterable (e.g., list, dict, etc.) _iterable_re = re.compile(r"^\s*[\[\{\(].*[\]\}\)]\s*$") @@ -37,31 +39,34 @@ def _check_column_name(frame: pd.DataFrame, name: str, synonyms: list) -> pd.Dat return frame -def _parse_df_column(df: pd.DataFrame, g: Graph, idx: list, attr: str, prefix: str = "") -> None: +def _parse_df_column(df: pd.DataFrame, data: Data, attr: str, idx: list | None = None, prefix: str = "") -> None: """Helper function to parse a column in a DataFrame and add it as an attribute to the graph.""" + if idx is None: + idx = np.arange(len(df)) + if df[attr].dtype == "object": if _iterable_re.match(str(df[attr].values[0])): - g.data[prefix + attr] = torch.tensor( - [ast.literal_eval(x) for x in df[attr].values[idx]], device=g.data.edge_index.device + data[prefix + attr] = torch.tensor( + [ast.literal_eval(x) for x in df[attr].values[idx]], device=data.edge_index.device ) elif _number_re.match(str(df[attr].values[0])): # if the attribute is a number, convert it to a tensor if _integer_re.match(str(df[attr].values[0])): - g.data[prefix + attr] = torch.tensor( - df[attr].values.astype(int)[idx], device=g.data.edge_index.device + data[prefix + attr] = torch.tensor( + df[attr].values.astype(int)[idx], device=data.edge_index.device ) else: - g.data[prefix + attr] = torch.tensor( - df[attr].values.astype(float)[idx], device=g.data.edge_index.device + data[prefix + attr] = torch.tensor( + df[attr].values.astype(float)[idx], device=data.edge_index.device ) else: # if the attribute is not iterable, convert it to a string - g.data[prefix + attr] = np.array(df[attr].values.astype(str)[idx]) + data[prefix + attr] = np.array(df[attr].values.astype(str)[idx]) else: - g.data[prefix + attr] = torch.tensor(df[attr].values[idx], device=g.data.edge_index.device) + data[prefix + attr] = torch.tensor(df[attr].values[idx], device=data.edge_index.device) -def df_to_graph(df: pd.DataFrame, is_undirected: bool = False, multiedges: bool = False, **kwargs: Any) -> Graph: +def df_to_graph(df: pd.DataFrame, is_undirected: bool = False, multiedges: bool = False, num_nodes: int | None = None) -> Graph: """Reads a network from a pandas data frame. The data frame is expected to have a minimum of two columns @@ -117,11 +122,29 @@ def df_to_graph(df: pd.DataFrame, is_undirected: bool = False, multiedges: bool print("Data frame contains multiple edges, but multiedges is set to False. Removing duplicates.") df = df.drop_duplicates(subset=["v", "w"]) - # create graph - g = Graph.from_edge_list(edge_list=edge_df.values, is_undirected=is_undirected, **kwargs) + mapping = IndexMap(node_ids=np.unique(df[["v", "w"]].values)) + data = Data( + edge_index=mapping.to_idxs(df[["v", "w"]].values.T), + num_nodes=num_nodes if num_nodes is not None else mapping.node_ids.shape[0], + ) + cols = df.columns.tolist() + cols.remove("v") + cols.remove("w") + for col in cols: + if col.startswith("edge_"): + prefix = "" + else: + prefix = "edge_" - # assign edge attributes - add_edge_attributes(df, g) + _parse_df_column( + df=df, + data=data, + attr=col, + prefix=prefix + ) + g = Graph(data=data, mapping=mapping) + if is_undirected: + g = g.to_undirected() return g @@ -181,7 +204,7 @@ def add_node_attributes(df: pd.DataFrame, g: Graph): _parse_df_column( df=df, - g=g, + data=g.data, idx=node_idx, attr=attr, prefix=prefix, @@ -250,7 +273,7 @@ def add_edge_attributes(df: pd.DataFrame, g: Graph, time_attr: str | None = None # parse column and add to graph _parse_df_column( df=df, - g=g, + data=g.data, idx=edge_idx, attr=attr, prefix=prefix, @@ -258,7 +281,7 @@ def add_edge_attributes(df: pd.DataFrame, g: Graph, time_attr: str | None = None def df_to_temporal_graph( - df: pd.DataFrame, is_undirected: bool = False, timestamp_format="%Y-%m-%d %H:%M:%S", time_rescale=1, **kwargs: Any + df: pd.DataFrame, is_undirected: bool = False, timestamp_format="%Y-%m-%d %H:%M:%S", time_rescale=1, num_nodes: int | None = None ) -> TemporalGraph: """Reads a temporal graph from a pandas data frame. @@ -329,15 +352,33 @@ def df_to_temporal_graph( f"Found {df['t'].dtype} instead." ) - tedges = df[["v", "w", "t"]].values - print(tedges) + mapping = IndexMap(node_ids=np.unique(df[["v", "w"]].values)) + data = Data( + edge_index=mapping.to_idxs(df[["v", "w"]].values.T), + time=torch.tensor(df["t"].values), + num_nodes=num_nodes if num_nodes is not None else mapping.node_ids.shape[0], + ) + cols = df.columns.tolist() + cols.remove("v") + cols.remove("w") + for col in cols: + if col.startswith("edge_"): + prefix = "" + else: + prefix = "edge_" - g = TemporalGraph.from_edge_list(tedges, **kwargs) - add_edge_attributes(df, g, "t") + _parse_df_column( + df=df, + data=data, + attr=col, + prefix=prefix + ) + g = TemporalGraph(data=data, mapping=mapping) + if is_undirected: - return g.to_undirected() - else: - return g + g = g.to_undirected() + + return g def graph_to_df(graph: Graph, node_indices: Optional[bool] = False) -> pd.DataFrame: diff --git a/tests/io/test_netzschleuder.py b/tests/io/test_netzschleuder.py index 8c749a8c..ea080158 100644 --- a/tests/io/test_netzschleuder.py +++ b/tests/io/test_netzschleuder.py @@ -2,7 +2,7 @@ import pytest -from torch import tensor, equal +import torch from pathpyG import Graph, TemporalGraph from pathpyG.io import list_netzschleuder_records, read_netzschleuder_graph, read_netzschleuder_record @@ -32,11 +32,12 @@ def test_node_attrs(): def test_edge_attrs(): """Test the extraction of edge attributes""" - g = read_netzschleuder_graph("ambassador", "1985_1989") + g = read_netzschleuder_graph("ambassador", "1985_1989", multiedges=True) assert "edge_weight" in g.edge_attrs() - assert equal( + print(g.data.edge_weight) + assert torch.equal( g.data.edge_weight, - tensor( + torch.tensor( [ 1, 1, @@ -51,15 +52,15 @@ def test_edge_attrs(): 1, 1, 3, - 3, 1, 1, 1, + 3, 1, 3, 3, 1, - 3, + 1, 3, 2, 1, @@ -70,7 +71,7 @@ def test_edge_attrs(): 1, 1, 1, - 1, + 3, 1, 3, 3, @@ -119,5 +120,6 @@ def test_read_netzschleuder_graph_temporal(): assert isinstance(g, TemporalGraph) assert g.n == 167 assert g.m == 82927 - assert g.start_time == 1262454016.0 - assert g.end_time == 1285884544.0 + assert g.start_time == 1262454010 + assert g.end_time == 1285884492 + assert "edge_weight" in g.edge_attrs() From 9e2c48ea9dd5c54cd112fd78d7e01079e91a52d8 Mon Sep 17 00:00:00 2001 From: Moritz Lampert Date: Thu, 5 Jun 2025 11:00:22 +0000 Subject: [PATCH 7/9] add num_nodes --- src/pathpyG/io/netzschleuder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pathpyG/io/netzschleuder.py b/src/pathpyG/io/netzschleuder.py index ffca72a5..d4d911e8 100644 --- a/src/pathpyG/io/netzschleuder.py +++ b/src/pathpyG/io/netzschleuder.py @@ -172,7 +172,7 @@ def read_netzschleuder_graph( # construct graph and assign edge attributes if timestamps: - g = df_to_temporal_graph(df=edges, is_undirected=not is_directed) + g = df_to_temporal_graph(df=edges, is_undirected=not is_directed, num_nodes=num_nodes) else: g = df_to_graph(df=edges, multiedges=multiedges, is_undirected=not is_directed, num_nodes=num_nodes) From 9b5aac8244e2ef22935dfe8da0164793c325222e Mon Sep 17 00:00:00 2001 From: Moritz Lampert Date: Thu, 5 Jun 2025 12:43:39 +0000 Subject: [PATCH 8/9] fix ordering numpy attributes --- src/pathpyG/core/temporal_graph.py | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/src/pathpyG/core/temporal_graph.py b/src/pathpyG/core/temporal_graph.py index 2ccdd3ab..90182444 100644 --- a/src/pathpyG/core/temporal_graph.py +++ b/src/pathpyG/core/temporal_graph.py @@ -31,24 +31,20 @@ def __init__(self, data: Data, mapping: IndexMap | None = None) -> None: print(t) ``` """ - if not isinstance(data.edge_index, EdgeIndex): - data.edge_index = data.edge_index = EdgeIndex( - data=data.edge_index.contiguous(), sparse_size=(data.num_nodes, data.num_nodes) + self.data = data + if not isinstance(self.data.edge_index, EdgeIndex): + self.data.edge_index = EdgeIndex( + data=self.data.edge_index.contiguous(), sparse_size=(self.data.num_nodes, self.data.num_nodes) ) # reorder temporal data - # TODO: Fix in PyG - if data.num_nodes != data.num_edges: - self.data = data.sort_by_time() - else: - sorted_idx = torch.argsort(data.time) - data.time = data.time[sorted_idx] - for edge_attr in data.edge_attrs(): - if edge_attr == "edge_index": - data.edge_index = data.edge_index[:, sorted_idx] - else: - data[edge_attr] = data[edge_attr][sorted_idx] - self.data = data + # Note that we do not use `torch_geometric.self.data.Data.sort_by_time` because it cannot sort numpy arrays` + sorted_idx = torch.argsort(self.data.time) + for edge_attr in set(self.data.edge_attrs()).union(set(["time"])): + if edge_attr == "edge_index": + self.data.edge_index = self.data.edge_index[:, sorted_idx] + else: + data[edge_attr] = data[edge_attr][sorted_idx] if mapping is not None: self.mapping = mapping From a268e833473231132e1b18a4b94a36ec0afb2ab8 Mon Sep 17 00:00:00 2001 From: Moritz Lampert Date: Thu, 5 Jun 2025 13:35:24 +0000 Subject: [PATCH 9/9] add multiedge option for temporal graph --- src/pathpyG/io/netzschleuder.py | 2 +- src/pathpyG/io/pandas.py | 5 ++++- tests/io/test_netzschleuder.py | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/pathpyG/io/netzschleuder.py b/src/pathpyG/io/netzschleuder.py index d4d911e8..64a2c718 100644 --- a/src/pathpyG/io/netzschleuder.py +++ b/src/pathpyG/io/netzschleuder.py @@ -172,7 +172,7 @@ def read_netzschleuder_graph( # construct graph and assign edge attributes if timestamps: - g = df_to_temporal_graph(df=edges, is_undirected=not is_directed, num_nodes=num_nodes) + g = df_to_temporal_graph(df=edges, is_undirected=not is_directed, multiedges=multiedges, num_nodes=num_nodes) else: g = df_to_graph(df=edges, multiedges=multiedges, is_undirected=not is_directed, num_nodes=num_nodes) diff --git a/src/pathpyG/io/pandas.py b/src/pathpyG/io/pandas.py index 03d357bf..51c4f941 100644 --- a/src/pathpyG/io/pandas.py +++ b/src/pathpyG/io/pandas.py @@ -281,7 +281,7 @@ def add_edge_attributes(df: pd.DataFrame, g: Graph, time_attr: str | None = None def df_to_temporal_graph( - df: pd.DataFrame, is_undirected: bool = False, timestamp_format="%Y-%m-%d %H:%M:%S", time_rescale=1, num_nodes: int | None = None + df: pd.DataFrame, is_undirected: bool = False, multiedges: bool = False, timestamp_format="%Y-%m-%d %H:%M:%S", time_rescale=1, num_nodes: int | None = None ) -> TemporalGraph: """Reads a temporal graph from a pandas data frame. @@ -352,6 +352,9 @@ def df_to_temporal_graph( f"Found {df['t'].dtype} instead." ) + if not multiedges: + df = df.drop_duplicates(subset=["v", "w", "t"]) + mapping = IndexMap(node_ids=np.unique(df[["v", "w"]].values)) data = Data( edge_index=mapping.to_idxs(df[["v", "w"]].values.T), diff --git a/tests/io/test_netzschleuder.py b/tests/io/test_netzschleuder.py index ea080158..bfa96982 100644 --- a/tests/io/test_netzschleuder.py +++ b/tests/io/test_netzschleuder.py @@ -116,7 +116,7 @@ def test_read_netzschleuder_graph(): def test_read_netzschleuder_graph_temporal(): """Test the read_netzschleuder_graph() function for timestamped data.""" - g = read_netzschleuder_graph(name="email_company", time_attr="time") + g = read_netzschleuder_graph(name="email_company", time_attr="time", multiedges=True) assert isinstance(g, TemporalGraph) assert g.n == 167 assert g.m == 82927