diff --git a/pyproject.toml b/pyproject.toml index bd7e9413..24b3b995 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -126,6 +126,10 @@ disable = "R1705" [tool.pyright] reportGeneralTypeIssues = false +reportCallIssue = false +reportOptionalMemberAccess = false +reportArgumentType = false +reportOptionalSubscript = false [tool.mypy] # Static type checker check_untyped_defs = true diff --git a/src/pathpyG/core/graph.py b/src/pathpyG/core/graph.py index 5c149b6e..020a43b6 100644 --- a/src/pathpyG/core/graph.py +++ b/src/pathpyG/core/graph.py @@ -1,6 +1,5 @@ from __future__ import annotations from typing import ( - TYPE_CHECKING, Dict, Iterable, Tuple, @@ -8,7 +7,6 @@ Union, Any, Optional, - Generator, ) import numpy as np @@ -19,10 +17,8 @@ import torch_geometric.utils from torch_geometric import EdgeIndex from torch_geometric.data import Data -from torch_geometric.transforms.to_undirected import ToUndirected -from torch_geometric.utils import scatter +from torch_geometric.utils import scatter, to_undirected -from pathpyG.utils.config import config from pathpyG.core.index_map import IndexMap @@ -180,13 +176,10 @@ def from_edge_list( return Graph(Data(edge_index=edge_index, num_nodes=num_nodes), mapping=mapping) def to_undirected(self) -> Graph: - """ - Returns an undirected version of a directed graph. + """Return an undirected version of this directed graph. - This method transforms the current graph instance into an undirected graph by - adding all directed edges in opposite direction. It applies [`ToUndirected`](https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.transforms.ToUndirected.html#torch_geometric.transforms.ToUndirected) - transform to the underlying [`torch_geometric.Data`](https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.data.Data.html#torch_geometric.data.Data) object, which automatically - duplicates edge attributes for newly created directed edges. + This method creates a new undirected Graph from the current graph instance by + adding all directed edges in opposite direction. Examples: >>> import pathpyG as pp @@ -195,15 +188,30 @@ def to_undirected(self) -> Graph: >>> print(g_u) Undirected graph with 3 nodes and 6 (directed) edges """ - tf = ToUndirected() - d = tf(self.data) - # unfortunately, the application of a transform creates a new edge_index of type tensor - # so we have to recreate the EdgeIndex tensor and sort it again + # create undirected edge index by coalescing the directed edges and keep + # track of the original edge index for the edge attributes + attr_idx = torch.arange(self.data.num_edges, device=self.data.edge_index.device) + edge_index, attr_idx = to_undirected( + self.data.edge_index, + edge_attr=attr_idx, + num_nodes=self.data.num_nodes, + reduce="min", + ) - e = EdgeIndex(data=d.edge_index, sparse_size=(self.data.num_nodes, self.data.num_nodes), is_undirected=True) - d.edge_index = e - d.num_nodes = self.data.num_nodes - return Graph(d, self.mapping) + data = Data( + edge_index=EdgeIndex(data=edge_index, sparse_size=(self.data.num_nodes, self.data.num_nodes), is_undirected=True), + num_nodes=self.data.num_nodes + ) + # Note that while the torch_geometric.transforms.ToUndirected function would do this automatically, + # we do it manually since the transform cannot handle numpy arrays as edge attributes. + # make sure to copy all node and (undirected) edge attributes + for node_attr in self.node_attrs(): + data[node_attr] = self.data[node_attr] + for edge_attr in self.edge_attrs(): + if edge_attr != "edge_index": + data[edge_attr] = self.data[edge_attr][attr_idx] + + return Graph(data, self.mapping) def to_weighted_graph(self) -> Graph: """Coalesces multi-edges to single-edges with an additional weight attribute diff --git a/src/pathpyG/core/temporal_graph.py b/src/pathpyG/core/temporal_graph.py index 90182444..415d9426 100644 --- a/src/pathpyG/core/temporal_graph.py +++ b/src/pathpyG/core/temporal_graph.py @@ -1,5 +1,5 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Dict, List, Tuple, Union, Any, Optional, Generator +from typing import Tuple, Union, Any, Optional, Generator import numpy as np @@ -53,7 +53,10 @@ def __init__(self, data: Data, mapping: IndexMap | None = None) -> None: # create mapping between edge index and edge tuples self.edge_to_index = { - (e[0].item(), e[1].item()): i for i, e in enumerate([e for e in self.data.edge_index.t()]) + (e[0].item(), e[1].item()): i for i, e in enumerate(self.data.edge_index.t()) + } + self.tedge_to_index = { + (e[0].item(), e[1].item(), t.item()): i for i, (e, t) in enumerate(zip([e for e in self.data.edge_index.t()], self.data.time)) } self.start_time = self.data.time[0].item() @@ -163,6 +166,29 @@ def get_window(self, start_time: int, end_time: int) -> TemporalGraph: return TemporalGraph(data=self.data.snapshot(start_time, end_time), mapping=self.mapping) + def __getitem__(self, key: Union[tuple, str]) -> Any: + """Return node, edge, temporal edge, or graph attribute. + + Args: + key: name of attribute to be returned + """ + if not isinstance(key, tuple): + if key in self.data.keys(): + return self.data[key] + else: + raise KeyError(key + " is not a graph attribute") + elif key[0] in self.node_attrs(): + return self.data[key[0]][self.mapping.to_idx(key[1])] + elif key[0] in self.edge_attrs(): + # TODO: Get item for non-temporal edges will only return the last occurence of the edge + # This is a limitation and should be fixed in the future. + if len(key) == 3: + return self.data[key[0]][self.edge_to_index[self.mapping.to_idx(key[1]), self.mapping.to_idx(key[2])]] + else: + return self.data[key[0]][self.tedge_to_index[self.mapping.to_idx(key[1]), self.mapping.to_idx(key[2]), key[3]]] + else: + raise KeyError(key[0] + " is not a node or edge attribute") + def __str__(self) -> str: """ Return a string representation of the graph diff --git a/src/pathpyG/io/netzschleuder.py b/src/pathpyG/io/netzschleuder.py index 64a2c718..6024b391 100644 --- a/src/pathpyG/io/netzschleuder.py +++ b/src/pathpyG/io/netzschleuder.py @@ -172,7 +172,7 @@ def read_netzschleuder_graph( # construct graph and assign edge attributes if timestamps: - g = df_to_temporal_graph(df=edges, is_undirected=not is_directed, multiedges=multiedges, num_nodes=num_nodes) + g = df_to_temporal_graph(df=edges, multiedges=multiedges, num_nodes=num_nodes) else: g = df_to_graph(df=edges, multiedges=multiedges, is_undirected=not is_directed, num_nodes=num_nodes) diff --git a/src/pathpyG/io/pandas.py b/src/pathpyG/io/pandas.py index 51c4f941..373798a4 100644 --- a/src/pathpyG/io/pandas.py +++ b/src/pathpyG/io/pandas.py @@ -3,7 +3,7 @@ import ast import re -import warnings +import logging import pandas as pd import torch @@ -13,60 +13,100 @@ from pathpyG.core.graph import Graph from pathpyG.core.temporal_graph import TemporalGraph from pathpyG.core.index_map import IndexMap +from pathpyG.utils.convert import to_numpy -# Regex to check if the attribute is iterable (e.g., list, dict, etc.) -_iterable_re = re.compile(r"^\s*[\[\{\(].*[\]\}\)]\s*$") -_number_re = re.compile( - r"""^\s* # optional leading whitespace - [+-]? # optional sign - ( # start group - (\d+\.\d*)|(\.\d+)|(\d+) # float or int - ) - ([eE][+-]?\d+)? # optional exponent - \s*$ # optional trailing whitespace -""" -) +logger = logging.getLogger("root") + +# Regex to check if the attribute is iterable (e.g., list or tuple), a number (int or float), or an integer. +_iterable_re = re.compile(r"^\s*[\[\(].*[\]\)]\s*$") +_number_re = re.compile(r"^\s*[+-]?((\d+\.\d*)|(\.\d+)|(\d+))([eE][+-]?\d+)?\s*$") _integer_re = re.compile(r"^\s*[+-]?\d+\s*$") -def _check_column_name(frame: pd.DataFrame, name: str, synonyms: list) -> pd.DataFrame: - """Helper function to check column names and change them if needed.""" - if name not in frame.columns: - for col in frame.columns: - if col in synonyms: - frame.rename(columns={col: name}, inplace=True) - continue - return frame +def _parse_timestamp(df: pd.DataFrame, timestamp_format: str = "%Y-%m-%d %H:%M:%S", time_rescale: int = 1) -> None: + """Parse time stamps in a DataFrame. + Parses the time stamps in the DataFrame and rescales using the given time rescale factor. + The time stamps are expected to be in a column named `t`. If the column is of type `object`, it is assumed to + contain time stamps in the specified format. -def _parse_df_column(df: pd.DataFrame, data: Data, attr: str, idx: list | None = None, prefix: str = "") -> None: - """Helper function to parse a column in a DataFrame and add it as an attribute to the graph.""" + Args: + df: The DataFrame containing the time stamps in a column named `t`. + timestamp_format: The format of the time stamps in the `t` column. + time_rescale: The factor by which to rescale the time stamps. Defaults to 1, meaning no rescaling. + """ + # optionally parse time stamps + if df["t"].dtype == "object" and isinstance(df["t"].values[0], str): + # convert time stamps to seconds since epoch + df["t"] = pd.to_datetime(df["t"], format=timestamp_format) + # rescale time stamps + df["t"] = df["t"].astype("int64") // time_rescale + df["t"] = df["t"] - df["t"].min() # rescale to start at 0 + elif df["t"].dtype == "int64" or df["t"].dtype == "float64": + # rescale time stamps + df["t"] = df["t"] // time_rescale + elif pd.api.types.is_datetime64_any_dtype(df["t"]): + df["t"] = df["t"].astype("int64") // time_rescale + df["t"] = df["t"] - df["t"].min() # rescale to start at 0 + else: + raise ValueError( + "Column `t` must be of type `object`, `int64`, `float64`, or a datetime type. " + f"Found {df['t'].dtype} instead." + ) + + +def _parse_df_column( + df: pd.DataFrame, data: Data, attr: str, idx: list | np.ndarray | None = None, prefix: str = "" +) -> None: + """Parse a column in a DataFrame and add it as an attribute to the graph. + + Parses a column in a DataFrame and adds it as an attribute to the graph's data object. We assume that the attribute + in the DataFrame is ordered in the same way as the nodes/edges in the graph if `idx` is not provided. If `idx` is + provided, the order of the attribute values is determined by the indices in `idx`. + + Args: + df: The DataFrame containing the attribute. Attributes are expected to be numeric, string, or iterable types. + data: The Data object of the graph to which the attribute should be added. + attr: The name of the attribute column in the DataFrame. + idx: Indices specifying the order of the attribute values. If None, all values are used in the given order. + prefix: A prefix to be added to the attribute name in the Data object, e.g., "edge_" or "node_". + """ + # if idx is None, use all indices in the given order if idx is None: idx = np.arange(len(df)) + # check if the attribute is a string, list, tuple, etc. if df[attr].dtype == "object": - if _iterable_re.match(str(df[attr].values[0])): - data[prefix + attr] = torch.tensor( - [ast.literal_eval(x) for x in df[attr].values[idx]], device=data.edge_index.device - ) - elif _number_re.match(str(df[attr].values[0])): - # if the attribute is a number, convert it to a tensor - if _integer_re.match(str(df[attr].values[0])): + if isinstance(df[attr].values[0], str): + # if the attribute is a string, check if it is iterable or numeric + if _iterable_re.match(str(df[attr].values[0])): + # if the attribute is a string that can be converted to an iterable, convert it to a tensor data[prefix + attr] = torch.tensor( - df[attr].values.astype(int)[idx], device=data.edge_index.device + [ast.literal_eval(x) for x in df[attr].values[idx]], device=data.edge_index.device ) + elif _number_re.match(str(df[attr].values[0])): + # if the attribute is a number, convert it to a tensor + if _integer_re.match(str(df[attr].values[0])): + data[prefix + attr] = torch.tensor(df[attr].values.astype(int)[idx], device=data.edge_index.device) + else: + data[prefix + attr] = torch.tensor( + df[attr].values.astype(float)[idx], device=data.edge_index.device + ) else: - data[prefix + attr] = torch.tensor( - df[attr].values.astype(float)[idx], device=data.edge_index.device - ) + # if the attribute is not iterable, convert it to a string + data[prefix + attr] = np.array(df[attr].values.astype(str)[idx]) + elif isinstance(df[attr].values[0], (list, tuple)): + data[prefix + attr] = torch.tensor([np.array(x) for x in df[attr].values[idx]]) else: - # if the attribute is not iterable, convert it to a string - data[prefix + attr] = np.array(df[attr].values.astype(str)[idx]) + raise ValueError(f"Unsupported data type for attribute '{attr}': {type(df[attr].values[0])}") else: + # if the attribute is numeric, convert it to a tensor directly data[prefix + attr] = torch.tensor(df[attr].values[idx], device=data.edge_index.device) -def df_to_graph(df: pd.DataFrame, is_undirected: bool = False, multiedges: bool = False, num_nodes: int | None = None) -> Graph: +def df_to_graph( + df: pd.DataFrame, is_undirected: bool = False, multiedges: bool = False, num_nodes: int | None = None +) -> Graph: """Reads a network from a pandas data frame. The data frame is expected to have a minimum of two columns @@ -74,22 +114,15 @@ def df_to_graph(df: pd.DataFrame, is_undirected: bool = False, multiedges: bool data frame will be mapped to edge attributes. Args: - - df: pandas.DataFrame - - A data frame with rows containing edges and optional edge attributes. If the + df: A data frame with rows containing edges and optional edge attributes. If the data frame contains column names, the source and target columns must be called 'v' and 'w' respectively. If no column names are used the first two columns are interpreted as source and target. - - is_undirected: Optional[bool]=True - - whether or not to interpret edges as undirected - - multiedges: Optional[bool]=False - - whether or not to allow multiple edges between the same node pair. By + is_undirected: Whether or not to interpret edges as undirected. + multiedges: Whether or not to allow multiple edges between the same node pair. By default multi edges are ignored. + num_nodes: The number of nodes in the graph. If None, the number of unique nodes + in the data frame is used. Example: ```py @@ -114,19 +147,22 @@ def df_to_graph(df: pd.DataFrame, is_undirected: bool = False, multiedges: bool col_names = ["v", "w"] # interpret remaining columns as edge attributes for i in range(2, len(df.columns.values.tolist())): - col_names += ["edge_attr_{0}".format(i - 2)] + col_names += [f"edge_attr_{i - 2}"] df.columns = col_names - edge_df = df[["v", "w"]].drop_duplicates() - if not multiedges and (len(edge_df) != len(df)): - print("Data frame contains multiple edges, but multiedges is set to False. Removing duplicates.") + # optionally remove multiedges + if not multiedges and df[["v", "w"]].duplicated().any(): + logger.debug("Data frame contains multiple edges, but multiedges is set to False. Removing duplicates.") df = df.drop_duplicates(subset=["v", "w"]) - mapping = IndexMap(node_ids=np.unique(df[["v", "w"]].values)) + # Create index mapping and data object + mapping = IndexMap(node_ids=np.unique(df[["v", "w"]].values).tolist()) data = Data( edge_index=mapping.to_idxs(df[["v", "w"]].values.T), - num_nodes=num_nodes if num_nodes is not None else mapping.node_ids.shape[0], + num_nodes=num_nodes if num_nodes is not None else mapping.node_ids.shape[0], # type: ignore ) + + # Parse all columns except 'v' and 'w' as edge attributes cols = df.columns.tolist() cols.remove("v") cols.remove("w") @@ -136,22 +172,21 @@ def df_to_graph(df: pd.DataFrame, is_undirected: bool = False, multiedges: bool else: prefix = "edge_" - _parse_df_column( - df=df, - data=data, - attr=col, - prefix=prefix - ) + _parse_df_column(df=df, data=data, attr=col, prefix=prefix) + + # Create graph object g = Graph(data=data, mapping=mapping) + # If the graph should be undirected, convert it to an undirected graph if is_undirected: g = g.to_undirected() + return g def add_node_attributes(df: pd.DataFrame, g: Graph): - """Add node attributes from pandas data frame to existing `Graph`. + """Add node attributes from `DataFrame` to existing `Graph`. - Add node attributes from pandas data frame to existing graph, where node + Add node attributes from `pandas.DataFrame` to existing graph, where node IDs or indices are given in column `v` and node attributes x are given in columns `node_x`. Args: @@ -159,43 +194,35 @@ def add_node_attributes(df: pd.DataFrame, g: Graph): g: The graph to which the node attributes should be added. """ if "v" in df: - print("Mapping node attributes based on node names in column `v`") + logger.debug("Mapping node attributes based on node names in column `v`") attributed_nodes = list(df["v"]) elif "index" in df: - print("Mapping node attributes based on node indices in column `index`") + logger.debug("Mapping node attributes based on node indices in column `index`") attributed_nodes = list(df["index"]) else: - print("Data frame must either have `index` or `v` column") - return + raise ValueError("DataFrame must either have `index` or `v` column") # check for duplicated node attributes if len(set(attributed_nodes)) < len(attributed_nodes): - print("data frame cannot contain multiple attribute values for single node") - return + raise ValueError("DataFrame cannot contain multiple attribute values for single node") # check for difference between nodes in graph and nodes in attributes if "v" in df: if set(attributed_nodes) != set([v for v in g.nodes]): - print("Mismatch between nodes in DataFrame and nodes in graph") - return + raise ValueError("Mismatch between nodes in DataFrame and nodes in graph") # get indices of nodes in tensor - node_idx = g.mapping.to_idxs(attributed_nodes) + node_idx = g.mapping.to_idxs(attributed_nodes).tolist() else: if set(attributed_nodes) != set([i for i in range(g.n)]): - print("Mismatch between nodes in DataFrame and nodes in graph") - return + raise ValueError("Mismatch between nodes in DataFrame and nodes in graph") # get indices of nodes in tensor node_idx = attributed_nodes # assign node property tensors - for attr in df.columns: - - # skip node column - if attr == "v" or attr == "index": - continue - + cols = [attr for attr in df.columns if attr not in ["v", "index"]] + for attr in cols: # prefix attribute names that are not already prefixed if attr.startswith("node_"): prefix = "" @@ -226,13 +253,26 @@ def add_edge_attributes(df: pd.DataFrame, g: Graph, time_attr: str | None = None """ assert "v" in df and "w" in df, "Data frame must have columns `v` and `w` for source and target nodes" + # check for non-existent nodes + node_ids = set(df["v"]).union(set(df["w"])) + if not node_ids.issubset(set(g.nodes)): + raise ValueError( + f"DataFrame contains nodes {node_ids - set(g.nodes)} that do not exist in the graph. " + "Please ensure all nodes in the DataFrame are present in the graph." + ) + + # check if the number of edges in the data frame is consistent with the graph + if g.m != len(df): + raise ValueError( + f"DataFrame contains {len(df)} edges, but the graph has {g.m} edges. " + "Please ensure the DataFrame matches the number of edges in the graph." + ) + # extract indices of source/target node of edges src = g.mapping.to_idxs(df["v"].tolist()) tgt = g.mapping.to_idxs(df["w"].tolist()) - edge_attrs = list(df.columns) - edge_attrs.remove("v") - edge_attrs.remove("w") + edge_attrs = [attr for attr in df.columns if attr not in ["v", "w"]] if time_attr is not None: assert time_attr in df, f"Data frame must have column `{time_attr}` for time stamps" @@ -240,29 +280,23 @@ def add_edge_attributes(df: pd.DataFrame, g: Graph, time_attr: str | None = None time = df[time_attr].values edge_attrs.remove(time_attr) - # find indices of edges in edge_index + # find indices of edges in temporal edge_index edge_idx = [] for src_i, tgt_i, time_i in zip(src, tgt, time): - matching_idx = torch.where( - (g.data.edge_index[0, :] == src_i) & (g.data.edge_index[1, :] == tgt_i) & (g.data.time == time_i) - )[0] - if matching_idx.numel() == 1: - edge_idx.append(matching_idx.item()) - else: - # if the edge is not unique, raise a warning - if matching_idx.numel() > 1: - # if there are multiple edges, take the first one - edge_idx.append(matching_idx[0].item()) - warnings.warn(f"Edge ({src_i}, {tgt_i}) exists {matching_idx.numel()} times in the graph", stacklevel=2) + edge = g.tedge_to_index.get((src_i.item(), tgt_i.item(), time_i.item()), None) # type: ignore + if edge is None: + raise ValueError( + f"Edge ({src_i.item()}, {tgt_i.item()}) does not exist at time {time_i.item()} in the graph." + ) + edge_idx.append(edge) else: # find indices of edges in edge_index edge_idx = [] for src_i, tgt_i in zip(src, tgt): - matching_idx = torch.where((g.data.edge_index[0, :] == src_i) & (g.data.edge_index[1, :] == tgt_i))[0] - assert ( - matching_idx.numel() == 1 - ), f"Edge ({src_i}, {tgt_i}) either does not exist or is duplicated in the graph" - edge_idx.append(matching_idx.item()) + edge = g.edge_to_index.get((src_i.item(), tgt_i.item()), None) + if edge is None: + raise ValueError(f"Edge ({src_i.item()}, {tgt_i.item()}) does not exist in the graph.") + edge_idx.append(edge) for attr in edge_attrs: if attr.startswith("edge_"): @@ -272,57 +306,57 @@ def add_edge_attributes(df: pd.DataFrame, g: Graph, time_attr: str | None = None # parse column and add to graph _parse_df_column( - df=df, + df=df.iloc[edge_idx], data=g.data, - idx=edge_idx, attr=attr, prefix=prefix, ) def df_to_temporal_graph( - df: pd.DataFrame, is_undirected: bool = False, multiedges: bool = False, timestamp_format="%Y-%m-%d %H:%M:%S", time_rescale=1, num_nodes: int | None = None + df: pd.DataFrame, + multiedges: bool = False, + timestamp_format="%Y-%m-%d %H:%M:%S", + time_rescale=1, + num_nodes: int | None = None, ) -> TemporalGraph: - """Reads a temporal graph from a pandas data frame. - - The data frame is expected to have a minimum of two columns `v` and `w` - that give the source and target nodes of edges. Additional column names to - be used can be configured in `config.cfg` as `v_synonyms` and `w` - synonyms. The time information on edges can either be stored in an - additional `timestamp` column (for instantaneous interactions) or in two - columns `start`, `end` or `timestamp`, `duration` respectively for networks - where edges appear and exist for a certain time. Synonyms for those column - names can be configured in config.cfg. Each row in the data frame is - mapped to one temporal edge. Additional columns in the data frame will be + """Read a temporal graph from a DataFrame. + + The DataFrame is expected to have a minimum of two columns `v` and `w` + that give the source and target nodes of edges. Each row in the DataFrame is + mapped to one temporal edge. Additional columns in the DataFrame will be mapped to edge attributes. Args: df: pandas.DataFrame with rows containing time-stamped edges and optional edge - attributes. - timestamp_format: timestamp format - time_rescale: time stamp rescaling factor - **kwargs: Arbitrary keyword arguments that will be set as network-level attributes. + attributes. + multiedges: Whether or not to allow multiple edges between the same node pair. By + default multi edges are ignored. + timestamp_format: The format of the time stamps in the `t` column. + time_rescale: The factor by which to rescale the time stamps. Defaults to 1, meaning no rescaling. + num_nodes: The number of nodes in the graph. If None, the number of unique nodes + in the DataFrame is used. Example: - ```py - - import pathpyG as pp - import pandas as pd - df = pd.DataFrame({ - 'v': ['a', 'b', 'c'], - 'w': ['b', 'c', 'a'], - 't': [1, 2, 3]}) - g = pp.io.df_to_temporal_graph(df) - print(g) - - df = pd.DataFrame([ - ['a', 'b', 'c'], - ['b', 'c', 'a'], - [1, 2, 3] - ]) - g = pp.io.df_to_temporal_graph(df) - print(g) - ``` + ```py + + import pathpyG as pp + import pandas as pd + df = pd.DataFrame({ + 'v': ['a', 'b', 'c'], + 'w': ['b', 'c', 'a'], + 't': [1, 2, 3]}) + g = pp.io.df_to_temporal_graph(df) + print(g) + + df = pd.DataFrame([ + ['a', 'b', 'c'], + ['b', 'c', 'a'], + [1, 2, 3] + ]) + g = pp.io.df_to_temporal_graph(df) + print(g) + ``` """ # assign column names if no header is present no_header = all(isinstance(x, int) for x in df.columns.values.tolist()) @@ -335,59 +369,41 @@ def df_to_temporal_graph( col_names += ["edge_attr_{0}".format(i - 2)] df.columns = col_names - # optionally parse time stamps - if df["t"].dtype == "object": - # convert time stamps to seconds since epoch - df["t"] = pd.to_datetime(df["t"], format=timestamp_format) - # rescale time stamps - df["t"] = df["t"].astype("int64") // time_rescale - elif df["t"].dtype == "int64" or df["t"].dtype == "float64": - # rescale time stamps - df["t"] = df["t"] // time_rescale - elif pd.api.types.is_datetime64_any_dtype(df["t"]): - df["t"] = df["t"].astype("int64") // time_rescale - else: - raise ValueError( - "Column `t` must be of type `object`, `int64`, `float64`, or a datetime type. " - f"Found {df['t'].dtype} instead." - ) + # parse the time stamp column "t" + _parse_timestamp(df=df, timestamp_format=timestamp_format, time_rescale=time_rescale) + # optionally remove multiedges if not multiedges: df = df.drop_duplicates(subset=["v", "w", "t"]) + # Create index mapping and data object mapping = IndexMap(node_ids=np.unique(df[["v", "w"]].values)) data = Data( edge_index=mapping.to_idxs(df[["v", "w"]].values.T), time=torch.tensor(df["t"].values), - num_nodes=num_nodes if num_nodes is not None else mapping.node_ids.shape[0], + num_nodes=num_nodes if num_nodes is not None else mapping.node_ids.shape[0], # type: ignore ) - cols = df.columns.tolist() - cols.remove("v") - cols.remove("w") + + # add edge attributes + cols = [col for col in df.columns if col not in ["v", "w", "t"]] for col in cols: if col.startswith("edge_"): prefix = "" else: prefix = "edge_" - _parse_df_column( - df=df, - data=data, - attr=col, - prefix=prefix - ) + _parse_df_column(df=df, data=data, attr=col, prefix=prefix) + + # Create temporal graph object g = TemporalGraph(data=data, mapping=mapping) - - if is_undirected: - g = g.to_undirected() - + return g def graph_to_df(graph: Graph, node_indices: Optional[bool] = False) -> pd.DataFrame: - """Returns a pandas data frame for a given graph. + """Return a DataFrame for a given graph. - Returns a pandas dataframe data that contains all edges including edge + Returns a `pandas.DataFrame` that contains all edges including edge attributes. Node and network-level attributes are not included. To facilitate the import into network analysis tools that only support integer node identifiers, node uids can be replaced by a consecutive, zero-based @@ -398,32 +414,29 @@ def graph_to_df(graph: Graph, node_indices: Optional[bool] = False) -> pd.DataFr node_indices: whether nodes should be exported as integer indices Example: - ```py - import pathpyG as pp + ```py + import pathpyG as pp - n = pp.Graph.from_edge_list([('a', 'b'), ('b', 'c'), ('c', 'a')]) - df = pp.io.to_dataframe(n) - print(df) - ``` + n = pp.Graph.from_edge_list([('a', 'b'), ('b', 'c'), ('c', 'a')]) + df = pp.io.to_dataframe(n) + print(df) + ``` """ - df = pd.DataFrame() - - for v, w in graph.edges: - if node_indices: - v = graph.mapping.to_idx(v) - w = graph.mapping.to_idx(w) - edge_frame = pd.DataFrame.from_dict({"v": [v], "w": [w]}) - df = pd.concat([df, edge_frame], ignore_index=True, sort=False) + if node_indices: + vs = to_numpy(graph.data.edge_index[0]) + ws = to_numpy(graph.data.edge_index[1]) + else: + vs = graph.mapping.to_ids(to_numpy(graph.data.edge_index[0])) + ws = graph.mapping.to_ids(to_numpy(graph.data.edge_index[1])) + df = pd.DataFrame({**{"v": vs, "w": ws}, **{a: graph.data[a].tolist() for a in graph.edge_attrs()}}) - edge_attribute_df = pd.DataFrame.from_dict({a: graph.data[a] for a in graph.edge_attrs()}) - df = pd.concat([df, edge_attribute_df], axis=1) return df def temporal_graph_to_df(graph: TemporalGraph, node_indices: Optional[bool] = False) -> pd.DataFrame: - """Returns a pandas data frame for a given temporal graph. + """Return a DataFrame for a given temporal graph. - Returns a pandas dataframe data that contains all edges including edge + Returns a `pandas.DataFrame` that contains all edges including edge attributes. Node and network-level attributes are not included. To facilitate the import into network analysis tools that only support integer node identifiers, node uids can be replaced by a consecutive, zero-based @@ -434,26 +447,27 @@ def temporal_graph_to_df(graph: TemporalGraph, node_indices: Optional[bool] = Fa node_indices: whether nodes should be exported as integer indices Example: - ```py - import pathpyG as pp + ```py + import pathpyG as pp - n = pp.TemporalGraph.from_edge_list([('a', 'b', 1), ('b', 'c', 2), ('c', 'a', 3)]) - df = pp.io.to_df(n) - print(df) - ``` + n = pp.TemporalGraph.from_edge_list([('a', 'b', 1), ('b', 'c', 2), ('c', 'a', 3)]) + df = pp.io.to_df(n) + print(df) + ``` """ - df = pd.DataFrame() - - # export temporal graph - for v, w, t in graph.temporal_edges: - if node_indices: - v = graph.mapping.to_idx(v) - w = graph.mapping.to_idx(w) - edge_frame = pd.DataFrame.from_dict({"v": [v], "w": [w], "t": [t]}) - # data = pd.DataFrame.from_dict( - # {k: [v] for k, v in edge.attributes.items()}) - # edge_frame = pd.concat([edge_frame, data], axis=1) - df = pd.concat([edge_frame, df], ignore_index=True, sort=False) + if node_indices: + vs = to_numpy(graph.data.edge_index[0]) + ws = to_numpy(graph.data.edge_index[1]) + else: + vs = graph.mapping.to_ids(to_numpy(graph.data.edge_index[0])) + ws = graph.mapping.to_ids(to_numpy(graph.data.edge_index[1])) + df = pd.DataFrame( + { + **{"v": vs, "w": ws, "t": graph.data.time.tolist()}, + **{a: graph.data[a].tolist() for a in graph.edge_attrs()}, + } + ) + return df @@ -465,17 +479,20 @@ def read_csv_graph( multiedges: bool = False, **kwargs: Any, ) -> Graph: - """Reads a Graph or TemporalGraph from a csv file. To read a temporal graph, the csv file must have + """Read a `Graph` from a csv file. + + This method reads a graph from a `.csv`-file and converts it to a + `Graph` object. To read a temporal graph, the csv file must have a header with column `t` containing time stamps of edges Args: - loops: whether or not to add self_loops - directed: whether or not to intepret edges as directed - multiedges: whether or not to add multiple edges + filename: The path to the csv file containing the graph data. sep: character separating columns in the csv file header: whether or not the first line of the csv file is interpreted as header with column names - timestamp_format: format of timestamps - time_rescale: rescaling of timestamps + is_undirected: whether or not to interpret edges as undirected + multiedges: whether or not to allow multiple edges between the same node pair. By default multi edges are + ignored. + **kwargs: Additional keyword arguments passed to the `df_to_graph` function. Example: ```py @@ -497,20 +514,24 @@ def read_csv_temporal_graph( filename: str, sep: str = ",", header: bool = True, - is_undirected: bool = True, timestamp_format: str = "%Y-%m-%d %H:%M:%S", time_rescale: int = 1, **kwargs: Any, ) -> TemporalGraph: - """Reads a TemporalGraph from a csv file that minimally has three columns - containin source, target and time. + """Read a `TemporalGraph` from a csv file. + + This method reads a temporal graph from a `.csv`-file and converts it to a + `TemporalGraph` object. The csv file is expected to have a header with columns + `v`, `w`, and `t` containing source nodes, target nodes, and time stamps of edges, + respectively. Additional columns in the csv file will be interpreted as edge attributes. Args: + filename: The path to the csv file containing the temporal graph data. sep: character separating columns in the csv file header: whether or not the first line of the csv file is interpreted as header with column names - directed: whether or not to intepret edges as directed - timestamp_format: format of timestamps - time_rescale: rescaling of timestamps + timestamp_format: The format of the time stamps in the `t` column. + time_rescale: The factor by which to rescale the time stamps. Defaults to 1, meaning no rescaling. + **kwargs: Additional keyword arguments passed to the `df_to_temporal_graph` function. Example: ```py @@ -523,17 +544,25 @@ def read_csv_temporal_graph( df = pd.read_csv(filename, header=0, sep=sep) else: df = pd.read_csv(filename, header=None, sep=sep) - return df_to_temporal_graph( - df, is_undirected=is_undirected, timestamp_format=timestamp_format, time_rescale=time_rescale, **kwargs - ) + return df_to_temporal_graph(df, timestamp_format=timestamp_format, time_rescale=time_rescale, **kwargs) -def write_csv( - graph: Union[Graph, TemporalGraph], path_or_buf: Any = None, node_indices: bool = False, **pdargs: Any -) -> None: - """Stores all edges including edge attributes in a csv file.""" +def write_csv(graph: Union[Graph, TemporalGraph], node_indices: bool = False, **pdargs: Any) -> None: + """Store all edges including edge attributes in a csv file. + + This method stores a `Graph` or `TemporalGraph` as a `.csv` file. The csv file + will contain all edges including edge attributes. Node and network-level attributes + are not included. To facilitate the import into network analysis tools that only + support integer node identifiers, node uids can be replaced by a consecutive, + zero-based index. + + Args: + graph: The graph to export as pandas DataFrame + node_indices: whether nodes should be exported as integer indices + **pdargs: Additional keyword arguments passed to `pandas.DataFrame.to_csv`. + """ if isinstance(graph, TemporalGraph): frame = temporal_graph_to_df(graph=graph, node_indices=node_indices) else: frame = graph_to_df(graph=graph, node_indices=node_indices) - frame.to_csv(path_or_buf=path_or_buf, index=False, **pdargs) + frame.to_csv(index=False, **pdargs) diff --git a/tests/io/conftest.py b/tests/io/conftest.py index 026619a6..c7fd2bf5 100644 --- a/tests/io/conftest.py +++ b/tests/io/conftest.py @@ -1,41 +1,23 @@ -from __future__ import annotations +"""This module contains fixtures for testing the io module of pathpyG.""" import pytest - -from pathpyG.core.graph import Graph -import pandas as pd - - -@pytest.fixture -def df_graph() -> pd.DataFrame: - """DataFrame for simple graph with header and no edge attributes.""" - df = pd.DataFrame({"v": ["a", "b", "c"], "w": ["b", "c", "a"]}) - return df - - -@pytest.fixture -def df_graph_attribute() -> pd.DataFrame: - """DataFrame for simple graph with edge attributes and header.""" - df = pd.DataFrame({"v": ["a", "b", "c"], "w": ["b", "c", "a"], "edge_weight": [2.0, 1.0, 42.0]}) - return df +from pathpyG import Graph, TemporalGraph @pytest.fixture -def df_graph_attribute_no_header() -> pd.DataFrame: - """DataFrame for simple graph with edge attributes and no header.""" - df = pd.DataFrame([["a", "b", 2.0], ["b", "c", 1.0], ["c", "a", 42.0]]) - return df +def backward_idx() -> list[int]: + """Return a backward index.""" + return [2, 1, 0] @pytest.fixture -def df_temporal_graph() -> pd.DataFrame: - """DataFrame for simple temporal graph with header.""" - df = pd.DataFrame({"v": ["a", "b", "c"], "w": ["b", "c", "a"], "t": [1, 2, 3]}) - return df +def simple_graph() -> Graph: + """Return a simple directed graph.""" + return Graph.from_edge_list([("a", "b"), ("b", "c"), ("a", "c")]) @pytest.fixture -def df_temporal_graph_no_header() -> pd.DataFrame: - """DataFrame for simple temporal graph without header.""" - df = pd.DataFrame([["a", "b", 1], ["b", "c", 2], ["c", "a", 3]]) - return df +def simple_temporal_graph() -> TemporalGraph: + """Return a simple temporal graph.""" + tedges = [("a", "b", 1), ("b", "c", 5), ("c", "d", 9), ("c", "e", 9)] + return TemporalGraph.from_edge_list(tedges) diff --git a/tests/io/test_pandas.py b/tests/io/test_pandas.py index 1e8c9fea..8fb7ce01 100644 --- a/tests/io/test_pandas.py +++ b/tests/io/test_pandas.py @@ -1,40 +1,520 @@ """This module tests high-level functions of the pandas module.""" -import pytest +# pylint: disable=missing-function-docstring -from torch import tensor, equal +import pytest +import pandas as pd import numpy as np +import torch +from torch_geometric.data import Data + +from pathpyG import Graph, TemporalGraph +from pathpyG.io.pandas import ( + _iterable_re, + _number_re, + _integer_re, + _parse_timestamp, + _parse_df_column, + df_to_graph, + add_edge_attributes, + add_node_attributes, + df_to_temporal_graph, + graph_to_df, + temporal_graph_to_df, + read_csv_graph, + read_csv_temporal_graph, + write_csv, +) + + +def test_iterable_regex(): + assert _iterable_re.match("[1, 2, 3]") + assert _iterable_re.match("(1, 2, 3)") + assert not _iterable_re.match("{1, 2, 3}") + assert not _iterable_re.match("1, 2, 3") + assert not _iterable_re.match("1, 2, 3]") + assert not _iterable_re.match("(1, 2, 3") + assert _iterable_re.match("[[1, 2], [3, 4]]") + + +def test_number_regex(): + assert _number_re.match("1") + assert _number_re.match("1.0") + assert _number_re.match("1.0e10") + assert not _number_re.match("1,000") + assert not _number_re.match("one") + assert not _number_re.match("1.0.0") + + +def test_integer_regex(): + assert _integer_re.match("1") + assert _integer_re.match("1000") + assert not _integer_re.match("1.0") + assert not _integer_re.match("1.0e10") + assert not _integer_re.match("1,000") + assert not _integer_re.match("one") + assert not _integer_re.match("1.0.0") + + +def test_parse_timestamp_object_string(): + df = pd.DataFrame({"t": ["2023-01-01 12:00:00", "2023-01-01 13:00:00"]}) + _parse_timestamp(df) + # Should be int64 after conversion + assert np.issubdtype(df["t"].dtype, np.integer) + assert df["t"].iloc[1] > df["t"].iloc[0] + + +def test_parse_timestamp_object_string_with_format(): + df = pd.DataFrame({"t": ["01/01/2023 12:00", "01/01/2023 13:00"]}) + _parse_timestamp(df, timestamp_format="%d/%m/%Y %H:%M") + assert np.issubdtype(df["t"].dtype, np.integer) + assert df["t"].iloc[1] > df["t"].iloc[0] + + +def test_parse_timestamp_int64(): + df = pd.DataFrame({"t": [1000, 2000, 3000]}) + _parse_timestamp(df) + assert np.all(df["t"] == np.array([1000, 2000, 3000])) + + +def test_parse_timestamp_float64(): + df = pd.DataFrame({"t": [1000.0, 2000.0, 3000.0]}) + _parse_timestamp(df) + assert np.all(df["t"] == np.array([1000.0, 2000.0, 3000.0])) + + +def test_parse_timestamp_datetime64(): + df = pd.DataFrame({"t": pd.to_datetime(["2023-01-01", "2023-01-02"])}) + _parse_timestamp(df) + assert np.issubdtype(df["t"].dtype, np.integer) + assert df["t"].iloc[1] > df["t"].iloc[0] + + +def test_parse_timestamp_rescale(): + df = pd.DataFrame({"t": ["2023-01-01 12:00:00", "2023-01-01 13:00:00"]}) + _parse_timestamp(df, time_rescale=10**9) # convert to seconds + # Should be seconds since epoch + assert np.all(df["t"].diff().dropna() == 3600) + + +def test_parse_timestamp_invalid_type(): + df = pd.DataFrame({"t": [None, None]}) + with pytest.raises(ValueError, match="Column `t` must be of type"): + _parse_timestamp(df) + + +def test_parse_df_column_numeric(backward_idx): + df = pd.DataFrame({"attr": [1, 2, 3]}) + data = Data(edge_index=torch.tensor([[0, 1, 2], [1, 2, 0]])) + _parse_df_column(df, data, "attr") + assert torch.equal(data["attr"], torch.tensor([1, 2, 3])) + + df = pd.DataFrame({"attr": ["1", "2", "3"]}) + _parse_df_column(df, data, "attr", prefix="edge_") + assert torch.equal(data["edge_attr"], torch.tensor([1, 2, 3], device=data.edge_index.device)) + + _parse_df_column(df, data, "attr", prefix="node_", idx=backward_idx) + expected_idx = torch.tensor([3, 2, 1]) + assert torch.equal(data["node_attr"], expected_idx) + + +def test_parse_df_column_float(backward_idx): + df = pd.DataFrame({"attr": [1.1, 2.2, 3.3]}) + data = Data(edge_index=torch.tensor([[0, 1, 2], [1, 2, 0]])) + _parse_df_column(df, data, "attr") + assert torch.allclose(data["attr"], torch.tensor([1.1, 2.2, 3.3], dtype=torch.double)) + + df = pd.DataFrame({"attr": ["1.1", "2.2", "3.3"]}) + _parse_df_column(df, data, "attr", prefix="node_") + assert torch.allclose( + data["node_attr"], torch.tensor([1.1, 2.2, 3.3], dtype=torch.double, device=data.edge_index.device) + ) -from pathpyG.core.graph import Graph -from pathpyG.core.temporal_graph import TemporalGraph -from pathpyG.io.pandas import df_to_graph, df_to_temporal_graph + _parse_df_column(df, data, "attr", prefix="edge_", idx=backward_idx) + expected_idx = torch.tensor([3.3, 2.2, 1.1], dtype=torch.double) + assert torch.allclose(data["edge_attr"], expected_idx) -def test_df_to_graph(df_graph, df_graph_attribute, df_graph_attribute_no_header): - g: Graph = df_to_graph(df_graph) +def test_parse_df_column_iterable(backward_idx): + df = pd.DataFrame({"attr": [[1, 2], [3, 4], [5, 6]]}) + data = Data(edge_index=torch.tensor([[0, 1, 2], [1, 2, 0]])) + _parse_df_column(df, data, "attr") + expected = torch.tensor([[1, 2], [3, 4], [5, 6]]) + assert torch.equal(data["attr"], expected) + + df = pd.DataFrame({"attr": ["[1, 2]", "[3, 4]", "[5, 6]"]}) + _parse_df_column(df, data, "attr", prefix="edge_") + assert torch.equal(data["edge_attr"], expected) + + df = pd.DataFrame({"attr": [(1, 2), (3, 4), (5, 6)]}) + _parse_df_column(df, data, "attr", prefix="node_", idx=backward_idx) + expected_idx = torch.tensor([[5, 6], [3, 4], [1, 2]]) + assert torch.equal(data["node_attr"], expected_idx) + + +def test_parse_df_column_string(backward_idx): + df = pd.DataFrame({"attr": ["foo", "bar", "baz"]}) + data = Data(edge_index=torch.tensor([[0, 1, 2], [1, 2, 0]])) + _parse_df_column(df, data, "attr") + assert np.array_equal(data["attr"], np.array(["foo", "bar", "baz"])) + + _parse_df_column(df, data, "attr", prefix="edge_", idx=backward_idx) + expected_idx = np.array(["baz", "bar", "foo"]) + assert np.array_equal(data["edge_attr"], expected_idx) + + +def test_df_to_graph(): + df_graph = pd.DataFrame({"v": ["a", "b", "c"], "w": ["b", "c", "a"]}) + g = df_to_graph(df_graph) assert g.n == 3 assert g.m == 3 - g: Graph = df_to_graph(df_graph_attribute) + df_graph_attribute = pd.DataFrame({"v": ["a", "b", "c"], "w": ["b", "c", "a"], "edge_weight": [2.0, 1.0, 42.0]}) + g = df_to_graph(df_graph_attribute) assert g.n == 3 assert g.m == 3 assert "edge_weight" in g.edge_attrs() - assert equal(g.data.edge_weight, tensor([2.0, 1.0, 42.0])) + assert torch.equal(g.data.edge_weight, torch.tensor([2.0, 1.0, 42.0])) - g: Graph = df_to_graph(df_graph_attribute_no_header) + df_graph_attribute_no_header = pd.DataFrame([["a", "b", 2.0], ["b", "c", 1.0], ["c", "a", 42.0]]) + g = df_to_graph(df_graph_attribute_no_header) assert g.n == 3 assert g.m == 3 assert "edge_attr_0" in g.edge_attrs() - assert equal(g.data.edge_attr_0, tensor([2.0, 1.0, 42.0])) + assert torch.equal(g.data.edge_attr_0, torch.tensor([2.0, 1.0, 42.0])) + + df_graph_with_multi_edges = pd.DataFrame( + {"v": ["a", "b", "c", "a"], "w": ["b", "c", "a", "b"], "edge_weight": [2.0, 1.0, 42.0, 3.0]} + ) + g = df_to_graph(df_graph_with_multi_edges, multiedges=False) + assert g.n == 3 + assert g.m == 3 + + g = df_to_graph(df_graph_with_multi_edges, multiedges=True) + assert g.n == 3 + assert g.m == 4 + + df_graph_with_string_attr = pd.DataFrame( + {"v": ["a", "b", "c"], "w": ["b", "c", "a"], "edge_weight": ["a", "b", "c"]} + ) + g = df_to_graph(df_graph_with_string_attr, is_undirected=True) + assert g.n == 3 + assert g.m == 6 + + +def test_add_node_attributes_by_name(simple_graph): + df = pd.DataFrame({"v": ["b", "a", "c"], "x": [2, 1, 3], "node_y": [0.2, 0.1, 0.3]}) + add_node_attributes(df, simple_graph) + assert torch.equal(simple_graph.data["node_x"], torch.tensor([1, 2, 3])) + assert torch.allclose(simple_graph.data["node_y"], torch.tensor([0.1, 0.2, 0.3], dtype=torch.double)) + + +def test_add_node_attributes_by_index(simple_graph): + df = pd.DataFrame({"index": [1, 0, 2], "x": [20, 10, 30]}) + add_node_attributes(df, simple_graph) + assert torch.equal(simple_graph.data["node_x"], torch.tensor([10, 20, 30])) + + +def test_duplicate_node_attribute_raises(simple_graph): + df = pd.DataFrame( + { + "v": ["a", "a", "b", "c"], + "x": [1, 2, 3, 4], + } + ) + with pytest.raises(ValueError, match="multiple attribute values for single node"): + add_node_attributes(df, simple_graph) + + +def test_mismatch_nodes_raises(simple_graph): + df = pd.DataFrame({"v": ["a", "b", "d"], "x": [1, 2, 3]}) + with pytest.raises(ValueError, match="Mismatch between nodes"): + add_node_attributes(df, simple_graph) + + +def test_missing_v_and_index_raises(simple_graph): + df = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + with pytest.raises(ValueError, match="must either have `index` or `v` column"): + add_node_attributes(df, simple_graph) + + +def test_add_edge_attributes_basic(simple_graph): + df = pd.DataFrame({"v": ["a", "b", "a"], "w": ["b", "c", "c"], "weight": [1, 3, 2]}) + add_edge_attributes(df, simple_graph) + assert torch.allclose(simple_graph.data["edge_weight"], torch.tensor([1, 2, 3])) + + +def test_add_edge_attributes_with_prefix(simple_graph): + df = pd.DataFrame({"v": ["a", "b", "a"], "w": ["b", "c", "c"], "edge_score": [5, 6, 7]}) + add_edge_attributes(df, simple_graph) + assert torch.equal(simple_graph.data["edge_score"], torch.tensor([5, 7, 6])) + + +def test_add_edge_attributes_missing_edge_raises(simple_graph): + df = pd.DataFrame( + {"v": ["a", "x", "a"], "w": ["b", "c", "c"], "weight": [1.0, 2.0, 3.0]} # "x" does not exist in graph + ) + with pytest.raises(ValueError, match="Please ensure all nodes in the DataFrame are present in the graph."): + add_edge_attributes(df, simple_graph) + df = pd.DataFrame( + {"v": ["a", "b", "a"], "w": ["a", "c", "c"], "weight": [1.0, 2.0, 3.0]} # edge "a -> a" does not exist in graph + ) + with pytest.raises(ValueError, match="does not exist in the graph"): + add_edge_attributes(df, simple_graph) -def test_df_to_temporal_graph(df_temporal_graph, df_temporal_graph_no_header): - g: TemporalGraph = df_to_temporal_graph(df_temporal_graph) + +def test_add_edge_attributes_temporal(simple_temporal_graph): + df = pd.DataFrame({"v": ["a", "b", "c", "c"], "w": ["b", "c", "e", "d"], "t": [1, 5, 9, 9], "weight": [1, 2, 4, 3]}) + add_edge_attributes(df, simple_temporal_graph, time_attr="t") + assert torch.allclose(simple_temporal_graph.data["edge_weight"], torch.tensor([1, 2, 3, 4])) + + +def test_add_edge_attributes_temporal_to_few_edges(simple_temporal_graph): + df = pd.DataFrame({"v": ["a"], "w": ["b"], "t": [99], "weight": [1.0]}) # No such temporal edge + with pytest.raises(ValueError, match="Please ensure the DataFrame matches the number of edges in the graph"): + add_edge_attributes(df, simple_temporal_graph, time_attr="t") + + +def test_add_edge_attributes_temporal_missing_raises(simple_temporal_graph): + df = pd.DataFrame( + { + "v": ["a", "b", "c", "c"], + "w": ["b", "c", "d", "e"], + "t": [1, 5, 9, 10], # Time "10" does not exist in the graph + "weight": [1.0, 2.0, 3.0, 4.0], + } + ) + with pytest.raises(ValueError, match="does not exist at time"): + add_edge_attributes(df, simple_temporal_graph, time_attr="t") + + +def test_df_to_temporal_graph_basic(): + df = pd.DataFrame({"v": ["a", "b", "c"], "w": ["b", "c", "a"], "t": [1, 2, 3]}) + g = df_to_temporal_graph(df) + assert g.n == 3 + assert g.m == 3 + assert torch.equal(g.data.time, torch.tensor([1, 2, 3])) + + +def test_df_to_temporal_graph_with_edge_attr(): + df = pd.DataFrame({"v": ["a", "b"], "w": ["b", "c"], "t": [20, 10], "weight": [2.0, 1.0]}) + g = df_to_temporal_graph(df) + assert hasattr(g.data, "edge_weight") + # edge weights should be in the same order as edges (sorted by time) + assert torch.allclose(g.data.edge_weight, torch.tensor([1.0, 2.0], dtype=torch.double)) + + +def test_df_to_temporal_graph_multiedges_false_removes_duplicates(): + df = pd.DataFrame({"v": ["a", "a", "b"], "w": ["b", "b", "c"], "t": [1, 1, 2]}) + g = df_to_temporal_graph(df, multiedges=False) + assert g.m == 2 # duplicate (a, b, 1) should be removed + + +def test_df_to_temporal_graph_multiedges_true_keeps_duplicates(): + df = pd.DataFrame({"v": ["a", "a", "b"], "w": ["b", "b", "c"], "t": [1, 1, 2]}) + g = df_to_temporal_graph(df, multiedges=True) + assert g.m == 3 # duplicate (a, b, 1) is kept + + +def test_df_to_temporal_graph_no_header(): + df = pd.DataFrame([["a", "b", 1], ["b", "c", 2], ["c", "a", 3], ["a", "b", 4]]) + g = df_to_temporal_graph(df) + assert g.n == 3 + assert g.m == 4 + + +def test_df_to_temporal_graph_time_rescale(): + df = pd.DataFrame({"v": ["a", "b"], "w": ["b", "c"], "t": [1000, 2000]}) + g = df_to_temporal_graph(df, time_rescale=1000) + print(g.data.time) + assert torch.equal(g.data.time, torch.tensor([1, 2])) + + +def test_df_to_temporal_graph_with_extra_edge_attrs(): + df = pd.DataFrame({"v": ["a", "b"], "w": ["b", "c"], "t": [1, 2], "foo": [10, 20], "edge_bar": [0.1, 0.2]}) + g = df_to_temporal_graph(df) + assert hasattr(g.data, "edge_foo") + assert hasattr(g.data, "edge_bar") + assert torch.equal(g.data.edge_foo, torch.tensor([10, 20])) + assert torch.allclose(g.data.edge_bar, torch.tensor([0.1, 0.2], dtype=torch.double)) + + +def test_graph_to_df_basic(simple_graph): + df = graph_to_df(simple_graph) + assert set(df.columns) == {"v", "w"} + assert len(df) == 3 + assert set(df["v"]) == {"a", "b"} + assert set(df["w"]) == {"b", "c"} + + +def test_graph_to_df_with_edge_attr(simple_graph): + simple_graph.data.edge_weight = torch.tensor([1.0, 2.0, 3.0]) + df = graph_to_df(simple_graph) + assert "edge_weight" in df.columns + assert list(df["edge_weight"]) == [1.0, 2.0, 3.0] + + +def test_graph_to_df_node_indices(simple_graph): + df = graph_to_df(simple_graph, node_indices=True) + assert set(df.columns) == {"v", "w"} + assert set(df["v"]) == {0, 1} + assert set(df["w"]) == {1, 2} + + +def test_graph_to_df_with_multiple_edge_attrs(simple_graph): + simple_graph.data.edge_weight = torch.tensor([1.0, 2.0, 3.0]) + simple_graph.data.edge_label = torch.tensor([0, 1, 2]) + df = graph_to_df(simple_graph) + assert "edge_weight" in df.columns + assert "edge_label" in df.columns + assert list(df["edge_label"]) == [0, 1, 2] + + +def test_temporal_graph_to_df_basic(simple_temporal_graph): + df = temporal_graph_to_df(simple_temporal_graph) + assert set(df.columns) == {"v", "w", "t"} + assert len(df) == 4 + assert set(df["v"]) == {"a", "b", "c"} + assert set(df["w"]) == {"b", "c", "d", "e"} + assert set(df["t"]) == {1, 5, 9} + + +def test_temporal_graph_to_df_with_edge_attr(simple_temporal_graph): + simple_temporal_graph.data.edge_weight = torch.tensor([1.0, 2.0, 3.0, 4.0]) + df = temporal_graph_to_df(simple_temporal_graph) + assert "edge_weight" in df.columns + assert list(df["edge_weight"]) == [1.0, 2.0, 3.0, 4.0] + + +def test_temporal_graph_to_df_node_indices(simple_temporal_graph): + df = temporal_graph_to_df(simple_temporal_graph, node_indices=True) + assert set(df.columns) == {"v", "w", "t"} + assert set(df["v"]) == {0, 1, 2} + assert set(df["w"]) == {1, 2, 3, 4} + assert set(df["t"]) == {1, 5, 9} + + +def test_temporal_graph_to_df_with_multiple_edge_attrs(simple_temporal_graph): + simple_temporal_graph.data.edge_weight = torch.tensor([1.0, 2.0, 3.0, 4.0]) + simple_temporal_graph.data.edge_label = torch.tensor([0, 1, 0, 1]) + df = temporal_graph_to_df(simple_temporal_graph) + assert "edge_weight" in df.columns + assert "edge_label" in df.columns + assert list(df["edge_label"]) == [0, 1, 0, 1] + + +def test_read_csv_graph_basic(tmp_path): + # Create a simple CSV file + csv_path = tmp_path / "graph.csv" + df = pd.DataFrame({"v": ["a", "b", "a"], "w": ["b", "c", "c"]}) + df.to_csv(csv_path, index=False) + g = read_csv_graph(str(csv_path)) + assert isinstance(g, Graph) assert g.n == 3 assert g.m == 3 - assert equal(g.data.time, tensor([1.0, 2.0, 3.0])) + assert set(g.nodes) == {"a", "b", "c"} + + +def test_read_csv_graph_with_edge_attr(tmp_path): + csv_path = tmp_path / "graph_attr.csv" + df = pd.DataFrame({"v": ["a", "b"], "w": ["b", "c"], "edge_weight": [1.0, 2.0]}) + df.to_csv(csv_path, index=False) + g = read_csv_graph(str(csv_path)) + assert hasattr(g.data, "edge_weight") + assert torch.allclose(g.data.edge_weight, torch.tensor([1.0, 2.0], dtype=torch.double)) - g: TemporalGraph = df_to_temporal_graph(df_temporal_graph_no_header) + +def test_read_csv_graph_no_header(tmp_path): + csv_path = tmp_path / "graph_noheader.csv" + df = pd.DataFrame([["a", "b"], ["b", "c"], ["a", "c"]]) + df.to_csv(csv_path, index=False, header=False) + g = read_csv_graph(str(csv_path), header=False) assert g.n == 3 assert g.m == 3 - assert equal(g.data.time, tensor([1.0, 2.0, 3.0])) + assert set(g.nodes) == {"a", "b", "c"} + + +def test_read_csv_graph_multiedges(tmp_path): + csv_path = tmp_path / "graph_multi.csv" + df = pd.DataFrame({"v": ["a", "a", "b"], "w": ["b", "b", "c"]}) + df.to_csv(csv_path, index=False) + g = read_csv_graph(str(csv_path), multiedges=False) + assert g.m == 2 # duplicate (a, b) should be removed + g2 = read_csv_graph(str(csv_path), multiedges=True) + assert g2.m == 3 # all edges kept + + +def test_read_csv_temporal_graph_basic(tmp_path): + csv_path = tmp_path / "temporal_graph.csv" + df = pd.DataFrame({"v": ["a", "b", "c", "c"], "w": ["b", "c", "d", "e"], "t": [1, 5, 9, 9]}) + df.to_csv(csv_path, index=False) + g = read_csv_temporal_graph(str(csv_path)) + assert isinstance(g, TemporalGraph) + assert g.n == 5 + assert g.m == 4 + assert set(g.nodes) == {"a", "b", "c", "d", "e"} + assert set(g.data.time.tolist()) == {0, 4, 8, 8} or set(g.data.time.tolist()) == {1, 5, 9} + + +def test_read_csv_temporal_graph_with_edge_attr(tmp_path): + csv_path = tmp_path / "temporal_graph_attr.csv" + df = pd.DataFrame({"v": ["a", "b"], "w": ["b", "c"], "t": [1, 2], "edge_weight": [1.0, 2.0]}) + df.to_csv(csv_path, index=False) + g = read_csv_temporal_graph(str(csv_path)) + assert hasattr(g.data, "edge_weight") + assert torch.allclose(g.data.edge_weight, torch.tensor([1.0, 2.0], dtype=torch.double)) + + +def test_read_csv_temporal_graph_no_header(tmp_path): + csv_path = tmp_path / "temporal_graph_noheader.csv" + df = pd.DataFrame([["a", "b", 1], ["b", "c", 2], ["c", "d", 3]]) + df.to_csv(csv_path, index=False, header=False) + g = read_csv_temporal_graph(str(csv_path), header=False) + assert g.n == 4 + assert g.m == 3 + assert set(g.nodes) == {"a", "b", "c", "d"} + + +def test_read_csv_temporal_graph_time_rescale(tmp_path): + csv_path = tmp_path / "temporal_graph_rescale.csv" + df = pd.DataFrame({"v": ["a", "b"], "w": ["b", "c"], "t": [1000, 2000]}) + df.to_csv(csv_path, index=False) + g = read_csv_temporal_graph(str(csv_path), time_rescale=1000) + assert torch.equal(g.data.time, torch.tensor([1, 2])) + + +def test_write_csv_graph_and_read(tmp_path, simple_graph): + # Create a simple graph + simple_graph.data.edge_weight = torch.tensor([1.0, 2.0, 3.0]) + csv_path = tmp_path / "graph.csv" + write_csv(simple_graph, path_or_buf=csv_path) + # Read back and check content + df = pd.read_csv(csv_path) + assert set(df.columns) == {"v", "w", "edge_weight"} + assert len(df) == 3 + assert set(df["v"]) == {"a", "b"} + assert set(df["w"]) == {"b", "c"} + assert list(df["edge_weight"]) == [1.0, 2.0, 3.0] + + +def test_write_csv_temporal_graph_and_read(tmp_path, simple_temporal_graph): + simple_temporal_graph.data.edge_weight = torch.tensor([1.0, 2.0, 3.0, 4.0]) + csv_path = tmp_path / "temporal_graph.csv" + write_csv(simple_temporal_graph, path_or_buf=csv_path) + df = pd.read_csv(csv_path) + assert set(df.columns) == {"v", "w", "t", "edge_weight"} + assert len(df) == 4 + assert set(df["v"]) == {"a", "b", "c"} + assert set(df["w"]) == {"b", "c", "d", "e"} + assert set(df["t"]) == {1, 5, 9} + assert list(df["edge_weight"]) == [1.0, 2.0, 3.0, 4.0] + + +def test_write_csv_with_node_indices(tmp_path, simple_graph): + csv_path = tmp_path / "graph_indices.csv" + write_csv(simple_graph, node_indices=True, path_or_buf=csv_path) + df = pd.read_csv(csv_path) + assert set(df.columns) == {"v", "w"} + assert set(df["v"]) == {0, 1} + assert set(df["w"]) == {1, 2}