Source code for pytspl.io.network_reader

"""Module for preprocessing simplicial complex network data.

The network can be read as the following formats:
- TNTP
- CSV
- B1 and B2 incidence matrices

Once the data is read, the SCBuilder object is created to build the
simplicial complex using the nodes, edges and triangles (based on
the user defined condition).

The module also provides functionality to read the coordinates
and flow data.
"""

import os

import numpy as np
import pandas as pd

from pytspl.simplicial_complex.scbuilder import SCBuilder



[docs]
def _extract_nodes_edges(
    df: pd.DataFrame, src_col: str, dest_col: str, start_index_zero: bool
) -> list:
    """
    Extract nodes and edges from the network dataframe.

    Args:
        df (pd.DataFrame): The dataframe containing the edges.
        src_col (str): The name of the column containing the source nodes.
        dest_col (str): The name of the column containing the destination
        start_index_zero (bool): True, if the node ids start from 0. False,
        if the node ids start from 1.

    Returns:
        list: List of nodes and edges.
    """
    nodes = set()
    edges = []

    for _, row in df.iterrows():
        # subtract 1 to make the node ids 0-indexed
        from_node = int(row[src_col])
        to_node = int(row[dest_col])

        if not start_index_zero:
            from_node -= 1
            to_node -= 1

        nodes.add(from_node)
        nodes.add(to_node)

        if (from_node, to_node) in edges or (to_node, from_node) in edges:
            continue

        edges.append((from_node, to_node))

    nodes = list(range(max(nodes) + 1))
    # order edges
    edges.sort()

    return nodes, edges




[docs]
def read_tntp(
    filename: str,
    src_col: str,
    dest_col: str,
    skip_rows: int,
    delimiter: str = "\t",
    start_index_zero: bool = True,
) -> SCBuilder:
    """Read a tntp file and returns a graph.

    Args:
        filename (str): The name of the tntp file.
        src_col (str): The name of the column containing the source nodes.
        dest_col (str): The name of the column containing the destination
        nodes.
        skip_rows (int): The number of (metadata) rows to skip in the tntp
        file.
        delimiter (str): The delimiter used in the tntp file. Defaults to next
        line.
        start_index_zero (bool): True, if the node ids start from 0. False,
        if the node ids start from 1.

    Returns:
        SCBuilder: SC builder object to build the simplicial complex.
    """
    # Read the file
    df = pd.read_csv(filename, skiprows=skip_rows, sep=delimiter)
    # trimmed cols names
    df.columns = [s.strip() for s in df.columns]

    # And drop the silly first andlast columns
    df.drop(["~", ";"], axis=1, inplace=True)

    # get the nodes and edges
    nodes, edges = _extract_nodes_edges(
        df=df,
        src_col=src_col,
        dest_col=dest_col,
        start_index_zero=start_index_zero,
    )

    # extract features if any
    feature_cols = [
        col for col in df.columns if col not in [src_col, dest_col]
    ]

    edge_features = {}
    node_features = {}
    if len(feature_cols) > 0:
        for i, (from_node, to_node) in enumerate(edges):
            edge_features[(from_node, to_node)] = df.iloc[i][
                feature_cols
            ].to_dict()

    return SCBuilder(
        nodes=nodes,
        edges=edges,
        node_features=node_features,
        edge_features=edge_features,
    )




[docs]
def read_csv(
    filename: str,
    delimiter: str,
    src_col: str,
    dest_col: str,
    feature_cols: list = None,
    start_index_zero: bool = True,
) -> SCBuilder:
    """Read a csv file and returns a graph.

    Args:
        filename (str): The name of the csv file.
        delimiter (str): The delimiter used in the csv file.
        src_col (str): The name of the column containing the source nodes.
        dest_col (str): The name of the column containing the destination
        nodes.
        feature_cols (list, optional): The names of the feature columns.
        Defaults to None.
        start_index_zero (bool): True, if the node ids start from 0. False,

    Returns:
       SCBuilder: SC builder object to build the simplicial complex.
    """
    df = pd.read_csv(filename, sep=delimiter)

    # get the nodes and edges
    nodes, edges = _extract_nodes_edges(
        df=df,
        src_col=src_col,
        dest_col=dest_col,
        start_index_zero=start_index_zero,
    )

    # add features if any
    edge_features = {}
    node_features = {}

    if len(feature_cols) > 0:
        for i, (from_node, to_node) in enumerate(edges):
            edge_features[(from_node, to_node)] = df.iloc[i][
                feature_cols
            ].to_dict()

    return SCBuilder(
        nodes=nodes,
        edges=edges,
        node_features=node_features,
        edge_features=edge_features,
    )




[docs]
def read_B2(B2_filename: str, edges: np.ndarray) -> list:
    """
    Extract triangles from the B2 incidence matrix.

    Args:
        B2_filename (str): The name of the B2 incidence matrix
        file.
        edges (np.ndarray): The edges of the graph.

    Returns:
        list: List of triangles.
    """
    assert isinstance(edges, np.ndarray), "Edges should be a numpy array."

    B2 = pd.read_csv(B2_filename, header=None).to_numpy().T
    num_triangles = B2.shape[1]

    triangles = []
    for j in range(num_triangles):
        # Check each column of B2 for triangles
        col = B2[:, j]
        ones = np.where(col != 0)[0]
        triangle = edges[ones]
        triangle = tuple(set(triangle.flatten()))
        triangle = tuple(sorted(triangle))
        triangles.append(triangle)

    return triangles




[docs]
def read_B1_B2(B1_filename: str, B2_filename: str) -> tuple:
    """
    Read the B1 and B2 incidence matrices.

    Args:
        B1_filename (str): The name of the B1 incidence matrix file.
        B2_filename (str): The name of the B2 incidence matrix file.

    Returns:
        SCBuilder: SC builder object to build the simplicial complex.
        list: List of triangles (2-simplices).
    """
    B1 = pd.read_csv(B1_filename, header=None).to_numpy()

    num_edges = B1.shape[1]
    nodes = set()
    edges = []

    for j in range(num_edges):
        col = B1[:, j]
        from_node = np.where(col == -1)[0][0]
        to_node = np.where(col == 1)[0][0]

        nodes.add(from_node)
        nodes.add(to_node)

        edges.append((from_node, to_node))

    nodes = list(range(max(nodes) + 1))
    edges.sort()

    scbuilder = SCBuilder(nodes=nodes, edges=edges)
    triangles = read_B2(B2_filename, np.asarray(edges))

    return scbuilder, triangles




[docs]
def read_coordinates(
    filename: str,
    node_id_col: str,
    x_col: str,
    y_col: str,
    delimiter: str,
    start_index_zero: bool = True,
) -> dict:
    """
    Read a csv file and returns a dictionary of coordinates.

    Args:
        filename (str): The name of the file.
        node_id_col (str): The name of the column containing the node ids.
        x_col (str): The name of the column containing the x coordinates.
        y_col (str): The name of the column containing the y coordinates.
        delimiter (str, optional): The delimiter used in the csv file.
        start_index_zero (bool): True, if the node ids start from 0. False,
        if the node ids start from 1.

    Returns:
        dict: A dictionary of coordinates (node_id : (x, y)).
    """
    if not os.path.exists(filename):
        return None

    df_coords = pd.read_csv(filename, sep=delimiter)
    df_coords.columns = [s.strip() for s in df_coords.columns]

    if not start_index_zero:
        # subtract 1 to make the node ids 0-indexed
        df_coords[node_id_col] = df_coords[node_id_col] - 1

    # create a dictionary of coordinates (node_id : (x, y))
    return dict(
        zip(
            df_coords[node_id_col],
            zip(df_coords[x_col], df_coords[y_col]),
        )
    )




[docs]
def read_flow(filename: str, sep: str = "\t", header="infer") -> dict:
    """
    Read the flow.

    Args:
        filename (str): The name of the flow file.
        sep (str): The delimiter used in the flow file. Defaults to tab.
        header: The header of the flow file. Defaults to infer.

    Returns:
        pd.DataFrame: The flow data.
    """
    flow = pd.DataFrame()
    if os.path.exists(filename):
        flow = pd.read_csv(filename, sep=sep, header=header)
    else:
        print("WARNING: Flow data file not found.")

    return flow