Source code for pytspl.io.data_loaders.transportation_loader

"""Module for preprocessing and loading transportation network
datasets for analysis.
"""

import os

import pandas as pd
import pkg_resources

from pytspl.io.network_reader import (
    read_B1_B2,
    read_coordinates,
    read_flow,
    read_tntp,
)

DATA_FOLDER = pkg_resources.resource_filename(
    "pytspl", "data/transportation_networks"
)

CHICAGO_SKETCH_DATA_FOLDER = pkg_resources.resource_filename(
    "pytspl", "data/transportation_networks/chicago-sketch"
)


METADATA_ROWS = 8



[docs]
def list_transportation_datasets() -> list:
    """List the available transportation datasets.

    Returns:
        list: The list of available transportation datasets.
    """
    datasets = os.listdir(DATA_FOLDER)
    # remove files
    files = [".DS_Store", "README.md"]
    datasets = [dataset for dataset in datasets if dataset not in files]
    return datasets




[docs]
def load_flow_transportation(dataset: str, edges: list) -> pd.DataFrame:
    """Read the flow data of the transportation dataset.

    Args:
        dataset (str): The name of the dataset.
        edges (list): The list of edges in the simplicial complex.

    Returns:
        pd.DataFrame: The flow data of the transportation dataset.
        Returns an empty dictionary if the flow data is not found.
    """
    flow_data_path = f"{DATA_FOLDER}/{dataset}/{dataset}_flow.tntp"
    df_flow = read_flow(filename=flow_data_path)
    if df_flow.empty:
        return {}

    visited_nodes = set()
    flow_dict = {}
    if not df_flow.empty:
        for edge in edges:
            source, target = edge
            # index starts at 1
            source += 1
            target += 1

            if (source, target) not in visited_nodes:
                # get the flow volume in the positive direction
                flow_pos = df_flow[
                    (df_flow["From "] == source) & (df_flow["To "] == target)
                ]["Volume "].values[0]

                # check if the flow is in the opposite direction
                try:
                    flow_neg = df_flow[
                        (df_flow["From "] == target)
                        & (df_flow["To "] == source)
                    ]["Volume "].values[0]
                except IndexError:
                    flow_neg = 0

                # calculate the net flow
                net_flow = flow_pos - flow_neg

                # zero index the nodes
                source -= 1
                target -= 1

                flow_dict[(source, target)] = net_flow
                visited_nodes.add((source, target))

    return flow_dict




[docs]
def load_transportation_dataset(dataset: str) -> tuple:
    """
    Load the transportation dataset and return the simplicial complex
    and coordinates.

    Args:
        dataset (str): The name of the dataset.

    Returns:
        tuple:
            SimplicialComplex: The simplicial complex of the dataset.
            dict: The coordinates of the nodes. If the coordinates do not
            exist, the coordinates are generated using spring layout.
            dict: The flow data of the dataset. If the flow data does not
            exist, an empty dictionary is returned.
    """
    if dataset == "chicago-sketch":
        return load_chicago_sketch()

    start_index_zero = False

    network_data_path = f"{DATA_FOLDER}/{dataset}/{dataset}_net.tntp"
    coordinates_data_path = f"{DATA_FOLDER}/{dataset}/{dataset}_node.tntp"

    # read the network data
    sc = read_tntp(
        filename=network_data_path,
        src_col="init_node",
        dest_col="term_node",
        skip_rows=METADATA_ROWS,
        delimiter="\t",
        # index starts at 1
        start_index_zero=start_index_zero,
    ).to_simplicial_complex()

    # read the coordinates data
    coordinates = read_coordinates(
        filename=coordinates_data_path,
        node_id_col="node",
        x_col="X",
        y_col="Y",
        delimiter="\t",
        start_index_zero=start_index_zero,
    )

    # generate coordinates using spring layout if coordinates are not provided
    if coordinates is None:
        coordinates = sc.generate_coordinates()

    # read the flow data
    flow_dict = load_flow_transportation(dataset=dataset, edges=sc.edges)

    return sc, coordinates, flow_dict




[docs]
def load_chicago_sketch() -> tuple:
    """
    Load the Chicago sketch dataset straight from the files.

    Returns:
        tuple:
            SimplicialComplex: The simplicial complex of the dataset.
            dict: The coordinates of the nodes.
            dict: The flow data of the dataset.
    """
    B1_dataset_path = f"{CHICAGO_SKETCH_DATA_FOLDER}/B1_chicago_sketch.csv"
    B2_dataset_path = f"{CHICAGO_SKETCH_DATA_FOLDER}/B2t_chicago_sketch.csv"

    scbuilder, triangles = read_B1_B2(B1_dataset_path, B2_dataset_path)
    sc = scbuilder.to_simplicial_complex(triangles=triangles)

    # read coordinates
    coordinates_path = (
        f"{CHICAGO_SKETCH_DATA_FOLDER}/coordinates_chicago_sketch.csv"
    )
    coordinates = read_coordinates(
        coordinates_path,
        node_id_col="Id",
        x_col="X",
        y_col="Y",
        delimiter=",",
        start_index_zero=True,
    )

    # read flow
    flow_path = f"{CHICAGO_SKETCH_DATA_FOLDER}/flow_chicago_sketch.csv"
    flow = (
        pd.read_csv(flow_path, delimiter=",", header=None).to_numpy().flatten()
    )
    # convert to dictionary
    flow_dict = {
        (edge[0], edge[1]): flow[i] for i, edge in enumerate(sc.edges)
    }
    return sc, coordinates, flow_dict