Source code for pytspl.io.dataset_loader

"""Module for preprocessing and loading datasets for analysis.
"""

from .data_loaders.forex_loader import load_forex_data
from .data_loaders.lastfm_loader import load_lastfm_1k_artist
from .data_loaders.paper_loader import load_paper_data
from .data_loaders.transportation_loader import (
    list_transportation_datasets,
    load_transportation_dataset,
)
from .data_loaders.wsn_loader import load_wsn_data

DATASETS = {
    "paper": load_paper_data,
    "forex": load_forex_data,
    "lastfm-1k-artist": load_lastfm_1k_artist,
    "wsn": load_wsn_data,
}


[docs] def list_datasets() -> list: """ List the available datasets. Returns: list: The list of available datasets. """ other_datasets = list(DATASETS.keys()) transportation_datasets = list_transportation_datasets() return transportation_datasets + other_datasets
[docs] def load_dataset(dataset: str) -> tuple: """ Load the dataset and return the simplicial complex and coordinates. Args: dataset (str): The name of the dataset. ValueError: If the dataset is not found. Returns: SimplicialComplex: The simplicial complex of the dataset. dict: The coordinates of the nodes. If the coordinates do not exist, the coordinates are generated using spring layout. dict: The flow data of the dataset. If the flow data does not exist, an empty dictionary is returned. """ datasets = list_datasets() if dataset not in datasets: raise ValueError( f"Dataset {dataset} not found. Available datasets: {datasets}" ) if dataset in DATASETS: sc, coordinates, flow = DATASETS[dataset]() else: sc, coordinates, flow = load_transportation_dataset(dataset=dataset) assert sc is not None assert coordinates is not None assert flow is not None # each node should have a coordinate assert len(sc.nodes) == len(coordinates) # print summary of the dataset sc.print_summary() print(f"Coordinates: {len(coordinates)}") print(f"Flow: {len(flow)}") return sc, coordinates, flow