Source code for NeuralTSNE.Utils.Loaders.FileLoaders.file_loaders

from typing import List, Tuple
import numpy as np
import torch
from torch.utils.data import Dataset, Subset

from NeuralTSNE.DatasetLoader import get_datasets
from NeuralTSNE.Utils.Writers.StatWriters import (
    save_means_and_vars,
)
from NeuralTSNE.Utils.Writers.LabelWriters import (
    save_torch_labels,
)
from NeuralTSNE.Utils.Preprocessing import prepare_data



[docs]
def load_text_file(
    input_file: str,
    step: int,
    header: bool,
    exclude_cols: List[int],
    variance_threshold: float,
) -> torch.Tensor:
    """
    Load and preprocess data from a text file.

    The function reads the data from the specified text file, skips the `header` if present,
    and excludes specified columns if the `exclude_cols` list is provided. It then subsamples
    the data based on the given `step` size. Finally, it preprocesses the data by applying
    a `variance threshold` to perform feature selection and returns the resulting `torch.Tensor`.

    Parameters
    ----------
    `input_file` : `str`
        The path to the input text file.
    `step` : `int`
        Step size for subsampling the data.
    `header` : `bool`
        A boolean indicating whether the file has a header.
    `exclude_cols` : `List[int]`
        A list of column indices to exclude from the data.
    `variance_threshold` : `float`
        Threshold for variance-based feature selection.

    Returns
    -------
    `torch.Tensor`
        Processed data tensor.
    """
    input_file = open(input_file, "r")
    cols = None
    if header:
        input_file.readline()
    if exclude_cols:
        last_pos = input_file.tell()
        ncols = len(input_file.readline().strip().split())
        input_file.seek(last_pos)
        cols = np.arange(0, ncols, 1)
        cols = tuple(np.delete(cols, exclude_cols))

    X = np.loadtxt(input_file, usecols=cols)

    input_file.close()

    data = np.array(X[::step, :])
    data = prepare_data(variance_threshold, data)

    return data




[docs]
def load_npy_file(
    input_file: str,
    step: int,
    exclude_cols: List[int],
    variance_threshold: float,
) -> torch.Tensor:
    """
    Load and preprocess data from a `NumPy` (`.npy`) file.

    The function loads data from the specified `NumPy` file, subsamples it based on the given `step` size,
    and excludes specified columns if the `exclude_cols` list is provided. It then preprocesses the data
    by applying a `variance threshold` to perform feature selection and returns the resulting `torch.Tensor`.

    Parameters
    ----------
    `input_file` : `str`
        The path to the input `NumPy` file (`.npy`).
    `step` : `int`
        Step size for subsampling the data.
    `exclude_cols` : `List[int]`
        A list of column indices to exclude from the data.
    `variance_threshold` : `float`
        Threshold for variance-based feature selection.

    Returns
    -------
    `torch.Tensor`
        Processed data tensor.
    """
    data = np.load(input_file)
    data = data[::step, :]
    if exclude_cols:
        data = np.delete(data, exclude_cols, axis=1)

    data = prepare_data(variance_threshold, data)

    return data




[docs]
def load_torch_dataset(name: str, step: int, output: str) -> Tuple[Dataset, Dataset]:
    """
    Load and preprocess a `torch.Dataset`, returning `training` and `testing` subsets.

    The function loads a `torch.Dataset` specified by the `name` parameter, extracts `training` and `testing` subsets,
    and preprocesses the `training` subset by saving labels and calculating means and variances.

    Parameters
    ----------
    `name` : `str`
        The name of the torch dataset to be loaded.
    `step` : `int`
        The step size for subsampling the training dataset.
    `output` : `str`
        The output file path for saving labels.

    Returns
    -------
    `Tuple[Dataset, Dataset]`
        A tuple containing the training and testing subsets.

    Note
    ----
    - The function uses the `name` parameter to load a torch dataset and extract training and testing subsets.
    - The training subset is subsampled using the `step` parameter.
    - Labels for the testing subset are saved to a file specified by the `output` parameter.
    - Means and variances for the training subset are calculated and saved to the `"means_and_vars.txt"` file.
    - The function returns a `tuple` containing the training and testing subsets.
    """
    train, test = get_datasets.get_dataset(name)
    train = Subset(train, range(0, len(train), step))

    save_torch_labels(output, test)
    train_data = torch.stack([row[0] for row in train])
    save_means_and_vars(train_data)

    return train, test