Coverage for NeuralTSNE/NeuralTSNE/Utils/Loaders/FileLoaders/file_loaders.py: 100%
38 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-05-18 16:32 +0000
« prev ^ index » next coverage.py v7.8.0, created at 2025-05-18 16:32 +0000
1from typing import List, Tuple
2import numpy as np
3import torch
4from torch.utils.data import Dataset, Subset
6from NeuralTSNE.DatasetLoader import get_datasets
7from NeuralTSNE.Utils.Writers.StatWriters import (
8 save_means_and_vars,
9)
10from NeuralTSNE.Utils.Writers.LabelWriters import (
11 save_torch_labels,
12)
13from NeuralTSNE.Utils.Preprocessing import prepare_data
16def load_text_file(
17 input_file: str,
18 step: int,
19 header: bool,
20 exclude_cols: List[int],
21 variance_threshold: float,
22) -> torch.Tensor:
23 """
24 Load and preprocess data from a text file.
26 The function reads the data from the specified text file, skips the `header` if present,
27 and excludes specified columns if the `exclude_cols` list is provided. It then subsamples
28 the data based on the given `step` size. Finally, it preprocesses the data by applying
29 a `variance threshold` to perform feature selection and returns the resulting `torch.Tensor`.
31 Parameters
32 ----------
33 `input_file` : `str`
34 The path to the input text file.
35 `step` : `int`
36 Step size for subsampling the data.
37 `header` : `bool`
38 A boolean indicating whether the file has a header.
39 `exclude_cols` : `List[int]`
40 A list of column indices to exclude from the data.
41 `variance_threshold` : `float`
42 Threshold for variance-based feature selection.
44 Returns
45 -------
46 `torch.Tensor`
47 Processed data tensor.
48 """
49 input_file = open(input_file, "r")
50 cols = None
51 if header:
52 input_file.readline()
53 if exclude_cols:
54 last_pos = input_file.tell()
55 ncols = len(input_file.readline().strip().split())
56 input_file.seek(last_pos)
57 cols = np.arange(0, ncols, 1)
58 cols = tuple(np.delete(cols, exclude_cols))
60 X = np.loadtxt(input_file, usecols=cols)
62 input_file.close()
64 data = np.array(X[::step, :])
65 data = prepare_data(variance_threshold, data)
67 return data
70def load_npy_file(
71 input_file: str,
72 step: int,
73 exclude_cols: List[int],
74 variance_threshold: float,
75) -> torch.Tensor:
76 """
77 Load and preprocess data from a `NumPy` (`.npy`) file.
79 The function loads data from the specified `NumPy` file, subsamples it based on the given `step` size,
80 and excludes specified columns if the `exclude_cols` list is provided. It then preprocesses the data
81 by applying a `variance threshold` to perform feature selection and returns the resulting `torch.Tensor`.
83 Parameters
84 ----------
85 `input_file` : `str`
86 The path to the input `NumPy` file (`.npy`).
87 `step` : `int`
88 Step size for subsampling the data.
89 `exclude_cols` : `List[int]`
90 A list of column indices to exclude from the data.
91 `variance_threshold` : `float`
92 Threshold for variance-based feature selection.
94 Returns
95 -------
96 `torch.Tensor`
97 Processed data tensor.
98 """
99 data = np.load(input_file)
100 data = data[::step, :]
101 if exclude_cols:
102 data = np.delete(data, exclude_cols, axis=1)
104 data = prepare_data(variance_threshold, data)
106 return data
109def load_torch_dataset(name: str, step: int, output: str) -> Tuple[Dataset, Dataset]:
110 """
111 Load and preprocess a `torch.Dataset`, returning `training` and `testing` subsets.
113 The function loads a `torch.Dataset` specified by the `name` parameter, extracts `training` and `testing` subsets,
114 and preprocesses the `training` subset by saving labels and calculating means and variances.
116 Parameters
117 ----------
118 `name` : `str`
119 The name of the torch dataset to be loaded.
120 `step` : `int`
121 The step size for subsampling the training dataset.
122 `output` : `str`
123 The output file path for saving labels.
125 Returns
126 -------
127 `Tuple[Dataset, Dataset]`
128 A tuple containing the training and testing subsets.
130 Note
131 ----
132 - The function uses the `name` parameter to load a torch dataset and extract training and testing subsets.
133 - The training subset is subsampled using the `step` parameter.
134 - Labels for the testing subset are saved to a file specified by the `output` parameter.
135 - Means and variances for the training subset are calculated and saved to the `"means_and_vars.txt"` file.
136 - The function returns a `tuple` containing the training and testing subsets.
137 """
138 train, test = get_datasets.get_dataset(name)
139 train = Subset(train, range(0, len(train), step))
141 save_torch_labels(output, test)
142 train_data = torch.stack([row[0] for row in train])
143 save_means_and_vars(train_data)
145 return train, test