Coverage for NeuralTSNE/NeuralTSNE/Utils/Loaders/FileLoaders/file

1from typing import List, Tuple

2import numpy as np

3import torch

4from torch.utils.data import Dataset, Subset

6from NeuralTSNE.DatasetLoader import get_datasets

7from NeuralTSNE.Utils.Writers.StatWriters import (

8 save_means_and_vars,

10from NeuralTSNE.Utils.Writers.LabelWriters import (

11 save_torch_labels,

12)

13from NeuralTSNE.Utils.Preprocessing import prepare_data

16def load_text_file(

17 input_file: str,

18 step: int,

19 header: bool,

20 exclude_cols: List[int],

21 variance_threshold: float,

22) -> torch.Tensor:

23 """

24 Load and preprocess data from a text file.

26 The function reads the data from the specified text file, skips the `header` if present,

27 and excludes specified columns if the `exclude_cols` list is provided. It then subsamples

28 the data based on the given `step` size. Finally, it preprocesses the data by applying

29 a `variance threshold` to perform feature selection and returns the resulting `torch.Tensor`.

31 Parameters

32 ----------

33 `input_file` : `str`

34 The path to the input text file.

35 `step` : `int`

36 Step size for subsampling the data.

37 `header` : `bool`

38 A boolean indicating whether the file has a header.

39 `exclude_cols` : `List[int]`

40 A list of column indices to exclude from the data.

41 `variance_threshold` : `float`

42 Threshold for variance-based feature selection.

44 Returns

45 -------

46 `torch.Tensor`

47 Processed data tensor.

48 """

49 input_file = open(input_file, "r")

50 cols = None

51 if header:

52 input_file.readline()

53 if exclude_cols:

54 last_pos = input_file.tell()

55 ncols = len(input_file.readline().strip().split())

56 input_file.seek(last_pos)

57 cols = np.arange(0, ncols, 1)

58 cols = tuple(np.delete(cols, exclude_cols))

60 X = np.loadtxt(input_file, usecols=cols)

62 input_file.close()

64 data = np.array(X[::step, :])

65 data = prepare_data(variance_threshold, data)

67 return data

70def load_npy_file(

71 input_file: str,

72 step: int,

73 exclude_cols: List[int],

74 variance_threshold: float,

75) -> torch.Tensor:

76 """

77 Load and preprocess data from a `NumPy` (`.npy`) file.

79 The function loads data from the specified `NumPy` file, subsamples it based on the given `step` size,

80 and excludes specified columns if the `exclude_cols` list is provided. It then preprocesses the data

81 by applying a `variance threshold` to perform feature selection and returns the resulting `torch.Tensor`.

83 Parameters

84 ----------

85 `input_file` : `str`

86 The path to the input `NumPy` file (`.npy`).

87 `step` : `int`

88 Step size for subsampling the data.

89 `exclude_cols` : `List[int]`

90 A list of column indices to exclude from the data.

91 `variance_threshold` : `float`

92 Threshold for variance-based feature selection.

94 Returns

95 -------

96 `torch.Tensor`

97 Processed data tensor.

98 """

99 data = np.load(input_file)

100 data = data[::step, :]

101 if exclude_cols:

102 data = np.delete(data, exclude_cols, axis=1)

103

104 data = prepare_data(variance_threshold, data)

105

106 return data

107

108

109def load_torch_dataset(name: str, step: int, output: str) -> Tuple[Dataset, Dataset]:

110 """

111 Load and preprocess a `torch.Dataset`, returning `training` and `testing` subsets.

112

113 The function loads a `torch.Dataset` specified by the `name` parameter, extracts `training` and `testing` subsets,

114 and preprocesses the `training` subset by saving labels and calculating means and variances.

115

116 Parameters

117 ----------

118 `name` : `str`

119 The name of the torch dataset to be loaded.

120 `step` : `int`

121 The step size for subsampling the training dataset.

122 `output` : `str`

123 The output file path for saving labels.

124

125 Returns

126 -------

127 `Tuple[Dataset, Dataset]`

128 A tuple containing the training and testing subsets.

129

130 Note

131 ----

132 - The function uses the `name` parameter to load a torch dataset and extract training and testing subsets.

133 - The training subset is subsampled using the `step` parameter.

134 - Labels for the testing subset are saved to a file specified by the `output` parameter.

135 - Means and variances for the training subset are calculated and saved to the `"means_and_vars.txt"` file.

136 - The function returns a `tuple` containing the training and testing subsets.

137 """

138 train, test = get_datasets.get_dataset(name)

139 train = Subset(train, range(0, len(train), step))

140

141 save_torch_labels(output, test)

142 train_data = torch.stack([row[0] for row in train])

143 save_means_and_vars(train_data)

144

145 return train, test

Coverage for NeuralTSNE/NeuralTSNE/Utils/Loaders/FileLoaders/file_loaders.py: 100%

38 statements