Coverage for NeuralTSNE/NeuralTSNE/Utils/Loaders/FileLoaders/file_loaders.py: 100%

38 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-05-18 16:32 +0000

1from typing import List, Tuple 

2import numpy as np 

3import torch 

4from torch.utils.data import Dataset, Subset 

5 

6from NeuralTSNE.DatasetLoader import get_datasets 

7from NeuralTSNE.Utils.Writers.StatWriters import ( 

8 save_means_and_vars, 

9) 

10from NeuralTSNE.Utils.Writers.LabelWriters import ( 

11 save_torch_labels, 

12) 

13from NeuralTSNE.Utils.Preprocessing import prepare_data 

14 

15 

16def load_text_file( 

17 input_file: str, 

18 step: int, 

19 header: bool, 

20 exclude_cols: List[int], 

21 variance_threshold: float, 

22) -> torch.Tensor: 

23 """ 

24 Load and preprocess data from a text file. 

25 

26 The function reads the data from the specified text file, skips the `header` if present, 

27 and excludes specified columns if the `exclude_cols` list is provided. It then subsamples 

28 the data based on the given `step` size. Finally, it preprocesses the data by applying 

29 a `variance threshold` to perform feature selection and returns the resulting `torch.Tensor`. 

30 

31 Parameters 

32 ---------- 

33 `input_file` : `str` 

34 The path to the input text file. 

35 `step` : `int` 

36 Step size for subsampling the data. 

37 `header` : `bool` 

38 A boolean indicating whether the file has a header. 

39 `exclude_cols` : `List[int]` 

40 A list of column indices to exclude from the data. 

41 `variance_threshold` : `float` 

42 Threshold for variance-based feature selection. 

43 

44 Returns 

45 ------- 

46 `torch.Tensor` 

47 Processed data tensor. 

48 """ 

49 input_file = open(input_file, "r") 

50 cols = None 

51 if header: 

52 input_file.readline() 

53 if exclude_cols: 

54 last_pos = input_file.tell() 

55 ncols = len(input_file.readline().strip().split()) 

56 input_file.seek(last_pos) 

57 cols = np.arange(0, ncols, 1) 

58 cols = tuple(np.delete(cols, exclude_cols)) 

59 

60 X = np.loadtxt(input_file, usecols=cols) 

61 

62 input_file.close() 

63 

64 data = np.array(X[::step, :]) 

65 data = prepare_data(variance_threshold, data) 

66 

67 return data 

68 

69 

70def load_npy_file( 

71 input_file: str, 

72 step: int, 

73 exclude_cols: List[int], 

74 variance_threshold: float, 

75) -> torch.Tensor: 

76 """ 

77 Load and preprocess data from a `NumPy` (`.npy`) file. 

78 

79 The function loads data from the specified `NumPy` file, subsamples it based on the given `step` size, 

80 and excludes specified columns if the `exclude_cols` list is provided. It then preprocesses the data 

81 by applying a `variance threshold` to perform feature selection and returns the resulting `torch.Tensor`. 

82 

83 Parameters 

84 ---------- 

85 `input_file` : `str` 

86 The path to the input `NumPy` file (`.npy`). 

87 `step` : `int` 

88 Step size for subsampling the data. 

89 `exclude_cols` : `List[int]` 

90 A list of column indices to exclude from the data. 

91 `variance_threshold` : `float` 

92 Threshold for variance-based feature selection. 

93 

94 Returns 

95 ------- 

96 `torch.Tensor` 

97 Processed data tensor. 

98 """ 

99 data = np.load(input_file) 

100 data = data[::step, :] 

101 if exclude_cols: 

102 data = np.delete(data, exclude_cols, axis=1) 

103 

104 data = prepare_data(variance_threshold, data) 

105 

106 return data 

107 

108 

109def load_torch_dataset(name: str, step: int, output: str) -> Tuple[Dataset, Dataset]: 

110 """ 

111 Load and preprocess a `torch.Dataset`, returning `training` and `testing` subsets. 

112 

113 The function loads a `torch.Dataset` specified by the `name` parameter, extracts `training` and `testing` subsets, 

114 and preprocesses the `training` subset by saving labels and calculating means and variances. 

115 

116 Parameters 

117 ---------- 

118 `name` : `str` 

119 The name of the torch dataset to be loaded. 

120 `step` : `int` 

121 The step size for subsampling the training dataset. 

122 `output` : `str` 

123 The output file path for saving labels. 

124 

125 Returns 

126 ------- 

127 `Tuple[Dataset, Dataset]` 

128 A tuple containing the training and testing subsets. 

129 

130 Note 

131 ---- 

132 - The function uses the `name` parameter to load a torch dataset and extract training and testing subsets. 

133 - The training subset is subsampled using the `step` parameter. 

134 - Labels for the testing subset are saved to a file specified by the `output` parameter. 

135 - Means and variances for the training subset are calculated and saved to the `"means_and_vars.txt"` file. 

136 - The function returns a `tuple` containing the training and testing subsets. 

137 """ 

138 train, test = get_datasets.get_dataset(name) 

139 train = Subset(train, range(0, len(train), step)) 

140 

141 save_torch_labels(output, test) 

142 train_data = torch.stack([row[0] for row in train]) 

143 save_means_and_vars(train_data) 

144 

145 return train, test