Coverage for NeuralTSNE/NeuralTSNE/TSNE/neural_tsne.py: 19%

92 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-05-18 16:32 +0000

1import io 

2import sys 

3import os 

4from typing import Tuple 

5import argparse 

6from argparse_range import range_action 

7 

8import numpy as np 

9import torch 

10 

11import pytorch_lightning as L 

12from pytorch_lightning.callbacks import EarlyStopping 

13from pytorch_lightning.tuner import Tuner 

14 

15from NeuralTSNE.DatasetLoader import get_datasets 

16from NeuralTSNE.Utils.Validators.FileTypeValidators import ( 

17 FileTypeWithExtensionCheck, 

18 FileTypeWithExtensionCheckWithPredefinedDatasets, 

19) 

20from NeuralTSNE.Utils.Writers.StatWriters import save_results 

21from NeuralTSNE.Utils.Writers.LabelWriters import save_labels_data 

22from NeuralTSNE.Utils.Loaders.LabelLoaders import load_labels 

23from NeuralTSNE.Utils.Loaders.FileLoaders import ( 

24 load_npy_file, 

25 load_text_file, 

26 load_torch_dataset, 

27) 

28 

29from NeuralTSNE.TSNE.ParametricTSNE import ParametricTSNE 

30from NeuralTSNE.TSNE.Modules import DimensionalityReduction 

31 

32 

33def run_tsne( 

34 input_file, 

35 iter=1000, 

36 labels=None, 

37 no_dims=2, 

38 perplexity=30.0, 

39 exclude_cols=None, 

40 step=1, 

41 exaggeration_iter=0, 

42 exaggeration_value=12, 

43 o="result.txt", 

44 model_save=None, 

45 model_load=None, 

46 shuffle=False, 

47 train_size=None, 

48 test_size=None, 

49 jobs=1, 

50 batch_size=1000, 

51 header=False, 

52 net_multipliers=None, 

53 variance_threshold=None, 

54 cpu=False, 

55 early_stopping_delta=1e-5, 

56 early_stopping_patience=3, 

57 lr=1e-3, 

58 auto_lr=False, 

59): 

60 available_datasets = [] 

61 if "NeuralTSNE.DatasetLoader.get_datasets" in sys.modules: 

62 available_datasets = get_datasets._get_available_datasets() 

63 

64 if net_multipliers is None: 

65 net_multipliers = [0.75, 0.75, 0.75] 

66 

67 skip_data_splitting = False 

68 if ( 

69 not isinstance(input_file, io.TextIOWrapper) 

70 and len(available_datasets) > 0 

71 and (name := input_file.lower()) in available_datasets 

72 ): 

73 train, test = load_torch_dataset(name, step, o) 

74 skip_data_splitting = True 

75 features = np.prod(train.dataset.data.shape[1:]) 

76 else: 

77 labels = load_labels(labels) 

78 

79 if input_file.endswith(".npy"): 

80 data = load_npy_file(input_file, step, exclude_cols, variance_threshold) 

81 else: 

82 data = load_text_file( 

83 input_file, step, header, exclude_cols, variance_threshold 

84 ) 

85 features = data.shape[1] 

86 

87 tsne = ParametricTSNE( 

88 loss_fn="kl_divergence", 

89 n_components=no_dims, 

90 perplexity=perplexity, 

91 batch_size=batch_size, 

92 early_exaggeration_epochs=exaggeration_iter, 

93 early_exaggeration_value=exaggeration_value, 

94 max_iterations=iter, 

95 features=features, 

96 multipliers=net_multipliers, 

97 n_jobs=jobs, 

98 force_cpu=cpu, 

99 ) 

100 

101 early_stopping = EarlyStopping( 

102 "train_loss_epoch", 

103 min_delta=early_stopping_delta, 

104 patience=early_stopping_patience, 

105 ) 

106 

107 is_gpu = tsne.device == torch.device("cuda:0") 

108 

109 trainer = L.Trainer( 

110 accelerator="gpu" if is_gpu else "cpu", 

111 devices=1 if is_gpu else tsne.n_jobs, 

112 log_every_n_steps=1, 

113 max_epochs=tsne.max_iterations, 

114 callbacks=[early_stopping], 

115 ) 

116 

117 classifier = DimensionalityReduction(tsne, shuffle, lr=lr) 

118 

119 if model_load: 

120 tsne.read_model(model_load) 

121 train, test = ( 

122 tsne.split_dataset(data, y=labels, test_size=1) 

123 if not skip_data_splitting 

124 else tsne.create_dataloaders(train, test) 

125 ) 

126 if not skip_data_splitting: 

127 save_labels_data({"o": o}, test) 

128 Y = trainer.predict(classifier, test) 

129 else: 

130 train, test = ( 

131 tsne.split_dataset( 

132 data, y=labels, train_size=train_size, test_size=test_size 

133 ) 

134 if not skip_data_splitting 

135 else tsne.create_dataloaders(train, test) 

136 ) 

137 if auto_lr: 

138 tuner = Tuner(trainer) 

139 tuner.lr_find(classifier, train) 

140 classifier.reset_exaggeration_status() 

141 if not skip_data_splitting: 

142 save_labels_data({"o": o}, test) 

143 trainer.fit(classifier, train, [test]) 

144 if model_save: 

145 tsne.save_model(model_save) 

146 if test is not None: 

147 Y = trainer.predict(classifier, test) 

148 

149 save_results({"o": o, "step": step}, test, Y) 

150 

151 

152def parse_args(): 

153 available_datasets = [] 

154 if "NeuralTSNE.DatasetLoader.get_datasets" in sys.modules: 

155 available_datasets = get_datasets._get_available_datasets() 

156 parser = argparse.ArgumentParser(description="t-SNE Algorithm") 

157 parser.add_argument( 

158 "input_file", 

159 type=FileTypeWithExtensionCheckWithPredefinedDatasets( 

160 valid_extensions=("txt", "data", "npy"), 

161 available_datasets=available_datasets, 

162 ), 

163 help="Input file", 

164 ) 

165 parser.add_argument( 

166 "-iter", type=int, default=1000, help="Number of iterations", required=False 

167 ) 

168 parser.add_argument( 

169 "-labels", 

170 type=FileTypeWithExtensionCheck(valid_extensions=("txt", "data")), 

171 help="Labels file", 

172 required=False, 

173 ) 

174 parser.add_argument( 

175 "-no_dims", type=int, help="Number of dimensions", required=True, default=2 

176 ) 

177 parser.add_argument( 

178 "-perplexity", 

179 type=float, 

180 help="Perplexity of the Gaussian kernel", 

181 required=True, 

182 default=30.0, 

183 ) 

184 parser.add_argument( 

185 "-exclude_cols", type=int, nargs="+", help="Columns to exclude", required=False 

186 ) 

187 parser.add_argument( 

188 "-step", type=int, help="Step between samples", required=False, default=1 

189 ) 

190 parser.add_argument( 

191 "-exaggeration_iter", 

192 type=int, 

193 help="Early exaggeration end", 

194 required=False, 

195 default=0, 

196 ) 

197 parser.add_argument( 

198 "-exaggeration_value", 

199 type=float, 

200 help="Early exaggeration value", 

201 required=False, 

202 default=12, 

203 ) 

204 parser.add_argument( 

205 "-o", type=str, help="Output filename", required=False, default="result.txt" 

206 ) 

207 parser.add_argument( 

208 "-model_save", 

209 type=str, 

210 help="Filename to save model to", 

211 required=False, 

212 ) 

213 parser.add_argument( 

214 "-model_load", 

215 type=str, 

216 help="Filename to load model from", 

217 required=False, 

218 ) 

219 parser.add_argument("-shuffle", action="store_true", help="Shuffle data") 

220 parser.add_argument( 

221 "-train_size", 

222 type=float, 

223 action=range_action(0, 1), 

224 help="Train size", 

225 required=False, 

226 ) 

227 parser.add_argument( 

228 "-test_size", 

229 type=float, 

230 action=range_action(0, 1), 

231 help="Test size", 

232 required=False, 

233 ) 

234 # parser.add_argument( 

235 # "-jobs", type=int, help="Number of jobs", required=False, default=1 

236 # ) 

237 parser.add_argument( 

238 "-batch_size", type=int, help="Batch size", required=False, default=1000 

239 ) 

240 

241 parser.add_argument("-header", action="store_true", help="Data has header") 

242 parser.add_argument( 

243 "-net_multipliers", 

244 type=float, 

245 nargs="+", 

246 help="Network multipliers", 

247 default=[0.75, 0.75, 0.75], 

248 ) 

249 parser.add_argument("-variance_threshold", type=float, help="Variance threshold") 

250 parser.add_argument("-cpu", action="store_true", help="Use CPU") 

251 parser.add_argument( 

252 "-early_stopping_delta", type=float, help="Early stopping delta", default=1e-5 

253 ) 

254 parser.add_argument( 

255 "-early_stopping_patience", type=int, help="Early stopping patience", default=3 

256 ) 

257 parser.add_argument("-lr", type=float, help="Learning rate", default=1e-3) 

258 parser.add_argument("-auto_lr", action="store_true", help="Auto learning rate") 

259 

260 return parser.parse_args() 

261 

262 

263def main(): 

264 args = parse_args() 

265 run_tsne(**vars(args))