From b46cb3d97c79780cf681b09087174000ca545abc Mon Sep 17 00:00:00 2001 From: Kyle Gerard Felker Date: Tue, 25 Feb 2020 13:05:13 -0500 Subject: [PATCH 1/2] Add all of @ASvyatkovskiy transformer work from late 2019 --- examples/conf.yaml | 1 + examples/slurm.cmd | 10 + examples/transformer_learn.py | 153 +++++++ plasma/models/distributed_torch_runner.py | 464 ++++++++++++++++++++++ plasma/models/loader.py | 33 ++ plasma/transformer/__init__.py | 0 plasma/transformer/runner.py | 362 +++++++++++++++++ 7 files changed, 1023 insertions(+) create mode 100644 examples/transformer_learn.py create mode 100644 plasma/models/distributed_torch_runner.py create mode 100644 plasma/transformer/__init__.py create mode 100644 plasma/transformer/runner.py diff --git a/examples/conf.yaml b/examples/conf.yaml index dfda1145..9e2bea11 100644 --- a/examples/conf.yaml +++ b/examples/conf.yaml @@ -58,6 +58,7 @@ model: loss_scale_factor: 1.0 use_batch_norm: false torch: False + transformer: False # requires torch: True shallow: False shallow_model: num_samples: 1000000 # 1000000 # the number of samples to use for training diff --git a/examples/slurm.cmd b/examples/slurm.cmd index 3dcae884..4db356a3 100644 --- a/examples/slurm.cmd +++ b/examples/slurm.cmd @@ -27,3 +27,13 @@ rm /tigress/$USER/normalization/* export OMPI_MCA_btl="tcp,self,vader" srun python mpi_learn.py + +# single model replica PyTorch implementation of Transformer +# (set one rank, one core, ...) +# +# conda activate Py3 +# module load cudnn/cuda-10.0/7.5.0 +# module load cudatoolkit/10.0 +# module load openmpi/gcc/3.1.3/64 +# export LD_LIBRARY_PATH=/usr/local/cuda-10.0/extras/CUPTI/lib64:$LD_LIBRARY_PATH +# srun python transformer_learn.py diff --git a/examples/transformer_learn.py b/examples/transformer_learn.py new file mode 100644 index 00000000..ae7674d2 --- /dev/null +++ b/examples/transformer_learn.py @@ -0,0 +1,153 @@ +from plasma.models.loader import Loader +from plasma.preprocessor.preprocess import guarantee_preprocessed +from plasma.transformer.runner import train +from plasma.models.torch_runner import make_predictions_and_evaluate_gpu +from plasma.conf import conf + +from pprint import pprint +import numpy as np +import datetime +import logging +import random +import sys +import os + +import matplotlib +matplotlib.use('Agg') + +pprint(conf) + +if conf['data']['normalizer'] == 'minmax': + from plasma.preprocessor.normalize import MinMaxNormalizer as Normalizer +elif conf['data']['normalizer'] == 'meanvar': + from plasma.preprocessor.normalize import MeanVarNormalizer as Normalizer +elif conf['data']['normalizer'] == 'var': + # performs !much better than minmaxnormalizer + from plasma.preprocessor.normalize import VarNormalizer as Normalizer +elif conf['data']['normalizer'] == 'averagevar': + # performs !much better than minmaxnormalizer + from plasma.preprocessor.normalize import ( + AveragingVarNormalizer as Normalizer + ) +else: + print('unkown normalizer. exiting') + exit(1) + +if __name__ == '__main__': + logging.basicConfig( + level=logging.INFO, + format="%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s", + ) + LOGGER = logging.getLogger("transformer_learn") + + shot_list_dir = conf['paths']['shot_list_dir'] + shot_files = conf['paths']['shot_files'] + shot_files_test = conf['paths']['shot_files_test'] + train_frac = conf['training']['train_frac'] + stateful = conf['model']['stateful'] + + # FIXME change seed setting + np.random.seed(0) + random.seed(0) + + only_predict = len(sys.argv) > 1 + custom_path = None + if only_predict: + custom_path = sys.argv[1] + print("predicting using path {}".format(custom_path)) + + ##################################################### + # PREPROCESSING # + ##################################################### + # TODO(KGF): check tuple unpack + (shot_list_train, shot_list_validate, + shot_list_test) = guarantee_preprocessed(conf) + + ##################################################### + # NORMALIZATION # + ##################################################### + + print("normalization", end='') + nn = Normalizer(conf) + nn.train() + loader = Loader(conf, nn) + print("...done") + print('Training on {} shots, testing on {} shots'.format( + len(shot_list_train), len(shot_list_test))) + + + ##################################################### + # TRAINING # + ##################################################### + train(conf, shot_list_train.random_sublist(512), + shot_list_validate.random_sublist(256), loader) + #if not only_predict: + # p = old_mp.Process(target=train, + # args=(conf, shot_list_train, + # shot_list_validate, loader) + # ) + # p.start() + # p.join() + + ##################################################### + # PREDICTING # + ##################################################### + loader.set_inference_mode(True) + + # load last model for testing + print('saving results') + y_prime = [] + y_prime_test = [] + y_prime_train = [] + + y_gold = [] + y_gold_test = [] + y_gold_train = [] + + disruptive = [] + disruptive_train = [] + disruptive_test = [] + + # y_prime_train, y_gold_train, disruptive_train = + # make_predictions(conf, shot_list_train, loader) + # y_prime_test, y_gold_test, disruptive_test = + # make_predictions(conf, shot_list_test, loader) + + # TODO(KGF): check tuple unpack + (y_prime_train, y_gold_train, disruptive_train, roc_train, + loss_train) = make_predictions_and_evaluate_gpu( + conf, shot_list_train, loader, custom_path) + (y_prime_test, y_gold_test, disruptive_test, roc_test, + loss_test) = make_predictions_and_evaluate_gpu( + conf, shot_list_test, loader, custom_path) + print('=========Summary========') + print('Train Loss: {:.3e}'.format(loss_train)) + print('Train ROC: {:.4f}'.format(roc_train)) + print('Test Loss: {:.3e}'.format(loss_test)) + print('Test ROC: {:.4f}'.format(roc_test)) + + + disruptive_train = np.array(disruptive_train) + disruptive_test = np.array(disruptive_test) + + y_gold = y_gold_train + y_gold_test + y_prime = y_prime_train + y_prime_test + disruptive = np.concatenate((disruptive_train, disruptive_test)) + + shot_list_validate.make_light() + shot_list_test.make_light() + shot_list_train.make_light() + + save_str = 'results_' + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + result_base_path = conf['paths']['results_prepath'] + if not os.path.exists(result_base_path): + os.makedirs(result_base_path) + np.savez(result_base_path+save_str, y_gold=y_gold, y_gold_train=y_gold_train, + y_gold_test=y_gold_test, y_prime=y_prime, y_prime_train=y_prime_train, + y_prime_test=y_prime_test, disruptive=disruptive, + disruptive_train=disruptive_train, disruptive_test=disruptive_test, + shot_list_validate=shot_list_validate, + shot_list_train=shot_list_train, shot_list_test=shot_list_test, + conf=conf) + + print('finished.') diff --git a/plasma/models/distributed_torch_runner.py b/plasma/models/distributed_torch_runner.py new file mode 100644 index 00000000..f6ba678c --- /dev/null +++ b/plasma/models/distributed_torch_runner.py @@ -0,0 +1,464 @@ +from __future__ import print_function +from keras.utils.generic_utils import Progbar +from torch.nn.utils import weight_norm +import torch.optim as opt +from torch.autograd import Variable +import torch.nn as nn +import torch +from plasma.utils.downloading import makedirs_process_safe +from plasma.utils.performance import PerformanceAnalyzer +from plasma.utils.evaluation import get_loss_from_list +from functools import partial +import os +import numpy as np +import horovod.torch as hvd + +model_filename = 'torch_model.pt' + +# Horovod: initialize library. +hvd.init() + +local_rank = hvd.local_rank() +world_size = hvd.size() +rank = hvd.rank() +# Horovod: pin GPU to local rank. +torch.cuda.set_device(local_rank) +# Horovod: limit number of CPU threads to be used per worker. +torch.set_num_threads(1) + + +class FTCN(nn.Module): + def __init__(self, n_scalars, n_profiles, profile_size, + layer_sizes_spatial, kernel_size_spatial, + linear_size, output_size, + num_channels_tcn, kernel_size_temporal, dropout=0.1): + super(FTCN, self).__init__() + self.lin = InputBlock(n_scalars, n_profiles, profile_size, + layer_sizes_spatial, kernel_size_spatial, + linear_size, dropout) + self.input_layer = TimeDistributed(self.lin, batch_first=True) + self.tcn = TCN(linear_size, output_size, num_channels_tcn, + kernel_size_temporal, dropout) + self.model = nn.Sequential(self.input_layer, self.tcn) + + def forward(self, x): + return self.model(x) + + +class InputBlock(nn.Module): + def __init__(self, n_scalars, n_profiles, profile_size, layer_sizes, + kernel_size, linear_size, dropout=0.2): + super(InputBlock, self).__init__() + self.pooling_size = 2 + self.n_scalars = n_scalars + self.n_profiles = n_profiles + self.profile_size = profile_size + self.conv_output_size = profile_size + if self.n_profiles == 0: + self.net = None + self.conv_output_size = 0 + else: + self.layers = [] + for (i, layer_size) in enumerate(layer_sizes): + if i == 0: + input_size = n_profiles + else: + input_size = layer_sizes[i-1] + self.layers.append(weight_norm( + nn.Conv1d(input_size, layer_size, kernel_size))) + self.layers.append(nn.ReLU()) + self.conv_output_size = calculate_conv_output_size( + self.conv_output_size, 0, 1, 1, kernel_size) + self.layers.append(nn.MaxPool1d(kernel_size=self.pooling_size)) + self.conv_output_size = calculate_conv_output_size( + self.conv_output_size, 0, 1, self.pooling_size, + self.pooling_size) + self.layers.append(nn.Dropout2d(dropout)) + self.net = nn.Sequential(*self.layers) + self.conv_output_size = self.conv_output_size*layer_sizes[-1] + self.linear_layers = [] + + print("Final feature size = {}".format(self.n_scalars + + self.conv_output_size)) + self.linear_layers.append( + nn.Linear( + self.conv_output_size + + self.n_scalars, + linear_size)) + self.linear_layers.append(nn.ReLU()) + self.linear_layers.append(nn.Linear(linear_size, linear_size)) + self.linear_layers.append(nn.ReLU()) + print("Final output size = {}".format(linear_size)) + self.linear_net = nn.Sequential(*self.linear_layers) + + def forward(self, x): + if self.n_profiles == 0: + full_features = x # x_scalars + else: + if self.n_scalars == 0: + x_profiles = x + else: + x_scalars = x[:, :self.n_scalars] + x_profiles = x[:, self.n_scalars:] + x_profiles = x_profiles.contiguous().view( + x.size(0), self.n_profiles, self.profile_size) + profile_features = self.net(x_profiles).view(x.size(0), -1) + if self.n_scalars == 0: + full_features = profile_features + else: + full_features = torch.cat([x_scalars, profile_features], dim=1) + + out = self.linear_net(full_features) + return out + + +def calculate_conv_output_size(L_in, padding, dilation, stride, kernel_size): + return int(np.floor( + (L_in + 2*padding - dilation * (kernel_size-1) - 1)*1.0/stride + 1)) + + +class Chomp1d(nn.Module): + def __init__(self, chomp_size): + super(Chomp1d, self).__init__() + self.chomp_size = chomp_size + + def forward(self, x): + return x[:, :, :-self.chomp_size].contiguous() + + +class TemporalBlock(nn.Module): + def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, + padding, dropout=0.2): + super(TemporalBlock, self).__init__() + self.conv1 = weight_norm(nn.Conv1d( + n_inputs, n_outputs, kernel_size, stride=stride, padding=padding, + dilation=dilation)) + self.chomp1 = Chomp1d(padding) + self.relu1 = nn.ReLU() + self.dropout1 = nn.Dropout2d(dropout) + + self.conv2 = weight_norm(nn.Conv1d( + n_outputs, n_outputs, kernel_size, stride=stride, padding=padding, + dilation=dilation)) + self.chomp2 = Chomp1d(padding) + self.relu2 = nn.ReLU() + self.dropout2 = nn.Dropout2d(dropout) + + self.net = nn.Sequential(self.conv1, self.chomp1, self.relu1, + self.dropout1, self.conv2, self.chomp2, + self.relu2, self.dropout2) + self.downsample = (nn.Conv1d(n_inputs, n_outputs, 1) + if n_inputs != n_outputs else None) + self.relu = nn.ReLU() + self.init_weights() + + def init_weights(self): + self.conv1.weight.data.normal_(0, 0.01) + self.conv2.weight.data.normal_(0, 0.01) + if self.downsample is not None: + self.downsample.weight.data.normal_(0, 0.01) + + def forward(self, x): + out = self.net(x) + res = x if self.downsample is None else self.downsample(x) + return self.relu(out + res) + +# dimensions are batch,channels,length + + +class TemporalConvNet(nn.Module): + def __init__(self, num_inputs, num_channels, kernel_size=2, dropout=0.2): + super(TemporalConvNet, self).__init__() + layers = [] + num_levels = len(num_channels) + for i in range(num_levels): + dilation_size = 2 ** i + in_channels = num_inputs if i == 0 else num_channels[i-1] + out_channels = num_channels[i] + layers += [TemporalBlock(in_channels, out_channels, kernel_size, + stride=1, dilation=dilation_size, + padding=(kernel_size-1) * dilation_size, + dropout=dropout)] + + self.network = nn.Sequential(*layers) + + def forward(self, x): + return self.network(x) + + +class TCN(nn.Module): + def __init__(self, input_size, output_size, num_channels, kernel_size, + dropout): + super(TCN, self).__init__() + self.tcn = TemporalConvNet(input_size, num_channels, kernel_size, + dropout=dropout) + self.linear = nn.Linear(num_channels[-1], output_size) +# self.sig = nn.Sigmoid() + + def forward(self, x): + # x needs to have dimension (N, C, L) in order to be passed into CNN + output = self.tcn(x.transpose(1, 2)).transpose(1, 2) + output = self.linear(output) # .transpose(1,2)).transpose(1,2) + return output +# return self.sig(output) + + +class TimeDistributed(nn.Module): + def __init__(self, module, batch_first=False): + super(TimeDistributed, self).__init__() + self.module = module + self.batch_first = batch_first + + def forward(self, x): + if len(x.size()) <= 2: + return self.module(x) + + # Squash samples and timesteps into a single axis + # (samples * timesteps, input_size) + x_reshape = x.contiguous().view(-1, x.size(-1)) + + y = self.module(x_reshape) + + # We have to reshape Y + if self.batch_first: + # (samples, timesteps, output_size) + y = y.contiguous().view(x.size(0), -1, y.size(-1)) + else: + # (timesteps, samples, output_size) + y = y.view(-1, x.size(1), y.size(-1)) + + return y + + +def build_torch_model(conf): + dropout = conf['model']['dropout_prob'] + # dim = 10 + # lin = nn.Linear(input_size,intermediate_dim) + n_scalars, n_profiles, profile_size = get_signal_dimensions(conf) + # dim = n_scalars + n_profiles*profile_size + # input_size = dim + output_size = 1 + # intermediate_dim = 15 + + layer_sizes_spatial = [6, 3, 3] # [40,20,20] + kernel_size_spatial = 3 + linear_size = 5 + + num_channels_tcn = [10, 5, 3, 3] # [3]*5 + kernel_size_temporal = 3 # 3 + model = FTCN(n_scalars, n_profiles, profile_size, layer_sizes_spatial, + kernel_size_spatial, linear_size, output_size, + num_channels_tcn, kernel_size_temporal, dropout) + + return model + + +def get_signal_dimensions(conf): + # make sure all 1D indices are contiguous in the end! + use_signals = conf['paths']['use_signals'] + n_scalars = 0 + n_profiles = 0 + profile_size = 0 + # do we have any 1D indices? + is_1D_region = use_signals[0].num_channels > 1 + for sig in use_signals: + num_channels = sig.num_channels + if num_channels > 1: + profile_size = num_channels + n_profiles += 1 + is_1D_region = True + else: + assert not is_1D_region, ( + "make sure all use_signals are ordered such that ", + "1D signals come last!") + assert num_channels == 1 + n_scalars += 1 + is_1D_region = False + return n_scalars, n_profiles, profile_size + + +def apply_model_to_np(model, x): + # return + # model(Variable(torch.from_numpy(x).float()).unsqueeze(0)).squeeze( + # 0).data.numpy() + return model(Variable(torch.from_numpy(x).float())).data.numpy() + + +def make_predictions(conf, shot_list, loader, custom_path=None): + generator = loader.inference_batch_generator_full_shot(shot_list) + inference_model = build_torch_model(conf) + + if custom_path is None: + model_path = get_model_path(conf) + else: + model_path = custom_path + inference_model.load_state_dict(torch.load(model_path)) + # shot_list = shot_list.random_sublist(10) + + y_prime = [] + y_gold = [] + disruptive = [] + num_shots = len(shot_list) + + pbar = Progbar(num_shots) + while True: + x, y, mask, disr, lengths, num_so_far, num_total = next(generator) + # x, y, mask = Variable(torch.from_numpy(x_).float()), + # Variable(torch.from_numpy(y_).float()), + # Variable(torch.from_numpy(mask_).byte()) + output = apply_model_to_np(inference_model, x) + for batch_idx in range(x.shape[0]): + curr_length = lengths[batch_idx] + y_prime += [output[batch_idx, :curr_length, 0]] + y_gold += [y[batch_idx, :curr_length, 0]] + disruptive += [disr[batch_idx]] + pbar.add(1.0) + if len(disruptive) >= num_shots: + y_prime = y_prime[:num_shots] + y_gold = y_gold[:num_shots] + disruptive = disruptive[:num_shots] + break + return y_prime, y_gold, disruptive + + +def make_predictions_and_evaluate_gpu(conf, shot_list, loader, + custom_path=None): + y_prime, y_gold, disruptive = make_predictions( + conf, shot_list, loader, custom_path) + analyzer = PerformanceAnalyzer(conf=conf) + roc_area = analyzer.get_roc_area(y_prime, y_gold, disruptive) + loss = get_loss_from_list(y_prime, y_gold, conf['data']['target']) + return y_prime, y_gold, disruptive, roc_area, loss + + +def get_model_path(conf): + return (conf['paths']['model_save_path'] + 'torch/' + + model_filename) # save_prepath + model_filename + + +def train_epoch(model, data_gen, optimizer, scheduler, loss_fn): + loss = 0 + total_loss = 0 + num_so_far = 0 + x_, y_, mask_, num_so_far_start, num_total = next(data_gen) + num_so_far = num_so_far_start + step = 0 + while True: + # print(y) + x, y, mask = Variable( + torch.from_numpy(x_).float()), Variable( + torch.from_numpy(y_).float()), Variable( + torch.from_numpy(mask_).bool()) + # print(y) + optimizer.zero_grad() + # output = model(x.unsqueeze(0)).squeeze(0) + output = model(x) # .unsqueeze(0)).squeeze(0) + output_masked = torch.masked_select(output, mask) + y_masked = torch.masked_select(y, mask) + # print(y.shape,output.shape) + loss = loss_fn(output_masked, y_masked) + total_loss += loss.item() + # count += output.size(0) + + # if args.clip > 0: + # torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) + loss.backward() + optimizer.step() + scheduler.step() + step += 1 + print("[{}] [{}/{}] loss: {:.3f}, ave_loss: {:.3f}".format( + step, num_so_far - num_so_far_start, num_total, loss.item(), + total_loss/step)) + if num_so_far-num_so_far_start >= num_total: + break + x_, y_, mask_, num_so_far, num_total = next(data_gen) + return step, loss.item(), total_loss, num_so_far, 1.0*num_so_far/num_total + + +def hvd_average_scalars(val, name): + tensor = torch.tensor(val) + avg_tensor = hvd.allreduce(tensor, name=name) + return avg_tensor.item() + + +def train(conf, shot_list_train, shot_list_validate, loader): + np.random.seed(1) + data_gen = partial( + loader.training_batch_generator_full_shot_partial_reset, + shot_list=shot_list_train)() + print('validate: {} shots, {} disruptive'.format( + len(shot_list_validate), shot_list_validate.num_disruptive())) + print('training: {} shots, {} disruptive'.format( + len(shot_list_train), shot_list_train.num_disruptive())) + + loader.set_inference_mode(False) + + train_model = build_torch_model(conf) + num_epochs = conf['training']['num_epochs'] + patience = conf['callbacks']['patience'] + lr_decay = conf['model']['lr_decay'] + # batch_size = conf['training']['batch_size'] + lr = conf['model']['lr'] + # clipnorm = conf['model']['clipnorm'] + e = 0 + # warmup_steps = conf['model']['warmup_steps'] + # num_batches_minimum = conf['training']['num_batches_minimum'] + + print('{} epochs left to go'.format(num_epochs - 1 - e)) + + if conf['callbacks']['mode'] == 'max': + best_so_far = -np.inf + cmp_fn = max + else: + best_so_far = np.inf + cmp_fn = min + optim = opt.Adam(train_model.parameters(), lr=lr) + scheduler = opt.lr_scheduler.ExponentialLR(optim, lr_decay) + + hvd.broadcast_parameters(train_model.state_dict(), root_rank=0) + hvd.broadcast_optimizer_state(optim, root_rank=0) + + optimizer_args = {'op': hvd.Average, 'compression': hvd.Compression.fp16, 'named_parameters': train_model.named_parameters()} + optimizer = hvd.DistributedOptimizer(optim, **optimizer_args) + + train_model.train() + not_updated = 0 + # total_loss = 0 + # count = 0 + loss_fn = nn.MSELoss(size_average=True) + model_path = get_model_path(conf) + makedirs_process_safe(os.path.dirname(model_path)) + while e < num_epochs - 1: + print('\nEpoch {}/{}'.format(e, num_epochs)) + (step, ave_loss, curr_loss, num_so_far, + effective_epochs) = train_epoch(train_model, data_gen, optimizer, scheduler, + loss_fn) + e = effective_epochs + loader.verbose = False # True during the first iteration + # if task_index == 0: + # specific_builder.save_model_weights(train_model,int(round(e))) + + if rank == 0: + torch.save(train_model.state_dict(), model_path) + + _, _, _, roc_area, loss = make_predictions_and_evaluate_gpu( + conf, shot_list_validate, loader) + + best_so_far = cmp_fn(roc_area, best_so_far) + + # stop_training = False + print('=========Summary======== for epoch{}'.format(step)) + print('Training Loss numpy: {:.3e}'.format(ave_loss)) + print('Validation Loss: {:.3e}'.format(loss)) + print('Validation ROC: {:.4f}'.format(roc_area)) + + # only save model weights if the quantity we are tracking is improving + if best_so_far != roc_area: + print("No improvement, still saving model") + not_updated += 1 + else: + print("Saving model") + # specific_builder.delete_model_weights(train_model,int(round(e))) + if not_updated > patience: + print("Stopping training due to early stopping") + break diff --git a/plasma/models/loader.py b/plasma/models/loader.py index 39e92d27..52e272ea 100644 --- a/plasma/models/loader.py +++ b/plasma/models/loader.py @@ -835,6 +835,39 @@ def get_batch_size(batch_size, prediction_mode): def get_num_skips(length, skip): return 1 + (length-1)//skip + #FIXME Alexeys + def simple_batch_generator(self, shot_list, max_len=2048, inference=False): + + batch_size = self.conf['training']['batch_size'] + sig, res = self.get_signal_result_from_shot(shot_list.shots[0]) + Xbuff = np.zeros((batch_size, max_len, sig.shape[1])) + Ybuff = np.zeros((batch_size, max_len, res.shape[1])) + + num_total = len(shot_list) + #num_batches = num_total//batch_size + disr = np.zeros(batch_size, dtype=bool) + + while True: + num_so_far = 0 + # the list of all shots + if not inference: + shot_list.shuffle() + + for i in range(num_total): + shot = self.sample_shot_from_list_given_index(shot_list, i) + sig, res = self.get_signal_result_from_shot(shot) + sig = sig[-max_len:,:] + res = res[-max_len:,:] + Xbuff[i%batch_size, -sig.shape[0]:, :] = sig + Ybuff[i%batch_size, -res.shape[0]:, :] = res + disr[i%batch_size] = shot.is_disruptive_shot() + + if i % batch_size == 0: + num_so_far += batch_size + + yield Xbuff, Ybuff, num_so_far, num_total, disr + #Xbuff = np.zeros((batch_size, max_len, sig.shape[1])) + #Ybuff = np.zeros((batch_size, max_len, res.shape[1])) class ProcessGenerator(object): def __init__(self, generator): diff --git a/plasma/transformer/__init__.py b/plasma/transformer/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/plasma/transformer/runner.py b/plasma/transformer/runner.py new file mode 100644 index 00000000..5fb89982 --- /dev/null +++ b/plasma/transformer/runner.py @@ -0,0 +1,362 @@ +from torch.nn.utils import weight_norm +import torch.optim as opt +import torch.nn as nn +import torch + +from keras.utils.generic_utils import Progbar +from plasma.utils.downloading import makedirs_process_safe +from plasma.utils.performance import PerformanceAnalyzer +from plasma.utils.evaluation import get_loss_from_list +from plasma.models.torch_runner import ( + #make_predictions_and_evaluate_gpu, + #make_predictions, + get_signal_dimensions, +) + +from functools import partial +import os +import numpy as np +import logging +import random +import tqdm + +model_filename = "torch_model.pt" +LOGGER = logging.getLogger("plasma.transformer.runner") + +global device +device = "cuda:0" if torch.cuda.is_available() else "cpu" + +# if torch.cuda.is_available(): +# torch.cuda.set_device(device_id) +# device = torch.device("cuda", index=device_id) +# else: +# device = torch.device("cpu") + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + os.environ["PYTHONHASHSEED"]="0" + +class TransformerNet(nn.Module): + def __init__( + self, + n_scalars, + n_profiles, + profile_size, + layer_sizes_spatial, + kernel_size_spatial, + linear_size, + dropout=0.1, + ): + super(TransformerNet, self).__init__() + self.input_layer = InputBlock( + n_scalars, + n_profiles, + profile_size, + layer_sizes_spatial, + kernel_size_spatial, + linear_size, + dropout, + ) + self.temporal_encoder = TransformerSequenceEncoder() + self.model = nn.Sequential(self.input_layer, self.temporal_encoder) + + def forward(self, x): + return self.model(x) + + +class InputBlock(nn.Module): + def __init__( + self, + n_scalars, + n_profiles, + profile_size, + layer_sizes, + kernel_size, + linear_size, + dropout=0.2, + ): + super(InputBlock, self).__init__() + self.pooling_size = 2 + self.n_scalars = n_scalars + self.n_profiles = n_profiles + self.profile_size = profile_size + self.conv_output_size = profile_size + if self.n_profiles == 0: + self.net = None + self.conv_output_size = 0 + else: + self.layers = [] + for (i, layer_size) in enumerate(layer_sizes): + if i == 0: + input_size = n_profiles + else: + input_size = layer_sizes[i - 1] + self.layers.append( + weight_norm(nn.Conv1d(input_size, layer_size, kernel_size)) + ) + self.layers.append(nn.ReLU()) + self.conv_output_size = calculate_conv_output_size( + self.conv_output_size, 0, 1, 1, kernel_size + ) + self.layers.append(nn.MaxPool1d(kernel_size=self.pooling_size)) + self.conv_output_size = calculate_conv_output_size( + self.conv_output_size, 0, 1, self.pooling_size, self.pooling_size + ) + self.layers.append(nn.Dropout2d(dropout)) + self.net = nn.Sequential(*self.layers) + self.conv_output_size = self.conv_output_size * layer_sizes[-1] + self.linear_layers = [] + + print("Final feature size = {}".format(self.n_scalars + self.conv_output_size)) + self.linear_layers.append( + nn.Linear(self.conv_output_size + self.n_scalars, linear_size) + ) + self.linear_layers.append(nn.ReLU()) + self.linear_layers.append(nn.Linear(linear_size, linear_size)) + self.linear_layers.append(nn.ReLU()) + print("Final output size = {}".format(linear_size)) + self.linear_net = nn.Sequential(*self.linear_layers) + + def forward(self, x): + if self.n_profiles == 0: + full_features = x # x_scalars + else: + if self.n_scalars == 0: + x_profiles = x + else: + x_scalars = x[:, : self.n_scalars] + x_profiles = x[:, self.n_scalars :] + x_profiles = x_profiles.contiguous().view( + x.size(0), self.n_profiles, self.profile_size + ) + profile_features = self.net(x_profiles).view(x.size(0), -1) + if self.n_scalars == 0: + full_features = profile_features + else: + full_features = torch.cat([x_scalars, profile_features], dim=1) + + # FIXME do not use linear layers + # out = self.linear_net(full_features) + out = full_features + return out + + +class TransformerSequenceEncoder(nn.Module): + def __init__( + self, + max_seq_length=2048, + d_model=11, + num_layers=6, + dim_feedforward=1024, + nhead=11, + dropout=0.1, + ): + super(TransformerSequenceEncoder, self).__init__() + + self.__transformer_encoder = nn.TransformerEncoder( + encoder_layer=nn.TransformerEncoderLayer( + d_model=d_model, + nhead=nhead, + dim_feedforward=dim_feedforward, + dropout=dropout, + ), + num_layers=num_layers, + norm=nn.LayerNorm(d_model), + ) + + self.__max_seq_length = max_seq_length + self.__d_model = d_model + # FIXME + self.__positional_encodings = nn.Embedding(max_seq_length, d_model).float() + + def forward(self, x): + """ + Shape of x is sequence, length + """ + # Force-pad + mask = ( + torch.arange(x.shape[1], device=device) + .unsqueeze(0) + .lt(torch.tensor([self.__max_seq_length], device=device).unsqueeze(-1)) + ) + transformer_input = x * mask.unsqueeze(-1).float() # B x max_len x D + + positional_encodings = self.__positional_encodings( + torch.arange(x.shape[1], dtype=torch.int64, device=device) + ).unsqueeze(0) + transformer_input = transformer_input + positional_encodings # B x max_len x D + + out = self.__transformer_encoder( + transformer_input #.transpose(0, 1), src_key_padding_mask=~mask + ) + return out + + +def build_torch_model(conf): + + dropout = conf["model"]["dropout_prob"] + n_scalars, n_profiles, profile_size = get_signal_dimensions(conf) + + output_size = 1 + layer_sizes_spatial = [6, 3, 3] + kernel_size_spatial = 3 + linear_size = 5 #FIXME Alexeys there will be no linear layers + + model = TransformerNet( + n_scalars, + n_profiles, + profile_size, + layer_sizes_spatial, + kernel_size_spatial, + linear_size, + dropout, + ) + model.to(device) + + return model + + +def get_model_path(conf): + return ( + conf["paths"]["model_save_path"] + "torch/" + model_filename + ) # save_prepath + model_filename + + +def train_epoch(model, data_gen, optimizer, scheduler, loss_fn): + + loss = 0 + total_loss = 0 + + step = 0 + while True: + x_, y_, num_so_far, num_total, _ = next(data_gen) + + x = torch.from_numpy(x_).float().to(device) + y = torch.from_numpy(y_).float().to(device) + + optimizer.zero_grad() + output = model(x) + loss = loss_fn(output, y) + total_loss += loss.item() + + loss.backward() + optimizer.step() + scheduler.step() + step += 1 + + LOGGER.info( + f"[{step}] [{num_so_far}/{num_total}] loss: {loss.item()}, ave_loss: {total_loss / step}" + ) + if num_so_far >= num_total: + break + + return step, loss.item(), total_loss, num_so_far, 1.0 * num_so_far / num_total + + +def train(conf, shot_list_train, shot_list_validate, loader): + #set random seed + set_seed(0) + num_epochs = conf["training"]["num_epochs"] + patience = conf["callbacks"]["patience"] + lr_decay = conf["model"]["lr_decay"] + batch_size = conf['training']['batch_size'] + lr = conf["model"]["lr"] + clipnorm = conf['model']['clipnorm'] + e = 0 + + loader.set_inference_mode(False) + train_data_gen = partial( + loader.simple_batch_generator, + shot_list=shot_list_train, + )() + valid_data_generator = partial( + loader.simple_batch_generator, + shot_list=shot_list_validate, + inference=True + )() + LOGGER.info(f"validate: {len(shot_list_validate)} shots, {shot_list_validate.num_disruptive()} disruptive") + LOGGER.info(f"training: {len(shot_list_train)} shots, {shot_list_train.num_disruptive()} disruptive") + + loss_fn = nn.MSELoss(size_average=True) + train_model = build_torch_model(conf) + + optimizer = opt.Adam(train_model.parameters(), lr=lr) + scheduler = opt.lr_scheduler.ExponentialLR(optimizer, lr_decay) + + model_path = get_model_path(conf) + makedirs_process_safe(os.path.dirname(model_path)) + + train_model.train() + LOGGER.info(f"{num_epochs - 1 - e} epochs left to go") + while e < num_epochs - 1: + LOGGER.info(f"Epoch {e}/{num_epochs}") + (step, ave_loss, curr_loss, num_so_far, effective_epochs) = train_epoch( + train_model, train_data_gen, optimizer, scheduler, loss_fn + ) + + e = effective_epochs + torch.save(train_model.state_dict(), model_path) + #FIXME no validation for now as OOM + #_, _, _, roc_area, loss = make_predictions_and_evaluate_gpu( + # conf, shot_list_validate, valid_data_generator + #) + + ## stop_training = False + #print("=========Summary======== for epoch{}".format(step)) + #print("Training Loss numpy: {:.3e}".format(ave_loss)) + #print("Validation Loss: {:.3e}".format(loss)) + #print("Validation ROC: {:.4f}".format(roc_area)) + +def apply_model_to_np(model, x): + return model(torch.from_numpy(x).float()).data.numpy() + +#FIXME Alexeys change +def make_predictions(conf, shot_list, generator, custom_path=None): + #generator = loader.inference_batch_generator_full_shot(shot_list) + inference_model = build_torch_model(conf) + + if custom_path is None: + model_path = get_model_path(conf) + else: + model_path = custom_path + inference_model.load_state_dict(torch.load(model_path)) + # shot_list = shot_list.random_sublist(10) + + y_prime = [] + y_gold = [] + disruptive = [] + num_shots = len(shot_list) + + pbar = Progbar(num_shots) + while True: + x_, y_, num_so_far, num_total, disr = next(generator) + + x = torch.from_numpy(x_).float().to(device) + y = torch.from_numpy(y_).float().to(device) + #output = apply_model_to_np(inference_model, x) + output = inference_model(x) + + for batch_idx in range(x.shape[0]): + #curr_length = lengths[batch_idx] + y_prime += [output[batch_idx, :, 0]] + y_gold += [y[batch_idx, :, 0]] + disruptive += [disr[batch_idx]] + pbar.add(1.0) + if len(disruptive) >= num_shots: + y_prime = y_prime[:num_shots] + y_gold = y_gold[:num_shots] + disruptive = disruptive[:num_shots] + break + return y_prime, y_gold, disruptive + +#FIXME ALexeys change loader --> generator +def make_predictions_and_evaluate_gpu(conf, shot_list, generator, custom_path=None): + y_prime, y_gold, disruptive = make_predictions( + conf, shot_list, generator, custom_path) + analyzer = PerformanceAnalyzer(conf=conf) + roc_area = analyzer.get_roc_area(y_prime, y_gold, disruptive) + loss = get_loss_from_list(y_prime, y_gold, conf['data']['target']) + return y_prime, y_gold, disruptive, roc_area, loss \ No newline at end of file From 019c1bdf3ae861059edfb129aca14b994fa4ebdb Mon Sep 17 00:00:00 2001 From: Kyle Gerard Felker Date: Tue, 25 Feb 2020 13:19:23 -0500 Subject: [PATCH 2/2] Fix style errors in transformer code --- .travis.yml | 1 + examples/transformer_learn.py | 32 +++---- plasma/models/distributed_torch_runner.py | 7 +- plasma/models/loader.py | 21 +++-- plasma/transformer/runner.py | 101 ++++++++++++---------- 5 files changed, 88 insertions(+), 74 deletions(-) diff --git a/.travis.yml b/.travis.yml index 087fdb66..95d4a4ba 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,6 +2,7 @@ language: python branches: only: - master + - transformer os: - linux diff --git a/examples/transformer_learn.py b/examples/transformer_learn.py index ae7674d2..68bbc685 100644 --- a/examples/transformer_learn.py +++ b/examples/transformer_learn.py @@ -75,13 +75,12 @@ print('Training on {} shots, testing on {} shots'.format( len(shot_list_train), len(shot_list_test))) - ##################################################### # TRAINING # ##################################################### train(conf, shot_list_train.random_sublist(512), shot_list_validate.random_sublist(256), loader) - #if not only_predict: + # if not only_predict: # p = old_mp.Process(target=train, # args=(conf, shot_list_train, # shot_list_validate, loader) @@ -115,18 +114,17 @@ # TODO(KGF): check tuple unpack (y_prime_train, y_gold_train, disruptive_train, roc_train, - loss_train) = make_predictions_and_evaluate_gpu( - conf, shot_list_train, loader, custom_path) + loss_train) = make_predictions_and_evaluate_gpu( + conf, shot_list_train, loader, custom_path) (y_prime_test, y_gold_test, disruptive_test, roc_test, - loss_test) = make_predictions_and_evaluate_gpu( - conf, shot_list_test, loader, custom_path) + loss_test) = make_predictions_and_evaluate_gpu( + conf, shot_list_test, loader, custom_path) print('=========Summary========') print('Train Loss: {:.3e}'.format(loss_train)) print('Train ROC: {:.4f}'.format(roc_train)) print('Test Loss: {:.3e}'.format(loss_test)) print('Test ROC: {:.4f}'.format(roc_test)) - disruptive_train = np.array(disruptive_train) disruptive_test = np.array(disruptive_test) @@ -138,16 +136,20 @@ shot_list_test.make_light() shot_list_train.make_light() - save_str = 'results_' + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + save_str = 'results_' + datetime.datetime.now().strftime( + "%Y-%m-%d-%H-%M-%S") result_base_path = conf['paths']['results_prepath'] if not os.path.exists(result_base_path): os.makedirs(result_base_path) - np.savez(result_base_path+save_str, y_gold=y_gold, y_gold_train=y_gold_train, - y_gold_test=y_gold_test, y_prime=y_prime, y_prime_train=y_prime_train, - y_prime_test=y_prime_test, disruptive=disruptive, - disruptive_train=disruptive_train, disruptive_test=disruptive_test, - shot_list_validate=shot_list_validate, - shot_list_train=shot_list_train, shot_list_test=shot_list_test, - conf=conf) + np.savez(result_base_path+save_str, y_gold=y_gold, + y_gold_train=y_gold_train, + y_gold_test=y_gold_test, + y_prime=y_prime, y_prime_train=y_prime_train, + y_prime_test=y_prime_test, disruptive=disruptive, + disruptive_train=disruptive_train, + disruptive_test=disruptive_test, + shot_list_validate=shot_list_validate, + shot_list_train=shot_list_train, shot_list_test=shot_list_test, + conf=conf) print('finished.') diff --git a/plasma/models/distributed_torch_runner.py b/plasma/models/distributed_torch_runner.py index f6ba678c..e5b1b483 100644 --- a/plasma/models/distributed_torch_runner.py +++ b/plasma/models/distributed_torch_runner.py @@ -418,7 +418,8 @@ def train(conf, shot_list_train, shot_list_validate, loader): hvd.broadcast_parameters(train_model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optim, root_rank=0) - optimizer_args = {'op': hvd.Average, 'compression': hvd.Compression.fp16, 'named_parameters': train_model.named_parameters()} + optimizer_args = {'op': hvd.Average, 'compression': hvd.Compression.fp16, + 'named_parameters': train_model.named_parameters()} optimizer = hvd.DistributedOptimizer(optim, **optimizer_args) train_model.train() @@ -431,8 +432,8 @@ def train(conf, shot_list_train, shot_list_validate, loader): while e < num_epochs - 1: print('\nEpoch {}/{}'.format(e, num_epochs)) (step, ave_loss, curr_loss, num_so_far, - effective_epochs) = train_epoch(train_model, data_gen, optimizer, scheduler, - loss_fn) + effective_epochs) = train_epoch(train_model, data_gen, optimizer, + scheduler, loss_fn) e = effective_epochs loader.verbose = False # True during the first iteration # if task_index == 0: diff --git a/plasma/models/loader.py b/plasma/models/loader.py index deebddec..b23d9524 100644 --- a/plasma/models/loader.py +++ b/plasma/models/loader.py @@ -825,16 +825,15 @@ def get_batch_size(batch_size, prediction_mode): def get_num_skips(length, skip): return 1 + (length-1)//skip - #FIXME Alexeys + # FIXME Alexeys def simple_batch_generator(self, shot_list, max_len=2048, inference=False): - batch_size = self.conf['training']['batch_size'] sig, res = self.get_signal_result_from_shot(shot_list.shots[0]) Xbuff = np.zeros((batch_size, max_len, sig.shape[1])) Ybuff = np.zeros((batch_size, max_len, res.shape[1])) num_total = len(shot_list) - #num_batches = num_total//batch_size + # num_batches = num_total//batch_size disr = np.zeros(batch_size, dtype=bool) while True: @@ -846,18 +845,18 @@ def simple_batch_generator(self, shot_list, max_len=2048, inference=False): for i in range(num_total): shot = self.sample_shot_from_list_given_index(shot_list, i) sig, res = self.get_signal_result_from_shot(shot) - sig = sig[-max_len:,:] - res = res[-max_len:,:] - Xbuff[i%batch_size, -sig.shape[0]:, :] = sig - Ybuff[i%batch_size, -res.shape[0]:, :] = res - disr[i%batch_size] = shot.is_disruptive_shot() + sig = sig[-max_len:, :] + res = res[-max_len:, :] + Xbuff[i % batch_size, -sig.shape[0]:, :] = sig + Ybuff[i % batch_size, -res.shape[0]:, :] = res + disr[i % batch_size] = shot.is_disruptive_shot() if i % batch_size == 0: num_so_far += batch_size - yield Xbuff, Ybuff, num_so_far, num_total, disr - #Xbuff = np.zeros((batch_size, max_len, sig.shape[1])) - #Ybuff = np.zeros((batch_size, max_len, res.shape[1])) + # Xbuff = np.zeros((batch_size, max_len, sig.shape[1])) + # Ybuff = np.zeros((batch_size, max_len, res.shape[1])) + class ProcessGenerator(object): def __init__(self, generator): diff --git a/plasma/transformer/runner.py b/plasma/transformer/runner.py index 5fb89982..2357947d 100644 --- a/plasma/transformer/runner.py +++ b/plasma/transformer/runner.py @@ -8,9 +8,10 @@ from plasma.utils.performance import PerformanceAnalyzer from plasma.utils.evaluation import get_loss_from_list from plasma.models.torch_runner import ( - #make_predictions_and_evaluate_gpu, - #make_predictions, + # make_predictions_and_evaluate_gpu, + # make_predictions, get_signal_dimensions, + calculate_conv_output_size, ) from functools import partial @@ -18,7 +19,7 @@ import numpy as np import logging import random -import tqdm +# import tqdm model_filename = "torch_model.pt" LOGGER = logging.getLogger("plasma.transformer.runner") @@ -32,12 +33,14 @@ # else: # device = torch.device("cpu") + def set_seed(seed): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) - os.environ["PYTHONHASHSEED"]="0" + os.environ["PYTHONHASHSEED"] = "0" + class TransformerNet(nn.Module): def __init__( @@ -103,14 +106,16 @@ def __init__( ) self.layers.append(nn.MaxPool1d(kernel_size=self.pooling_size)) self.conv_output_size = calculate_conv_output_size( - self.conv_output_size, 0, 1, self.pooling_size, self.pooling_size + self.conv_output_size, 0, 1, self.pooling_size, + self.pooling_size ) self.layers.append(nn.Dropout2d(dropout)) self.net = nn.Sequential(*self.layers) self.conv_output_size = self.conv_output_size * layer_sizes[-1] self.linear_layers = [] - print("Final feature size = {}".format(self.n_scalars + self.conv_output_size)) + print("Final feature size = {}".format(self.n_scalars + + self.conv_output_size)) self.linear_layers.append( nn.Linear(self.conv_output_size + self.n_scalars, linear_size) ) @@ -128,7 +133,7 @@ def forward(self, x): x_profiles = x else: x_scalars = x[:, : self.n_scalars] - x_profiles = x[:, self.n_scalars :] + x_profiles = x[:, self.n_scalars:] x_profiles = x_profiles.contiguous().view( x.size(0), self.n_profiles, self.profile_size ) @@ -170,7 +175,8 @@ def __init__( self.__max_seq_length = max_seq_length self.__d_model = d_model # FIXME - self.__positional_encodings = nn.Embedding(max_seq_length, d_model).float() + self.__positional_encodings = nn.Embedding( + max_seq_length, d_model).float() def forward(self, x): """ @@ -180,17 +186,19 @@ def forward(self, x): mask = ( torch.arange(x.shape[1], device=device) .unsqueeze(0) - .lt(torch.tensor([self.__max_seq_length], device=device).unsqueeze(-1)) + .lt(torch.tensor([self.__max_seq_length], + device=device).unsqueeze(-1)) ) transformer_input = x * mask.unsqueeze(-1).float() # B x max_len x D positional_encodings = self.__positional_encodings( torch.arange(x.shape[1], dtype=torch.int64, device=device) ).unsqueeze(0) - transformer_input = transformer_input + positional_encodings # B x max_len x D + transformer_input = (transformer_input + + positional_encodings) # B x max_len x D out = self.__transformer_encoder( - transformer_input #.transpose(0, 1), src_key_padding_mask=~mask + transformer_input # .transpose(0, 1), src_key_padding_mask=~mask ) return out @@ -199,11 +207,10 @@ def build_torch_model(conf): dropout = conf["model"]["dropout_prob"] n_scalars, n_profiles, profile_size = get_signal_dimensions(conf) - - output_size = 1 - layer_sizes_spatial = [6, 3, 3] + # output_size = 1 + layer_sizes_spatial = [6, 3, 3] kernel_size_spatial = 3 - linear_size = 5 #FIXME Alexeys there will be no linear layers + linear_size = 5 # FIXME Alexeys there will be no linear layers model = TransformerNet( n_scalars, @@ -233,7 +240,7 @@ def train_epoch(model, data_gen, optimizer, scheduler, loss_fn): step = 0 while True: x_, y_, num_so_far, num_total, _ = next(data_gen) - + x = torch.from_numpy(x_).float().to(device) y = torch.from_numpy(y_).float().to(device) @@ -247,24 +254,23 @@ def train_epoch(model, data_gen, optimizer, scheduler, loss_fn): scheduler.step() step += 1 - LOGGER.info( - f"[{step}] [{num_so_far}/{num_total}] loss: {loss.item()}, ave_loss: {total_loss / step}" - ) + LOGGER.info(f"[{step}] [{num_so_far}/{num_total}] loss: {loss.item()}, ave_loss: {total_loss / step}") # noqa if num_so_far >= num_total: break - return step, loss.item(), total_loss, num_so_far, 1.0 * num_so_far / num_total + return (step, loss.item(), total_loss, num_so_far, + 1.0 * num_so_far / num_total) def train(conf, shot_list_train, shot_list_validate, loader): - #set random seed + # set random seed set_seed(0) num_epochs = conf["training"]["num_epochs"] - patience = conf["callbacks"]["patience"] + # patience = conf["callbacks"]["patience"] lr_decay = conf["model"]["lr_decay"] - batch_size = conf['training']['batch_size'] + # batch_size = conf['training']['batch_size'] lr = conf["model"]["lr"] - clipnorm = conf['model']['clipnorm'] + # clipnorm = conf['model']['clipnorm'] e = 0 loader.set_inference_mode(False) @@ -272,20 +278,20 @@ def train(conf, shot_list_train, shot_list_validate, loader): loader.simple_batch_generator, shot_list=shot_list_train, )() - valid_data_generator = partial( + valid_data_generator = partial( # noqa loader.simple_batch_generator, shot_list=shot_list_validate, inference=True )() - LOGGER.info(f"validate: {len(shot_list_validate)} shots, {shot_list_validate.num_disruptive()} disruptive") - LOGGER.info(f"training: {len(shot_list_train)} shots, {shot_list_train.num_disruptive()} disruptive") + LOGGER.info(f"validate: {len(shot_list_validate)} shots, {shot_list_validate.num_disruptive()} disruptive") # noqa + LOGGER.info(f"training: {len(shot_list_train)} shots, {shot_list_train.num_disruptive()} disruptive") # noqa loss_fn = nn.MSELoss(size_average=True) train_model = build_torch_model(conf) optimizer = opt.Adam(train_model.parameters(), lr=lr) scheduler = opt.lr_scheduler.ExponentialLR(optimizer, lr_decay) - + model_path = get_model_path(conf) makedirs_process_safe(os.path.dirname(model_path)) @@ -293,29 +299,32 @@ def train(conf, shot_list_train, shot_list_validate, loader): LOGGER.info(f"{num_epochs - 1 - e} epochs left to go") while e < num_epochs - 1: LOGGER.info(f"Epoch {e}/{num_epochs}") - (step, ave_loss, curr_loss, num_so_far, effective_epochs) = train_epoch( + (step, ave_loss, curr_loss, num_so_far, + effective_epochs) = train_epoch( train_model, train_data_gen, optimizer, scheduler, loss_fn ) - + e = effective_epochs torch.save(train_model.state_dict(), model_path) - #FIXME no validation for now as OOM - #_, _, _, roc_area, loss = make_predictions_and_evaluate_gpu( + # FIXME no validation for now as OOM + # _, _, _, roc_area, loss = make_predictions_and_evaluate_gpu( # conf, shot_list_validate, valid_data_generator - #) + # ) + + # # stop_training = False + # print("=========Summary======== for epoch{}".format(step)) + # print("Training Loss numpy: {:.3e}".format(ave_loss)) + # print("Validation Loss: {:.3e}".format(loss)) + # print("Validation ROC: {:.4f}".format(roc_area)) - ## stop_training = False - #print("=========Summary======== for epoch{}".format(step)) - #print("Training Loss numpy: {:.3e}".format(ave_loss)) - #print("Validation Loss: {:.3e}".format(loss)) - #print("Validation ROC: {:.4f}".format(roc_area)) def apply_model_to_np(model, x): return model(torch.from_numpy(x).float()).data.numpy() -#FIXME Alexeys change + +# FIXME Alexeys change def make_predictions(conf, shot_list, generator, custom_path=None): - #generator = loader.inference_batch_generator_full_shot(shot_list) + # generator = loader.inference_batch_generator_full_shot(shot_list) inference_model = build_torch_model(conf) if custom_path is None: @@ -336,11 +345,11 @@ def make_predictions(conf, shot_list, generator, custom_path=None): x = torch.from_numpy(x_).float().to(device) y = torch.from_numpy(y_).float().to(device) - #output = apply_model_to_np(inference_model, x) + # output = apply_model_to_np(inference_model, x) output = inference_model(x) for batch_idx in range(x.shape[0]): - #curr_length = lengths[batch_idx] + # curr_length = lengths[batch_idx] y_prime += [output[batch_idx, :, 0]] y_gold += [y[batch_idx, :, 0]] disruptive += [disr[batch_idx]] @@ -352,11 +361,13 @@ def make_predictions(conf, shot_list, generator, custom_path=None): break return y_prime, y_gold, disruptive -#FIXME ALexeys change loader --> generator -def make_predictions_and_evaluate_gpu(conf, shot_list, generator, custom_path=None): + +# FIXME ALexeys change loader --> generator +def make_predictions_and_evaluate_gpu(conf, shot_list, generator, + custom_path=None): y_prime, y_gold, disruptive = make_predictions( conf, shot_list, generator, custom_path) analyzer = PerformanceAnalyzer(conf=conf) roc_area = analyzer.get_roc_area(y_prime, y_gold, disruptive) loss = get_loss_from_list(y_prime, y_gold, conf['data']['target']) - return y_prime, y_gold, disruptive, roc_area, loss \ No newline at end of file + return y_prime, y_gold, disruptive, roc_area, loss