From 2561aaa21fb7f75b3cba39dae500e9d6698650f3 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Wed, 17 Mar 2021 00:37:43 +0000
Subject: [PATCH 01/50] Weird unicode error on ThetaGPU (encountered
 2021-03-05)

Dont enable kludge by default

Python 3.8.5 in /home/felker/tf_frnn-2021-03-02 conda env, change
relative to Python 3.7.x?
---
 plasma/utils/hashing.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/plasma/utils/hashing.py b/plasma/utils/hashing.py
index b5c5a1d4..2de47661 100644
--- a/plasma/utils/hashing.py
+++ b/plasma/utils/hashing.py
@@ -65,9 +65,14 @@ def myhash_obj(x):
     Serialize a generic Python object using dill, decode the bytes obj,
     then pass the Unicode string to the particular hash function.
     '''
+    # print(dill.dumps(x)[448:450])
+    # try:
+    #     hashable = dill.dumps(x).decode('unicode_escape')
+    # except UnicodeDecodeError:
+    #     hashable = '8a9sd09vu8a9sdaf0sdsf09diufa'
+    # return myhash(hashable)
     return myhash(dill.dumps(x).decode('unicode_escape'))
 
-
 def myhash_signals(signals):
     '''
     Given a List of Signal class instances, sort by their str representations

From b1f001ed035f296804d09dc40ccbf9a931d948fb Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Wed, 17 Mar 2021 00:45:31 +0000
Subject: [PATCH 02/50] TF 2 compatibliity reached on ThetaGPU

---
 plasma/models/builder.py    |  4 +-
 plasma/models/mpi_runner.py | 76 +++++++++++++++++++------------------
 2 files changed, 42 insertions(+), 38 deletions(-)

diff --git a/plasma/models/builder.py b/plasma/models/builder.py
index 61b3199c..eae5648f 100644
--- a/plasma/models/builder.py
+++ b/plasma/models/builder.py
@@ -10,10 +10,12 @@
     Input,
     Dense, Activation, Dropout, Lambda,
     Reshape, Flatten, Permute,  # RepeatVector
-    LSTM, CuDNNLSTM, SimpleRNN, BatchNormalization,
+    LSTM, ####CuDNNLSTM,
+    SimpleRNN, BatchNormalization,
     Convolution1D, MaxPooling1D, TimeDistributed,
     Concatenate
     )
+CuDNNLSTM = LSTM
 from tensorflow.keras.callbacks import Callback
 from tensorflow.keras.regularizers import l2  # l1, l1_l2
 
diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index aa7ee518..709a405c 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -71,15 +71,15 @@
     # TODO(KGF): above, builder.py (bug workaround), mpi_launch_tensorflow.py,
     # and runner.py are the only files that import tensorflow directly
 
-    from tensorflow.keras.backend import set_session
+    # from tensorflow.keras.backend import set_session
     # KGF: next 3 lines dump many TensorFlow diagnostics to stderr.
     # All MPI ranks first "Successfully opened dynamic library libcuda"
     # then, one by one: ID GPU, libcudart, libcublas, libcufft, ...
     # Finally, "Device interconnect StreamExecutor with strength 1 edge matrix"
     gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.95,
                                 allow_growth=True)
-    config = tf.ConfigProto(gpu_options=gpu_options)
-    set_session(tf.Session(config=config))
+    # config = tf.ConfigProto(gpu_options=gpu_options)
+    # set_session(tf.Session(config=config))
     g.flush_all_inorder()
 else:
     sys.exit('Invalid Keras backend specified')
@@ -1041,40 +1041,41 @@ def __init__(self, log_dir='./logs', histogram_freq=0, validation_steps=0,
         self.write_graph = write_graph
         self.write_grads = write_grads
         self.validation_steps = validation_steps
-        self.sess = None
+        # self.sess = None
         self.model = None
 
     def set_model(self, model):
-        self.model = model
-        self.sess = K.get_session()
-
-        if self.histogram_freq and self.merged is None:
-            for layer in self.model.layers:
-                for weight in layer.weights:
-                    mapped_weight_name = weight.name.replace(':', '_')
-                    tf.summary.histogram(mapped_weight_name, weight)
-                    if self.write_grads:
-                        grads = self.model.optimizer.get_gradients(
-                            self.model.total_loss, weight)
-
-                        def is_indexed_slices(grad):
-                            return type(grad).__name__ == 'IndexedSlices'
-                        grads = [grad.values if is_indexed_slices(grad) else
-                                 grad for grad in grads]
-                        for grad in grads:
-                            tf.summary.histogram(
-                                '{}_grad'.format(mapped_weight_name), grad)
-
-                if hasattr(layer, 'output'):
-                    tf.summary.histogram('{}_out'.format(layer.name),
-                                         layer.output)
-        self.merged = tf.summary.merge_all()
-
-        if self.write_graph:
-            self.writer = tf.summary.FileWriter(self.log_dir,
-                                                self.sess.graph)
-        else:
-            self.writer = tf.summary.FileWriter(self.log_dir)
+        pass
+        # self.model = model
+        # # self.sess = K.get_session()
+
+        # if self.histogram_freq and self.merged is None:
+        #     for layer in self.model.layers:
+        #         for weight in layer.weights:
+        #             mapped_weight_name = weight.name.replace(':', '_')
+        #             tf.summary.histogram(mapped_weight_name, weight)
+        #             if self.write_grads:
+        #                 grads = self.model.optimizer.get_gradients(
+        #                     self.model.total_loss, weight)
+
+        #                 def is_indexed_slices(grad):
+        #                     return type(grad).__name__ == 'IndexedSlices'
+        #                 grads = [grad.values if is_indexed_slices(grad) else
+        #                          grad for grad in grads]
+        #                 for grad in grads:
+        #                     tf.summary.histogram(
+        #                         '{}_grad'.format(mapped_weight_name), grad)
+
+        #         if hasattr(layer, 'output'):
+        #             tf.summary.histogram('{}_out'.format(layer.name),
+        #                                  layer.output)
+        # self.merged = tf.contrib.summary()  # tf.summary.merge_all()
+
+        # if self.write_graph:
+        #     self.writer = tf.summary.FileWriter(self.log_dir,
+        #                                         self.sess.graph)
+        # else:
+        #     self.writer = tf.summary.FileWriter(self.log_dir)
 
     def on_epoch_end(self, val_generator, val_steps, epoch, logs=None):
         logs = logs or {}
@@ -1116,7 +1117,7 @@ def on_epoch_end(self, val_generator, val_steps, epoch, logs=None):
         # this TensorBoard evaluation of the validation set accuracy
         tensors += [K.learning_phase()]
 
-        self.sess = K.get_session()
+        # self.sess = K.get_session()
 
         for val_data in val_generator:
             batch_val = []
@@ -1134,8 +1135,9 @@ def on_epoch_end(self, val_generator, val_steps, epoch, logs=None):
             # Things may break if there is no layer in model that uses this flg
             # E.g. if all Dropout, BatchNorm layers are missing
 
-            feed_dict = dict(zip(tensors, batch_val))
-            result = self.sess.run([self.merged], feed_dict=feed_dict)
+            # feed_dict = dict(zip(tensors, batch_val))
+            # result = self.sess.run([self.merged], feed_dict=feed_dict)
+            result = self.merged(batch_val)
             summary_str = result[0]
             self.writer.add_summary(summary_str, int(round(epoch)))
             val_steps -= 1

From 3f067431f0a768cefdf7a7d7ab5f6a9be41da995 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Tue, 23 Mar 2021 18:07:27 -0500
Subject: [PATCH 03/50] Pickle compatibility with Python 3.8 change to
 protocol=4

---
 plasma/utils/hashing.py | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/plasma/utils/hashing.py b/plasma/utils/hashing.py
index 2de47661..9d2e81a5 100644
--- a/plasma/utils/hashing.py
+++ b/plasma/utils/hashing.py
@@ -2,6 +2,8 @@
 import dill
 import hashlib
 import copy
+# import pickle
+# dill.settings['protocol'] = 3
 
 
 def general_object_hash(o):
@@ -65,13 +67,24 @@ def myhash_obj(x):
     Serialize a generic Python object using dill, decode the bytes obj,
     then pass the Unicode string to the particular hash function.
     '''
-    # print(dill.dumps(x)[448:450])
-    # try:
-    #     hashable = dill.dumps(x).decode('unicode_escape')
-    # except UnicodeDecodeError:
-    #     hashable = '8a9sd09vu8a9sdaf0sdsf09diufa'
-    # return myhash(hashable)
-    return myhash(dill.dumps(x).decode('unicode_escape'))
+
+    # KGF: Python 3.8 made Pickle serialization protocol version 4 the default
+    # Dill (v0.3.3) wraps Pickle, and Pickle now returns an invalid utf-8
+    # escape code when serializing the conf dictionary and nested objs
+    # Works totally fine in Python 3.7 with protocol=3
+    # See PEP 3154
+
+    # protocol=0 in ANSI readable, and I suspect that protocol=3 produces valid utf-8,
+    # but I can't find any documentation of that.
+    # https://stackoverflow.com/questions/30469575/how-to-pickle-and-unpickle-to-portable-string-in-python-3
+    # "pickle.dumps() produces a bytes object. Expecting these arbitrary bytes to be valid
+    # UTF-8 text (the assumption you are making by trying to decode it to a string from
+    # UTF-8) is pretty optimistic."
+
+    # return myhash(pickle.dumps(x).decode('unicode_escape'))
+    # return myhash(dill.dumps(x).decode('raw_unicode_escape'))
+    return myhash(dill.dumps(x, protocol=3).decode('unicode_escape'))
+
 
 def myhash_signals(signals):
     '''

From d5346d8da33895637f2cd642a9244f9978fed1a4 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Tue, 23 Mar 2021 21:52:03 -0500
Subject: [PATCH 04/50] Update SavedModel dump to TF 2.x API

---
 plasma/models/builder.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/plasma/models/builder.py b/plasma/models/builder.py
index f03e9f5d..443168a9 100644
--- a/plasma/models/builder.py
+++ b/plasma/models/builder.py
@@ -374,17 +374,12 @@ def save_model_weights(self, model, epoch):
                    signatures=None,  # applicable to 'tf' SavedModel format only
                    )
         # TensorFlow SavedModel format (full directory)
-        full_moodel_save_dir = full_model_save_path.rsplit('.',1)[0]
+        full_model_save_dir = full_model_save_path.rsplit('.', 1)[0]
         # TODO(KGF): model.save(..., save_format='tf') disabled in r1.15
         # Same with tf.keras.models.save_model(..., save_format="tf").
         # Need to use experimental API until r2.x
-        # model.save(full_model_save_dir, overwrite=True, save_format='tf')
-        tf.keras.experimental.export_saved_model(model, full_moodel_save_dir,
-                                                 custom_objects=None,
-                                                 as_text=False,
-                                                 input_signature=None,
-                                                 serving_only=False
-                                                 )
+        model.save(full_model_save_dir, overwrite=True, save_format='tf')
+
         # try:
         if _has_onnx:
             save_path = self.get_save_path(epoch, ext='onnx')

From 619939e9033e6d75157401c6d84cabd49c12a862 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Fri, 26 Mar 2021 10:55:48 -0500
Subject: [PATCH 05/50] Ignore json files; uncomment TensorBoard writer

Skip writing layer output histograms in TF 2.x, for now
---
 .gitignore                  |  5 +++-
 plasma/models/mpi_runner.py | 60 ++++++++++++++++++-------------------
 2 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/.gitignore b/.gitignore
index 3b24d3a7..7754ecae 100644
--- a/.gitignore
+++ b/.gitignore
@@ -116,4 +116,7 @@ ENV/
 # *.e*
 
 # Etc
-*.local
\ No newline at end of file
+*.local
+
+# TF Profiler
+*.json
\ No newline at end of file
diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index 7074df9e..b775dd37 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -1080,37 +1080,35 @@ def __init__(self, log_dir='./logs', histogram_freq=0, validation_steps=0,
         self.model = None
 
     def set_model(self, model):
-        pass
-        # self.model = model
-        # # self.sess = K.get_session()
-
-        # if self.histogram_freq and self.merged is None:
-        #     for layer in self.model.layers:
-        #         for weight in layer.weights:
-        #             mapped_weight_name = weight.name.replace(':', '_')
-        #             tf.summary.histogram(mapped_weight_name, weight)
-        #             if self.write_grads:
-        #                 grads = self.model.optimizer.get_gradients(
-        #                     self.model.total_loss, weight)
-
-        #                 def is_indexed_slices(grad):
-        #                     return type(grad).__name__ == 'IndexedSlices'
-        #                 grads = [grad.values if is_indexed_slices(grad) else
-        #                          grad for grad in grads]
-        #                 for grad in grads:
-        #                     tf.summary.histogram(
-        #                         '{}_grad'.format(mapped_weight_name), grad)
-
-        #         if hasattr(layer, 'output'):
-        #             tf.summary.histogram('{}_out'.format(layer.name),
-        #                                  layer.output)
-        # self.merged = tf.contrib.summary()  # tf.summary.merge_all()
-
-        # if self.write_graph:
-        #     self.writer = tf.summary.FileWriter(self.log_dir,
-        #                                         self.sess.graph)
-        # else:
-        #     self.writer = tf.summary.FileWriter(self.log_dir)
+        self.model = model
+
+        if self.histogram_freq and self.merged is None:
+            for layer in self.model.layers:
+                for weight in layer.weights:
+                    mapped_weight_name = weight.name.replace(':', '_')
+                    tf.summary.histogram(mapped_weight_name, weight)
+                    if self.write_grads:
+                        grads = self.model.optimizer.get_gradients(
+                            self.model.total_loss, weight)
+
+                        def is_indexed_slices(grad):
+                            return type(grad).__name__ == 'IndexedSlices'
+                        grads = [grad.values if is_indexed_slices(grad) else
+                                 grad for grad in grads]
+                        for grad in grads:
+                            tf.summary.histogram(
+                                '{}_grad'.format(mapped_weight_name), grad)
+
+                # if hasattr(layer, 'output'):
+                #     tf.summary.histogram('{}_out'.format(layer.name),
+                #                          layer.output)
+        self.merged = tf.contrib.summary()  # tf.summary.merge_all()
+
+        if self.write_graph:
+            self.writer = tf.summary.FileWriter(self.log_dir,
+                                                self.sess.graph)
+        else:
+            self.writer = tf.summary.FileWriter(self.log_dir)
 
     def on_epoch_end(self, val_generator, val_steps, epoch, logs=None):
         logs = logs or {}

From b9ed251cf4ed34f5192b41d451b74911fde8ab52 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Fri, 26 Mar 2021 13:29:05 -0500
Subject: [PATCH 06/50] Remove theano, do not import tf.compat.v1

Remove validation set summary info from on_epoch_end()
---
 plasma/models/mpi_runner.py | 112 ++++++++----------------------------
 1 file changed, 25 insertions(+), 87 deletions(-)

diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index b775dd37..121d2393 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -65,10 +65,10 @@
     # But many TF deprecation warnings in 1.14.0, e.g.:
     # "The name tf.GPUOptions is deprecated. Please use tf.compat.v1.GPUOptions
     # instead". See tf_export.py
-    if g.tf_ver >= parse_version('1.14.0'):
-        import tensorflow.compat.v1 as tf
-    else:
-        import tensorflow as tf
+    # if g.tf_ver >= parse_version('1.14.0'):
+    #     import tensorflow.compat.v1 as tf
+    # else:
+    import tensorflow as tf
     # TODO(KGF): above, builder.py (bug workaround), mpi_launch_tensorflow.py,
     # and runner.py are the only files that import tensorflow directly
 
@@ -77,8 +77,8 @@
     # All MPI ranks first "Successfully opened dynamic library libcuda"
     # then, one by one: ID GPU, libcudart, libcublas, libcufft, ...
     # Finally, "Device interconnect StreamExecutor with strength 1 edge matrix"
-    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.95,
-                                allow_growth=True)
+    # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.95,
+    #                             allow_growth=True)
     # config = tf.ConfigProto(gpu_options=gpu_options)
     # set_session(tf.Session(config=config))
     g.flush_all_inorder()
@@ -907,11 +907,11 @@ def mpi_train(conf, shot_list_train, shot_list_validate, loader,
     mpi_model.compile(conf['model']['optimizer'], clipnorm,
                       conf['data']['target'].loss)
     tensorboard = None
-    if g.backend != "theano" and g.task_index == 0:
+    if g.task_index == 0:
         tensorboard_save_path = conf['paths']['tensorboard_save_path']
         write_grads = conf['callbacks']['write_grads']
         tensorboard = TensorBoard(log_dir=tensorboard_save_path,
-                                  histogram_freq=1, write_graph=True,
+                                  histogram_freq=1, # write_graph=True,
                                   write_grads=write_grads)
         tensorboard.set_model(mpi_model.model)
         # TODO(KGF): check addition of TF model summary write added from fork
@@ -1031,12 +1031,11 @@ def mpi_train(conf, shot_list_train, shot_list_validate, loader,
                         train_model, int(round(e)))
 
             # tensorboard
-            if g.backend != 'theano':
-                val_generator = partial(loader.training_batch_generator,
-                                        shot_list=shot_list_validate)()
-                val_steps = 1
-                tensorboard.on_epoch_end(val_generator, val_steps,
-                                         int(round(e)), epoch_logs)
+            val_generator = partial(loader.training_batch_generator,
+                                    shot_list=shot_list_validate)()
+            val_steps = 1
+            tensorboard.on_epoch_end(val_generator, val_steps,
+                                     int(round(e)), epoch_logs)
         stop_training = g.comm.bcast(stop_training, root=0)
         g.write_unique('Finished evaluation of epoch {:.2f}/{}'.format(
             e, num_epochs))
@@ -1065,24 +1064,24 @@ def get_stop_training(callbacks):
 
 class TensorBoard(object):
     def __init__(self, log_dir='./logs', histogram_freq=0, validation_steps=0,
-                 write_graph=True, write_grads=False):
+                 # write_graph=True,
+                 write_grads=False):
         if K.backend() != 'tensorflow':
             raise RuntimeError('TensorBoard callback only works '
                                'with the TensorFlow backend.')
         self.log_dir = log_dir
         self.histogram_freq = histogram_freq
-        self.merged = None
         self.writer = None
-        self.write_graph = write_graph
+        # self.write_graph = write_graph
         self.write_grads = write_grads
         self.validation_steps = validation_steps
-        # self.sess = None
         self.model = None
 
     def set_model(self, model):
         self.model = model
 
-        if self.histogram_freq and self.merged is None:
+        # TODO(KGF): check removal of cond "&& self.merged is None"
+        if self.histogram_freq:
             for layer in self.model.layers:
                 for weight in layer.weights:
                     mapped_weight_name = weight.name.replace(':', '_')
@@ -1099,83 +1098,22 @@ def is_indexed_slices(grad):
                             tf.summary.histogram(
                                 '{}_grad'.format(mapped_weight_name), grad)
 
-                # if hasattr(layer, 'output'):
-                #     tf.summary.histogram('{}_out'.format(layer.name),
-                #                          layer.output)
-        self.merged = tf.contrib.summary()  # tf.summary.merge_all()
+                # KGF: Skip writing layer output histograms in TF 2.x, for now?
+                if hasattr(layer, 'output'):
+                    tf.summary.histogram('{}_out'.format(layer.name),
+                                         layer.output)
 
-        if self.write_graph:
-            self.writer = tf.summary.FileWriter(self.log_dir,
-                                                self.sess.graph)
-        else:
-            self.writer = tf.summary.FileWriter(self.log_dir)
+        self.writer = tf.summary.create_file_writer(self.log_dir)
 
     def on_epoch_end(self, val_generator, val_steps, epoch, logs=None):
         logs = logs or {}
 
+        # KGF: val_roc, val_loss, train_loss
         for name, value in logs.items():
             if name in ['batch', 'size']:
                 continue
-            summary = tf.Summary()
-            summary_value = summary.value.add()
-            summary_value.simple_value = value.item()
-            summary_value.tag = name
-            self.writer.add_summary(summary, epoch)
+            tf.summary.scalar(name, value, step=epoch)
             self.writer.flush()
 
-        # KGF: targets attribute of Model class moved to private in tf.keras
-        tensors = (self.model.inputs + self.model._targets
-                   )  # + self.model.sample_weights)
-        # KGF: tf.keras results in sample_weights = None. Dont pass it
-        # since we use equal weights, anyway
-
-        # KGF: former external Keras API returns the following
-        # print(type(self.model.uses_learning_phase))
-        # <class 'bool'>
-        # print(self.model.uses_learning_phase)
-        # True
-        # "True if the layer has a different behavior in training mode and
-        # test mode"
-
-        # No longer necessary to check backend-dependent flag (eliminated)
-        # https://stackoverflow.com/questions/52295852/what-is-uses-learning-phase-in-keras
-        # if self.model.uses_learning_phase:
-            # print(type(K.learning_phase()))
-            # <class 'tensorflow.python.framework.ops.Tensor'>
-            # print(K.learning_phase())
-            # Tensor("keras_learning_phase:0", shape=(), dtype=bool)
-        # KGF: This indicates that TensorFlow is set to training at this point,
-        # but we zip K.learning_phase() as the key with '1' as the value below
-        # when building our feed_dict, which indicates testing (appropriate for
-        # this TensorBoard evaluation of the validation set accuracy
-        tensors += [K.learning_phase()]
-
-        # self.sess = K.get_session()
-
-        for val_data in val_generator:
-            batch_val = []
-            # sh = val_data[0].shape[0]
-            #
-            # 3x numpy arrays matching input, targets, sample_weights tensors
-            # + 1x bool flag if any layer in model takes a train vs. test flag
-            batch_val.append(val_data[0])
-            batch_val.append(val_data[1])
-            # batch_val.append(np.ones(sh))  # equal weights
-
-            # TODO(KGF): confirm that this flag check can be skipped. See above
-            # if self.model.uses_learning_phase:
-            batch_val.append(1)
-            # Things may break if there is no layer in model that uses this flg
-            # E.g. if all Dropout, BatchNorm layers are missing
-
-            # feed_dict = dict(zip(tensors, batch_val))
-            # result = self.sess.run([self.merged], feed_dict=feed_dict)
-            result = self.merged(batch_val)
-            summary_str = result[0]
-            self.writer.add_summary(summary_str, int(round(epoch)))
-            val_steps -= 1
-            if val_steps <= 0:
-                break
-
     def on_train_end(self):
         self.writer.close()

From 97e2a045b0042ea9e93ede72bfe256217aa8913a Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Fri, 26 Mar 2021 15:11:42 -0500
Subject: [PATCH 07/50] Split base_path to a separate configurable output_path

---
 examples/conf.yaml          | 14 +++++++++++---
 plasma/conf_parser.py       | 16 +++++++---------
 plasma/models/mpi_runner.py |  5 ++++-
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/examples/conf.yaml b/examples/conf.yaml
index 0b996b79..a4992811 100644
--- a/examples/conf.yaml
+++ b/examples/conf.yaml
@@ -2,11 +2,19 @@
 # note, the YAML parser will NOT evaluate expressions in the value fields.
 # e.g. "validation_frac: 1.0/3.0" will result in str value "1.0/3.0"
 
-# will do stuff in fs_path / [username] / signal_data | shot_lists | processed shots, etc.
+# will read and write (normalization, etc.) shot data
+# in fs_path / [username] / signal_data | shot_lists | processed shots, etc.
+# (username is automatically added as first subdir if user_subdir==True)
 
-fs_path: '/tigress'
+# will output csvlog, trained model checkpoints, etc.
+# in fs_path_output / [username] / results | csv_logs | model_checkpoints | Graph, etc.
+
+fs_path: '/Users/'
+user_subdir: True
+fs_path_output: '/Users/'
+user_subdir_output: True
 target: 'hinge' # 'maxhinge' # 'maxhinge' # 'binary' # 'hinge'
-num_gpus: 4  # per node
+num_gpus: 1  # per node
 paths:
   signal_prepath: '/signal_data/' # /signal_data/jet/
   shot_list_dir: '/shot_lists/'
diff --git a/plasma/conf_parser.py b/plasma/conf_parser.py
index 29b96ece..2da7a489 100644
--- a/plasma/conf_parser.py
+++ b/plasma/conf_parser.py
@@ -25,13 +25,11 @@ def parameters(input_file):
         params = yaml.load(yaml_file, Loader=yaml.SafeLoader)
         params['user_name'] = getpass.getuser()
         base_path = params['fs_path']
-        output_path = os.path.join(base_path, params['user_name'])
-        # TODO(KGF): this next line should be deleted at some pt, breaking BC
-        base_path = output_path
-        print(output_path)
-        # TODO(KGF): allow for completely indpendent save/output_path vs. base_path
-        # configured in conf.yaml. don't assume username subdirectory, or pwd
-        # save_path = os.environ.get("PWD")
+        if params['user_subdir']:
+            base_path = os.path.join(base_path, params['user_name'])
+        output_path = params['fs_path_output']
+        if params['user_subdir_output']:
+            output_path = os.path.join(output_path, params['user_name'])
 
         params['paths']['base_path'] = base_path
         params['paths']['output_path'] = output_path
@@ -64,7 +62,7 @@ def parameters(input_file):
             h = myhash_signals(sig.all_signals.values())
 
         params['paths']['global_normalizer_path'] = (
-            output_path
+            base_path
             + '/normalization/normalization_signal_group_{}.npz'.format(h))
         if params['training']['hyperparam_tuning']:
             # params['paths']['saved_shotlist_path'] =
@@ -90,7 +88,7 @@ def parameters(input_file):
             + params['paths']['data']
             + '/shot_lists_signal_group_{}.npz'.format(h))
         params['paths']['processed_prepath'] = (
-            output_path + '/processed_shots/' + 'signal_group_{}/'.format(h))
+            base_path + '/processed_shots/' + 'signal_group_{}/'.format(h))
         # ensure shallow model has +1 -1 target.
         if params['model']['shallow'] or params['target'] == 'hinge':
             params['data']['target'] = HingeTarget
diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index 121d2393..e3397b2d 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -57,7 +57,10 @@
         os.environ['CUDA_VISIBLE_DEVICES'] = '{}'.format(g.MY_GPU)
         # ,mode=NanGuardMode'
     os.environ['KERAS_BACKEND'] = 'tensorflow'  # default setting
-    g.tf_ver = parse_version(get_distribution('tensorflow').version)
+    try:
+        g.tf_ver = parse_version(get_distribution('tensorflow').version)
+    except DistributionNotFound:
+        g.tf_ver = parse_version(get_distribution('tensorflow-gpu').version)
     # compat/compat.py first committed on 2018-06-29 for Py 2 vs 3
     # (around, but not present in, the release of v1.9.0)
     # v2 compatiblity code added, then moved from compat.py in Nov and Dec 2018

From 10d1981cdcbfecb789d82c82c0fa532df1e2c8f9 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Fri, 2 Apr 2021 20:04:40 -0500
Subject: [PATCH 08/50] d3d_signals* dicts in data/signals.py unused in
 conf_parser.py

---
 data/signals.py | 20 ++------------------
 1 file changed, 2 insertions(+), 18 deletions(-)

diff --git a/data/signals.py b/data/signals.py
index ff069552..61f513cc 100644
--- a/data/signals.py
+++ b/data/signals.py
@@ -17,7 +17,7 @@ def create_missing_value_filler():
 def get_tree_and_tag(path):
     if '/' not in path:
         return None, '\\' + path
-    
+
     spl = path.split('/')
     tree = spl[0]
     tag = '\\' + spl[1]
@@ -27,7 +27,7 @@ def get_tree_and_tag(path):
 def get_tree_and_tag_no_backslash(path):
     if '/' not in path:
         return None, path
-    
+
     spl = path.split('/')
     tree = spl[0]
     tag = spl[1]
@@ -455,22 +455,6 @@ def fetch_nstx_data(signal_path, shot_num, c):
     sig_name: sig for (sig_name, sig) in all_signals_restricted.items() if (
         sig.is_defined_on_machines(all_machines) and sig.num_channels > 1)
 }
-d3d_signals = {
-    sig_name: sig for (sig_name, sig) in all_signals_restricted.items() if (
-        sig.is_defined_on_machine(d3d))
-}
-d3d_signals_max_tol = {
-    sig_name: sig for (sig_name, sig) in all_signals_max_tol.items() if (
-        sig.is_defined_on_machine(d3d))
-}
-d3d_signals_0D = {
-    sig_name: sig for (sig_name, sig) in all_signals_restricted.items() if (
-        (sig.is_defined_on_machine(d3d) and sig.num_channels == 1))
-}
-d3d_signals_1D = {
-    sig_name: sig for (sig_name, sig) in all_signals_restricted.items() if (
-        (sig.is_defined_on_machine(d3d) and sig.num_channels > 1))
-}
 
 jet_signals = {
     sig_name: sig for (sig_name, sig) in all_signals_restricted.items() if (

From 56d72de26d3af2d2278e5446a5daa58e80f492ad Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Thu, 29 Apr 2021 19:20:54 -0500
Subject: [PATCH 09/50] Apply Subrata's patch

---
 examples/mpi_learn.py       |  6 ++----
 plasma/global_vars.py       | 15 ++++++++++++++-
 plasma/models/mpi_runner.py | 34 +++++++++++++++++++++++++++++-----
 3 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/examples/mpi_learn.py b/examples/mpi_learn.py
index e06a64ad..d00eae75 100644
--- a/examples/mpi_learn.py
+++ b/examples/mpi_learn.py
@@ -1,8 +1,8 @@
 import plasma.global_vars as g
-g.init_MPI()
-
+import argparse
 import os.path
 
+g.init_MPI()
 
 # TODO(KGF): replace this workaround for the "from plasma.conf import conf"
 def is_valid_file(parser, arg):
@@ -11,7 +11,6 @@ def is_valid_file(parser, arg):
     else:
         return arg
 
-import argparse
 parser = argparse.ArgumentParser(prog='mpi_learn', description='FusionDL TensorFlow 1.x + mpi4py')
 parser.add_argument("--input_file", "-i",   # type=str,
                     required=False, dest="conf_file",
@@ -23,7 +22,6 @@ def is_valid_file(parser, arg):
 
 
 from plasma.conf import conf
-
 from plasma.models.mpi_runner import (
     mpi_train, mpi_make_predictions_and_evaluate
     )
diff --git a/plasma/global_vars.py b/plasma/global_vars.py
index 5f7e1275..fea93eba 100644
--- a/plasma/global_vars.py
+++ b/plasma/global_vars.py
@@ -9,6 +9,8 @@
 MY_GPU = 0
 # TODO(KGF): remove this (and all?) references to Keras backend
 backend = ''
+backendpackage = ''
+bfloat16= ''
 tf_ver = None
 conf_file = None
 
@@ -22,10 +24,21 @@ def init_MPI():
 
 
 def init_GPU_backend(conf):
-    global NUM_GPUS, MY_GPU, backend
+    global NUM_GPUS, MY_GPU, backend, backendpackage, bfloat16
     NUM_GPUS = conf['num_gpus']
     MY_GPU = task_index % NUM_GPUS
     backend = conf['model']['backend']
+    try:
+        backendpackage = conf['model']['backendpackage']
+    except KeyError as ex:
+        backendpackage = backend
+    print( "backendpackage", backendpackage)
+
+    try:
+        bfloat16 = conf['model']['bfloat16']
+    except KeyError as ex:
+        bfloat16 = ''
+    print( "bfloat16", bfloat16)
 
 
 def pprint_unique(obj):
diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index e3397b2d..a84bf65c 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -12,7 +12,7 @@
 # Keras "Using TensorFlow backend" stderr messages do not interfere in stdout
 from plasma.conf import conf
 from mpi4py import MPI
-from pkg_resources import parse_version, get_distribution
+from pkg_resources import parse_version, get_distribution, DistributionNotFound
 import random
 '''
 #########################################################
@@ -58,9 +58,9 @@
         # ,mode=NanGuardMode'
     os.environ['KERAS_BACKEND'] = 'tensorflow'  # default setting
     try:
-        g.tf_ver = parse_version(get_distribution('tensorflow').version)
+        g.tf_ver = parse_version(get_distribution(g.backendpackage).version)
     except DistributionNotFound:
-        g.tf_ver = parse_version(get_distribution('tensorflow-gpu').version)
+        g.tf_ver = parse_version(get_distribution('tensorflow').version)
     # compat/compat.py first committed on 2018-06-29 for Py 2 vs 3
     # (around, but not present in, the release of v1.9.0)
     # v2 compatiblity code added, then moved from compat.py in Nov and Dec 2018
@@ -86,12 +86,36 @@
     # set_session(tf.Session(config=config))
     g.flush_all_inorder()
 else:
-    sys.exit('Invalid Keras backend specified')
+    sys.exit('Invalid Keras backend specified !! {}'.format(g.backend))
 for i in range(g.num_workers):
     g.comm.Barrier()
     if i == g.task_index:
         print('[{}] importing Keras'.format(g.task_index))
-        import tensorflow.keras.backend as K
+        print("g.bfloat16:", g.bfloat16)
+        print("g.backendpackage", g.backendpackage)
+        # bf16
+        if g.bfloat16 == 'keras':
+            print('Running in BFloat16 Via Keras')
+            from tensorflow.keras.mixed_precision import experimental as mixed_precision
+            policy = mixed_precision.Policy('mixed_bfloat16')
+            mixed_precision.set_policy(policy)
+        elif g.bfloat16 == 'amp':
+            print('Running in BFloat16 via AutoMixedPrecisionMkl')
+            import tensorflow as tf
+            from tensorflow.core.protobuf import rewriter_config_pb2
+            from tensorflow.python.keras import backend as K
+
+            graph_options = tf.compat.v1.GraphOptions(
+                        rewrite_options=rewriter_config_pb2.RewriterConfig(
+                        auto_mixed_precision_mkl=rewriter_config_pb2.RewriterConfig.ON))
+
+            session_conf = tf.compat.v1.ConfigProto(graph_options = graph_options)
+            sess = tf.compat.v1.Session(config=session_conf)
+            K.set_session(sess)
+        else:
+            import tensorflow.keras.backend as K
+
+
         from tensorflow.keras.utils import Progbar
         # TODO(KGF): instead of tensorflow.keras.callbacks.CallbackList()
         # until API added in tf-nightly in v2.2.0

From ec8bb97af5fff856cdb550f853074bff2366ebb7 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Wed, 19 May 2021 10:35:45 -0500
Subject: [PATCH 10/50] Update TensorFlow timeline profiler to work with TF 2.x

---
 plasma/models/mpi_runner.py | 29 ++++++-----------------------
 1 file changed, 6 insertions(+), 23 deletions(-)

diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index a84bf65c..dc6cce5d 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -32,7 +32,6 @@
 import datetime
 import numpy as np
 
-from tensorflow.python.client import timeline
 from functools import partial
 from copy import deepcopy
 # import socket
@@ -306,15 +305,7 @@ def compile(self, optimizer, clipnorm, loss='mse'):
             print("Optimizer not implemented yet")
             exit(1)
 
-        # Timeline profiler
-        if (self.conf is not None
-                and conf['training']['timeline_prof']):
-            self.run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
-            self.run_metadata= tf.RunMetadata()
-            self.model.compile(optimizer=optimizer_class, loss=loss,
-                               options=self.run_options, run_metadata=self.run_metadata)
-        else:
-            self.model.compile(optimizer=optimizer_class, loss=loss)
+        self.model.compile(optimizer=optimizer_class, loss=loss)
 
         self.ensure_equal_weights()
 
@@ -554,10 +545,6 @@ def train_epoch(self):
         loss_averager = Averager()
         t_start = time.time()
 
-        timeline_prof = False
-        if (self.conf is not None
-                and conf['training']['timeline_prof']):
-            timeline_prof = True
         step_limit = 0
         if (self.conf is not None
                 and conf['training']['step_limit'] > 0):
@@ -642,15 +629,6 @@ def train_epoch(self):
                     + 'walltime: {:.4f} | '.format(
                         time.time() - self.start_time))
                 g.write_unique(write_str + write_str_0)
-
-                if timeline_prof:
-                    # dump profile
-                    tl = timeline.Timeline(self.run_metadata.step_stats)
-                    ctf = tl.generate_chrome_trace_format()
-                    # dump file per iteration
-                    with open('timeline_%s.json' % step, 'w') as f:
-                        f.write(ctf)
-
                 step += 1
             else:
                 g.write_unique('\r[{}] warmup phase, num so far: {}'.format(
@@ -965,6 +943,9 @@ def mpi_train(conf, shot_list_train, shot_list_validate, loader,
         best_so_far = np.inf
         cmp_fn = min
 
+    if conf['training']['timeline_prof']:
+        tf.profiler.experimental.start('./logs')
+
     while e < num_epochs:
         g.write_unique('\nBegin training from epoch {:.2f}/{}'.format(
             e, num_epochs))
@@ -1071,6 +1052,8 @@ def mpi_train(conf, shot_list_train, shot_list_validate, loader,
         if stop_training:
             g.write_unique("Stopping training due to early stopping")
             break
+    if conf['training']['timeline_prof']:
+        tf.profiler.experimental.stop()
 
     if g.task_index == 0:
         callbacks.on_train_end()

From 5b24ee5f2d5a64c7c5254a03580ef512d12a35d2 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Tue, 25 May 2021 14:04:29 -0500
Subject: [PATCH 11/50] Second attempt to get TensorBoard profiler to work on
 CPU

Added manual instrumentation according to
https://www.tensorflow.org/guide/profiler for Keras custom loops to show
step-time stats and avoid below error. Might be fine on A100?

TensorBoard Callback with profile_batch option was even worse than
tf.profiler function API. Did not show kernel stats, timeline, etc.

No step marker observed and hence the step time is unknown. This may
happen if (1) training steps are not instrumented (e.g., if you are not
using Keras) or (2) the profiling duration is shorter than the step
time. For (1), you need to add step instrumentation; for (2), you may
try to profile longer.
---
 plasma/models/mpi_runner.py | 149 +++++++++++++++++++-----------------
 1 file changed, 78 insertions(+), 71 deletions(-)

diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index dc6cce5d..6a28f675 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -492,6 +492,11 @@ def build_callbacks(self, conf, callbacks_list):
                 patience=patience, monitor=monitor, mode=mode)]
         if "lr_scheduler" in callbacks_list:
             pass
+        # if conf['training']['timeline_prof']:
+        #     tb_callback = tf.keras.callbacks.TensorBoard(
+        #         log_dir="./logs", profile_batch=(10, 15),
+        #         update_freq=1,)
+        #     callbacks += [tb_callback]
 
         return cbks.CallbackList(callbacks)
 
@@ -560,79 +565,81 @@ def train_epoch(self):
 
         while ((self.num_so_far - self.epoch * num_total) < num_total
                or step < self.num_batches_minimum):
-            if step_limit > 0 and step > step_limit:
-                print('reached step limit')
-                break
-            try:
-                (batch_xs, batch_ys, batches_to_reset, num_so_far_curr,
-                 num_total, is_warmup_period) = next(batch_iterator_func)
-            except StopIteration:
-                g.print_unique("Resetting batch iterator.")
-                self.num_so_far_accum = self.num_so_far_indiv
-                self.set_batch_iterator_func()
-                batch_iterator_func = self.batch_iterator_func
-                (batch_xs, batch_ys, batches_to_reset, num_so_far_curr,
-                 num_total, is_warmup_period) = next(batch_iterator_func)
-            self.num_so_far_indiv = self.num_so_far_accum + num_so_far_curr
-
-            # if batches_to_reset:
-            # self.model.reset_states(batches_to_reset)
-
-            warmup_phase = (step < self.warmup_steps and self.epoch == 0)
-            num_replicas = 1 if warmup_phase else self.num_replicas
-
-            self.num_so_far = self.mpi_sum_scalars(
-                self.num_so_far_indiv, num_replicas)
-
-            # run the model once to force compilation. Don't actually use these
-            # values.
-            if first_run:
-                first_run = False
-                t0_comp = time.time()
-                #   print('input_dimension:',batch_xs.shape)
-                #   print('output_dimension:',batch_ys.shape)
-                _, _ = self.train_on_batch_and_get_deltas(
+            # TODO(KGF): this is still not correctly tracing the steps on CPU
+            with tf.profiler.experimental.Trace('train', step_num=step, _r=1):
+                if step_limit > 0 and step > step_limit:
+                    print('reached step limit')
+                    break
+                try:
+                    (batch_xs, batch_ys, batches_to_reset, num_so_far_curr,
+                     num_total, is_warmup_period) = next(batch_iterator_func)
+                except StopIteration:
+                    g.print_unique("Resetting batch iterator.")
+                    self.num_so_far_accum = self.num_so_far_indiv
+                    self.set_batch_iterator_func()
+                    batch_iterator_func = self.batch_iterator_func
+                    (batch_xs, batch_ys, batches_to_reset, num_so_far_curr,
+                     num_total, is_warmup_period) = next(batch_iterator_func)
+                self.num_so_far_indiv = self.num_so_far_accum + num_so_far_curr
+
+                # if batches_to_reset:
+                # self.model.reset_states(batches_to_reset)
+
+                warmup_phase = (step < self.warmup_steps and self.epoch == 0)
+                num_replicas = 1 if warmup_phase else self.num_replicas
+
+                self.num_so_far = self.mpi_sum_scalars(
+                    self.num_so_far_indiv, num_replicas)
+
+                # run the model once to force compilation. Don't actually use these
+                # values.
+                if first_run:
+                    first_run = False
+                    t0_comp = time.time()
+                    #   print('input_dimension:',batch_xs.shape)
+                    #   print('output_dimension:',batch_ys.shape)
+                    _, _ = self.train_on_batch_and_get_deltas(
+                        batch_xs, batch_ys, verbose)
+                    self.comm.Barrier()
+                    sys.stdout.flush()
+                    # TODO(KGF): check line feed/carriage returns around this
+                    g.print_unique('\nCompilation finished in {:.2f}s'.format(
+                        time.time() - t0_comp))
+                    t_start = time.time()
+                    sys.stdout.flush()
+
+                if np.any(batches_to_reset):
+                    reset_states(self.model, batches_to_reset)
+                if ('noise' in self.conf['training'].keys()
+                        and self.conf['training']['noise'] is not False):
+                    batch_xs = self.add_noise(batch_xs)
+                t0 = time.time()
+                deltas, loss = self.train_on_batch_and_get_deltas(
                     batch_xs, batch_ys, verbose)
-                self.comm.Barrier()
-                sys.stdout.flush()
-                # TODO(KGF): check line feed/carriage returns around this
-                g.print_unique('\nCompilation finished in {:.2f}s'.format(
-                    time.time() - t0_comp))
-                t_start = time.time()
-                sys.stdout.flush()
-
-            if np.any(batches_to_reset):
-                reset_states(self.model, batches_to_reset)
-            if ('noise' in self.conf['training'].keys()
-                    and self.conf['training']['noise'] is not False):
-                batch_xs = self.add_noise(batch_xs)
-            t0 = time.time()
-            deltas, loss = self.train_on_batch_and_get_deltas(
-                batch_xs, batch_ys, verbose)
-            t1 = time.time()
-            if not is_warmup_period:
-                self.set_new_weights(deltas, num_replicas)
-                t2 = time.time()
-                write_str_0 = self.calculate_speed(t0, t1, t2, num_replicas)
-                curr_loss = self.mpi_average_scalars(1.0*loss, num_replicas)
-                # g.print_unique(self.model.get_weights()[0][0][:4])
-                loss_averager.add_val(curr_loss)
-                ave_loss = loss_averager.get_ave()
-                eta = self.estimate_remaining_time(
-                    t0 - t_start, self.num_so_far - self.epoch*num_total,
-                    num_total)
-                write_str = (
-                    '\r[{}] step: {} [ETA: {:.2f}s] [{:.2f}/{}], '.format(
-                        self.task_index, step, eta, 1.0*self.num_so_far,
+                t1 = time.time()
+                if not is_warmup_period:
+                    self.set_new_weights(deltas, num_replicas)
+                    t2 = time.time()
+                    write_str_0 = self.calculate_speed(t0, t1, t2, num_replicas)
+                    curr_loss = self.mpi_average_scalars(1.0*loss, num_replicas)
+                    # g.print_unique(self.model.get_weights()[0][0][:4])
+                    loss_averager.add_val(curr_loss)
+                    ave_loss = loss_averager.get_ave()
+                    eta = self.estimate_remaining_time(
+                        t0 - t_start, self.num_so_far - self.epoch*num_total,
                         num_total)
-                    + 'loss: {:.5f} [{:.5f}] | '.format(ave_loss, curr_loss)
-                    + 'walltime: {:.4f} | '.format(
-                        time.time() - self.start_time))
-                g.write_unique(write_str + write_str_0)
-                step += 1
-            else:
-                g.write_unique('\r[{}] warmup phase, num so far: {}'.format(
-                    self.task_index, self.num_so_far))
+                    write_str = (
+                        '\r[{}] step: {} [ETA: {:.2f}s] [{:.2f}/{}], '.format(
+                            self.task_index, step, eta, 1.0*self.num_so_far,
+                            num_total)
+                        + 'loss: {:.5f} [{:.5f}] | '.format(ave_loss, curr_loss)
+                        + 'walltime: {:.4f} | '.format(
+                            time.time() - self.start_time))
+                    g.write_unique(write_str + write_str_0)
+                    step += 1
+                else:
+                    g.write_unique('\r[{}] warmup phase, num so far: {}'.format(
+                        self.task_index, self.num_so_far))
 
         effective_epochs = 1.0*self.num_so_far/num_total
         epoch_previous = self.epoch

From 53e281b8854b662d783f4e31d85495c46c796ce1 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Tue, 25 May 2021 18:47:35 -0500
Subject: [PATCH 12/50] Try using tf2onnx instead of keras2onnx

---
 plasma/models/builder.py | 47 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 41 insertions(+), 6 deletions(-)

diff --git a/plasma/models/builder.py b/plasma/models/builder.py
index 443168a9..d782e758 100644
--- a/plasma/models/builder.py
+++ b/plasma/models/builder.py
@@ -32,11 +32,27 @@
     import keras2onnx
     import onnx
 except ImportError:  # as e:
-    _has_onnx = False
+    _has_keras2onnx = False
     # onnx = None
     # keras2onnx = None
 else:
-    _has_onnx = True
+    _has_keras2onnx = True
+
+try:
+    import tf2onnx
+    import onnx
+    # CLI: python -m tf2onnx.convert --saved-model model.97765202633820900403308121179367157713._epoch_.0 --output frnn-1D.onnx
+except ImportError:  # as e:
+    _has_tf2onnx = False
+    # onnx = None
+    # keras2onnx = None
+else:
+    _has_tf2onnx = True
+
+# TODO(KGF): both conversion tools not working with current network and TF version
+_has_tf2onnx = False
+_has_keras2onnx = False
+
 
 # Synchronize 2x stderr msg from TensorFlow initialization via Keras backend
 # "Succesfully opened dynamic library... libcudart" "Using TensorFlow backend."
@@ -288,10 +304,8 @@ def slicer_output_shape(input_shape, indices):
         #     pre_rnn_model.summary()
         #     sys.stdout = ori
         #     fr.close()
-        # pre_rnn_model.summary()
+        pre_rnn_model.summary()
         x_input = Input(batch_shape=batch_input_shape)
-        # TODO(KGF): Ge moved this inside a new conditional in Dec 2019. check
-        # x_in = TimeDistributed(pre_rnn_model)(x_input)
         if (num_1D > 0 or (
                 'extra_dense_input' in model_conf.keys()
                 and model_conf['extra_dense_input'])):
@@ -381,8 +395,29 @@ def save_model_weights(self, model, epoch):
         model.save(full_model_save_dir, overwrite=True, save_format='tf')
 
         # try:
-        if _has_onnx:
+        if _has_tf2onnx:
             save_path = self.get_save_path(epoch, ext='onnx')
+            # TODO(KGF): eliminate this repeated def of x_input shape from build_model()
+            model_conf = self.conf['model']
+            length = model_conf['length']
+            batch_size = self.conf['training']['batch_size']
+            use_signals = self.conf['paths']['use_signals']
+            num_signals = sum([sig.num_channels for sig in use_signals])
+            batch_input_shape = (batch_size, length, num_signals)
+            print(f"batch_input_shape = {batch_input_shape}")
+            # ValueError: Input 0 of node model_1/lstm/AssignVariableOp was passed float
+            # from model_1/lstm/lstm_cell/ones_like_1/ReadVariableOp/resource:0
+            # incompatible with expected resource.
+            model_proto, external_tensor_storage = tf2onnx.convert.from_keras(
+                model, input_signature=[tf.TensorSpec(batch_input_shape)],
+                opset=10, output_path=save_path)
+            # KGF: error likely due to the splitting of pre_rnn_model and rnn_model, since
+            # the latter expects a trivially-wrapped TimeDistributed(Input()) for 0D model
+        if _has_keras2onnx:
+            save_path = self.get_save_path(epoch, ext='onnx')
+            # TODO(KGF): keras2onnx broken in TF >=2.4
+            # https://github.com/onnx/keras-onnx/issues/651
+
             onnx_model = keras2onnx.convert_keras(model, model.name,
                                                   target_opset=10)
             onnx.save_model(onnx_model, save_path)

From df7ffca28ae8f33bb09569d8cafdfd48e9fe0c57 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Fri, 11 Jun 2021 10:56:18 -0500
Subject: [PATCH 13/50] Bump TF version in Conda YAML

---
 envs/requirements-cpu.yaml          | 2 +-
 envs/requirements-linux-64-gpu.yaml | 2 +-
 envs/requirements-traverse.yaml     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/envs/requirements-cpu.yaml b/envs/requirements-cpu.yaml
index 4687a03a..c54f63af 100644
--- a/envs/requirements-cpu.yaml
+++ b/envs/requirements-cpu.yaml
@@ -5,4 +5,4 @@ h5py
 pyparsing
 pyyaml
 pytorch>1.3
-tensorflow>=1.3,<2.0.0
+tensorflow>2.1.0
diff --git a/envs/requirements-linux-64-gpu.yaml b/envs/requirements-linux-64-gpu.yaml
index 4f106b9b..204eaeb2 100644
--- a/envs/requirements-linux-64-gpu.yaml
+++ b/envs/requirements-linux-64-gpu.yaml
@@ -13,7 +13,7 @@ dependencies:
   - pyparsing
   - pyyaml
   - pytorch>1.3
-  - tensorflow-gpu>=1.3,<2.0.0
+  - tensorflow>2.1.0
   - pip:
       - pathos
       - matplotlib>=2.0.2
diff --git a/envs/requirements-traverse.yaml b/envs/requirements-traverse.yaml
index 09a136a0..fecedfa3 100644
--- a/envs/requirements-traverse.yaml
+++ b/envs/requirements-traverse.yaml
@@ -15,7 +15,7 @@ dependencies:
   - pyparsing
   - pyyaml
   - pytorch>1.3
-  - tensorflow-gpu>=1.3,<2.0.0
+  - tensorflow>2.1.0
   - pip:
       - pathos
       - matplotlib>=2.0.2

From 3a82a56b8be974e35a6c2556e289d69a1257f6e2 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Tue, 6 Jul 2021 00:36:49 -0400
Subject: [PATCH 14/50] Dont print out bfloat, backendpackage

---
 plasma/global_vars.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/plasma/global_vars.py b/plasma/global_vars.py
index fea93eba..db0967ee 100644
--- a/plasma/global_vars.py
+++ b/plasma/global_vars.py
@@ -28,17 +28,18 @@ def init_GPU_backend(conf):
     NUM_GPUS = conf['num_gpus']
     MY_GPU = task_index % NUM_GPUS
     backend = conf['model']['backend']
+
+    # KGF: added via Subrata patch in April 2021 specific to tf2 branch
+    # (neither of the following options are in the default conf.yaml)
     try:
         backendpackage = conf['model']['backendpackage']
     except KeyError as ex:
         backendpackage = backend
-    print( "backendpackage", backendpackage)
 
     try:
         bfloat16 = conf['model']['bfloat16']
     except KeyError as ex:
         bfloat16 = ''
-    print( "bfloat16", bfloat16)
 
 
 def pprint_unique(obj):

From ad8171a036d792b32cf373bd22a0b101f013f07c Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Tue, 6 Jul 2021 00:38:27 -0400
Subject: [PATCH 15/50] Only main rank should output pre_rnn summary

---
 plasma/models/builder.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/plasma/models/builder.py b/plasma/models/builder.py
index d782e758..6d44c97e 100644
--- a/plasma/models/builder.py
+++ b/plasma/models/builder.py
@@ -304,7 +304,8 @@ def slicer_output_shape(input_shape, indices):
         #     pre_rnn_model.summary()
         #     sys.stdout = ori
         #     fr.close()
-        pre_rnn_model.summary()
+        if g.task_index == 0:
+            pre_rnn_model.summary()
         x_input = Input(batch_shape=batch_input_shape)
         if (num_1D > 0 or (
                 'extra_dense_input' in model_conf.keys()

From 7eaed6e8d8e49d8b223edb1cada45bbc9a7ea1e7 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Mon, 5 Jul 2021 23:46:43 -0500
Subject: [PATCH 16/50] Remove print statements

---
 plasma/models/mpi_runner.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index 6a28f675..3df9c328 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -90,8 +90,6 @@
     g.comm.Barrier()
     if i == g.task_index:
         print('[{}] importing Keras'.format(g.task_index))
-        print("g.bfloat16:", g.bfloat16)
-        print("g.backendpackage", g.backendpackage)
         # bf16
         if g.bfloat16 == 'keras':
             print('Running in BFloat16 Via Keras')

From fc02d4d440ed109739e1a2ab5fee86652c465f55 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Tue, 6 Jul 2021 20:51:06 -0400
Subject: [PATCH 17/50] Bump modules and delete some conda/pip deps on Traverse

Downgrade h5py for Keras model loading in TF 2.1.x
https://github.com/tensorflow/tensorflow/issues/44467
---
 envs/requirements-traverse.yaml | 17 ++++++++++-------
 envs/traverse.cmd               |  4 ++--
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/envs/requirements-traverse.yaml b/envs/requirements-traverse.yaml
index fecedfa3..aad550db 100644
--- a/envs/requirements-traverse.yaml
+++ b/envs/requirements-traverse.yaml
@@ -11,16 +11,19 @@ dependencies:
   - scipy
   - pandas
   - flake8
-  - h5py
+  - h5py<3.0.0
   - pyparsing
   - pyyaml
-  - pytorch>1.3
-  - tensorflow>2.1.0
+  #- pytorch>1.3
+  - tensorflow>2.1.0  # limited to 2.1.3 on IBM WMLCE 1.7.0 (2020-02-12) as of mid-2021
+# WML CE 1.7.0 is built for CUDA 10.2 and requires version 440 of the NVIDIA GPU driver.
+# Which GPU device driver does Traverse have?
   - pip:
       - pathos
       - matplotlib>=2.0.2
-      - hyperopt  # TODO(KGF): remove
+      - h5py<3.0.0
+      # - hyperopt  # TODO(KGF): remove
       # - mpi4py   # must reload MPI library modules before installing via pip
-      - xgboost
-      - scikit-learn
-      - joblib
+      # - xgboost
+      # - scikit-learn
+      # - joblib
diff --git a/envs/traverse.cmd b/envs/traverse.cmd
index 7708dbc0..e472969d 100644
--- a/envs/traverse.cmd
+++ b/envs/traverse.cmd
@@ -3,8 +3,8 @@
 module load anaconda3
 conda activate frnn
 
-module load cudatoolkit/11.0
-module load cudnn/cuda-10.1/7.6.1
+module load cudatoolkit/11.3
+module load cudnn/cuda-11.x/8.2.0
 
 # after RHEL 8 upgrade
 module load openmpi/gcc/4.0.4/64

From 3efc49069b4c489f1919d69afd25865feb23ddaf Mon Sep 17 00:00:00 2001
From: Ian DesJardin <imdesjardin@gmail.com>
Date: Thu, 15 Jul 2021 10:01:46 -0400
Subject: [PATCH 18/50] Added buffer to buffer MPI call for model output

---
 plasma/models/mpi_runner.py | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index 3df9c328..d0acde39 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -784,13 +784,36 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             y_prime += y_p
             y_gold += y
             disruptive += disr
+
+            # Create numpy block from y list which is used in MPI
+            # Pads y_prime and y_gold with zeros to make it all fit
+            max_length = g.comm.allreduce(max([max(nparray.shape) for nparray in y_prime]), MPI.MAX)
+            shpz = [y.shape for y in y_prime]
+            y_prime_numpy = np.stack([np.pad(sublist, pad_width=((0,max_length-max(sublist.shape)),(0,0))) for sublist in y_prime])
+            y_gold_numpy = np.stack([np.pad(sublist, pad_width=((0,max_length-max(sublist.shape)),(0,0))) for sublist in y_gold])
             # print_all('\nFinished with i = {}'.format(i))
 
         if (i % g.num_workers == g.num_workers - 1
                 or i == len(shot_sublists) - 1):
             g.comm.Barrier()
-            y_prime_global += concatenate_sublists(g.comm.allgather(y_prime))
-            y_gold_global += concatenate_sublists(g.comm.allgather(y_gold))
+
+            # Create numpy array to store all processors output, then aggregate and unpad using MPI gathered shape list
+            shp = y_prime_numpy.shape
+            shpzg = g.comm.allgather(shpz)
+            y_primeg = np.zeros((g.num_workers*shp[0],)+shp[1:], dtype=y_prime_numpy.dtype)
+            y_goldg  = np.zeros((g.num_workers*shp[0],)+shp[1:], dtype=y_prime_numpy.dtype)
+            y_primeg = g.comm.Allgather([y_prime_numpy, y_prime_numpy.dtype],
+                                        [y_primeg, y_primeg.dtype])
+            y_primeg = g.comm.Allgather([y_gold_numpy, y_gold_numpy.dtype],
+                                        [y_goldg, y_goldg.dtype])
+            y_primeg_list = []
+            y_goldg_list = []
+            # Unpad
+            for idx, s in enumerate(shpzg):
+                y_primeg_list.append(y_primeg[idx,0:max(s),:])
+                y_goldg_list.append(y_goldg[idx,0:max(s),:])
+            y_prime_global += concatenate_sublists(y_primeg_list)
+            y_gold_global += concatenate_sublists(y_goldg_list)
             disruptive_global += concatenate_sublists(
                 g.comm.allgather(disruptive))
             g.comm.Barrier()

From 50b3c66d58182f02df403ccdc82ce6cfa609dcc9 Mon Sep 17 00:00:00 2001
From: Ian Desjardin <imdesjardin@gmail.com>
Date: Fri, 16 Jul 2021 15:23:36 -0400
Subject: [PATCH 19/50] Seems to make it through rewritten section, breaks
 afterwards

---
 plasma/models/mpi_runner.py | 88 ++++++++++++++++++++++++++++++++-----
 1 file changed, 76 insertions(+), 12 deletions(-)

diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index d0acde39..094ae3c2 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -767,9 +767,21 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
     if g.task_index != 0:
         loader.verbose = False
 
+    g.write_unique('num workers= {}\nlen(shot_sublists)={}, num_shots = {}\n'.format(g.num_workers, len(shot_sublists), len(shot_list)))
+    freeme = False
     for (i, shot_sublist) in enumerate(shot_sublists):
+        shpz = []
+        max_length = -1 # So non shot predictive workers don't have a real length
+        #g.write_all('My task index = {}, i mod num_workers = {}\n'.format(g.task_index, i%g.num_workers))
         if i % g.num_workers == g.task_index:
+            #g.write_all('Creating new comm\n')
+            color = 1
+            temp_predictor_only_comm = MPI.Comm.Split(g.comm, color, i)
+            freeme = True
+            # Create new MPI comm to pass around rank
+            #g.write_all('Starting to load and predict subroutine\n')
             X, y, shot_lengths, disr = loader.load_as_X_y_pred(shot_sublist)
+            g.write_all('X, y, lengths, disr loaded, shot_lengths shape: {} \n'.format(len(shot_lengths)))
 
             # load data and fit on data
             y_p = model.predict(X, batch_size=conf['model']['pred_batch_size'])
@@ -777,6 +789,8 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             y_p = loader.batch_output_to_array(y_p)
             y = loader.batch_output_to_array(y)
 
+            #g.write_all('Finished le prediction\n')
+
             # cut arrays back
             y_p = [arr[:shot_lengths[j]] for (j, arr) in enumerate(y_p)]
             y = [arr[:shot_lengths[j]] for (j, arr) in enumerate(y)]
@@ -787,31 +801,77 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
 
             # Create numpy block from y list which is used in MPI
             # Pads y_prime and y_gold with zeros to make it all fit
-            max_length = g.comm.allreduce(max([max(nparray.shape) for nparray in y_prime]), MPI.MAX)
             shpz = [y.shape for y in y_prime]
+            max_length = max([max(y.shape) for y in y_p])
+            #g.write_all(' max length = {}\n'.format(max_length))
+            max_length = temp_predictor_only_comm.allreduce(max_length, MPI.MAX) 
+            #g.write_all('Calculated shpz\n')
             y_prime_numpy = np.stack([np.pad(sublist, pad_width=((0,max_length-max(sublist.shape)),(0,0))) for sublist in y_prime])
             y_gold_numpy = np.stack([np.pad(sublist, pad_width=((0,max_length-max(sublist.shape)),(0,0))) for sublist in y_gold])
-            # print_all('\nFinished with i = {}'.format(i))
+            #g.write_all('First Barrier\n')
+            g.comm.Barrier()
+        elif g.task_index < len(shot_sublists):
+            pass
+        else:
+            if i == 0:
+                color = 2
+                temp_predictor_only_comm = MPI.Comm.Split(g.comm, color, i)
+                freeme = True
+                g.write_all('First Barrier (other threads)\n')
+                g.comm.Barrier()
+                g.write_all('Past First Barrier (other threads)\n')
+            
 
         if (i % g.num_workers == g.num_workers - 1
                 or i == len(shot_sublists) - 1):
-            g.comm.Barrier()
 
+            g.write_all('Entered second area\n')
+            g.comm.Barrier()
             # Create numpy array to store all processors output, then aggregate and unpad using MPI gathered shape list
-            shp = y_prime_numpy.shape
+            g.write_all('getting shapez\n')
             shpzg = g.comm.allgather(shpz)
-            y_primeg = np.zeros((g.num_workers*shp[0],)+shp[1:], dtype=y_prime_numpy.dtype)
-            y_goldg  = np.zeros((g.num_workers*shp[0],)+shp[1:], dtype=y_prime_numpy.dtype)
-            y_primeg = g.comm.Allgather([y_prime_numpy, y_prime_numpy.dtype],
-                                        [y_primeg, y_primeg.dtype])
-            y_primeg = g.comm.Allgather([y_gold_numpy, y_gold_numpy.dtype],
-                                        [y_goldg, y_goldg.dtype])
+            shpzg = [s for s in shpzg if s != [] and s != [0]]
+            shpzg = shpzg[0]
+            shpzg = [s[0] for s in shpzg]
+            max_length = g.comm.allreduce(max_length, MPI.MAX) 
+            g.write_unique(str(shpzg)+'\n')
+            g.write_all('gotting shapez\n')
+            # Todo: Figure out if empty shots are added to fit batch length
+            y_primeg = np.zeros((9*128,max_length,1), dtype=conf['data']['floatx'])
+            y_goldg  = np.zeros((9*128,max_length,1), dtype=conf['data']['floatx'])
+            y_primeg_flattend = np.zeros(y_primeg.flatten().shape)
+            y_goldg_flattend  = np.zeros(y_goldg.flatten().shape)
+            g.write_all('initialized golbals\n')
+            if conf['data']['floatx'] == 'float32':
+                dtype_mpi = MPI.FLOAT
+            # TODO (IMD) Support more floating point types
+            # ValueError: message: cannot infer count, number of entries 10652818 is not a multiple of required number of blocks 9
+            # Need to send an unequal sized array I think
+            if color == 1:
+                g.write_all('y_prime_numpy.shape = {}\n'.format(y_prime_numpy.shape))
+                g.write_all('y_gold_numpy.shape = {}\n'.format(y_gold_numpy.shape))
+                g.write_all('y_prime_g.shape = {}\n'.format(y_primeg.shape))
+                # Todo send flattened and then unflatten
+                temp_predictor_only_comm.Allgather(y_prime_numpy.flatten(), y_primeg_flattend)
+                temp_predictor_only_comm.Allgather(y_gold_numpy.flatten(), y_goldg_flattend)
+            # Process 0 broadcast y_primeg adn y_goldg to all processors, including ones
+            # not involved in calculating predictions so they can each create their own 
+            # y_prime_global and y_gold_global
+            g.comm.Barrier()
+            g.write_all('Broadcasting y_primeg and y_goldg to every\n')
+            g.comm.Bcast(y_primeg_flattend, root=0)
+            g.comm.Bcast(y_goldg_flattend, root=0) 
+            g.write_all('All gathered initialized golbals\n')
+            y_primeg = y_primeg_flattend.reshape(y_primeg.shape)
+            y_goldg  = y_goldg_flattend.reshape(y_goldg.shape)
             y_primeg_list = []
             y_goldg_list = []
             # Unpad
+            g.write_all('unpadding\n'.format(len(shpzg),shpzg[0]))
+            # need to have subgroups gather a broadcast y_prmeg (maybe do above)
             for idx, s in enumerate(shpzg):
-                y_primeg_list.append(y_primeg[idx,0:max(s),:])
-                y_goldg_list.append(y_goldg[idx,0:max(s),:])
+                y_primeg_list.append(y_primeg[idx,0:int(s),:].squeeze())
+                y_goldg_list.append(y_goldg[idx,0:int(s),:].squeeze())
             y_prime_global += concatenate_sublists(y_primeg_list)
             y_gold_global += concatenate_sublists(y_goldg_list)
             disruptive_global += concatenate_sublists(
@@ -824,6 +884,10 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
         if g.task_index == 0:
             pbar.add(1.0*len(shot_sublist))
 
+        if freeme and (g.task_index > len(shot_sublists) and i == 0):
+            temp_predictor_only_comm.Free()
+            freeme = False
+
     y_prime_global = y_prime_global[:len(shot_list)]
     y_gold_global = y_gold_global[:len(shot_list)]
     disruptive_global = disruptive_global[:len(shot_list)]

From 0175053868e7530f8dde8f59755df1dcd8c3ff8b Mon Sep 17 00:00:00 2001
From: Ian Desjardin <imdesjardin@gmail.com>
Date: Mon, 19 Jul 2021 15:41:50 -0400
Subject: [PATCH 20/50] Seems to run ok, seeing possible long term breaks

---
 plasma/models/mpi_runner.py | 51 +++++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 19 deletions(-)

diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index 094ae3c2..ce333f16 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -772,14 +772,14 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
     for (i, shot_sublist) in enumerate(shot_sublists):
         shpz = []
         max_length = -1 # So non shot predictive workers don't have a real length
-        #g.write_all('My task index = {}, i mod num_workers = {}\n'.format(g.task_index, i%g.num_workers))
+        g.write_all('My task index = {}, i mod num_workers = {}\n'.format(g.task_index, i%g.num_workers))
         if i % g.num_workers == g.task_index:
-            #g.write_all('Creating new comm\n')
+            g.write_all('Creating new comm\n')
             color = 1
             temp_predictor_only_comm = MPI.Comm.Split(g.comm, color, i)
             freeme = True
             # Create new MPI comm to pass around rank
-            #g.write_all('Starting to load and predict subroutine\n')
+            g.write_all('Starting to load and predict subroutine\n')
             X, y, shot_lengths, disr = loader.load_as_X_y_pred(shot_sublist)
             g.write_all('X, y, lengths, disr loaded, shot_lengths shape: {} \n'.format(len(shot_lengths)))
 
@@ -789,7 +789,7 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             y_p = loader.batch_output_to_array(y_p)
             y = loader.batch_output_to_array(y)
 
-            #g.write_all('Finished le prediction\n')
+            g.write_all('Finished le prediction\n')
 
             # cut arrays back
             y_p = [arr[:shot_lengths[j]] for (j, arr) in enumerate(y_p)]
@@ -803,12 +803,12 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             # Pads y_prime and y_gold with zeros to make it all fit
             shpz = [y.shape for y in y_prime]
             max_length = max([max(y.shape) for y in y_p])
-            #g.write_all(' max length = {}\n'.format(max_length))
+            g.write_all(' max length = {}\n'.format(max_length))
             max_length = temp_predictor_only_comm.allreduce(max_length, MPI.MAX) 
-            #g.write_all('Calculated shpz\n')
+            g.write_all('Calculated shpz\n')
             y_prime_numpy = np.stack([np.pad(sublist, pad_width=((0,max_length-max(sublist.shape)),(0,0))) for sublist in y_prime])
             y_gold_numpy = np.stack([np.pad(sublist, pad_width=((0,max_length-max(sublist.shape)),(0,0))) for sublist in y_gold])
-            #g.write_all('First Barrier\n')
+            g.write_all('First Barrier\n')
             g.comm.Barrier()
         elif g.task_index < len(shot_sublists):
             pass
@@ -851,7 +851,7 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
                 g.write_all('y_prime_numpy.shape = {}\n'.format(y_prime_numpy.shape))
                 g.write_all('y_gold_numpy.shape = {}\n'.format(y_gold_numpy.shape))
                 g.write_all('y_prime_g.shape = {}\n'.format(y_primeg.shape))
-                # Todo send flattened and then unflatten
+                # Todo send flattened and then unflatten - DOES FLATTENING, GATHERING, AND THEN RESHAPING SCREW WITH ORDERING?
                 temp_predictor_only_comm.Allgather(y_prime_numpy.flatten(), y_primeg_flattend)
                 temp_predictor_only_comm.Allgather(y_gold_numpy.flatten(), y_goldg_flattend)
             # Process 0 broadcast y_primeg adn y_goldg to all processors, including ones
@@ -861,19 +861,25 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             g.write_all('Broadcasting y_primeg and y_goldg to every\n')
             g.comm.Bcast(y_primeg_flattend, root=0)
             g.comm.Bcast(y_goldg_flattend, root=0) 
-            g.write_all('All gathered initialized golbals\n')
-            y_primeg = y_primeg_flattend.reshape(y_primeg.shape)
-            y_goldg  = y_goldg_flattend.reshape(y_goldg.shape)
+            g.write_all('All gathered initialized golbals len1={}, len2={}\n'.format(len(y_primeg_flattend), len(y_goldg_flattend)))
+            y_primeg_flattend = np.split(y_primeg_flattend, len(shot_sublists))
+            y_goldg_flattend = np.split(y_goldg_flattend, len(shot_sublists))
+            y_primeg = [y.reshape((128, max_length, 1)) for y in y_primeg_flattend]
+            y_goldg = [y.reshape((128, max_length, 1)) for y in y_goldg_flattend]
+            g.write_all('primeg length: {}\n'.format(len(y_primeg)))
+            y_primeg = np.concatenate(y_primeg, axis=0)
+            g.write_all('primeg shape after stack: {}\n'.format(y_primeg.shape))
+            y_goldg  = np.concatenate(y_goldg, axis=0)
             y_primeg_list = []
             y_goldg_list = []
             # Unpad
-            g.write_all('unpadding\n'.format(len(shpzg),shpzg[0]))
+            g.write_all('unpadding len(shpzg)={}, shpzg[0]={}\n'.format(len(shpzg),shpzg[0]))
             # need to have subgroups gather a broadcast y_prmeg (maybe do above)
             for idx, s in enumerate(shpzg):
-                y_primeg_list.append(y_primeg[idx,0:int(s),:].squeeze())
-                y_goldg_list.append(y_goldg[idx,0:int(s),:].squeeze())
-            y_prime_global += concatenate_sublists(y_primeg_list)
-            y_gold_global += concatenate_sublists(y_goldg_list)
+                trim = lambda nparry, s: nparry#(0:int(s),:)
+                y_primeg[idx] = trim(y_primeg[idx],s)
+                y_goldg[idx] = trim(y_goldg[idx], s)
+
             disruptive_global += concatenate_sublists(
                 g.comm.allgather(disruptive))
             g.comm.Barrier()
@@ -888,12 +894,14 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             temp_predictor_only_comm.Free()
             freeme = False
 
-    y_prime_global = y_prime_global[:len(shot_list)]
-    y_gold_global = y_gold_global[:len(shot_list)]
+    #y_prime_global = y_prime_global[:len(shot_list)]
+    y_primeg = y_primeg[:len(shot_list)]
+    #y_gold_global = y_gold_global[:len(shot_list)]
+    y_goldg = y_goldg[:len(shot_list)]
     disruptive_global = disruptive_global[:len(shot_list)]
     loader.set_inference_mode(False)
 
-    return y_prime_global, y_gold_global, disruptive_global
+    return y_primeg, y_goldg, disruptive_global
 
 
 def mpi_make_predictions_and_evaluate(conf, shot_list, loader,
@@ -901,6 +909,11 @@ def mpi_make_predictions_and_evaluate(conf, shot_list, loader,
     y_prime, y_gold, disruptive = mpi_make_predictions(
         conf, shot_list, loader, custom_path)
     analyzer = PerformanceAnalyzer(conf=conf)
+    g.write_all('Afterwards y_prime length = {}\n'.format(len(y_prime)))
+    g.write_all('Afterwards y_prime[0] shape = {}\n'.format(y_prime[0].shape))
+    g.write_all('Afterwards y_gold length = {}\n'.format(len(y_gold)))
+    g.write_all('Afterwards y_gold[0] shape = {}\n'.format(y_gold[0].shape))
+    g.write_all('Afterwards distruptive length = {}\n'.format(len(disruptive)))
     roc_area = analyzer.get_roc_area(y_prime, y_gold, disruptive)
     shot_list.set_weights(
         analyzer.get_shot_difficulty(y_prime, y_gold, disruptive))

From 1f79dcac07578b88e628c1851a974052066607dc Mon Sep 17 00:00:00 2001
From: Ian Desjardin <imdesjardin@gmail.com>
Date: Thu, 22 Jul 2021 11:24:46 -0400
Subject: [PATCH 21/50] Added assert and print statements for debugging

---
 plasma/models/mpi_runner.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index ce333f16..60f7b2f9 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -849,9 +849,15 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             # Need to send an unequal sized array I think
             if color == 1:
                 g.write_all('y_prime_numpy.shape = {}\n'.format(y_prime_numpy.shape))
+                g.write_all('y_prime_numpy_flattend.shape = {}\n'.format(y_prime_numpy.flatten().shape))
                 g.write_all('y_gold_numpy.shape = {}\n'.format(y_gold_numpy.shape))
+                g.write_all('y_gold_numpy_flattend.shape = {}\n'.format(y_gold_numpy.flatten().shape))
                 g.write_all('y_prime_g.shape = {}\n'.format(y_primeg.shape))
-                # Todo send flattened and then unflatten - DOES FLATTENING, GATHERING, AND THEN RESHAPING SCREW WITH ORDERING?
+                g.write_all('y_primeg_flattend.shape = {}\n'.format(y_primeg_flattend.shape))
+                g.write_all('y_goldg_flattend.shape = {}\n'.format(y_goldg_flattend.shape))
+                # Ensure that numpy arrays have correct dimensions before gathering them
+                assert len(shot_sublists)*max(y_prime_numpy.flatten().shape) == max(y_primeg_flattend.shape)
+                assert len(shot_sublists)*max(y_gold_numpy.flatten().shape) == max(y_goldg_flattend.shape)
                 temp_predictor_only_comm.Allgather(y_prime_numpy.flatten(), y_primeg_flattend)
                 temp_predictor_only_comm.Allgather(y_gold_numpy.flatten(), y_goldg_flattend)
             # Process 0 broadcast y_primeg adn y_goldg to all processors, including ones

From 8844d54400c686bbe1a0631263bb0c53477335ad Mon Sep 17 00:00:00 2001
From: Ian Desjardin <imdesjardin@gmail.com>
Date: Thu, 22 Jul 2021 13:43:50 -0400
Subject: [PATCH 22/50] Fixed assert calls

---
 plasma/models/mpi_runner.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index 60f7b2f9..e5c54f7b 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -837,13 +837,11 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             g.write_unique(str(shpzg)+'\n')
             g.write_all('gotting shapez\n')
             # Todo: Figure out if empty shots are added to fit batch length
-            y_primeg = np.zeros((9*128,max_length,1), dtype=conf['data']['floatx'])
-            y_goldg  = np.zeros((9*128,max_length,1), dtype=conf['data']['floatx'])
+            y_primeg = np.zeros((len(shot_sublists)*conf['model']['pred_batch_size'],max_length,1), dtype=conf['data']['floatx'])
+            y_goldg  = np.zeros((len(shot_sublists)*conf['model']['pred_batch_size'],max_length,1), dtype=conf['data']['floatx'])
             y_primeg_flattend = np.zeros(y_primeg.flatten().shape)
             y_goldg_flattend  = np.zeros(y_goldg.flatten().shape)
             g.write_all('initialized golbals\n')
-            if conf['data']['floatx'] == 'float32':
-                dtype_mpi = MPI.FLOAT
             # TODO (IMD) Support more floating point types
             # ValueError: message: cannot infer count, number of entries 10652818 is not a multiple of required number of blocks 9
             # Need to send an unequal sized array I think

From 678028f6caf3dbec2fb2f7be309f919bc90a1ca9 Mon Sep 17 00:00:00 2001
From: Ian Desjardin <id9930@traverse.princeton.edu>
Date: Wed, 28 Jul 2021 11:54:03 -0400
Subject: [PATCH 23/50] Works single threaded again on Traverse (might be other
 fixes mixed in there)

---
 plasma/models/mpi_runner.py | 207 ++++++++++++++++++++----------------
 1 file changed, 118 insertions(+), 89 deletions(-)

diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index e5c54f7b..82ad49ec 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -564,80 +564,80 @@ def train_epoch(self):
         while ((self.num_so_far - self.epoch * num_total) < num_total
                or step < self.num_batches_minimum):
             # TODO(KGF): this is still not correctly tracing the steps on CPU
-            with tf.profiler.experimental.Trace('train', step_num=step, _r=1):
-                if step_limit > 0 and step > step_limit:
-                    print('reached step limit')
-                    break
-                try:
-                    (batch_xs, batch_ys, batches_to_reset, num_so_far_curr,
-                     num_total, is_warmup_period) = next(batch_iterator_func)
-                except StopIteration:
-                    g.print_unique("Resetting batch iterator.")
-                    self.num_so_far_accum = self.num_so_far_indiv
-                    self.set_batch_iterator_func()
-                    batch_iterator_func = self.batch_iterator_func
-                    (batch_xs, batch_ys, batches_to_reset, num_so_far_curr,
-                     num_total, is_warmup_period) = next(batch_iterator_func)
-                self.num_so_far_indiv = self.num_so_far_accum + num_so_far_curr
-
-                # if batches_to_reset:
-                # self.model.reset_states(batches_to_reset)
-
-                warmup_phase = (step < self.warmup_steps and self.epoch == 0)
-                num_replicas = 1 if warmup_phase else self.num_replicas
-
-                self.num_so_far = self.mpi_sum_scalars(
-                    self.num_so_far_indiv, num_replicas)
-
-                # run the model once to force compilation. Don't actually use these
-                # values.
-                if first_run:
-                    first_run = False
-                    t0_comp = time.time()
-                    #   print('input_dimension:',batch_xs.shape)
-                    #   print('output_dimension:',batch_ys.shape)
-                    _, _ = self.train_on_batch_and_get_deltas(
-                        batch_xs, batch_ys, verbose)
-                    self.comm.Barrier()
-                    sys.stdout.flush()
-                    # TODO(KGF): check line feed/carriage returns around this
-                    g.print_unique('\nCompilation finished in {:.2f}s'.format(
-                        time.time() - t0_comp))
-                    t_start = time.time()
-                    sys.stdout.flush()
-
-                if np.any(batches_to_reset):
-                    reset_states(self.model, batches_to_reset)
-                if ('noise' in self.conf['training'].keys()
-                        and self.conf['training']['noise'] is not False):
-                    batch_xs = self.add_noise(batch_xs)
-                t0 = time.time()
-                deltas, loss = self.train_on_batch_and_get_deltas(
+            #with tf.profiler.experimental.Trace('train', step_num=step, _r=1):
+            if step_limit > 0 and step > step_limit:
+                print('reached step limit')
+                break
+            try:
+                (batch_xs, batch_ys, batches_to_reset, num_so_far_curr,
+                 num_total, is_warmup_period) = next(batch_iterator_func)
+            except StopIteration:
+                g.print_unique("Resetting batch iterator.")
+                self.num_so_far_accum = self.num_so_far_indiv
+                self.set_batch_iterator_func()
+                batch_iterator_func = self.batch_iterator_func
+                (batch_xs, batch_ys, batches_to_reset, num_so_far_curr,
+                 num_total, is_warmup_period) = next(batch_iterator_func)
+            self.num_so_far_indiv = self.num_so_far_accum + num_so_far_curr
+
+            # if batches_to_reset:
+            # self.model.reset_states(batches_to_reset)
+
+            warmup_phase = (step < self.warmup_steps and self.epoch == 0)
+            num_replicas = 1 if warmup_phase else self.num_replicas
+
+            self.num_so_far = self.mpi_sum_scalars(
+                self.num_so_far_indiv, num_replicas)
+
+            # run the model once to force compilation. Don't actually use these
+            # values.
+            if first_run:
+                first_run = False
+                t0_comp = time.time()
+                #   print('input_dimension:',batch_xs.shape)
+                #   print('output_dimension:',batch_ys.shape)
+                _, _ = self.train_on_batch_and_get_deltas(
                     batch_xs, batch_ys, verbose)
-                t1 = time.time()
-                if not is_warmup_period:
-                    self.set_new_weights(deltas, num_replicas)
-                    t2 = time.time()
-                    write_str_0 = self.calculate_speed(t0, t1, t2, num_replicas)
-                    curr_loss = self.mpi_average_scalars(1.0*loss, num_replicas)
-                    # g.print_unique(self.model.get_weights()[0][0][:4])
-                    loss_averager.add_val(curr_loss)
-                    ave_loss = loss_averager.get_ave()
-                    eta = self.estimate_remaining_time(
-                        t0 - t_start, self.num_so_far - self.epoch*num_total,
+                self.comm.Barrier()
+                sys.stdout.flush()
+                # TODO(KGF): check line feed/carriage returns around this
+                g.print_unique('\nCompilation finished in {:.2f}s'.format(
+                    time.time() - t0_comp))
+                t_start = time.time()
+                sys.stdout.flush()
+
+            if np.any(batches_to_reset):
+                reset_states(self.model, batches_to_reset)
+            if ('noise' in self.conf['training'].keys()
+                    and self.conf['training']['noise'] is not False):
+                batch_xs = self.add_noise(batch_xs)
+            t0 = time.time()
+            deltas, loss = self.train_on_batch_and_get_deltas(
+                batch_xs, batch_ys, verbose)
+            t1 = time.time()
+            if not is_warmup_period:
+                self.set_new_weights(deltas, num_replicas)
+                t2 = time.time()
+                write_str_0 = self.calculate_speed(t0, t1, t2, num_replicas)
+                curr_loss = self.mpi_average_scalars(1.0*loss, num_replicas)
+                # g.print_unique(self.model.get_weights()[0][0][:4])
+                loss_averager.add_val(curr_loss)
+                ave_loss = loss_averager.get_ave()
+                eta = self.estimate_remaining_time(
+                    t0 - t_start, self.num_so_far - self.epoch*num_total,
+                    num_total)
+                write_str = (
+                    '\r[{}] step: {} [ETA: {:.2f}s] [{:.2f}/{}], '.format(
+                        self.task_index, step, eta, 1.0*self.num_so_far,
                         num_total)
-                    write_str = (
-                        '\r[{}] step: {} [ETA: {:.2f}s] [{:.2f}/{}], '.format(
-                            self.task_index, step, eta, 1.0*self.num_so_far,
-                            num_total)
-                        + 'loss: {:.5f} [{:.5f}] | '.format(ave_loss, curr_loss)
-                        + 'walltime: {:.4f} | '.format(
-                            time.time() - self.start_time))
-                    g.write_unique(write_str + write_str_0)
-                    step += 1
-                else:
-                    g.write_unique('\r[{}] warmup phase, num so far: {}'.format(
-                        self.task_index, self.num_so_far))
+                    + 'loss: {:.5f} [{:.5f}] | '.format(ave_loss, curr_loss)
+                    + 'walltime: {:.4f} | '.format(
+                        time.time() - self.start_time))
+                g.write_unique(write_str + write_str_0)
+                step += 1
+            else:
+                g.write_unique('\r[{}] warmup phase, num so far: {}'.format(
+                    self.task_index, self.num_so_far))
 
         effective_epochs = 1.0*self.num_so_far/num_total
         epoch_previous = self.epoch
@@ -769,17 +769,36 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
 
     g.write_unique('num workers= {}\nlen(shot_sublists)={}, num_shots = {}\n'.format(g.num_workers, len(shot_sublists), len(shot_list)))
     freeme = False
+
+    # MPI loop works by predicting in batches of the 
+    # largest possible multiple of len(shot_sublists) < num_workers
+    # i.e. if there are 9 shot_sublists and 4 workers,
+    #      worker 0 will predict shot_sublist 0, 4, and 8
+    #      workers 1-3 will predict shot_sublist 1-3 and 5-7 respectively
+    # each makes their prediction on the ith iteration of the loop
+    #      (i.e. worker 0 predicts shot_sublist 0 on loop iteration i=0)
+    #      and then skips through loop iterations unless it has to predict again (i=4)
+    #      or aggregate predictions with other workers, after each worker has made a prediction
+    #      which happens every num_workers iterations in the for loop
+    #      (i.e. worker 0 will aggregate predictions with workers 1-3 at the end of i=3)
+    if g.task_index < len(shot_sublists) % g.num_workers:
+        times_i_will_predict = len(shot_sublists)//g.num_workers + 1
+    else:
+        times_i_will_predict = len(shot_sublists)//g.num_workers
+    times_predicted = 0
+    g.write_all('I shall predict {} times\n'.format(times_i_will_predict))
     for (i, shot_sublist) in enumerate(shot_sublists):
         shpz = []
         max_length = -1 # So non shot predictive workers don't have a real length
         g.write_all('My task index = {}, i mod num_workers = {}\n'.format(g.task_index, i%g.num_workers))
         if i % g.num_workers == g.task_index:
-            g.write_all('Creating new comm\n')
+            g.write_all('Creating new comm at i={}\n'.format(i))
             color = 1
+            g.comm.Barrier()
             temp_predictor_only_comm = MPI.Comm.Split(g.comm, color, i)
-            freeme = True
+            #freeme = True
             # Create new MPI comm to pass around rank
-            g.write_all('Starting to load and predict subroutine\n')
+            g.write_all('Starting to load and predict subroutine, i={}\n'.format(i))
             X, y, shot_lengths, disr = loader.load_as_X_y_pred(shot_sublist)
             g.write_all('X, y, lengths, disr loaded, shot_lengths shape: {} \n'.format(len(shot_lengths)))
 
@@ -788,7 +807,7 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             model.reset_states()
             y_p = loader.batch_output_to_array(y_p)
             y = loader.batch_output_to_array(y)
-
+            times_predicted += 1
             g.write_all('Finished le prediction\n')
 
             # cut arrays back
@@ -808,23 +827,24 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             g.write_all('Calculated shpz\n')
             y_prime_numpy = np.stack([np.pad(sublist, pad_width=((0,max_length-max(sublist.shape)),(0,0))) for sublist in y_prime])
             y_gold_numpy = np.stack([np.pad(sublist, pad_width=((0,max_length-max(sublist.shape)),(0,0))) for sublist in y_gold])
-            g.write_all('First Barrier\n')
-            g.comm.Barrier()
-        elif g.task_index < len(shot_sublists):
+        elif times_predicted < times_i_will_predict or \
+             (times_predicted==times_i_will_predict and \
+              i < len(shot_sublists) - (len(shot_sublists) % g.num_workers)):
+            g.write_all('skipping on i={}, I only predicted {}/{} times\n'.format(i, times_predicted, times_i_will_predict))
             pass
         else:
-            if i == 0:
-                color = 2
-                temp_predictor_only_comm = MPI.Comm.Split(g.comm, color, i)
-                freeme = True
-                g.write_all('First Barrier (other threads)\n')
-                g.comm.Barrier()
-                g.write_all('Past First Barrier (other threads)\n')
+            color = 2
+            g.write_all('New comm Barrier (other threads), i={}\n'.format(i))
+            g.comm.Barrier()
+            temp_predictor_only_comm = MPI.Comm.Split(g.comm, color, i)
+            #freeme = True
+            g.write_all('Past new comm Barrier (other threads), i={}\n'.format(i))
             
 
         if (i % g.num_workers == g.num_workers - 1
                 or i == len(shot_sublists) - 1):
 
+            freeme = True
             g.write_all('Entered second area\n')
             g.comm.Barrier()
             # Create numpy array to store all processors output, then aggregate and unpad using MPI gathered shape list
@@ -841,7 +861,7 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             y_goldg  = np.zeros((len(shot_sublists)*conf['model']['pred_batch_size'],max_length,1), dtype=conf['data']['floatx'])
             y_primeg_flattend = np.zeros(y_primeg.flatten().shape)
             y_goldg_flattend  = np.zeros(y_goldg.flatten().shape)
-            g.write_all('initialized golbals\n')
+            g.write_all('initialized golbals, i = {}\n'.format(i))
             # TODO (IMD) Support more floating point types
             # ValueError: message: cannot infer count, number of entries 10652818 is not a multiple of required number of blocks 9
             # Need to send an unequal sized array I think
@@ -852,19 +872,25 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
                 g.write_all('y_gold_numpy_flattend.shape = {}\n'.format(y_gold_numpy.flatten().shape))
                 g.write_all('y_prime_g.shape = {}\n'.format(y_primeg.shape))
                 g.write_all('y_primeg_flattend.shape = {}\n'.format(y_primeg_flattend.shape))
-                g.write_all('y_goldg_flattend.shape = {}\n'.format(y_goldg_flattend.shape))
+                g.write_all('y_goldg_flattend.shape = {}, i={}\n'.format(y_goldg_flattend.shape,i))
                 # Ensure that numpy arrays have correct dimensions before gathering them
                 assert len(shot_sublists)*max(y_prime_numpy.flatten().shape) == max(y_primeg_flattend.shape)
                 assert len(shot_sublists)*max(y_gold_numpy.flatten().shape) == max(y_goldg_flattend.shape)
+                g.write_all('Passed asserts for i = {}\n'.format(i))
+                temp_predictor_only_comm.Barrier()
                 temp_predictor_only_comm.Allgather(y_prime_numpy.flatten(), y_primeg_flattend)
                 temp_predictor_only_comm.Allgather(y_gold_numpy.flatten(), y_goldg_flattend)
+                temp_predictor_only_comm.Barrier()
+                g.write_all('Passed Allgather color=1 for i = {}\n'.format(i))
             # Process 0 broadcast y_primeg adn y_goldg to all processors, including ones
             # not involved in calculating predictions so they can each create their own 
             # y_prime_global and y_gold_global
+            g.write_all('Waiting on Barrier before broadcast at i={}\n'.format(i))
             g.comm.Barrier()
             g.write_all('Broadcasting y_primeg and y_goldg to every\n')
             g.comm.Bcast(y_primeg_flattend, root=0)
             g.comm.Bcast(y_goldg_flattend, root=0) 
+            g.comm.Barrier()
             g.write_all('All gathered initialized golbals len1={}, len2={}\n'.format(len(y_primeg_flattend), len(y_goldg_flattend)))
             y_primeg_flattend = np.split(y_primeg_flattend, len(shot_sublists))
             y_goldg_flattend = np.split(y_goldg_flattend, len(shot_sublists))
@@ -879,6 +905,7 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             # Unpad
             g.write_all('unpadding len(shpzg)={}, shpzg[0]={}\n'.format(len(shpzg),shpzg[0]))
             # need to have subgroups gather a broadcast y_prmeg (maybe do above)
+            # TODO
             for idx, s in enumerate(shpzg):
                 trim = lambda nparry, s: nparry#(0:int(s),:)
                 y_primeg[idx] = trim(y_primeg[idx],s)
@@ -894,9 +921,11 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
         if g.task_index == 0:
             pbar.add(1.0*len(shot_sublist))
 
-        if freeme and (g.task_index > len(shot_sublists) and i == 0):
+        if freeme:
+            g.write_all('Freeing extra comm at i={}\n'.format(i))
             temp_predictor_only_comm.Free()
             freeme = False
+            color = 2
 
     #y_prime_global = y_prime_global[:len(shot_list)]
     y_primeg = y_primeg[:len(shot_list)]

From 2d625cd8b5a445bd966c0d5a62fa7d556cae8bef Mon Sep 17 00:00:00 2001
From: Ian DesJardin <idesjard@umd.edu>
Date: Wed, 28 Jul 2021 15:34:50 -0400
Subject: [PATCH 24/50] Tested to work for N=1 and N=4

---
 plasma/models/mpi_runner.py | 47 +++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 26 deletions(-)

diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index 82ad49ec..f6f377da 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -786,16 +786,17 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
     else:
         times_i_will_predict = len(shot_sublists)//g.num_workers
     times_predicted = 0
+    color = 0
     g.write_all('I shall predict {} times\n'.format(times_i_will_predict))
     for (i, shot_sublist) in enumerate(shot_sublists):
         shpz = []
         max_length = -1 # So non shot predictive workers don't have a real length
         g.write_all('My task index = {}, i mod num_workers = {}\n'.format(g.task_index, i%g.num_workers))
         if i % g.num_workers == g.task_index:
-            g.write_all('Creating new comm at i={}\n'.format(i))
+            #g.write_all('Creating new comm at i={}\n'.format(i))
             color = 1
-            g.comm.Barrier()
-            temp_predictor_only_comm = MPI.Comm.Split(g.comm, color, i)
+            #g.comm.Barrier()
+            #temp_predictor_only_comm = MPI.Comm.Split(g.comm, color, i)
             #freeme = True
             # Create new MPI comm to pass around rank
             g.write_all('Starting to load and predict subroutine, i={}\n'.format(i))
@@ -818,34 +819,27 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             y_gold += y
             disruptive += disr
 
+        if (i % g.num_workers == g.num_workers - 1
+                or i == len(shot_sublists) - 1):
             # Create numpy block from y list which is used in MPI
             # Pads y_prime and y_gold with zeros to make it all fit
-            shpz = [y.shape for y in y_prime]
-            max_length = max([max(y.shape) for y in y_p])
-            g.write_all(' max length = {}\n'.format(max_length))
-            max_length = temp_predictor_only_comm.allreduce(max_length, MPI.MAX) 
+            g.write_all('In second area, my color is {}, i={}\n'.format(color,i))
+            g.comm.Barrier()
+            if color ==1:
+                shpz = [y.shape for y in y_prime]
+                max_length = max([max(y.shape) for y in y_p])
+                g.write_all(' max length = {}\n'.format(max_length))
+            g.comm.Barrier()
+            max_length = g.comm.allreduce(max_length, MPI.MAX) 
             g.write_all('Calculated shpz\n')
-            y_prime_numpy = np.stack([np.pad(sublist, pad_width=((0,max_length-max(sublist.shape)),(0,0))) for sublist in y_prime])
-            y_gold_numpy = np.stack([np.pad(sublist, pad_width=((0,max_length-max(sublist.shape)),(0,0))) for sublist in y_gold])
-        elif times_predicted < times_i_will_predict or \
-             (times_predicted==times_i_will_predict and \
-              i < len(shot_sublists) - (len(shot_sublists) % g.num_workers)):
-            g.write_all('skipping on i={}, I only predicted {}/{} times\n'.format(i, times_predicted, times_i_will_predict))
-            pass
-        else:
-            color = 2
-            g.write_all('New comm Barrier (other threads), i={}\n'.format(i))
+            if color == 1:
+                y_prime_numpy = np.stack([np.pad(sublist, pad_width=((0,max_length-max(sublist.shape)),(0,0))) for sublist in y_prime])
+                y_gold_numpy = np.stack([np.pad(sublist, pad_width=((0,max_length-max(sublist.shape)),(0,0))) for sublist in y_gold])
+            
+            g.write_all('Entered second area\n')
             g.comm.Barrier()
             temp_predictor_only_comm = MPI.Comm.Split(g.comm, color, i)
-            #freeme = True
-            g.write_all('Past new comm Barrier (other threads), i={}\n'.format(i))
-            
-
-        if (i % g.num_workers == g.num_workers - 1
-                or i == len(shot_sublists) - 1):
-
             freeme = True
-            g.write_all('Entered second area\n')
             g.comm.Barrier()
             # Create numpy array to store all processors output, then aggregate and unpad using MPI gathered shape list
             g.write_all('getting shapez\n')
@@ -916,7 +910,8 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             g.comm.Barrier()
             y_prime = []
             y_gold = []
-            disruptive = []
+            disruptive = [] 
+            color = 0
 
         if g.task_index == 0:
             pbar.add(1.0*len(shot_sublist))

From 14f274b8063d17fdd3c70929dc81b9c735f20a24 Mon Sep 17 00:00:00 2001
From: Ian DesJardin <idesjard@umd.edu>
Date: Thu, 29 Jul 2021 13:12:56 -0400
Subject: [PATCH 25/50] Returns as yprime, ygold as list as intended

---
 plasma/models/mpi_runner.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index f6f377da..3f73b129 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -826,7 +826,7 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             g.write_all('In second area, my color is {}, i={}\n'.format(color,i))
             g.comm.Barrier()
             if color ==1:
-                shpz = [y.shape for y in y_prime]
+                shpz = [max(y.shape) for y in y_prime]
                 max_length = max([max(y.shape) for y in y_p])
                 g.write_all(' max length = {}\n'.format(max_length))
             g.comm.Barrier()
@@ -842,11 +842,14 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             freeme = True
             g.comm.Barrier()
             # Create numpy array to store all processors output, then aggregate and unpad using MPI gathered shape list
-            g.write_all('getting shapez\n')
+            g.write_all('getting shapez, i contribute {} shpz\n'.format(len(shpz)))
             shpzg = g.comm.allgather(shpz)
-            shpzg = [s for s in shpzg if s != [] and s != [0]]
-            shpzg = shpzg[0]
-            shpzg = [s[0] for s in shpzg]
+            import itertools
+            shpzg = list(itertools.chain(*shpzg))
+            g.comm.Barrier()
+            g.write_all('shpzg shape before preproc={}\n'.format(len(shpzg)))
+            shpzg = [s for s in shpzg if s != []]
+            g.write_all('shpzg shape after preproc={}\n'.format(len(shpzg)))
             max_length = g.comm.allreduce(max_length, MPI.MAX) 
             g.write_unique(str(shpzg)+'\n')
             g.write_all('gotting shapez\n')
@@ -894,16 +897,14 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             y_primeg = np.concatenate(y_primeg, axis=0)
             g.write_all('primeg shape after stack: {}\n'.format(y_primeg.shape))
             y_goldg  = np.concatenate(y_goldg, axis=0)
-            y_primeg_list = []
-            y_goldg_list = []
             # Unpad
             g.write_all('unpadding len(shpzg)={}, shpzg[0]={}\n'.format(len(shpzg),shpzg[0]))
             # need to have subgroups gather a broadcast y_prmeg (maybe do above)
             # TODO
             for idx, s in enumerate(shpzg):
-                trim = lambda nparry, s: nparry#(0:int(s),:)
-                y_primeg[idx] = trim(y_primeg[idx],s)
-                y_goldg[idx] = trim(y_goldg[idx], s)
+                trim = lambda nparry, s: nparry[0:int(s),:]
+                y_prime_global.append(trim(y_primeg[idx],s))
+                y_gold_global.append(trim(y_goldg[idx], s))
 
             disruptive_global += concatenate_sublists(
                 g.comm.allgather(disruptive))
@@ -912,6 +913,7 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             y_gold = []
             disruptive = [] 
             color = 0
+            g.write_all('y_prime_global len ={}\n'.format(len(y_prime_global)))
 
         if g.task_index == 0:
             pbar.add(1.0*len(shot_sublist))
@@ -922,14 +924,12 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             freeme = False
             color = 2
 
-    #y_prime_global = y_prime_global[:len(shot_list)]
-    y_primeg = y_primeg[:len(shot_list)]
-    #y_gold_global = y_gold_global[:len(shot_list)]
-    y_goldg = y_goldg[:len(shot_list)]
+    y_prime_global = y_prime_global[:len(shot_list)]
+    y_gold_global = y_gold_global[:len(shot_list)]
     disruptive_global = disruptive_global[:len(shot_list)]
     loader.set_inference_mode(False)
 
-    return y_primeg, y_goldg, disruptive_global
+    return y_prime_global, y_gold_global, disruptive_global
 
 
 def mpi_make_predictions_and_evaluate(conf, shot_list, loader,

From 158b8d16b88440f244ae6e94ad8c42d1d7291b72 Mon Sep 17 00:00:00 2001
From: Ian DesJardin <idesjard@umd.edu>
Date: Thu, 29 Jul 2021 17:55:42 -0400
Subject: [PATCH 26/50] I did something, I dunno

---
 plasma/models/mpi_runner.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index 3f73b129..1c7683cd 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -854,8 +854,12 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             g.write_unique(str(shpzg)+'\n')
             g.write_all('gotting shapez\n')
             # Todo: Figure out if empty shots are added to fit batch length
-            y_primeg = np.zeros((len(shot_sublists)*conf['model']['pred_batch_size'],max_length,1), dtype=conf['data']['floatx'])
-            y_goldg  = np.zeros((len(shot_sublists)*conf['model']['pred_batch_size'],max_length,1), dtype=conf['data']['floatx'])
+            if color == 1:
+                num_pred = temp_predictor_only_comm.size
+            else:
+                num_pred = g.comm.size - temp_predictor_only_comm.size
+            y_primeg = np.zeros((num_pred*conf['model']['pred_batch_size'],max_length,1), dtype=conf['data']['floatx'])
+            y_goldg  = np.zeros((num_pred*conf['model']['pred_batch_size'],max_length,1), dtype=conf['data']['floatx'])
             y_primeg_flattend = np.zeros(y_primeg.flatten().shape)
             y_goldg_flattend  = np.zeros(y_goldg.flatten().shape)
             g.write_all('initialized golbals, i = {}\n'.format(i))
@@ -871,8 +875,8 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
                 g.write_all('y_primeg_flattend.shape = {}\n'.format(y_primeg_flattend.shape))
                 g.write_all('y_goldg_flattend.shape = {}, i={}\n'.format(y_goldg_flattend.shape,i))
                 # Ensure that numpy arrays have correct dimensions before gathering them
-                assert len(shot_sublists)*max(y_prime_numpy.flatten().shape) == max(y_primeg_flattend.shape)
-                assert len(shot_sublists)*max(y_gold_numpy.flatten().shape) == max(y_goldg_flattend.shape)
+                assert num_pred*max(y_prime_numpy.flatten().shape) == max(y_primeg_flattend.shape)
+                assert num_pred*max(y_gold_numpy.flatten().shape) == max(y_goldg_flattend.shape)
                 g.write_all('Passed asserts for i = {}\n'.format(i))
                 temp_predictor_only_comm.Barrier()
                 temp_predictor_only_comm.Allgather(y_prime_numpy.flatten(), y_primeg_flattend)
@@ -889,8 +893,8 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             g.comm.Bcast(y_goldg_flattend, root=0) 
             g.comm.Barrier()
             g.write_all('All gathered initialized golbals len1={}, len2={}\n'.format(len(y_primeg_flattend), len(y_goldg_flattend)))
-            y_primeg_flattend = np.split(y_primeg_flattend, len(shot_sublists))
-            y_goldg_flattend = np.split(y_goldg_flattend, len(shot_sublists))
+            y_primeg_flattend = np.split(y_primeg_flattend, num_pred)
+            y_goldg_flattend = np.split(y_goldg_flattend, num_pred)
             y_primeg = [y.reshape((128, max_length, 1)) for y in y_primeg_flattend]
             y_goldg = [y.reshape((128, max_length, 1)) for y in y_goldg_flattend]
             g.write_all('primeg length: {}\n'.format(len(y_primeg)))

From d6eb6dcd604b739efab8d707ffff2c6a6cb7ceba Mon Sep 17 00:00:00 2001
From: Ian DesJardin <idesjard@umd.edu>
Date: Fri, 30 Jul 2021 13:49:16 -0400
Subject: [PATCH 27/50] Commented out debugging statements

---
 plasma/models/mpi_runner.py | 84 +++++++++++++++++--------------------
 1 file changed, 39 insertions(+), 45 deletions(-)

diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index 1c7683cd..23195203 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -14,6 +14,7 @@
 from mpi4py import MPI
 from pkg_resources import parse_version, get_distribution, DistributionNotFound
 import random
+import itertools
 '''
 #########################################################
 This file trains a deep learning model to predict
@@ -767,7 +768,7 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
     if g.task_index != 0:
         loader.verbose = False
 
-    g.write_unique('num workers= {}\nlen(shot_sublists)={}, num_shots = {}\n'.format(g.num_workers, len(shot_sublists), len(shot_list)))
+    #g.write_unique('num workers= {}\nlen(shot_sublists)={}, num_shots = {}\n'.format(g.num_workers, len(shot_sublists), len(shot_list)))
     freeme = False
 
     # MPI loop works by predicting in batches of the 
@@ -781,17 +782,12 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
     #      or aggregate predictions with other workers, after each worker has made a prediction
     #      which happens every num_workers iterations in the for loop
     #      (i.e. worker 0 will aggregate predictions with workers 1-3 at the end of i=3)
-    if g.task_index < len(shot_sublists) % g.num_workers:
-        times_i_will_predict = len(shot_sublists)//g.num_workers + 1
-    else:
-        times_i_will_predict = len(shot_sublists)//g.num_workers
-    times_predicted = 0
-    color = 0
-    g.write_all('I shall predict {} times\n'.format(times_i_will_predict))
+    color = 2
+    #g.write_all('I shall predict {} times\n'.format(times_i_will_predict))
     for (i, shot_sublist) in enumerate(shot_sublists):
         shpz = []
         max_length = -1 # So non shot predictive workers don't have a real length
-        g.write_all('My task index = {}, i mod num_workers = {}\n'.format(g.task_index, i%g.num_workers))
+        #g.write_all('My task index = {}, i mod num_workers = {}\n'.format(g.task_index, i%g.num_workers))
         if i % g.num_workers == g.task_index:
             #g.write_all('Creating new comm at i={}\n'.format(i))
             color = 1
@@ -799,17 +795,16 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             #temp_predictor_only_comm = MPI.Comm.Split(g.comm, color, i)
             #freeme = True
             # Create new MPI comm to pass around rank
-            g.write_all('Starting to load and predict subroutine, i={}\n'.format(i))
+            #g.write_all('Starting to load and predict subroutine, i={}\n'.format(i))
             X, y, shot_lengths, disr = loader.load_as_X_y_pred(shot_sublist)
-            g.write_all('X, y, lengths, disr loaded, shot_lengths shape: {} \n'.format(len(shot_lengths)))
+            #g.write_all('X, y, lengths, disr loaded, shot_lengths shape: {} \n'.format(len(shot_lengths)))
 
             # load data and fit on data
             y_p = model.predict(X, batch_size=conf['model']['pred_batch_size'])
             model.reset_states()
             y_p = loader.batch_output_to_array(y_p)
             y = loader.batch_output_to_array(y)
-            times_predicted += 1
-            g.write_all('Finished le prediction\n')
+            #g.write_all('Finished le prediction\n')
 
             # cut arrays back
             y_p = [arr[:shot_lengths[j]] for (j, arr) in enumerate(y_p)]
@@ -823,7 +818,7 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
                 or i == len(shot_sublists) - 1):
             # Create numpy block from y list which is used in MPI
             # Pads y_prime and y_gold with zeros to make it all fit
-            g.write_all('In second area, my color is {}, i={}\n'.format(color,i))
+            #g.write_all('In second area, my color is {}, i={}\n'.format(color,i))
             g.comm.Barrier()
             if color ==1:
                 shpz = [max(y.shape) for y in y_prime]
@@ -831,28 +826,27 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
                 g.write_all(' max length = {}\n'.format(max_length))
             g.comm.Barrier()
             max_length = g.comm.allreduce(max_length, MPI.MAX) 
-            g.write_all('Calculated shpz\n')
+            #g.write_all('Calculated shpz\n')
             if color == 1:
                 y_prime_numpy = np.stack([np.pad(sublist, pad_width=((0,max_length-max(sublist.shape)),(0,0))) for sublist in y_prime])
                 y_gold_numpy = np.stack([np.pad(sublist, pad_width=((0,max_length-max(sublist.shape)),(0,0))) for sublist in y_gold])
             
-            g.write_all('Entered second area\n')
+            #g.write_all('Entered second area\n')
             g.comm.Barrier()
             temp_predictor_only_comm = MPI.Comm.Split(g.comm, color, i)
             freeme = True
             g.comm.Barrier()
             # Create numpy array to store all processors output, then aggregate and unpad using MPI gathered shape list
-            g.write_all('getting shapez, i contribute {} shpz\n'.format(len(shpz)))
+            #g.write_all('getting shapez, i contribute {} shpz\n'.format(len(shpz)))
             shpzg = g.comm.allgather(shpz)
-            import itertools
             shpzg = list(itertools.chain(*shpzg))
             g.comm.Barrier()
-            g.write_all('shpzg shape before preproc={}\n'.format(len(shpzg)))
+            #g.write_all('shpzg shape before preproc={}\n'.format(len(shpzg)))
             shpzg = [s for s in shpzg if s != []]
-            g.write_all('shpzg shape after preproc={}\n'.format(len(shpzg)))
+            #g.write_all('shpzg shape after preproc={}\n'.format(len(shpzg)))
             max_length = g.comm.allreduce(max_length, MPI.MAX) 
-            g.write_unique(str(shpzg)+'\n')
-            g.write_all('gotting shapez\n')
+            #g.write_unique(str(shpzg)+'\n')
+            #g.write_all('gotting shapez\n')
             # Todo: Figure out if empty shots are added to fit batch length
             if color == 1:
                 num_pred = temp_predictor_only_comm.size
@@ -862,47 +856,47 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             y_goldg  = np.zeros((num_pred*conf['model']['pred_batch_size'],max_length,1), dtype=conf['data']['floatx'])
             y_primeg_flattend = np.zeros(y_primeg.flatten().shape)
             y_goldg_flattend  = np.zeros(y_goldg.flatten().shape)
-            g.write_all('initialized golbals, i = {}\n'.format(i))
+            #g.write_all('initialized golbals, i = {}\n'.format(i))
             # TODO (IMD) Support more floating point types
             # ValueError: message: cannot infer count, number of entries 10652818 is not a multiple of required number of blocks 9
             # Need to send an unequal sized array I think
             if color == 1:
-                g.write_all('y_prime_numpy.shape = {}\n'.format(y_prime_numpy.shape))
-                g.write_all('y_prime_numpy_flattend.shape = {}\n'.format(y_prime_numpy.flatten().shape))
-                g.write_all('y_gold_numpy.shape = {}\n'.format(y_gold_numpy.shape))
-                g.write_all('y_gold_numpy_flattend.shape = {}\n'.format(y_gold_numpy.flatten().shape))
-                g.write_all('y_prime_g.shape = {}\n'.format(y_primeg.shape))
-                g.write_all('y_primeg_flattend.shape = {}\n'.format(y_primeg_flattend.shape))
-                g.write_all('y_goldg_flattend.shape = {}, i={}\n'.format(y_goldg_flattend.shape,i))
+                #g.write_all('y_prime_numpy.shape = {}\n'.format(y_prime_numpy.shape))
+                #g.write_all('y_prime_numpy_flattend.shape = {}\n'.format(y_prime_numpy.flatten().shape))
+                #g.write_all('y_gold_numpy.shape = {}\n'.format(y_gold_numpy.shape))
+                #g.write_all('y_gold_numpy_flattend.shape = {}\n'.format(y_gold_numpy.flatten().shape))
+                #g.write_all('y_prime_g.shape = {}\n'.format(y_primeg.shape))
+                #g.write_all('y_primeg_flattend.shape = {}\n'.format(y_primeg_flattend.shape))
+                #g.write_all('y_goldg_flattend.shape = {}, i={}\n'.format(y_goldg_flattend.shape,i))
                 # Ensure that numpy arrays have correct dimensions before gathering them
                 assert num_pred*max(y_prime_numpy.flatten().shape) == max(y_primeg_flattend.shape)
                 assert num_pred*max(y_gold_numpy.flatten().shape) == max(y_goldg_flattend.shape)
-                g.write_all('Passed asserts for i = {}\n'.format(i))
+                #g.write_all('Passed asserts for i = {}\n'.format(i))
                 temp_predictor_only_comm.Barrier()
                 temp_predictor_only_comm.Allgather(y_prime_numpy.flatten(), y_primeg_flattend)
                 temp_predictor_only_comm.Allgather(y_gold_numpy.flatten(), y_goldg_flattend)
                 temp_predictor_only_comm.Barrier()
-                g.write_all('Passed Allgather color=1 for i = {}\n'.format(i))
+                #g.write_all('Passed Allgather color=1 for i = {}\n'.format(i))
             # Process 0 broadcast y_primeg adn y_goldg to all processors, including ones
             # not involved in calculating predictions so they can each create their own 
             # y_prime_global and y_gold_global
-            g.write_all('Waiting on Barrier before broadcast at i={}\n'.format(i))
+            #g.write_all('Waiting on Barrier before broadcast at i={}\n'.format(i))
             g.comm.Barrier()
-            g.write_all('Broadcasting y_primeg and y_goldg to every\n')
+            #g.write_all('Broadcasting y_primeg and y_goldg to every\n')
             g.comm.Bcast(y_primeg_flattend, root=0)
             g.comm.Bcast(y_goldg_flattend, root=0) 
             g.comm.Barrier()
-            g.write_all('All gathered initialized golbals len1={}, len2={}\n'.format(len(y_primeg_flattend), len(y_goldg_flattend)))
+            #g.write_all('All gathered initialized golbals len1={}, len2={}\n'.format(len(y_primeg_flattend), len(y_goldg_flattend)))
             y_primeg_flattend = np.split(y_primeg_flattend, num_pred)
             y_goldg_flattend = np.split(y_goldg_flattend, num_pred)
             y_primeg = [y.reshape((128, max_length, 1)) for y in y_primeg_flattend]
             y_goldg = [y.reshape((128, max_length, 1)) for y in y_goldg_flattend]
-            g.write_all('primeg length: {}\n'.format(len(y_primeg)))
+            #g.write_all('primeg length: {}\n'.format(len(y_primeg)))
             y_primeg = np.concatenate(y_primeg, axis=0)
-            g.write_all('primeg shape after stack: {}\n'.format(y_primeg.shape))
+            #g.write_all('primeg shape after stack: {}\n'.format(y_primeg.shape))
             y_goldg  = np.concatenate(y_goldg, axis=0)
             # Unpad
-            g.write_all('unpadding len(shpzg)={}, shpzg[0]={}\n'.format(len(shpzg),shpzg[0]))
+            #g.write_all('unpadding len(shpzg)={}, shpzg[0]={}\n'.format(len(shpzg),shpzg[0]))
             # need to have subgroups gather a broadcast y_prmeg (maybe do above)
             # TODO
             for idx, s in enumerate(shpzg):
@@ -917,13 +911,13 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             y_gold = []
             disruptive = [] 
             color = 0
-            g.write_all('y_prime_global len ={}\n'.format(len(y_prime_global)))
+            #g.write_all('y_prime_global len ={}\n'.format(len(y_prime_global)))
 
         if g.task_index == 0:
             pbar.add(1.0*len(shot_sublist))
 
         if freeme:
-            g.write_all('Freeing extra comm at i={}\n'.format(i))
+            #g.write_all('Freeing extra comm at i={}\n'.format(i))
             temp_predictor_only_comm.Free()
             freeme = False
             color = 2
@@ -941,11 +935,11 @@ def mpi_make_predictions_and_evaluate(conf, shot_list, loader,
     y_prime, y_gold, disruptive = mpi_make_predictions(
         conf, shot_list, loader, custom_path)
     analyzer = PerformanceAnalyzer(conf=conf)
-    g.write_all('Afterwards y_prime length = {}\n'.format(len(y_prime)))
-    g.write_all('Afterwards y_prime[0] shape = {}\n'.format(y_prime[0].shape))
-    g.write_all('Afterwards y_gold length = {}\n'.format(len(y_gold)))
-    g.write_all('Afterwards y_gold[0] shape = {}\n'.format(y_gold[0].shape))
-    g.write_all('Afterwards distruptive length = {}\n'.format(len(disruptive)))
+    #g.write_all('Afterwards y_prime length = {}\n'.format(len(y_prime)))
+    #g.write_all('Afterwards y_prime[0] shape = {}\n'.format(y_prime[0].shape))
+    #g.write_all('Afterwards y_gold length = {}\n'.format(len(y_gold)))
+    #g.write_all('Afterwards y_gold[0] shape = {}\n'.format(y_gold[0].shape))
+    #g.write_all('Afterwards distruptive length = {}\n'.format(len(disruptive)))
     roc_area = analyzer.get_roc_area(y_prime, y_gold, disruptive)
     shot_list.set_weights(
         analyzer.get_shot_difficulty(y_prime, y_gold, disruptive))

From 81d6508b34f5920e3bfcce4911c8551b1327bac1 Mon Sep 17 00:00:00 2001
From: "Jesse A. Rodriguez" <jessear@traverse.princeton.edu>
Date: Fri, 30 Jul 2021 17:38:47 -0400
Subject: [PATCH 28/50] Made necessary changes to get software up & running on
 Traverse.

---
 envs/requirements-traverse.yaml |   2 +-
 examples/conf.yaml              |  12 +-
 examples/slurm.cmd              |  25 ++--
 plasma/models/builder.py        |   2 +-
 plasma/models/mpi_runner.py     | 150 +++++++++++-----------
 plasma/utils/CallbackList.py    | 219 ++++++++++++++++++++++++++++++++
 6 files changed, 315 insertions(+), 95 deletions(-)
 create mode 100644 plasma/utils/CallbackList.py

diff --git a/envs/requirements-traverse.yaml b/envs/requirements-traverse.yaml
index aad550db..30422586 100644
--- a/envs/requirements-traverse.yaml
+++ b/envs/requirements-traverse.yaml
@@ -5,7 +5,7 @@ channels:
   - defaults
 # channel_priority: strict   # set in .condarc
 dependencies:
-  - python>=3.6.8
+  - python=3.6.8
   - cython
   - pip
   - scipy
diff --git a/examples/conf.yaml b/examples/conf.yaml
index a4992811..f67bc89f 100644
--- a/examples/conf.yaml
+++ b/examples/conf.yaml
@@ -9,12 +9,12 @@
 # will output csvlog, trained model checkpoints, etc.
 # in fs_path_output / [username] / results | csv_logs | model_checkpoints | Graph, etc.
 
-fs_path: '/Users/'
+fs_path: '/tigress/'
 user_subdir: True
-fs_path_output: '/Users/'
+fs_path_output: '/tigress/'
 user_subdir_output: True
 target: 'hinge' # 'maxhinge' # 'maxhinge' # 'binary' # 'hinge'
-num_gpus: 1  # per node
+num_gpus: 4  # per node
 paths:
   signal_prepath: '/signal_data/' # /signal_data/jet/
   shot_list_dir: '/shot_lists/'
@@ -94,7 +94,7 @@ model:
   # TODO(KGF): optimize size of RNN layers
   # size 100 slight overfitting, size 20 no overfitting. 200 is not better than 100.
   # Prediction is much better with size 100, size 20 cannot capture the data.
-  rnn_size: 200
+  rnn_size: 100
   rnn_type: 'LSTM'
   # TODO(KGF): optimize number of RNN layers
   rnn_layers: 2
@@ -144,8 +144,8 @@ training:
   num_batches_minimum: 20 # minimum number of batches per epoch
   ranking_difficulty_fac: 1.0 # how much to upweight incorrectly classified shots during training
   timeline_prof: False
-  step_limit: 50
-  no_validation: True
+  step_limit: 1000
+  no_validation: False
 callbacks:
   list: ['earlystop']
   metrics: ['val_loss','val_roc','train_loss']
diff --git a/examples/slurm.cmd b/examples/slurm.cmd
index 3dcae884..af1aabda 100644
--- a/examples/slurm.cmd
+++ b/examples/slurm.cmd
@@ -1,22 +1,24 @@
 #!/bin/bash
+#SBATCH --job-name=FRNNTest
 #SBATCH -t 01:00:00
-#SBATCH -N 4
+#SBATCH -N 2
 #SBATCH --ntasks-per-node=4
+#SBATCH --gpus-per-node=4
 #SBATCH --ntasks-per-socket=2
 #SBATCH --gres=gpu:4
 #SBATCH -c 4
 #SBATCH --mem-per-cpu=0
+#SBATCH --reservation test
+#SBATCH --mail-user=jrodrig@stanford.edu
+#SBATCH --mail-type=ALL
 
-# Example Slurm configuration for TigerGPU nodes (4 nodes, 16 GPUs total)
-# Each node = 2.4 GHz Xeon Broadwell E5-2680 v4 + 4x 1328 MHz P100 GPU
-
-module load anaconda3
-conda activate my_env
-module load cudatoolkit
-module load cudnn
-module load openmpi/cuda-8.0/intel-17.0/3.0.0/64
-module load intel/19.0/64/19.0.3.199
-module load hdf5/intel-17.0/intel-mpi/1.10.0
+# Load modules
+module load anaconda3/2020.7
+conda activate FRNN
+module load cudatoolkit/11.3
+module load cudnn/cuda-11.x/8.2.0
+module load openmpi/cuda-11.0/gcc/4.0.4/64
+module load hdf5/gcc/openmpi-4.0.4/1.10.6
 
 # remove checkpoints for a benchmark run
 rm /tigress/$USER/model_checkpoints/*
@@ -25,5 +27,4 @@ rm /tigress/$USER/csv_logs/*
 rm /tigress/$USER/Graph/*
 rm /tigress/$USER/normalization/*
 
-export OMPI_MCA_btl="tcp,self,vader"
 srun python mpi_learn.py
diff --git a/plasma/models/builder.py b/plasma/models/builder.py
index 6d44c97e..e4a41d99 100644
--- a/plasma/models/builder.py
+++ b/plasma/models/builder.py
@@ -393,7 +393,7 @@ def save_model_weights(self, model, epoch):
         # TODO(KGF): model.save(..., save_format='tf') disabled in r1.15
         # Same with tf.keras.models.save_model(..., save_format="tf").
         # Need to use experimental API until r2.x
-        model.save(full_model_save_dir, overwrite=True, save_format='tf')
+        # model.save(full_model_save_dir, overwrite=True, save_format='tf')
 
         # try:
         if _has_tf2onnx:
diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index 3df9c328..081ac263 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -112,11 +112,11 @@
         else:
             import tensorflow.keras.backend as K
 
-
         from tensorflow.keras.utils import Progbar
         # TODO(KGF): instead of tensorflow.keras.callbacks.CallbackList()
         # until API added in tf-nightly in v2.2.0
-        import tensorflow.python.keras.callbacks as cbks
+        import tensorflow.keras.callbacks as cbks
+        from plasma.utils.CallbackList import CallbackList
 
 g.flush_all_inorder()
 g.pprint_unique(conf)
@@ -496,7 +496,7 @@ def build_callbacks(self, conf, callbacks_list):
         #         update_freq=1,)
         #     callbacks += [tb_callback]
 
-        return cbks.CallbackList(callbacks)
+        return CallbackList(callbacks)
 
     def add_noise(self, X):
         if self.conf['training']['noise'] is True:
@@ -564,80 +564,80 @@ def train_epoch(self):
         while ((self.num_so_far - self.epoch * num_total) < num_total
                or step < self.num_batches_minimum):
             # TODO(KGF): this is still not correctly tracing the steps on CPU
-            with tf.profiler.experimental.Trace('train', step_num=step, _r=1):
-                if step_limit > 0 and step > step_limit:
-                    print('reached step limit')
-                    break
-                try:
-                    (batch_xs, batch_ys, batches_to_reset, num_so_far_curr,
-                     num_total, is_warmup_period) = next(batch_iterator_func)
-                except StopIteration:
-                    g.print_unique("Resetting batch iterator.")
-                    self.num_so_far_accum = self.num_so_far_indiv
-                    self.set_batch_iterator_func()
-                    batch_iterator_func = self.batch_iterator_func
-                    (batch_xs, batch_ys, batches_to_reset, num_so_far_curr,
-                     num_total, is_warmup_period) = next(batch_iterator_func)
-                self.num_so_far_indiv = self.num_so_far_accum + num_so_far_curr
-
-                # if batches_to_reset:
-                # self.model.reset_states(batches_to_reset)
-
-                warmup_phase = (step < self.warmup_steps and self.epoch == 0)
-                num_replicas = 1 if warmup_phase else self.num_replicas
-
-                self.num_so_far = self.mpi_sum_scalars(
-                    self.num_so_far_indiv, num_replicas)
-
-                # run the model once to force compilation. Don't actually use these
-                # values.
-                if first_run:
-                    first_run = False
-                    t0_comp = time.time()
-                    #   print('input_dimension:',batch_xs.shape)
-                    #   print('output_dimension:',batch_ys.shape)
-                    _, _ = self.train_on_batch_and_get_deltas(
+            #with tf.profiler.experimental.Trace('train', step_num=step, _r=1):
+            if step_limit > 0 and step > step_limit:
+                print('reached step limit')
+                break
+            try:
+                (batch_xs, batch_ys, batches_to_reset, num_so_far_curr,
+                 num_total, is_warmup_period) = next(batch_iterator_func)
+            except StopIteration:
+                g.print_unique("Resetting batch iterator.")
+                self.num_so_far_accum = self.num_so_far_indiv
+                self.set_batch_iterator_func()
+                batch_iterator_func = self.batch_iterator_func
+                (batch_xs, batch_ys, batches_to_reset, num_so_far_curr,
+                 num_total, is_warmup_period) = next(batch_iterator_func)
+            self.num_so_far_indiv = self.num_so_far_accum + num_so_far_curr
+
+            # if batches_to_reset:
+            # self.model.reset_states(batches_to_reset)
+
+            warmup_phase = (step < self.warmup_steps and self.epoch == 0)
+            num_replicas = 1 if warmup_phase else self.num_replicas
+
+            self.num_so_far = self.mpi_sum_scalars(
+                self.num_so_far_indiv, num_replicas)
+
+            # run the model once to force compilation. Don't actually use these
+            # values.
+            if first_run:
+                first_run = False
+                t0_comp = time.time()
+                #   print('input_dimension:',batch_xs.shape)
+                #   print('output_dimension:',batch_ys.shape)
+                _, _ = self.train_on_batch_and_get_deltas(
                         batch_xs, batch_ys, verbose)
-                    self.comm.Barrier()
-                    sys.stdout.flush()
-                    # TODO(KGF): check line feed/carriage returns around this
-                    g.print_unique('\nCompilation finished in {:.2f}s'.format(
-                        time.time() - t0_comp))
-                    t_start = time.time()
-                    sys.stdout.flush()
-
-                if np.any(batches_to_reset):
-                    reset_states(self.model, batches_to_reset)
-                if ('noise' in self.conf['training'].keys()
-                        and self.conf['training']['noise'] is not False):
-                    batch_xs = self.add_noise(batch_xs)
-                t0 = time.time()
-                deltas, loss = self.train_on_batch_and_get_deltas(
-                    batch_xs, batch_ys, verbose)
-                t1 = time.time()
-                if not is_warmup_period:
-                    self.set_new_weights(deltas, num_replicas)
-                    t2 = time.time()
-                    write_str_0 = self.calculate_speed(t0, t1, t2, num_replicas)
-                    curr_loss = self.mpi_average_scalars(1.0*loss, num_replicas)
-                    # g.print_unique(self.model.get_weights()[0][0][:4])
-                    loss_averager.add_val(curr_loss)
-                    ave_loss = loss_averager.get_ave()
-                    eta = self.estimate_remaining_time(
-                        t0 - t_start, self.num_so_far - self.epoch*num_total,
+                self.comm.Barrier()
+                sys.stdout.flush()
+                # TODO(KGF): check line feed/carriage returns around this
+                g.print_unique('\nCompilation finished in {:.2f}s'.format(
+                    time.time() - t0_comp))
+                t_start = time.time()
+                sys.stdout.flush()
+
+            if np.any(batches_to_reset):
+                reset_states(self.model, batches_to_reset)
+            if ('noise' in self.conf['training'].keys()
+                    and self.conf['training']['noise'] is not False):
+                batch_xs = self.add_noise(batch_xs)
+            t0 = time.time()
+            deltas, loss = self.train_on_batch_and_get_deltas(
+                batch_xs, batch_ys, verbose)
+            t1 = time.time()
+            if not is_warmup_period:
+                self.set_new_weights(deltas, num_replicas)
+                t2 = time.time()
+                write_str_0 = self.calculate_speed(t0, t1, t2, num_replicas)
+                curr_loss = self.mpi_average_scalars(1.0*loss, num_replicas)
+                # g.print_unique(self.model.get_weights()[0][0][:4])
+                loss_averager.add_val(curr_loss)
+                ave_loss = loss_averager.get_ave()
+                eta = self.estimate_remaining_time(
+                    t0 - t_start, self.num_so_far - self.epoch*num_total,
+                    num_total)
+                write_str = (
+                    '\r[{}] step: {} [ETA: {:.2f}s] [{:.2f}/{}], '.format(
+                        self.task_index, step, eta, 1.0*self.num_so_far,
                         num_total)
-                    write_str = (
-                        '\r[{}] step: {} [ETA: {:.2f}s] [{:.2f}/{}], '.format(
-                            self.task_index, step, eta, 1.0*self.num_so_far,
-                            num_total)
-                        + 'loss: {:.5f} [{:.5f}] | '.format(ave_loss, curr_loss)
-                        + 'walltime: {:.4f} | '.format(
-                            time.time() - self.start_time))
-                    g.write_unique(write_str + write_str_0)
-                    step += 1
-                else:
-                    g.write_unique('\r[{}] warmup phase, num so far: {}'.format(
-                        self.task_index, self.num_so_far))
+                    + 'loss: {:.5f} [{:.5f}] | '.format(ave_loss, curr_loss)
+                    + 'walltime: {:.4f} | '.format(
+                        time.time() - self.start_time))
+                g.write_unique(write_str + write_str_0)
+                step += 1
+            else:
+                g.write_unique('\r[{}] warmup phase, num so far: {}'.format(
+                    self.task_index, self.num_so_far))
 
         effective_epochs = 1.0*self.num_so_far/num_total
         epoch_previous = self.epoch
diff --git a/plasma/utils/CallbackList.py b/plasma/utils/CallbackList.py
new file mode 100644
index 00000000..78e8ff15
--- /dev/null
+++ b/plasma/utils/CallbackList.py
@@ -0,0 +1,219 @@
+import collections
+
+class CallbackList(object):
+  """Container abstracting a list of callbacks.
+  Arguments:
+      callbacks: List of `Callback` instances.
+      queue_length: Queue length for keeping
+          running statistics over callback execution time.
+  """
+
+  def __init__(self, callbacks=None, queue_length=10):
+    callbacks = callbacks or []
+    self.callbacks = [c for c in callbacks]
+    self.queue_length = queue_length
+    self.params = {}
+    self.model = None
+    self._reset_batch_timing()
+
+  def _reset_batch_timing(self):
+    self._delta_t_batch = 0.
+    self._delta_ts = collections.defaultdict(
+        lambda: collections.deque([], maxlen=self.queue_length))
+
+  def append(self, callback):
+    self.callbacks.append(callback)
+
+  def set_params(self, params):
+    self.params = params
+    for callback in self.callbacks:
+      callback.set_params(params)
+
+  def set_model(self, model):
+    self.model = model
+    for callback in self.callbacks:
+      callback.set_model(model)
+
+  def _call_batch_hook(self, mode, hook, batch, logs=None):
+    """Helper function for all batch_{begin | end} methods."""
+    if not self.callbacks:
+      return
+    hook_name = 'on_{mode}_batch_{hook}'.format(mode=mode, hook=hook)
+    if hook == 'begin':
+      self._t_enter_batch = time.time()
+    if hook == 'end':
+      # Batch is ending, calculate batch time.
+      self._delta_t_batch = time.time() - self._t_enter_batch
+
+    logs = logs or {}
+    t_before_callbacks = time.time()
+    for callback in self.callbacks:
+      batch_hook = getattr(callback, hook_name)
+      batch_hook(batch, logs)
+    self._delta_ts[hook_name].append(time.time() - t_before_callbacks)
+
+    delta_t_median = np.median(self._delta_ts[hook_name])
+    if (self._delta_t_batch > 0. and
+        delta_t_median > 0.95 * self._delta_t_batch and delta_t_median > 0.1):
+      logging.warning(
+          'Method (%s) is slow compared '
+          'to the batch update (%f). Check your callbacks.', hook_name,
+          delta_t_median)
+
+  def _call_begin_hook(self, mode):
+    """Helper function for on_{train|test|predict}_begin methods."""
+    if mode == ModeKeys.TRAIN:
+      self.on_train_begin()
+    elif mode == ModeKeys.TEST:
+      self.on_test_begin()
+    else:
+      self.on_predict_begin()
+
+  def _call_end_hook(self, mode):
+    """Helper function for on_{train|test|predict}_end methods."""
+    if mode == ModeKeys.TRAIN:
+      self.on_train_end()
+    elif mode == ModeKeys.TEST:
+      self.on_test_end()
+    else:
+      self.on_predict_end()
+
+  def on_batch_begin(self, batch, logs=None):
+    self._call_batch_hook(ModeKeys.TRAIN, 'begin', batch, logs=logs)
+
+  def on_batch_end(self, batch, logs=None):
+    self._call_batch_hook(ModeKeys.TRAIN, 'end', batch, logs=logs)
+
+  def on_epoch_begin(self, epoch, logs=None):
+    """Calls the `on_epoch_begin` methods of its callbacks.
+    This function should only be called during TRAIN mode.
+    Arguments:
+        epoch: integer, index of epoch.
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+    """
+    logs = logs or {}
+    for callback in self.callbacks:
+      callback.on_epoch_begin(epoch, logs)
+    self._reset_batch_timing()
+
+  def on_epoch_end(self, epoch, logs=None):
+    """Calls the `on_epoch_end` methods of its callbacks.
+    This function should only be called during TRAIN mode.
+    Arguments:
+        epoch: integer, index of epoch.
+        logs: dict, metric results for this training epoch, and for the
+          validation epoch if validation is performed. Validation result keys
+          are prefixed with `val_`.
+    """
+    logs = logs or {}
+    for callback in self.callbacks:
+      callback.on_epoch_end(epoch, logs)
+
+  def on_train_batch_begin(self, batch, logs=None):
+    """Calls the `on_train_batch_begin` methods of its callbacks.
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dict. Has keys `batch` and `size` representing the current batch
+          number and the size of the batch.
+    """
+    self._call_batch_hook(ModeKeys.TRAIN, 'begin', batch, logs=logs)
+
+  def on_train_batch_end(self, batch, logs=None):
+    """Calls the `on_train_batch_end` methods of its callbacks.
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dict. Metric results for this batch.
+    """
+    self._call_batch_hook(ModeKeys.TRAIN, 'end', batch, logs=logs)
+
+  def on_test_batch_begin(self, batch, logs=None):
+    """Calls the `on_test_batch_begin` methods of its callbacks.
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dict. Has keys `batch` and `size` representing the current batch
+          number and the size of the batch.
+    """
+    self._call_batch_hook(ModeKeys.TEST, 'begin', batch, logs=logs)
+
+  def on_test_batch_end(self, batch, logs=None):
+    """Calls the `on_test_batch_end` methods of its callbacks.
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dict. Metric results for this batch.
+    """
+    self._call_batch_hook(ModeKeys.TEST, 'end', batch, logs=logs)
+
+  def on_predict_batch_begin(self, batch, logs=None):
+    """Calls the `on_predict_batch_begin` methods of its callbacks.
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dict. Has keys `batch` and `size` representing the current batch
+          number and the size of the batch.
+    """
+    self._call_batch_hook(ModeKeys.PREDICT, 'begin', batch, logs=logs)
+
+  def on_predict_batch_end(self, batch, logs=None):
+    """Calls the `on_predict_batch_end` methods of its callbacks.
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dict. Metric results for this batch.
+    """
+    self._call_batch_hook(ModeKeys.PREDICT, 'end', batch, logs=logs)
+
+  def on_train_begin(self, logs=None):
+    """Calls the `on_train_begin` methods of its callbacks.
+    Arguments:
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+    """
+    for callback in self.callbacks:
+      callback.on_train_begin(logs)
+
+  def on_train_end(self, logs=None):
+    """Calls the `on_train_end` methods of its callbacks.
+    Arguments:
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+    """
+    for callback in self.callbacks:
+      callback.on_train_end(logs)
+
+  def on_test_begin(self, logs=None):
+    """Calls the `on_test_begin` methods of its callbacks.
+    Arguments:
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+    """
+    for callback in self.callbacks:
+      callback.on_test_begin(logs)
+
+  def on_test_end(self, logs=None):
+    """Calls the `on_test_end` methods of its callbacks.
+    Arguments:
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+    """
+    for callback in self.callbacks:
+      callback.on_test_end(logs)
+
+  def on_predict_begin(self, logs=None):
+    """Calls the 'on_predict_begin` methods of its callbacks.
+    Arguments:
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+    """
+    for callback in self.callbacks:
+      callback.on_predict_begin(logs)
+
+  def on_predict_end(self, logs=None):
+    """Calls the `on_predict_end` methods of its callbacks.
+    Arguments:
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+    """
+    for callback in self.callbacks:
+      callback.on_predict_end(logs)
+
+  def __iter__(self):
+    return iter(self.callbacks)
\ No newline at end of file

From 87ab057f54caa4d82c44764b0a800d6c01cf9be6 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Fri, 30 Jul 2021 19:44:11 -0400
Subject: [PATCH 29/50] Disable Profiler and SavedModel export for TF 2.1.3 on
 Traverse

---
 plasma/models/builder.py    |  6 +++++-
 plasma/models/mpi_runner.py | 18 +++++++++++++-----
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/plasma/models/builder.py b/plasma/models/builder.py
index 6d44c97e..7f27a319 100644
--- a/plasma/models/builder.py
+++ b/plasma/models/builder.py
@@ -24,6 +24,7 @@
 import sys
 import numpy as np
 from copy import deepcopy
+from packaging import version
 from plasma.utils.downloading import makedirs_process_safe
 from plasma.utils.hashing import general_object_hash
 from plasma.models.tcn import TCN
@@ -393,7 +394,10 @@ def save_model_weights(self, model, epoch):
         # TODO(KGF): model.save(..., save_format='tf') disabled in r1.15
         # Same with tf.keras.models.save_model(..., save_format="tf").
         # Need to use experimental API until r2.x
-        model.save(full_model_save_dir, overwrite=True, save_format='tf')
+        if (version.parse(g.tf_ver) > version.parse('2.1.0')
+                and version.parse(g.tf_ver).minor != 1):
+            # errors out in TF 2.1.3 on Traverse (latest version on IBM WMLCE 1.7.0)
+            model.save(full_model_save_dir, overwrite=True, save_format='tf')
 
         # try:
         if _has_tf2onnx:
diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index 3df9c328..eb6de9a3 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -14,6 +14,9 @@
 from mpi4py import MPI
 from pkg_resources import parse_version, get_distribution, DistributionNotFound
 import random
+from packaging import version
+import contextlib
+
 '''
 #########################################################
 This file trains a deep learning model to predict
@@ -564,7 +567,12 @@ def train_epoch(self):
         while ((self.num_so_far - self.epoch * num_total) < num_total
                or step < self.num_batches_minimum):
             # TODO(KGF): this is still not correctly tracing the steps on CPU
-            with tf.profiler.experimental.Trace('train', step_num=step, _r=1):
+            if version.parse(g.tf_ver) >= version.parse('2.2.0'):
+                # TensorFlow profiler added in April 2020, TF 2.2.0
+                cm = tf.profiler.experimental.Trace('train', step_num=step, _r=1)
+            else:
+                cm = contextlib.nullcontext()
+            with cm:
                 if step_limit > 0 and step > step_limit:
                     print('reached step limit')
                     break
@@ -948,8 +956,8 @@ def mpi_train(conf, shot_list_train, shot_list_validate, loader,
         best_so_far = np.inf
         cmp_fn = min
 
-    if conf['training']['timeline_prof']:
-        tf.profiler.experimental.start('./logs')
+    # if conf['training']['timeline_prof']:
+    #     tf.profiler.experimental.start('./logs')
 
     while e < num_epochs:
         g.write_unique('\nBegin training from epoch {:.2f}/{}'.format(
@@ -1057,8 +1065,8 @@ def mpi_train(conf, shot_list_train, shot_list_validate, loader,
         if stop_training:
             g.write_unique("Stopping training due to early stopping")
             break
-    if conf['training']['timeline_prof']:
-        tf.profiler.experimental.stop()
+    # if conf['training']['timeline_prof']:
+    #     tf.profiler.experimental.stop()
 
     if g.task_index == 0:
         callbacks.on_train_end()

From 74c60e4a5d6eb94021a0c2ae88897cee1d1056d9 Mon Sep 17 00:00:00 2001
From: Ian DesJardin <idesjard@umd.edu>
Date: Tue, 3 Aug 2021 17:54:38 -0400
Subject: [PATCH 30/50] Migrate version introspection to packaging module

---
 plasma/models/mpi_runner.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index eb6de9a3..906a7165 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -12,7 +12,6 @@
 # Keras "Using TensorFlow backend" stderr messages do not interfere in stdout
 from plasma.conf import conf
 from mpi4py import MPI
-from pkg_resources import parse_version, get_distribution, DistributionNotFound
 import random
 from packaging import version
 import contextlib
@@ -59,10 +58,6 @@
         os.environ['CUDA_VISIBLE_DEVICES'] = '{}'.format(g.MY_GPU)
         # ,mode=NanGuardMode'
     os.environ['KERAS_BACKEND'] = 'tensorflow'  # default setting
-    try:
-        g.tf_ver = parse_version(get_distribution(g.backendpackage).version)
-    except DistributionNotFound:
-        g.tf_ver = parse_version(get_distribution('tensorflow').version)
     # compat/compat.py first committed on 2018-06-29 for Py 2 vs 3
     # (around, but not present in, the release of v1.9.0)
     # v2 compatiblity code added, then moved from compat.py in Nov and Dec 2018
@@ -74,6 +69,7 @@
     #     import tensorflow.compat.v1 as tf
     # else:
     import tensorflow as tf
+    g.tf_ver = tf.__version__ # setting g.tf_ver moved after the import Summer 2021
     # TODO(KGF): above, builder.py (bug workaround), mpi_launch_tensorflow.py,
     # and runner.py are the only files that import tensorflow directly
 
@@ -861,7 +857,7 @@ def mpi_train(conf, shot_list_train, shot_list_validate, loader,
     conf['num_workers'] = g.comm.Get_size()
 
     specific_builder = builder.ModelBuilder(conf)
-    if g.tf_ver >= parse_version('1.14.0'):
+    if version.parse(g.tf_ver) >= version.parse('1.14.0'):
         # Internal TensorFlow flags, subject to change (v1.14.0+ only?)
         try:
             from tensorflow.python.util import module_wrapper as depr

From 9933063cdb9177300037442fb61b53576835e6f8 Mon Sep 17 00:00:00 2001
From: Ian DesJardin <imdesjardin@gmail.com>
Date: Wed, 4 Aug 2021 10:02:45 -0400
Subject: [PATCH 31/50] Cleaned up comments, removed debugging statements

---
 plasma/models/mpi_runner.py | 62 +++++--------------------------------
 1 file changed, 8 insertions(+), 54 deletions(-)

diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index bc7ad478..8df75950 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -791,9 +791,6 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
     if g.task_index != 0:
         loader.verbose = False
 
-    #g.write_unique('num workers= {}\nlen(shot_sublists)={}, num_shots = {}\n'.format(g.num_workers, len(shot_sublists), len(shot_list)))
-    freeme = False
-
     # MPI loop works by predicting in batches of the 
     # largest possible multiple of len(shot_sublists) < num_workers
     # i.e. if there are 9 shot_sublists and 4 workers,
@@ -805,29 +802,23 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
     #      or aggregate predictions with other workers, after each worker has made a prediction
     #      which happens every num_workers iterations in the for loop
     #      (i.e. worker 0 will aggregate predictions with workers 1-3 at the end of i=3)
+    # During the aggregation step, each worker is uses its color (which denotes whether it was
+    # predicting or not predicting during the last few runs of the for loop) to split the main comm
+    # The predictors (color = 1) share their predictions with first each other, and then to everyone
+    # the nonpredictors (color = 2) only recieve the global predictions from the predictors
     color = 2
-    #g.write_all('I shall predict {} times\n'.format(times_i_will_predict))
     for (i, shot_sublist) in enumerate(shot_sublists):
         shpz = []
         max_length = -1 # So non shot predictive workers don't have a real length
-        #g.write_all('My task index = {}, i mod num_workers = {}\n'.format(g.task_index, i%g.num_workers))
         if i % g.num_workers == g.task_index:
-            #g.write_all('Creating new comm at i={}\n'.format(i))
             color = 1
-            #g.comm.Barrier()
-            #temp_predictor_only_comm = MPI.Comm.Split(g.comm, color, i)
-            #freeme = True
-            # Create new MPI comm to pass around rank
-            #g.write_all('Starting to load and predict subroutine, i={}\n'.format(i))
             X, y, shot_lengths, disr = loader.load_as_X_y_pred(shot_sublist)
-            #g.write_all('X, y, lengths, disr loaded, shot_lengths shape: {} \n'.format(len(shot_lengths)))
 
             # load data and fit on data
             y_p = model.predict(X, batch_size=conf['model']['pred_batch_size'])
             model.reset_states()
             y_p = loader.batch_output_to_array(y_p)
             y = loader.batch_output_to_array(y)
-            #g.write_all('Finished le prediction\n')
 
             # cut arrays back
             y_p = [arr[:shot_lengths[j]] for (j, arr) in enumerate(y_p)]
@@ -840,8 +831,7 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
         if (i % g.num_workers == g.num_workers - 1
                 or i == len(shot_sublists) - 1):
             # Create numpy block from y list which is used in MPI
-            # Pads y_prime and y_gold with zeros to make it all fit
-            #g.write_all('In second area, my color is {}, i={}\n'.format(color,i))
+            # Pads y_prime and y_gold with zeros to maximum shot length within block being transferred
             g.comm.Barrier()
             if color ==1:
                 shpz = [max(y.shape) for y in y_prime]
@@ -849,28 +839,19 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
                 g.write_all(' max length = {}\n'.format(max_length))
             g.comm.Barrier()
             max_length = g.comm.allreduce(max_length, MPI.MAX) 
-            #g.write_all('Calculated shpz\n')
             if color == 1:
                 y_prime_numpy = np.stack([np.pad(sublist, pad_width=((0,max_length-max(sublist.shape)),(0,0))) for sublist in y_prime])
                 y_gold_numpy = np.stack([np.pad(sublist, pad_width=((0,max_length-max(sublist.shape)),(0,0))) for sublist in y_gold])
             
-            #g.write_all('Entered second area\n')
             g.comm.Barrier()
             temp_predictor_only_comm = MPI.Comm.Split(g.comm, color, i)
-            freeme = True
             g.comm.Barrier()
             # Create numpy array to store all processors output, then aggregate and unpad using MPI gathered shape list
-            #g.write_all('getting shapez, i contribute {} shpz\n'.format(len(shpz)))
             shpzg = g.comm.allgather(shpz)
             shpzg = list(itertools.chain(*shpzg))
             g.comm.Barrier()
-            #g.write_all('shpzg shape before preproc={}\n'.format(len(shpzg)))
             shpzg = [s for s in shpzg if s != []]
-            #g.write_all('shpzg shape after preproc={}\n'.format(len(shpzg)))
             max_length = g.comm.allreduce(max_length, MPI.MAX) 
-            #g.write_unique(str(shpzg)+'\n')
-            #g.write_all('gotting shapez\n')
-            # Todo: Figure out if empty shots are added to fit batch length
             if color == 1:
                 num_pred = temp_predictor_only_comm.size
             else:
@@ -879,49 +860,28 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             y_goldg  = np.zeros((num_pred*conf['model']['pred_batch_size'],max_length,1), dtype=conf['data']['floatx'])
             y_primeg_flattend = np.zeros(y_primeg.flatten().shape)
             y_goldg_flattend  = np.zeros(y_goldg.flatten().shape)
-            #g.write_all('initialized golbals, i = {}\n'.format(i))
-            # TODO (IMD) Support more floating point types
-            # ValueError: message: cannot infer count, number of entries 10652818 is not a multiple of required number of blocks 9
-            # Need to send an unequal sized array I think
             if color == 1:
-                #g.write_all('y_prime_numpy.shape = {}\n'.format(y_prime_numpy.shape))
-                #g.write_all('y_prime_numpy_flattend.shape = {}\n'.format(y_prime_numpy.flatten().shape))
-                #g.write_all('y_gold_numpy.shape = {}\n'.format(y_gold_numpy.shape))
-                #g.write_all('y_gold_numpy_flattend.shape = {}\n'.format(y_gold_numpy.flatten().shape))
-                #g.write_all('y_prime_g.shape = {}\n'.format(y_primeg.shape))
-                #g.write_all('y_primeg_flattend.shape = {}\n'.format(y_primeg_flattend.shape))
-                #g.write_all('y_goldg_flattend.shape = {}, i={}\n'.format(y_goldg_flattend.shape,i))
                 # Ensure that numpy arrays have correct dimensions before gathering them
                 assert num_pred*max(y_prime_numpy.flatten().shape) == max(y_primeg_flattend.shape)
                 assert num_pred*max(y_gold_numpy.flatten().shape) == max(y_goldg_flattend.shape)
-                #g.write_all('Passed asserts for i = {}\n'.format(i))
                 temp_predictor_only_comm.Barrier()
                 temp_predictor_only_comm.Allgather(y_prime_numpy.flatten(), y_primeg_flattend)
                 temp_predictor_only_comm.Allgather(y_gold_numpy.flatten(), y_goldg_flattend)
                 temp_predictor_only_comm.Barrier()
-                #g.write_all('Passed Allgather color=1 for i = {}\n'.format(i))
             # Process 0 broadcast y_primeg adn y_goldg to all processors, including ones
             # not involved in calculating predictions so they can each create their own 
             # y_prime_global and y_gold_global
-            #g.write_all('Waiting on Barrier before broadcast at i={}\n'.format(i))
             g.comm.Barrier()
-            #g.write_all('Broadcasting y_primeg and y_goldg to every\n')
             g.comm.Bcast(y_primeg_flattend, root=0)
             g.comm.Bcast(y_goldg_flattend, root=0) 
             g.comm.Barrier()
-            #g.write_all('All gathered initialized golbals len1={}, len2={}\n'.format(len(y_primeg_flattend), len(y_goldg_flattend)))
             y_primeg_flattend = np.split(y_primeg_flattend, num_pred)
             y_goldg_flattend = np.split(y_goldg_flattend, num_pred)
             y_primeg = [y.reshape((128, max_length, 1)) for y in y_primeg_flattend]
             y_goldg = [y.reshape((128, max_length, 1)) for y in y_goldg_flattend]
-            #g.write_all('primeg length: {}\n'.format(len(y_primeg)))
             y_primeg = np.concatenate(y_primeg, axis=0)
-            #g.write_all('primeg shape after stack: {}\n'.format(y_primeg.shape))
             y_goldg  = np.concatenate(y_goldg, axis=0)
-            # Unpad
-            #g.write_all('unpadding len(shpzg)={}, shpzg[0]={}\n'.format(len(shpzg),shpzg[0]))
-            # need to have subgroups gather a broadcast y_prmeg (maybe do above)
-            # TODO
+            # Unpad each shot to its true length
             for idx, s in enumerate(shpzg):
                 trim = lambda nparry, s: nparry[0:int(s),:]
                 y_prime_global.append(trim(y_primeg[idx],s))
@@ -933,18 +893,12 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             y_prime = []
             y_gold = []
             disruptive = [] 
-            color = 0
-            #g.write_all('y_prime_global len ={}\n'.format(len(y_prime_global)))
+            color = 2
+            temp_predictor_only_comm.Free()
 
         if g.task_index == 0:
             pbar.add(1.0*len(shot_sublist))
 
-        if freeme:
-            #g.write_all('Freeing extra comm at i={}\n'.format(i))
-            temp_predictor_only_comm.Free()
-            freeme = False
-            color = 2
-
     y_prime_global = y_prime_global[:len(shot_list)]
     y_gold_global = y_gold_global[:len(shot_list)]
     disruptive_global = disruptive_global[:len(shot_list)]

From 2066cd79b59898ca97d6fe4bd01a82e13182521f Mon Sep 17 00:00:00 2001
From: Ian DesJardin <idesjard@umd.edu>
Date: Mon, 9 Aug 2021 13:40:53 -0400
Subject: [PATCH 32/50] Removed debug statement and typo

---
 plasma/models/mpi_runner.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index 8df75950..d3febd31 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -836,7 +836,6 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             if color ==1:
                 shpz = [max(y.shape) for y in y_prime]
                 max_length = max([max(y.shape) for y in y_p])
-                g.write_all(' max length = {}\n'.format(max_length))
             g.comm.Barrier()
             max_length = g.comm.allreduce(max_length, MPI.MAX) 
             if color == 1:
@@ -868,7 +867,7 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
                 temp_predictor_only_comm.Allgather(y_prime_numpy.flatten(), y_primeg_flattend)
                 temp_predictor_only_comm.Allgather(y_gold_numpy.flatten(), y_goldg_flattend)
                 temp_predictor_only_comm.Barrier()
-            # Process 0 broadcast y_primeg adn y_goldg to all processors, including ones
+            # Process 0 broadcast y_primeg and y_goldg to all processors, including ones
             # not involved in calculating predictions so they can each create their own 
             # y_prime_global and y_gold_global
             g.comm.Barrier()
@@ -912,11 +911,6 @@ def mpi_make_predictions_and_evaluate(conf, shot_list, loader,
     y_prime, y_gold, disruptive = mpi_make_predictions(
         conf, shot_list, loader, custom_path)
     analyzer = PerformanceAnalyzer(conf=conf)
-    #g.write_all('Afterwards y_prime length = {}\n'.format(len(y_prime)))
-    #g.write_all('Afterwards y_prime[0] shape = {}\n'.format(y_prime[0].shape))
-    #g.write_all('Afterwards y_gold length = {}\n'.format(len(y_gold)))
-    #g.write_all('Afterwards y_gold[0] shape = {}\n'.format(y_gold[0].shape))
-    #g.write_all('Afterwards distruptive length = {}\n'.format(len(disruptive)))
     roc_area = analyzer.get_roc_area(y_prime, y_gold, disruptive)
     shot_list.set_weights(
         analyzer.get_shot_difficulty(y_prime, y_gold, disruptive))

From 603029f1e13e42c06b9e7271f9850a853f02ff62 Mon Sep 17 00:00:00 2001
From: Ian DesJardin <idesjard@umd.edu>
Date: Mon, 9 Aug 2021 14:02:27 -0400
Subject: [PATCH 33/50] Un-hardcoding batch size

---
 plasma/models/mpi_runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index d3febd31..74c08f1b 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -876,8 +876,8 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             g.comm.Barrier()
             y_primeg_flattend = np.split(y_primeg_flattend, num_pred)
             y_goldg_flattend = np.split(y_goldg_flattend, num_pred)
-            y_primeg = [y.reshape((128, max_length, 1)) for y in y_primeg_flattend]
-            y_goldg = [y.reshape((128, max_length, 1)) for y in y_goldg_flattend]
+            y_primeg = [y.reshape((conf['model']['pred_batch_size'], max_length, 1)) for y in y_primeg_flattend]
+            y_goldg = [y.reshape((conf['model']['pred_batch_size'], max_length, 1)) for y in y_goldg_flattend]
             y_primeg = np.concatenate(y_primeg, axis=0)
             y_goldg  = np.concatenate(y_goldg, axis=0)
             # Unpad each shot to its true length

From e53832dfeeb266b56a66d403a164caeab24c759d Mon Sep 17 00:00:00 2001
From: Ian DesJardin <idesjard@umd.edu>
Date: Tue, 10 Aug 2021 15:19:21 -0400
Subject: [PATCH 34/50] Removed extra MPI_Barrier calls

---
 plasma/models/mpi_runner.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index a05d85df..c82f78f5 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -828,23 +828,18 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
                 or i == len(shot_sublists) - 1):
             # Create numpy block from y list which is used in MPI
             # Pads y_prime and y_gold with zeros to maximum shot length within block being transferred
-            g.comm.Barrier()
             if color ==1:
                 shpz = [max(y.shape) for y in y_prime]
                 max_length = max([max(y.shape) for y in y_p])
-            g.comm.Barrier()
             max_length = g.comm.allreduce(max_length, MPI.MAX) 
             if color == 1:
                 y_prime_numpy = np.stack([np.pad(sublist, pad_width=((0,max_length-max(sublist.shape)),(0,0))) for sublist in y_prime])
                 y_gold_numpy = np.stack([np.pad(sublist, pad_width=((0,max_length-max(sublist.shape)),(0,0))) for sublist in y_gold])
             
-            g.comm.Barrier()
             temp_predictor_only_comm = MPI.Comm.Split(g.comm, color, i)
-            g.comm.Barrier()
             # Create numpy array to store all processors output, then aggregate and unpad using MPI gathered shape list
             shpzg = g.comm.allgather(shpz)
             shpzg = list(itertools.chain(*shpzg))
-            g.comm.Barrier()
             shpzg = [s for s in shpzg if s != []]
             max_length = g.comm.allreduce(max_length, MPI.MAX) 
             if color == 1:
@@ -869,7 +864,6 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             g.comm.Barrier()
             g.comm.Bcast(y_primeg_flattend, root=0)
             g.comm.Bcast(y_goldg_flattend, root=0) 
-            g.comm.Barrier()
             y_primeg_flattend = np.split(y_primeg_flattend, num_pred)
             y_goldg_flattend = np.split(y_goldg_flattend, num_pred)
             y_primeg = [y.reshape((conf['model']['pred_batch_size'], max_length, 1)) for y in y_primeg_flattend]

From 40b0b10418b2ebe9208bc00bffd40ec8bffcb540 Mon Sep 17 00:00:00 2001
From: Ian DesJardin <idesjard@umd.edu>
Date: Tue, 10 Aug 2021 15:54:52 -0400
Subject: [PATCH 35/50] Removed more barrier statements

---
 plasma/models/mpi_runner.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index c82f78f5..f0e235a5 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -854,10 +854,8 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
                 # Ensure that numpy arrays have correct dimensions before gathering them
                 assert num_pred*max(y_prime_numpy.flatten().shape) == max(y_primeg_flattend.shape)
                 assert num_pred*max(y_gold_numpy.flatten().shape) == max(y_goldg_flattend.shape)
-                temp_predictor_only_comm.Barrier()
                 temp_predictor_only_comm.Allgather(y_prime_numpy.flatten(), y_primeg_flattend)
                 temp_predictor_only_comm.Allgather(y_gold_numpy.flatten(), y_goldg_flattend)
-                temp_predictor_only_comm.Barrier()
             # Process 0 broadcast y_primeg and y_goldg to all processors, including ones
             # not involved in calculating predictions so they can each create their own 
             # y_prime_global and y_gold_global
@@ -878,7 +876,6 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
 
             disruptive_global += concatenate_sublists(
                 g.comm.allgather(disruptive))
-            g.comm.Barrier()
             y_prime = []
             y_gold = []
             disruptive = [] 

From ccbe60f61dff23044eff483d21516f2d68c988a1 Mon Sep 17 00:00:00 2001
From: Ian DesJardin <imdesjardin@gmail.com>
Date: Thu, 12 Aug 2021 14:29:31 -0400
Subject: [PATCH 36/50] Removed scikit dependency

---
 envs/requirements-linux-64-gpu.yaml | 1 -
 envs/requirements-traverse.yaml     | 1 -
 setup.py                            | 1 -
 3 files changed, 3 deletions(-)

diff --git a/envs/requirements-linux-64-gpu.yaml b/envs/requirements-linux-64-gpu.yaml
index 204eaeb2..e7511f1d 100644
--- a/envs/requirements-linux-64-gpu.yaml
+++ b/envs/requirements-linux-64-gpu.yaml
@@ -20,5 +20,4 @@ dependencies:
       - hyperopt  # TODO(KGF): remove
       # - mpi4py   # must reload MPI library modules before installing via pip
       - xgboost
-      - scikit-learn
       - joblib
diff --git a/envs/requirements-traverse.yaml b/envs/requirements-traverse.yaml
index aad550db..ad15182f 100644
--- a/envs/requirements-traverse.yaml
+++ b/envs/requirements-traverse.yaml
@@ -25,5 +25,4 @@ dependencies:
       # - hyperopt  # TODO(KGF): remove
       # - mpi4py   # must reload MPI library modules before installing via pip
       # - xgboost
-      # - scikit-learn
       # - joblib
diff --git a/setup.py b/setup.py
index d49b350f..2ba00552 100644
--- a/setup.py
+++ b/setup.py
@@ -33,7 +33,6 @@
           'hyperopt',
           'mpi4py',
           'xgboost',
-          'scikit-learn',
           'joblib',
           ],
       # TODO(KGF): add optional feature specs for [deephyper,balsam,

From 0a1f680fa758feaf4ac9a2d2b0de32fbee6e119f Mon Sep 17 00:00:00 2001
From: Ian DesJardin <imdesjardin@gmail.com>
Date: Fri, 13 Aug 2021 14:07:37 -0400
Subject: [PATCH 37/50] Put scikit-learn back in

---
 envs/requirements-linux-64-gpu.yaml | 1 +
 envs/requirements-traverse.yaml     | 1 +
 setup.py                            | 1 +
 3 files changed, 3 insertions(+)

diff --git a/envs/requirements-linux-64-gpu.yaml b/envs/requirements-linux-64-gpu.yaml
index e7511f1d..204eaeb2 100644
--- a/envs/requirements-linux-64-gpu.yaml
+++ b/envs/requirements-linux-64-gpu.yaml
@@ -20,4 +20,5 @@ dependencies:
       - hyperopt  # TODO(KGF): remove
       # - mpi4py   # must reload MPI library modules before installing via pip
       - xgboost
+      - scikit-learn
       - joblib
diff --git a/envs/requirements-traverse.yaml b/envs/requirements-traverse.yaml
index ad15182f..aad550db 100644
--- a/envs/requirements-traverse.yaml
+++ b/envs/requirements-traverse.yaml
@@ -25,4 +25,5 @@ dependencies:
       # - hyperopt  # TODO(KGF): remove
       # - mpi4py   # must reload MPI library modules before installing via pip
       # - xgboost
+      # - scikit-learn
       # - joblib
diff --git a/setup.py b/setup.py
index 2ba00552..d49b350f 100644
--- a/setup.py
+++ b/setup.py
@@ -33,6 +33,7 @@
           'hyperopt',
           'mpi4py',
           'xgboost',
+          'scikit-learn',
           'joblib',
           ],
       # TODO(KGF): add optional feature specs for [deephyper,balsam,

From 8d71b12cdfb5ec3790c34e1d959007fd53f855fa Mon Sep 17 00:00:00 2001
From: "Jesse A. Rodriguez" <jessear@traverse.princeton.edu>
Date: Wed, 25 Aug 2021 18:43:38 -0400
Subject: [PATCH 38/50] Added Signal2D class to data.py for 2D data processing
 (specifically ECEI), as yet untested. Added ECEI.py to utils which contains
 tools for handling and acquiring ECEI data.

---
 plasma/primitives/data.py |  246 ++++++++-
 plasma/utils/ECEI.py      | 1039 +++++++++++++++++++++++++++++++++++++
 2 files changed, 1283 insertions(+), 2 deletions(-)
 create mode 100644 plasma/utils/ECEI.py

diff --git a/plasma/primitives/data.py b/plasma/primitives/data.py
index 91a2aed0..99cfbe35 100644
--- a/plasma/primitives/data.py
+++ b/plasma/primitives/data.py
@@ -3,11 +3,13 @@
 import sys
 import os
 import re
+import h5py
 
 from scipy.interpolate import UnivariateSpline
 from plasma.utils.processing import get_individual_shot_file
 from plasma.utils.downloading import get_missing_value_array
 from plasma.utils.hashing import myhash
+from plasma.utils.ECEI import ECEI
 
 # class SignalCollection:
 #   """GA Data Obj"""
@@ -22,7 +24,33 @@
     pass
 
 
+###############################################################################
+# Parent Signal Class
+###############################################################################
 class Signal(object):
+    """
+    A Signal object is a wrapper for a single signal. It is used to fetch data
+    remotely and load locally stored data, as well as to return useful
+    information about the data which is used during processing and model
+    training.
+
+    Attributes:
+        description: str, full name of signal, e.g. "Plasma density"
+        paths: list of str, MDSplus point-names, must correspond in index to
+               the Machine objects in self.machines
+        machines: list of Machine objects that the signal is defined on.
+        causal_shifts: list of floats; the causal shift needed to be sure the
+                       signal is not utilizing future data.
+        is_ip: bool, True if signal is plasma current
+        num_channels: int, number of data collection channels. Used in profile
+                      signals and two dimensional signals
+        normalize: bool, True if signal is to be normalized (?)
+        data_avail_tolerances: list of floats, value in s of the maximum 
+                               allowable time between cessation of data 
+                               collection and t_disrupt for each machine
+        is_strictly_positive: bool, True if signal is strictly positive
+        mapping_paths: list of str, MDSplus mapping paths
+    """
     def __init__(self, description, paths, machines, tex_label=None,
                  causal_shifts=None, is_ip=False, normalize=True,
                  data_avail_tolerances=None, is_strictly_positive=False,
@@ -30,10 +58,10 @@ def __init__(self, description, paths, machines, tex_label=None,
         assert len(paths) == len(machines)
         self.description = description
         self.paths = paths
-        self.machines = machines  # on which machines is the signal defined
+        self.machines = machines
         if causal_shifts is None:
             causal_shifts = [0 for m in machines]
-        self.causal_shifts = causal_shifts  # causal shift in ms
+        self.causal_shifts = causal_shifts  # causal shift in ms -> (JAR) the causal shifts appear to be supplied in s in signals.py, NOT ms
         self.is_ip = is_ip
         self.num_channels = 1
         self.normalize = normalize
@@ -213,6 +241,9 @@ def __repr__(self):
         return self.description
 
 
+###############################################################################
+# Profile (1D) Signal Class
+###############################################################################
 class ProfileSignal(Signal):
     def __init__(self, description, paths, machines, tex_label=None,
                  causal_shifts=None, mapping_range=(0, 1), num_channels=32,
@@ -309,6 +340,9 @@ def fetch_data(self, machine, shot_num, c):
         return time, data, mapping, success
 
 
+###############################################################################
+# Channel Signal Class
+###############################################################################
 class ChannelSignal(Signal):
     def __init__(self, description, paths, machines, tex_label=None,
                  causal_shifts=None, data_avail_tolerances=None,
@@ -369,6 +403,214 @@ def get_file_path(self, prepath, machine, shot_number):
                                         raw_signal=True)
 
 
+###############################################################################
+# 2-Dimensional Signal Class
+###############################################################################
+class Signal2D(Signal):
+    """
+    Signal2D is a signal class specifically tailored for two-dimensional
+    signals, such as ECEi data
+
+    Non-inherited Attributes:
+        dims: tuple of ints, dimensions of 2d signal; ((20, 8) for ECEi)
+        is_ecei: bool, True if data is ECEi data
+        miss_chan_threshold: int, number of channels that can be
+                             missing in order for a shot to be included
+    """
+    def __init__(self, description, paths, machines, dims, is_ecei = False, 
+                 miss_chan_threshold = 80, tex_label=None, causal_shifts=None,
+                 is_ip=False, normalize=True, data_avail_tolerances=None,
+                 is_strictly_positive=False, mapping_paths=None):
+        super(Signal2D, self).__init__(
+            description, paths, machines,
+            tex_label=tex_label, causal_shifts=causal_shifts,
+            is_ip=False, normalize=normalize,
+            data_avail_tolerances=data_avail_tolerances,
+            is_strictly_positive=is_strictly_positive,
+            mapping_paths=mapping_paths)
+        self.dims = dims
+        self.num_channels = dims[0]*dims[1]
+        self.is_ecei = is_ecei
+
+
+    def get_file_path(self, prepath, machine, shot_number):
+        """
+        Returns file path.
+
+        Args:
+            prepath: str, file prepath
+            machine: Machine object, machine that signal is defined on
+            shot_number: int, shot number
+        """
+        if self.is_ecei:
+            return prepath+'/'+str(shot_number)+'.hdf5'
+        signal_dirname = self.get_path(machine)
+        dirname = os.path.join(prepath, machine.name, signal_dirname)
+        return get_individual_shot_file(dirname, machine.name, shot_number,
+                                        raw_signal=True)
+
+
+    def load_data_from_hdf5_safe(self, prepath, shot):
+        """
+        Loads 2D data from hdf5 file where each database in the file contains
+        data from a single channel. Stacks data into 2D numpy array with shape
+        (time_steps, num_channels+1) (where time series is first column) and 
+        returns it. Pads missing channel data with 0's up to the missing channel
+        threshold.
+
+        Args:
+            prepath: str, location of data
+            shot: Shot object for shot number of interest
+        """
+        file_path = self.get_file_path(prepath, shot.machine, shot.number)
+        if not self.is_saved(prepath, shot):
+            print('Signal {}, shot {} was never downloaded [omit]'.format(
+                self.description, shot.number))
+            return None, False
+
+        if os.path.getsize(file_path) == 0:
+            print('Signal {}, shot {} '.format(self.description, shot.number),
+                  'was downloaded incorrectly (empty file) [omit]')
+            os.remove(file_path)
+            return None, False
+
+        if self.is_ecei:
+            try:
+                E = ECEI()
+                f = h5py.File(file_path, 'r')
+                miss_count = 0
+                missing = []
+                for key in f.keys():
+                    if key.startswith('missing'):
+                        miss_count += 1
+                        missing.append(key)
+                if miss_count == 160:
+                    print('Signal {}, shot {} contains no data [omit]'.format(
+                          self.description, shot.number))
+                    return None, False
+                if miss_count > self.miss_chan_threshold:
+                    print('Signal {}, shot {} is missing too many channels \
+                           [omit]'.format(self.description, shot.number))
+                    return None, False
+
+                no_time_series = True
+                idx = 0
+                while no_time_series:
+                    chan = E.ecei_channels[idx]
+                    if chan not in missing:
+                        data = np.asarray(f.get(chan))[:,0]
+                        data = data.reshape((data.shape[0],1))
+                        no_time_series = False
+                    idx += 1
+
+                for channel in E.ecei_channels:
+                    if channel in missing:
+                        chan = np.zeros((data.shape[0],1))
+                        data = np.append(data, chan, axis = 1)
+                    else:
+                        chan = np.asarray(f.get(channel))
+                        data = np.append(data, chan[:,1].reshape((chan.shape[0],1)),\
+                                     axis = 1)
+            except Exception as e:
+                print(e)
+                print('Cannot load signal {} shot {} [omit]'.format(
+                      file_path, shot.number))
+                os.remove(file_path)
+                return None, False
+            assert data.shape[1] == 161
+
+        else:
+            print('Non-ECEi 2D hdf5 data not yet supported.')
+            return None, False
+
+        return data, True
+
+
+    def load_data_from_txt_safe(self, prepath, shot, dtype='float32'):
+        file_path = self.get_file_path(prepath, shot.machine, shot.number)
+        if not self.is_saved(prepath, shot):
+            print('Signal {}, shot {} was never downloaded [omit]'.format(
+                self.description, shot.number))
+            return None, False
+
+        if os.path.getsize(file_path) == 0:
+            print('Signal {}, shot {} '.format(self.description, shot.number),
+                  'was downloaded incorrectly (empty file) [omit]')
+            os.remove(file_path)
+            return None, False
+        try:
+            data = np.loadtxt(file_path, dtype=dtype)
+            if np.all(data == get_missing_value_array()):
+                print('Signal {}, shot {} contains no data [omit]'.format(
+                    self.description, shot.number))
+                return None, False
+        except Exception as e:
+            print(e)
+            print('Cannot load signal {} shot {} [omit]'.format(
+                file_path, shot.number))
+            os.remove(file_path)
+            return None, False
+
+        return data, True
+
+    def load_data(self, prepath, shot, dtype='float32'):
+        if self.is_ecei:
+            data, succ = self.load_data_from_hdf5_safe(prepath, shot)
+        else:
+            data, succ = self.load_data_from_txt_safe(prepath, shot)
+
+        if not succ:
+            return None, None, False
+
+        if np.ndim(data) == 1:
+            data = np.expand_dims(data, axis=0)
+
+        t = data[:, 0]
+        sig = data[:, 1:]
+
+        # make sure shot is not garbage data
+        if len(t) <= 1 or (np.max(sig) == 0.0 and np.min(sig) == 0.0):
+            if self.is_ip:
+                print('Shot {} has no current [omit]'.format(shot.number))
+            else:
+                print('Signal {}, shot {} contains no data [omit]'.format(
+                    self.description, shot.number))
+            return None, sig.shape, False
+
+        # make sure data doesn't contain NaN values
+        if np.any(np.isnan(t)) or np.any(np.isnan(sig)):
+            print('Signal {}, shot {} contains NaN [omit]'.format(
+                self.description, shot.number))
+            return None, sig.shape, False
+
+        return t, sig, True
+
+    def fetch_data_basic(self, machine, shot_num, c, path=None):
+        success = False
+        if self.is_ecei:
+            E = ECEI()
+            time, data, mapping, success = E.Fetch_Shot(shot_num)
+        else:
+            if path is None:
+                path = self.get_path(machine)
+            mapping = None
+            try:
+                time, data, mapping, success = machine.fetch_data_fn(
+                    path, shot_num, c)
+            except Exception as e:
+                print(e)
+                sys.stdout.flush()
+
+        if not success:
+            return None, None, None, False
+
+        time = np.array(time) + 1e-3*self.get_causal_shift(machine)
+        return time, np.array(data), mapping, success
+
+    def fetch_data(self, machine, shot_num, c):
+        return self.fetch_data_basic(machine, shot_num, c)
+
+
 class Machine(object):
     def __init__(self, name, server, fetch_data_fn, max_cores=8,
                  current_threshold=0):
diff --git a/plasma/utils/ECEI.py b/plasma/utils/ECEI.py
new file mode 100644
index 00000000..b9d23887
--- /dev/null
+++ b/plasma/utils/ECEI.py
@@ -0,0 +1,1039 @@
+"""
+The module composed in this file is designed to handle the processing/handling
+and incorporation of electron cyclotron emission imaging data into the FRNN
+disruption prediction software suite. It contains snippets from the rest of
+the FRNN codebase, and therefore is partially redundant.
+Jesse A Rodriguez, 06/28/2021
+"""
+
+import numpy as np
+import matplotlib as mpl
+#mpl.rcParams['figure.dpi']=10
+import matplotlib.pyplot as plt
+#plt.rc('font', family='tahoma')
+#font = 1
+#plt.rc('xtick', labelsize=font)
+#plt.rc('ytick', labelsize=font)
+import time
+import sys
+import os
+import multiprocessing as mp
+from functools import partial
+import h5py
+import scipy.signal
+import math
+try:
+    import MDSplus as MDS
+except ImportError:
+    pass
+
+################################################################################
+## Utility Functions and Globals
+################################################################################
+def Fetch_ECEI_d3d(channel_path, shot_number, c = None, verbose = False):
+    """
+    Basic fetch ecei data function, uses MDSplus Connection objects and looks
+    for data in all the locations we know of.
+
+    Args:
+        channel_path: str, path to save .txt file (channel folder, format 
+                      "LFSxxxx")
+        shot_number: int, DIII-D shot number
+        c: MDSplus.Connection object. None by default
+        verbose: bool, suppress print statements
+    """
+    channel = channel_path
+    shot = str(int(shot_number))
+    mds_fail_pd = False
+    mds_fail_pd2 = False
+    mds_fail_p = False
+    mds_fail_t = False
+
+    #ptdata2 method (seems to be most reliable)
+    try:
+        x_pd2 = c.get('dim_of(_s = ptdata2('+channel+','+shot+'))')
+        y_pd2 = c.get('_s = ptdata2('+channel+','+shot+')')
+    except Exception as e:
+        if verbose:
+            print(e)
+        mds_fail_pd2 = True
+        pass
+    if not mds_fail_pd2:
+        if x_pd2.shape[0] > 1:
+            print('Data exists for shot '+shot+' in channel '+channel[-5:-1]+'.')
+            return x_pd2, y_pd2, None, True
+    
+    #psuedo method
+    try:
+        x_p = c.get('dim_of(_s = psuedo('+channel+','+shot+'))')
+        y_p = c.get('_s = psuedo('+channel+','+shot+')')
+    except Exception as e:
+        if verbose:
+            print(e)
+        mds_fail_p = True
+        pass
+    if not mds_fail_p:
+        if x_p.shape[0] > 1:
+            print('Data exists for shot '+shot+' in channel '+channel[-5:-1]+'.')
+            return x_p, y_p, None, True
+            
+    #ptdata method
+    try:
+        x_pd = c.get('dim_of(_s = ptdata('+channel+','+shot+'))')
+        y_pd = c.get('_s = ptdata('+channel+','+shot+')')
+    except Exception as e:
+        if verbose:
+            print(e)
+        mds_fail_pd = True
+        pass
+    if not mds_fail_pd:
+        if x_pd.shape[0] > 1:
+            print('Data exists for shot '+shot+' in channel '+channel[-5:-1]+'.')
+            return x_pd, y_pd, None, True
+
+    #tree method
+    try:
+        c.openTree(channel, shot)
+        x_t = c.get('dim_of(_s = '+shot+')').data()
+        y_t = c.get('_s = '+shot).data()
+    except Exception as e:
+        if verbose:
+            print(e)
+        mds_fail_t = True
+        pass
+    if not mds_fail_t:
+        if x_t.shape[0] > 1:
+            print('Data exists for shot '+shot+' in channel '+channel[-5:-1]+'.')
+            return x_t, y_t, None, True
+
+    print('Data DOES NOT exist for shot '+shot+' in channel '+channel[-5:-1]+'.')
+    return None, None, None, False
+
+
+def Download_Shot(shot_num_queue, c, n_shots, n_procs, channel_paths,\
+                  sentinel = -1, verbose = False, d_sample = 1,\
+                  try_again = False):
+    """
+    Accepts a multiprocessor queue of shot numbers and downloads/saves data for
+    a single shot off the front of the queue.
+
+    Args:
+        shot_num_queue: multiprocessing queue object containing shot numbers
+        c: MDSplus.Connection object
+        n_shots: int, total number of shots to be processed
+        n_proc: int, number of processes
+        channel_paths: list containing savepaths to channel folders
+        sentinel: sentinel value; -1 by default. Serves as the mechanism for
+                  terminating the parallel program.
+        verbose: bool, suppress print statements
+        d_sample: int, downsample factor, MUST BE IN FORM 10^y
+        try_again: bool, tells script to try and download signals that were
+                   found to be missing in a prior run.
+    """
+    missing_shots = 0
+    while True:
+        shot_num = shot_num_queue.get()
+        shots_left = shot_num_queue.qsize() - n_procs
+        shots_progress = 100*(n_shots - shots_left)/n_shots
+        shots_progress_next = 100*(n_shots + 1 - shots_left)/n_shots
+        if shot_num == sentinel:
+            break
+        shot_complete = True
+        chan_done = 0
+        for channel_path in channel_paths:
+            save_path = channel_path[:-9]+'{}.hdf5'.format(int(shot_num))
+            channel = channel_path[-9:]
+
+            success = False
+            if os.path.isfile(save_path):
+                if os.path.getsize(save_path) > 0:
+                    f = h5py.File(save_path, 'r')
+                    for key in f.keys():
+                        if key == channel:
+                            success = True
+                        if key.startswith('missing') and key.endswith(channel)\
+                           and not try_again:
+                            success = True
+                    f.close()
+                else:
+                    print('Shot {} '.format(int(shot_num)),'was downloaded \
+                           incorrectly (empty file). Redownloading.')
+
+            else:
+                f = h5py.File(save_path, 'w')
+                f.close()
+
+            if not success:
+                try:
+                    try:
+                        time, data, mapping, success = Fetch_ECEI_d3d(\
+                                                channel_path[-9:], shot_num, c,\
+                                                verbose)
+                    except Exception as e:
+                        print(e)
+                        sys.stdout.flush()
+                        print('Channel {}, shot {} missing, all mds commands \
+                               failed.'.format(channel_path[-5:-1], shot_num))
+                        success = False
+
+                    if success:
+                        data_two_column = np.vstack((time, data)).transpose()
+                        if d_sample > 1:
+                            n = int(math.log10(d_sample))
+                            for _ in range(n):
+                                data_two_column = scipy.signal.decimate(\
+                                                  data_two_column, 10, axis = 0)
+                        f = h5py.File(save_path, 'r+')
+                        for key in f.keys():
+                            if key.startswith('missing'):
+                                if key[8:] == channel:
+                                    del f[key]
+                        dset = f.create_dataset(channel, data = data_two_column)
+                        f.close()
+                    else:
+                        f = h5py.File(save_path, 'r+')
+                        dsetk = 'missing_'+channel
+                        already = False
+                        for key in f.keys():
+                            if key == dsetk:
+                                f.close()
+                                already = True
+                        if not already:
+                            dset = f.create_dataset(dsetk,\
+                                                    data = np.array([-1.0]))
+                            f.close()
+
+                except BaseException:
+                    print('Could not save channel {}, shot {}.'.format(\
+                           channel_path[-5:-1], shot_num))
+                    print('Warning: Incomplete!!!')
+                    raise
+            else:
+                print('Channel {}, shot {} '.format(channel_path[-5:-1],\
+                       int(shot_num)),'has already been downloaded.')
+            sys.stdout.flush()
+            if not success:
+                missing_shots += 1
+            chan_done += 1
+            shot_prog = chan_done/160
+            overall_prog = shots_progress +\
+                           (shots_progress_next-shots_progress)*shot_prog
+            print('Approximate download progress: {:.2f}%.'\
+                  .format(overall_prog))
+
+    print('Finished with {} channel signals missing.'.format(missing_shots))
+    return
+                         
+
+def Download_Shot_List(shot_numbers, channel_paths, max_cores = 8,\
+                       server = 'atlas.gat.com', verbose = False,\
+                       d_sample = 1, try_again = False):
+    """
+    Accepts list of shots and downloads them in parallel
+
+    Args:
+        shot_numbers: list of integer shot numbers
+        channel_paths: list of channel save path folders
+        max_cores: int, max number of cores for parallelization
+        server: MDSplus server, str. D3D server by default
+        verbose: bool, suppress print statements
+        d_sample: int, downsample factor, MUST BE IN FORM 10^y
+        try_again: bool, tells script to try and download signals that were
+                   found to be missing in a prior run.
+    """
+    sentinel = -1
+    num_cores = min(mp.cpu_count(), max_cores)
+    fn = partial(Download_Shot, n_shots = len(shot_numbers),\
+                 n_procs = num_cores, channel_paths = channel_paths,\
+                 sentinel = sentinel, verbose = verbose,\
+                 d_sample = d_sample, try_again = try_again)
+    queue = mp.Queue()
+    assert len(shot_numbers) < 32000
+    for shot_num in shot_numbers:
+        queue.put(shot_num)
+    for i in range(num_cores):
+        queue.put(sentinel)
+
+    connections = [MDS.Connection(server) for _ in range(num_cores)]
+    processes = [mp.Process(target=fn, args = (queue, connections[i]))\
+                 for i in range(num_cores)]
+    print('Running in parallel on {} processes.'.format(num_cores))
+    for p in processes:
+        p.start()
+    for p in processes:
+        p.join()
+
+
+def Count_Missing(shot_list, shot_path, missing_path):
+    """
+    Accepts a shot list and a path to the shot files and produces an up-to-date
+    list of all missing data and places it in missing_path. Automatically
+    called after a download operation.
+
+    Args:
+        shot_list: 1-D numpy array of DIII-D shot numbers
+        shot_path: path to folder containing shot files
+        missing_path: folder for missing shot reports
+    """
+    min_shot = np.argmin(shot_list)
+    max_shot = np.argmax(shot_list)
+    num_shots = np.shape(shot_list)[0]*160
+    num_shots_miss = 0
+    print("Generating report for {} shots between shots {} and {}".format(\
+           np.shape(shot_list)[0], int(shot_list[min_shot]),\
+           int(shot_list[max_shot])))
+    report = open(missing_path+'/missing_report_'+str(int(shot_list[min_shot]))\
+                  +'-'+str(int(shot_list[max_shot]))+'.txt', mode = 'w',\
+                  encoding='utf-8')
+    report.write('Missing channel signals for download from shot {} to shot '\
+                 '{}:\n'.format(int(shot_list[min_shot]),\
+                                int(shot_list[max_shot])))
+    for filename in os.listdir(shot_path):
+        if filename.endswith('hdf5'):
+            if int(filename[:-5]) >= shot_list[min_shot]\
+            and int(filename[:-5]) <= shot_list[max_shot]:
+                f = h5py.File(shot_path+'/'+filename, 'r')
+                count = 0
+                for key in f.keys():
+                    if key.startswith('missing'):
+                        count += 1
+                        report.write('Channel '+key[-5:-1]+', shot #'+\
+                                     filename[:-5]+'\n')
+                        num_shots_miss +=1
+                if count > 160:
+                    print('Shot #'+filename[:-5]+' has more than 160 channels '\
+                          'missing!')
+
+    report.write('Missing channel signals for {} out of {} signals.'.\
+                  format(num_shots_miss, num_shots))
+    report.close()
+
+    return (num_shots_miss, num_shots)
+
+
+###############################################################################
+## ECEI Class
+###############################################################################
+class ECEI:
+    def __init__(self):
+        """
+        Initialize ECEI object by creating an internal list of channel keys.
+
+        Args:
+        """
+        self.ecei_channels = []
+        for i in range(20):
+            for j in range(8):
+                self.ecei_channels.append('"LFS{:02d}{:02d}"'.format(i+3,j+1))
+
+    ###########################################################################
+    ## Data Processing
+    ###########################################################################
+    def Generate_Missing_Report(self, shots, shot_1, clear_file, disrupt_file,\
+                                save_path = os.getcwd()):
+        """
+        Accept a start shot and a number of clear shots and generate a verbose
+        missing shot report for all shots in that range of the shot list files.
+
+        Args:
+            shots: int, number of non-disruptive shots you want to download
+            shot_1: int, the shot number you want to start with
+            clear_file: The path to the clear shot list
+            disrupt_file: The path to the disruptive shot list
+            save_path: location where the shot hdf5 files will be stored,
+                       current directory by default
+        """
+        clear_shots = np.loadtxt(clear_file)
+        disrupt_shots = np.loadtxt(disrupt_file)
+
+        first_c = False
+        first_d = False
+        i = 0
+        while not first_c:
+            if clear_shots[i,0] >= shot_1:
+                start_c = i
+                first_c = True
+            i += 1
+        i = 0
+        while not first_d:
+            if disrupt_shots[i,0] >= shot_1:
+                start_d = i
+                first_d = True
+            i += 1
+
+        if start_c + shots > clear_shots.shape[0]-1:
+            shots = clear_shots.shape[0] - start_c - 1
+
+        shot_list = np.array([clear_shots[start_c,0]])
+        for i in range(shots-1):
+            shot_list = np.append(shot_list, [clear_shots[i+start_c+1,0]])
+
+        last = False
+        no_disrupt = False
+        i = start_d
+        while not last:
+            if disrupt_shots[i,0] >= clear_shots[start_c+shots-1,0]:
+                end_d = i
+                last = True
+            i += 1
+            if i >= disrupt_shots.shape[0]:
+                no_disrupt = True
+                last = True
+
+        if not no_disrupt:
+            for i in range(end_d - start_d + 1):
+                shot_list = np.append(shot_list, [disrupt_shots[i+start_d,0]])
+            
+        channel_paths = []
+        for i in range(len(self.ecei_channels)):
+            channel_path = os.path.join(save_path, self.ecei_channels[i])
+            channel_paths.append(channel_path)
+        #Missing shots directory
+        missing_path = os.path.join(save_path, 'missing_shot_info')
+        if not os.path.exists(missing_path):
+            os.mkdir(missing_path)
+
+        missed = Count_Missing(shot_list, save_path, missing_path)
+
+        return
+
+
+    def Clean_Signals(self, save_path = os.getcwd()):
+        """
+        Removes all signal files in the save_path directory.
+        """
+        check = input("WARNING: this function will delete ALL signal files in \
+                the "+"designated save path. Type 'yes' to continue, anything \
+                else to cancel.\n")
+        if check == 'yes':
+            for filename in os.listdir(save_path):
+                if filename.endswith('hdf5'):
+                    shot = os.path.join(save_path, filename)
+                    os.remove(shot)
+
+
+    def Clean_Missing_Signals(self, missing_path, save_path = os.getcwd()):
+        """
+        Removes all signal files with all channels missing in the save_path 
+        directory.
+        """
+        check = input("WARNING: this function will delete ALL signal files in "\
+                      "the designated save path which have all channel signals"\
+                      " missing. Type 'yes' to continue, anything else to "\
+                      "cancel.\n")
+        report = open(missing_path+'/AllChannelsMissing.txt', mode = 'w',\
+                  encoding='utf-8')
+        if check == 'yes':
+            for filename in os.listdir(save_path):
+                if filename.endswith('hdf5'):
+                    shot = os.path.join(save_path, filename)
+                    f = h5py.File(shot, 'r')
+                    count = 0
+                    for key in f.keys():
+                        if key.startswith('missing'):
+                            count += 1
+                    if count == 160:
+                        f.close()
+                        if os.path.getsize(shot) <= 342289:
+                            report.write(filename[:-5]+"\n")
+                            os.remove(shot)
+                    else:
+                        f.close()
+
+        report.close()
+
+
+
+    def Clean_Missing_Signal(self, shot_file):
+        """
+        Removes a single shot file if it has all channel signals missing.
+        """
+        shot = os.path.join(os.getcwd(), shot_file)
+        f = h5py.File(shot, 'r')
+        count = 0
+        for key in f.keys():
+            if key.startswith('missing'):
+                count += 1
+        if count == 160:
+            f.close()
+            check = input("You are about to delete "+shot+". Are "+\
+                          "you sure about that?\n")
+            if check == 'yes':
+                os.remove(shot)
+        else:
+            f.close()
+
+
+    def Clean_Missing_Signal_List(self, shots):
+        """
+        Removes shot files in a list if they have all channel signals missing.
+        """
+        for s in shots:
+            shot_file = str(s)+".hdf5"
+            shot = os.path.join(os.getcwd(), shot_file)
+            f = h5py.File(shot, 'r')
+            count = 0
+            for key in f.keys():
+                if key.startswith('missing'):
+                    count += 1
+            if count == 160:
+                f.close()
+                check = input("You are about to delete "+shot+". Are "+\
+                              "you sure about that?\n")
+                if check == 'yes':
+                    os.remove(shot)
+            else:
+                f.close()
+
+
+    def Generate_Missing_Report_Concise(self, todays_date,\
+            data_path = os.getcwd(), output_path = os.getcwd()):
+        """
+        Creates a report of missing data in a more readable format.
+
+        Args:
+            todays_date: str, todays date in a readable, filename-friendly
+                         format, like "MM-DD-YYYY"
+            data_path: str, path where data files are stored
+            output_path: str, path where the report will go
+        """
+        # Collect necessary information.
+        shot_count = 0
+        none_missing = 0
+        all_missing = 0
+        one_missing = 0
+        eight_missing = 0
+        sixteen_missing = 0
+        sixteen_to_all_missing = 0
+        one_to_sixteen_missing = 0
+        missing_by_chan = {}
+        all_missing_list = []
+        some_missing_list = []
+        full_shot_list = []
+        file_list = os.listdir(data_path)
+        num_shots = len(file_list)
+        print("Generating concise report for the {} shots in "\
+              .format(int(num_shots))+data_path)
+        t_b = time.time()
+        for filename in file_list:
+            if filename.endswith('hdf5'):
+                f = h5py.File(data_path+'/'+filename, 'r')
+                miss_count = 0
+                for key in f.keys():
+                    if key[-9:] not in missing_by_chan:
+                        missing_by_chan[key[-9:]] = 0
+                    if key.startswith('missing'):
+                        miss_count += 1
+                        missing_by_chan[key[-9:]] += 1
+                if miss_count == 160:
+                    all_missing += 1
+                    for key in f.keys():
+                        missing_by_chan[key[-9:]] -= 1
+                    all_missing_list.append(int(filename[:-5]))
+                elif miss_count == 1:
+                    one_missing += 1
+                elif miss_count == 8:
+                    eight_missing += 1
+                elif miss_count == 16:
+                    sixteen_missing += 1
+                elif miss_count > 0 and miss_count <= 16:
+                    one_to_sixteen_missing += 1
+                    some_missing_list.append(int(filename[:-5]))
+                elif miss_count > 16 and miss_count < 160:
+                    sixteen_to_all_missing += 1
+                    some_missing_list.append(int(filename[:-5]))
+                elif miss_count == 0:
+                    none_missing += 1
+                    full_shot_list.append(int(filename[:-5]))
+                shot_count += 1
+                f.close()
+                if shot_count%10 == 0:
+                    print("{:.2f}% of the way through collecting missing shot "\
+                          "info.".format(shot_count/num_shots*100))
+
+        t_e = time.time()
+        T = t_e-t_b
+
+        print("Finished collecting info in {} seconds.".format(T))
+
+        # Write report
+        report = open(output_path+'/missing_signal_report_'+todays_date+'.txt',\
+                      'w')
+        report.write('This missing shot report was generated using the \
+                     contents of '+output_path+' on '+todays_date+'.\n\n')
+        report.write('Number of shots with NO channels missing: {}\n'.format(\
+                     int(none_missing)))
+        report.write('Number of shots with ALL channels missing: {}\n'.format(\
+                     int(all_missing)))
+        report.write('Number of shots with just one channel missing: {}\n'\
+                     .format(int(one_missing)))
+        report.write('Number of shots with 8 channels missing: {}\n'.format(\
+                     int(eight_missing)))
+        report.write('Number of shots with 16 channels missing: {}\n'.format(\
+                     int(sixteen_missing)))
+        report.write('Number of shots with 2 to 15 channels missing: {}\n'\
+                     .format(int(one_to_sixteen_missing)))
+        report.write('Number of shots with 17 to 159 channels missing: {}\n\n'\
+                     .format(int(sixteen_to_all_missing)))
+        report.write('Missing signal distribution by channel in shots with '+\
+                     'fewer than 160 channels missing:\n')
+        missing_chan_tot = 0
+        most_miss = 0
+        for key in missing_by_chan:
+            missing_chan_tot += missing_by_chan[key]
+            if missing_by_chan[key] > most_miss:
+                most_miss = missing_by_chan[key]
+
+        for i in range(20):
+            for j in range(8):
+                key = '"LFS{:02d}{:02d}"'.format(i+3, j+1)
+                bar_length = int(missing_by_chan[key]/most_miss*50)
+                bar = '█'*bar_length
+                report.write('Channel {:02d}{:02d}: '.format(i+3, j+1)+\
+                        str(int(missing_by_chan[key]))+' | '+bar+'\n')
+
+        report.close()
+
+        all_missing_list = np.sort(all_missing_list)
+        some_missing_list = np.sort(some_missing_list)
+        full_shot_list = np.sort(full_shot_list)
+
+        np.savetxt(output_path+'/all_channels_missing_list.txt',\
+                   all_missing_list, fmt='%i')
+        np.savetxt(output_path+'/some_channels_missing_list.txt',\
+                   some_missing_list, fmt='%i')
+        np.savetxt(output_path+'/no_channels_missing_list.txt', full_shot_list,\
+                   fmt='%i')
+
+
+    def Generate_Quality_Report(self, todays_date, data_path, disrupt_list,\
+                                shots_of_interest, shotlist_name, output_path =\
+                                os.getcwd()):
+        """
+        Create a report that checks shots in shots_of_interest for NaNs, as
+        well as for cases in which the data collection ceases before t_disrupt.
+        If the shots are missing channels, the report will give the number of
+        channels missed per shot.
+
+        Args:
+            todays_date: str, todays date in a readable, filename-friendly
+                         format, like "MM-DD-YYYY"
+            data_path: str, path where data files are stored.
+            disrupt_list: numpy array, shot list that contains the disruptive
+                          shots of interest.
+            shots_of_interest: numpy array, shot list that contains the shots
+                               you would like to check.
+            shotlist_name: str, name describing the shotlist of interest.
+            output_path: str, path where the report will go.
+        """
+        print("Generating a data quality report for {} shots of interest in "\
+              .format(int(len(shots_of_interest)))+data_path)
+        t_b = time.time()
+
+        contains_NaN = {}
+        ends_before_t_disrupt = {}
+        missing_chans = {}
+        low_std_dev = {}
+
+        count = 0
+        files = os.listdir(data_path)
+        num_files = len(files)
+        for filename in files:
+            if filename.endswith('hdf5'):
+                count += 1
+                shot_no = int(filename[:-5])
+                if shot_no in shots_of_interest:
+                    f = h5py.File(data_path+'/'+filename, 'r')
+                    keys = f.keys()
+                    # First we check for NaNs
+                    for key in keys:
+                        data = np.asarray(f.get(key))
+                        if np.any(np.isnan(data)):
+                            if shot_no not in contains_NaN:
+                                contains_NaN[shot_no] = []
+                            contains_NaN[shot_no].append(key[-5:-1])
+                        # Check to make sure 'something' happens
+                        if not key.startswith('missing'):
+                            sig = np.sqrt(np.var(data[:,1]))
+                            if sig < 0.001:
+                                if shot_no not in low_std_dev:
+                                    low_std_dev[shot_no] = []
+                                low_std_dev[shot_no].append(key[-5:-1])
+                        # Next, missing channels
+                        if key.startswith('missing'):
+                            if shot_no not in missing_chans:
+                                missing_chans[shot_no] = []
+                            missing_chans[shot_no].append(key[-5:-1])
+                        # Now we check if data is collected up to t_disrupt
+                        if shot_no in disrupt_list[:,0] and\
+                           not key.startswith('missing'):
+                            i_disrupt=np.where(disrupt_list[:,0]==shot_no)[0][0]
+                            t_max = np.max(data[:,0])
+                            t_disrupt = disrupt_list[i_disrupt,1]
+                            if t_max < t_disrupt:
+                                if shot_no not in ends_before_t_disrupt:
+                                    ends_before_t_disrupt[shot_no] = []
+                                ends_before_t_disrupt[shot_no].append(key[-5:-1])
+                    print("{:2f}% of the way through shot files"\
+                          .format(count/num_files*100))
+                    f.close()
+
+        t_e = time.time()
+        T = t_e-t_b
+
+        print("Finished collecting info in {} seconds.".format(T))
+
+        # Write report
+        report = open(output_path+'/data_quality_report_'+todays_date+'.txt',\
+                      'w')
+        report.write('This data quality report was generated using the '\
+                     'contents of '+output_path+'\non '+todays_date+', using '\
+                     'a shotlist named "'+shotlist_name+'".\n\n')
+
+        report.write('Number of shots with NaNs present: {}\n'.format(\
+                     int(len(contains_NaN))))
+        if len(contains_NaN) > 0:
+            for shot in contains_NaN:
+                report.write('Shot {} Contains NaNs in the following '\
+                             'channels:\n'.format(shot))
+                count = 0
+                for i in range(len(contains_NaN[shot])):
+                    count += 1
+                    if count%10 == 0:
+                        report.write(contains_NaN[shot][i]+',\n')
+                    else:
+                        report.write(contains_NaN[shot][i]+', ')
+                report.write('\n')
+
+        report.write('_'*80)
+        report.write('\n\n')
+        report.write('Number of shots that cease data collection before '\
+                     't_disrupt: {}\n'.format(int(len(ends_before_t_disrupt))))
+        if len(ends_before_t_disrupt) > 0:
+            for shot in ends_before_t_disrupt:
+                report.write('Shot {} stops short of t_disrupt in the '\
+                             'following channels:\n'.format(shot))
+                count = 0
+                for i in range(len(ends_before_t_disrupt[shot])):
+                    count += 1
+                    if count%10 == 0:
+                        report.write(ends_before_t_disrupt[shot][i]+',\n')
+                    else:
+                        report.write(ends_before_t_disrupt[shot][i]+', ')
+                report.write('\n')
+
+        report.write('_'*80)
+        report.write('\n\n')
+        report.write('Number of shots that have a standard deviation which is '\
+                     'smaller than 1 mV: {}\n'.format(int(len(low_std_dev))))
+        if len(low_std_dev) > 0:
+            for shot in low_std_dev:
+                report.write('Shot {} has a std. dev less than 1 mV in the '\
+                             'following channels:\n'.format(shot))
+                count = 0
+                for i in range(len(low_std_dev[shot])):
+                    count += 1
+                    if count%10 == 0:
+                        report.write(low_std_dev[shot][i]+',\n')
+                    else:
+                        report.write(low_std_dev[shot][i]+', ')
+                report.write('\n')
+
+        report.write('_'*80)
+        report.write('\n\n')
+        report.write('Number of shots with missing channels: {}\n'.format(\
+                     int(len(missing_chans))))
+        if len(missing_chans) > 0:
+            for shot in missing_chans:
+                report.write('Shot {} is missing data in the following '\
+                             'channels:\n'.format(shot))
+                count = 0
+                for i in range(len(missing_chans[shot])):
+                    count += 1
+                    if count%10 == 0:
+                        report.write(missing_chans[shot][i]+',\n')
+                    else:
+                        report.write(missing_chans[shot][i]+', ')
+                report.write('\n')
+
+        report.close()
+
+
+    ###########################################################################
+    ## Visualization
+    ###########################################################################
+    def Single_Shot_Plot(self, shot, data_path, save_dir = os.getcwd(),\
+                         show = False):
+        """
+        Plot voltage traces for a single shot, saves plot as a .pdf
+
+        Args:
+            shot: int, shot number
+            data_path: str, path to ECEI data
+            save_dir: str, directory for output plot image
+            shot: bool, determines whether output is shown right away
+        """
+        shot_file = data_path+'/'+str(int(shot))+'.hdf5'
+        f = h5py.File(shot_file, 'r')
+        fig = plt.figure()
+        gs = fig.add_gridspec(4, 5, hspace=0.35, wspace=0)
+        ax = gs.subplots(sharex='col')
+        count = 0
+        plot_no = 0
+        for channel in self.ecei_channels:
+            count += 1
+            row = plot_no//5
+            col = plot_no%5
+            if channel in f.keys():
+                data = f.get(channel)
+                ax[row,col].plot(data[:,0], data[:,1],\
+                                 label = 'YY = '+channel[-3:-1],\
+                                 linewidth = 0.4, alpha = 0.8)
+            if count%8 == 0:
+                plot_no += 1
+                XX = count//8 + 2
+                title = 'XX = {:02d}'.format(XX)
+                ax[row,col].set_title(title, fontsize = 5)
+                #ax[row,col].legend(prop={'size': 2.75})
+                ax[row,col].tick_params(width = 0.3)
+
+        fig.suptitle('Shot #{}'.format(int(shot)), fontsize = 10)
+        for axs in ax.flat:
+            axs.set_xlabel('Time (ms)', fontsize = 5)
+            axs.set_ylabel('ECEi Voltage (V)', fontsize = 5)
+
+        # Hide x labels and tick labels for top plots and y ticks for right plots.
+        for axs in ax.flat:
+            axs.label_outer()
+            axs.tick_params(axis='x', labelsize = 5)
+            axs.tick_params(axis='y', labelsize = 5)
+
+        labels = []
+        for i in range(8):
+            labels.append('YY = {:2d}'.format(i+1))
+        fig.legend(labels=labels, loc="lower center", ncol=8, prop={'size': 5.5})
+
+        if show: 
+            fig.show()
+
+        fig.savefig(save_dir+'/Shot_{}.pdf'.format(int(shot)))
+
+
+    def Generate_Txt(self, shot, channel, save_dir = os.getcwd()):
+        """
+        Get a .txt file out for signal data in a readable format for a single
+        channel.
+
+        Args:
+            shot: int, shot number
+            channel: str, format "XXYY", 03<=XX<=22, 01<=YY<=08, designates
+                     channel
+            save_dir: str, directory where shot files are stored
+        """
+        shot_s = str(shot)
+        f = h5py.File(save_dir+'/'+shot_s+'.hdf5', 'r')
+        data = np.asarray(f.get('"LFS'+channel+'"'))
+        np.savetxt(save_dir+'/'+shot_s+'_chan'+channel+'.txt', data)
+        f.close()
+
+        return
+
+    
+    def Generate_Txt_Interactive(self, save_dir = os.getcwd()):
+        """
+        Get a .txt file out for reading signal data, accepts input from command
+        line.
+
+        Args:
+            save_dir: str, directory where shot files are stored
+        """
+        shot = int(input("Which shot? Enter an integer.\n"))
+        channel = input("Which channel? format 'XXYY', 03<=XX<=22, 01<=YY<=08"\
+                        ".\n")
+
+        self.Generate_Txt(shot, channel, save_dir)
+
+        return
+
+
+    ###########################################################################
+    ## Data Acquisition
+    ###########################################################################
+    def Acquire_Shots_D3D(self, shot_numbers, save_path = os.getcwd(),\
+                          max_cores = 8, verbose = False, chan_lowlim = 3,\
+                          chan_uplim = 22, d_sample = 1, try_again = False):
+        """
+        Accepts a list of shot numbers and downloads the data, saving them into
+        folders corresponding to the individual channels. Returns nothing. 
+        Shots are saved in hdf5 format, downsampling is done BEFORE saving. 
+        Each channel is labelled within its own dataset in the hdf5 file, where 
+        the label is the channel name/MDS point name, e.g. '"LFSXXYY"'. If data
+        is not found, labels are 'missing_"LFSXXYY"' with [-1.0] as the dataset.
+
+        Args:
+            shot_numbers: 1-D numpy array of integers, DIII-D shot numbers
+            save_path: location where the channel folders will be stored,
+                       current directory by default
+            max_cores: int, max # of cores to carry out download tasks
+            verbose: bool, suppress most print statements
+            chan_lowlim: int, lower limit of subset of channels to download
+            chan_uplim: int, upper limit of subset of channels to download
+            d_sample: int, downsample factor, MUST BE IN FORM 10^y
+            try_again: bool, tells script to try and download signals that were
+                       found to be missing in a prior run.
+        """
+        t_b = time.time()
+        # Construct channel save paths and create them if needed.
+        channel_paths = []
+        for i in range(len(self.ecei_channels)):
+            XX = int(self.ecei_channels[i][-5:-3])
+            if XX >= chan_lowlim and XX <= chan_uplim:
+                channel_path = os.path.join(save_path, self.ecei_channels[i])
+                channel_paths.append(channel_path)
+        #Missing shots directory
+        missing_path = os.path.join(save_path, 'missing_shot_info')
+        if not os.path.exists(missing_path):
+            os.mkdir(missing_path)
+
+        try:
+            c = MDS.Connection('atlas.gat.com')
+        except Exception as e:
+            print(e)
+            return False
+
+        Download_Shot_List(shot_numbers, channel_paths, max_cores = max_cores,\
+                           server = 'atlas.gat.com', verbose = verbose,\
+                           d_sample = d_sample, try_again = try_again)
+
+        missed = Count_Missing(shot_numbers, save_path, missing_path)
+
+        t_e = time.time()
+        T = t_e-t_b
+
+        print("Downloaded {} out of {} signals in {} seconds."\
+              .format(missed[1]-missed[0], missed[1], T))
+
+        return
+
+
+    def Acquire_Shot_Sequence_D3D(self, shots, shot_1, clear_file,\
+                                  disrupt_file, save_path = os.getcwd(),\
+                                  max_cores = 8, verbose = False,\
+                                  chan_lowlim = 3, chan_uplim = 22,\
+                                  d_sample = 1, try_again = False):
+        """
+        Accepts a desired number of non-disruptive shots, then downloads all
+        shots in our labelled database up to the last non-disruptive shot.
+        Returns nothing. Shots are saved in hdf5 format, downsampling is done
+        BEFORE saving. Each channel is labelled within its own dataset in the
+        hdf5 file, where the label is the channel name/MDS point name, e.g.
+        '"LFSXXYY"'. If data is not found, labels are 'missing_"LFSXXYY"' with 
+        [-1.0] as the dataset.
+
+        Args:
+            shots: int, number of non-disruptive shots you want to download
+            shot_1: int, the shot number you want to start with
+            clear_file: The path to the clear shot list
+            disrupt_file: The path to the disruptive shot list
+            save_path: location where the channel folders will be stored,
+                       current directory by default
+            max_cores: int, max # of cores to carry out download tasks
+            verbose: bool, suppress some exception info
+            chan_lowlim: int, lower limit of subset of channels to download
+            chan_uplim: int, upper limit of subset of channels to download
+            d_sample: int, downsample factor, MUST BE IN FORM 10^y
+            try_again: bool, tells script to try and download signals that were
+                       found to be missing in a prior run.
+        """
+        clear_shots = np.loadtxt(clear_file)
+        disrupt_shots = np.loadtxt(disrupt_file)
+
+        first_c = False
+        first_d = False
+        i = 0
+        while not first_c:
+            if clear_shots[i,0] >= shot_1:
+                start_c = i
+                first_c = True
+            i += 1
+        i = 0
+        while not first_d:
+            if disrupt_shots[i,0] >= shot_1:
+                start_d = i
+                first_d = True
+            i += 1
+
+        if start_c + shots > clear_shots.shape[0]-1:
+            shots = clear_shots.shape[0] - start_c - 1
+
+        shot_list = np.array([clear_shots[start_c,0]])
+        for i in range(shots-1):
+            shot_list = np.append(shot_list, [clear_shots[i+start_c+1,0]])
+
+        last = False
+        no_disrupt = False
+        i = start_d
+        while not last:
+            if disrupt_shots[i,0] >= clear_shots[start_c+shots-1,0]:
+                end_d = i
+                last = True
+            i += 1
+            if i >= disrupt_shots.shape[0]:
+                no_disrupt = True
+                last = True
+
+        if not no_disrupt:
+            for i in range(end_d - start_d + 1):
+                shot_list = np.append(shot_list, [disrupt_shots[i+start_d,0]])
+
+        self.Acquire_Shots_D3D(shot_list, save_path, max_cores, verbose,\
+                               chan_lowlim, chan_uplim, d_sample, try_again)
+
+        return
+
+
+    def Fetch_Shot(self, shot_number, verbose = False):
+        """
+        Fetch shot data from MDSplus server directly as numpy arrays. Returns
+        a 1D time array and a 2D data array with shape (t_steps, 160), where
+        each column is the signal data from one channel, along with None in
+        place of a mapping and True to indicate success. Missing channels are
+        padded with zeros.
+
+        Args:
+            shot_number: int, shot number
+            verbose: bool, determines if MDS exceptions are printed
+        """
+        no_time = True
+        idx = 0
+        while no_time:
+            t, d, mapping, success = Fetch_ECEI_d3d(self.ecei_channels[idx],\
+                                                    shot_number, verbose = verbose)
+            if success:
+                time = np.asarray(t)
+                no_time = False
+            idx += 1
+            if idx >= 160:
+                return None, None, None, False
+
+        no_data = True
+        for channel in self.ecei_channels:
+            t, d, mapping, success = Fetch_ECEI_d3d(channel, shot_number,\
+                                                    verbose = verbose)
+            if success:
+                if no_data:
+                    data = np.asarray(d).reshape((time.shape[0],1))
+                    no_data = False
+                else:
+                    data = np.append(data, d, axis = 1)
+            else:
+                if no_data:
+                    data = np.zeros((time.shape[0],1))
+                    no_data = False
+                else:
+                    d = np.zeros((time.shape[0],1))
+                    data = np.append(data, d, axis = 1)
+
+        return time, data, None, True
+
+

From 86fc985e999ebf74755795811d912520146712a7 Mon Sep 17 00:00:00 2001
From: "Jesse A. Rodriguez" <jessear@traverse.princeton.edu>
Date: Wed, 1 Sep 2021 17:49:41 -0400
Subject: [PATCH 39/50] ECEi successfully integrated into data processing
 stack.

---
 data/signals.py                   | 14 +++++++++++++-
 examples/conf.yaml                |  4 ++--
 plasma/conf_parser.py             | 28 ++++++++++++++++++++++++++++
 plasma/preprocessor/preprocess.py |  2 ++
 plasma/primitives/data.py         | 17 ++++++++++-------
 plasma/primitives/shots.py        |  4 ++--
 6 files changed, 57 insertions(+), 12 deletions(-)

diff --git a/data/signals.py b/data/signals.py
index 61f513cc..793cb132 100644
--- a/data/signals.py
+++ b/data/signals.py
@@ -4,7 +4,7 @@
 import sys
 
 from plasma.primitives.data import (
-    Signal, ProfileSignal, ChannelSignal, Machine
+    Signal, ProfileSignal, ChannelSignal, Signal2D, Machine
     )
 
 
@@ -391,6 +391,9 @@ def fetch_nstx_data(signal_path, shot_num, c):
 ipdirectt = Signal("plasma current direction tol", ["iptdirect"], [d3d],
                    data_avail_tolerances=[0.029])
 
+ecei = Signal2D("ECEi", ['ECEI_kHz'], [d3d], (20,8),
+                is_ecei = True, miss_chan_threshold = 80)
+
 # for downloading, modify this to preprocess shots with only a subset of
 # signals. This may produce more shots
 # since only those shots that contain all_signals contained here are used.
@@ -420,6 +423,7 @@ def fetch_nstx_data(signal_path, shot_num, c):
     # 'neut_profile':neut_profile, 'q_profile':q_profile,
     # 'bootstrap_current_profile':bootstrap_current_profile,
     # 'q_psi_profile':q_psi_profile}
+    'ecei': ecei,
 }
 
 all_signals_max_tol = {
@@ -431,6 +435,11 @@ def fetch_nstx_data(signal_path, shot_num, c):
     'ipdirectt': ipdirectt, 'iptargett': iptargett,
     'iperrt': iperrt,
     'etemp_profilet': etemp_profilet, 'edens_profilet': edens_profilet,
+    'ecei': ecei,
+}
+
+ecei_test = {
+    'ecei': ecei,
 }
 
 # for actual data analysis, use:
@@ -451,6 +460,9 @@ def fetch_nstx_data(signal_path, shot_num, c):
     sig_name: sig for (sig_name, sig) in all_signals_restricted.items() if (
         sig.is_defined_on_machines(all_machines) and sig.num_channels == 1)
 }
+# NOTE(JAR): The check sig.num_channels > 1 to determine if a signal is 1D will
+# now include 2D signals. Need to add something like sig.num_channels < 160 to
+# exclude ecei, for example.
 fully_defined_signals_1D = {
     sig_name: sig for (sig_name, sig) in all_signals_restricted.items() if (
         sig.is_defined_on_machines(all_machines) and sig.num_channels > 1)
diff --git a/examples/conf.yaml b/examples/conf.yaml
index f67bc89f..da1aaacc 100644
--- a/examples/conf.yaml
+++ b/examples/conf.yaml
@@ -16,10 +16,10 @@ user_subdir_output: True
 target: 'hinge' # 'maxhinge' # 'maxhinge' # 'binary' # 'hinge'
 num_gpus: 4  # per node
 paths:
-  signal_prepath: '/signal_data/' # /signal_data/jet/
+  signal_prepath: ['/signal_data/', '/signal_data_new_REAL_TIME/'] # /signal_data/jet/
   shot_list_dir: '/shot_lists/'
   tensorboard_save_path: '/Graph/'
-  data: d3d_0D
+  data: 'd3d_2019_all_dims'
   # if specific_signals: [] left empty, it will use all valid signals defined on a machine. Only use if need a custom set
   specific_signals: [] # ['q95','li','ip','betan','energy','lm','pradcore','pradedge','pradtot','pin','torquein','tmamp1','tmamp2','tmfreq1','tmfreq2','pechin','energydt','ipdirect','etemp_profile','edens_profile']
   executable: "mpi_learn.py"
diff --git a/plasma/conf_parser.py b/plasma/conf_parser.py
index 2da7a489..e9c4f53b 100644
--- a/plasma/conf_parser.py
+++ b/plasma/conf_parser.py
@@ -318,6 +318,28 @@ def parameters(input_file):
                 'etemp_profile': sig.etemp_profile,
                 'edens_profile': sig.edens_profile,
             }
+        elif params['paths']['data'] == 'd3d_2019_all_dims':
+            params['paths']['shot_files'] = [d3d_full_2019]
+            params['paths']['shot_files_test'] = []
+            params['paths']['use_signals_dict'] = {
+                'q95': sig.q95,
+                'li': sig.li,
+                'ip': sig.ip,
+                'lm': sig.lm,
+                'betan': sig.betan,
+                'energy': sig.energy,
+                'dens': sig.dens,
+                'pradcore': sig.pradcore,
+                'pradedge': sig.pradedge,
+                'pin': sig.pin,
+                'torquein': sig.torquein,
+                'ipdirect': sig.ipdirect,
+                'iptarget': sig.iptarget,
+                'iperr': sig.iperr,
+                'etemp_profile': sig.etemp_profile,
+                'edens_profile': sig.edens_profile,
+                'ecei': sig.ecei,
+            }
         elif params['paths']['data'] == 'd3d_1D':
             params['paths']['shot_files'] = [d3d_full]
             params['paths']['shot_files_test'] = []
@@ -326,6 +348,12 @@ def parameters(input_file):
                 'etemp_profile': sig.etemp_profile,
                 'edens_profile': sig.edens_profile,
             }
+        elif params['paths']['data'] == 'd3d_2D':
+            params['paths']['shot_files'] = [d3d_full_2019]
+            params['paths']['shot_files_test'] = []
+            params['paths']['use_signals_dict'] = {
+                'ecei': sig.ecei,
+            }
         elif params['paths']['data'] == 'd3d_all_profiles':
             params['paths']['shot_files'] = [d3d_full]
             params['paths']['shot_files_test'] = []
diff --git a/plasma/preprocessor/preprocess.py b/plasma/preprocessor/preprocess.py
index 111b7fa4..5bc5e950 100644
--- a/plasma/preprocessor/preprocess.py
+++ b/plasma/preprocessor/preprocess.py
@@ -14,6 +14,7 @@
 import time
 import sys
 import os
+from pprint import pprint
 
 import numpy as np
 import pathos.multiprocessing as mp
@@ -77,6 +78,7 @@ def preprocess_all(self):
     def preprocess_from_files(self, shot_files, use_shots):
         # all shots, including invalid ones
         all_signals = self.conf['paths']['all_signals']
+        pprint(all_signals)
         shot_list = ShotList()
         shot_list.load_from_shot_list_files_objects(shot_files, all_signals)
         shot_list_picked = shot_list.random_sublist(use_shots)
diff --git a/plasma/primitives/data.py b/plasma/primitives/data.py
index 99cfbe35..9ecb921e 100644
--- a/plasma/primitives/data.py
+++ b/plasma/primitives/data.py
@@ -431,6 +431,7 @@ def __init__(self, description, paths, machines, dims, is_ecei = False,
         self.dims = dims
         self.num_channels = dims[0]*dims[1]
         self.is_ecei = is_ecei
+        self.miss_chan_threshold = miss_chan_threshold
 
 
     def get_file_path(self, prepath, machine, shot_number):
@@ -442,12 +443,14 @@ def get_file_path(self, prepath, machine, shot_number):
             machine: Machine object, machine that signal is defined on
             shot_number: int, shot number
         """
-        if self.is_ecei:
-            return prepath+'/'+str(shot_number)+'.hdf5'
         signal_dirname = self.get_path(machine)
         dirname = os.path.join(prepath, machine.name, signal_dirname)
-        return get_individual_shot_file(dirname, machine.name, shot_number,
-                                        raw_signal=True)
+        if self.is_ecei:
+            return get_individual_shot_file(dirname, machine.name, shot_number,\
+                                            raw_signal=True, ext = '.hdf5')
+        else:
+            return get_individual_shot_file(dirname, machine.name, shot_number,\
+                                            raw_signal=True)
 
 
     def load_data_from_hdf5_safe(self, prepath, shot):
@@ -464,8 +467,8 @@ def load_data_from_hdf5_safe(self, prepath, shot):
         """
         file_path = self.get_file_path(prepath, shot.machine, shot.number)
         if not self.is_saved(prepath, shot):
-            print('Signal {}, shot {} was never downloaded [omit]'.format(
-                self.description, shot.number))
+            print('Signal {}, shot {} was never downloaded at {} [omit]'.format(
+                self.description, shot.number, file_path))
             return None, False
 
         if os.path.getsize(file_path) == 0:
@@ -498,7 +501,7 @@ def load_data_from_hdf5_safe(self, prepath, shot):
                 while no_time_series:
                     chan = E.ecei_channels[idx]
                     if chan not in missing:
-                        data = np.asarray(f.get(chan))[:,0]
+                        data = (np.asarray(f.get(chan))[:,0])/1000 #units of raw data are ms
                         data = data.reshape((data.shape[0],1))
                         no_time_series = False
                     idx += 1
diff --git a/plasma/primitives/shots.py b/plasma/primitives/shots.py
index 221e7d72..2a9bcba5 100644
--- a/plasma/primitives/shots.py
+++ b/plasma/primitives/shots.py
@@ -408,8 +408,8 @@ def get_signals_and_times_from_file(self, conf):
                 for prepath in signal_prepath:
                     t, sig, valid_signal = signal.load_data(
                         prepath, self, conf['data']['floatx'])
-                if valid_signal:
-                    break
+                    if valid_signal:
+                        break
             else:
                 t, sig, valid_signal = signal.load_data(
                     signal_prepath, self, conf['data']['floatx'])

From 239be8527ad6b4e79b949ceaf9dd847735bd3872 Mon Sep 17 00:00:00 2001
From: "Jesse A. Rodriguez" <jessear@traverse.princeton.edu>
Date: Wed, 1 Sep 2021 19:19:18 -0400
Subject: [PATCH 40/50] Rolled back changes that should not have been pushed to
 simplify pull request.

---
 data/signals.py                   |   8 +-
 envs/requirements-traverse.yaml   |   2 +-
 examples/conf.yaml                |  16 +--
 examples/slurm.cmd                |  25 ++--
 plasma/models/builder.py          |   2 +-
 plasma/models/mpi_runner.py       | 150 ++++++++++----------
 plasma/preprocessor/preprocess.py |   2 -
 plasma/primitives/data.py         |   4 +-
 plasma/utils/CallbackList.py      | 219 ------------------------------
 plasma/utils/ECEI.py              |   3 +-
 10 files changed, 108 insertions(+), 323 deletions(-)
 delete mode 100644 plasma/utils/CallbackList.py

diff --git a/data/signals.py b/data/signals.py
index 793cb132..483a056c 100644
--- a/data/signals.py
+++ b/data/signals.py
@@ -394,6 +394,9 @@ def fetch_nstx_data(signal_path, shot_num, c):
 ecei = Signal2D("ECEi", ['ECEI_kHz'], [d3d], (20,8),
                 is_ecei = True, miss_chan_threshold = 80)
 
+eceit = Signal2D("ECEi", ['ECEI_kHz'], [d3d], (20,8),
+                is_ecei = True, miss_chan_threshold = 159)
+
 # for downloading, modify this to preprocess shots with only a subset of
 # signals. This may produce more shots
 # since only those shots that contain all_signals contained here are used.
@@ -435,7 +438,7 @@ def fetch_nstx_data(signal_path, shot_num, c):
     'ipdirectt': ipdirectt, 'iptargett': iptargett,
     'iperrt': iperrt,
     'etemp_profilet': etemp_profilet, 'edens_profilet': edens_profilet,
-    'ecei': ecei,
+    'ecei': eceit,
 }
 
 ecei_test = {
@@ -462,7 +465,8 @@ def fetch_nstx_data(signal_path, shot_num, c):
 }
 # NOTE(JAR): The check sig.num_channels > 1 to determine if a signal is 1D will
 # now include 2D signals. Need to add something like sig.num_channels < 160 to
-# exclude ecei, for example.
+# exclude ecei, for example, or come up with another Signal attribute that can
+# be used as a reliable way to distiguish between 0D, 1D, and 2D signals
 fully_defined_signals_1D = {
     sig_name: sig for (sig_name, sig) in all_signals_restricted.items() if (
         sig.is_defined_on_machines(all_machines) and sig.num_channels > 1)
diff --git a/envs/requirements-traverse.yaml b/envs/requirements-traverse.yaml
index 30422586..aad550db 100644
--- a/envs/requirements-traverse.yaml
+++ b/envs/requirements-traverse.yaml
@@ -5,7 +5,7 @@ channels:
   - defaults
 # channel_priority: strict   # set in .condarc
 dependencies:
-  - python=3.6.8
+  - python>=3.6.8
   - cython
   - pip
   - scipy
diff --git a/examples/conf.yaml b/examples/conf.yaml
index da1aaacc..a4992811 100644
--- a/examples/conf.yaml
+++ b/examples/conf.yaml
@@ -9,17 +9,17 @@
 # will output csvlog, trained model checkpoints, etc.
 # in fs_path_output / [username] / results | csv_logs | model_checkpoints | Graph, etc.
 
-fs_path: '/tigress/'
+fs_path: '/Users/'
 user_subdir: True
-fs_path_output: '/tigress/'
+fs_path_output: '/Users/'
 user_subdir_output: True
 target: 'hinge' # 'maxhinge' # 'maxhinge' # 'binary' # 'hinge'
-num_gpus: 4  # per node
+num_gpus: 1  # per node
 paths:
-  signal_prepath: ['/signal_data/', '/signal_data_new_REAL_TIME/'] # /signal_data/jet/
+  signal_prepath: '/signal_data/' # /signal_data/jet/
   shot_list_dir: '/shot_lists/'
   tensorboard_save_path: '/Graph/'
-  data: 'd3d_2019_all_dims'
+  data: d3d_0D
   # if specific_signals: [] left empty, it will use all valid signals defined on a machine. Only use if need a custom set
   specific_signals: [] # ['q95','li','ip','betan','energy','lm','pradcore','pradedge','pradtot','pin','torquein','tmamp1','tmamp2','tmfreq1','tmfreq2','pechin','energydt','ipdirect','etemp_profile','edens_profile']
   executable: "mpi_learn.py"
@@ -94,7 +94,7 @@ model:
   # TODO(KGF): optimize size of RNN layers
   # size 100 slight overfitting, size 20 no overfitting. 200 is not better than 100.
   # Prediction is much better with size 100, size 20 cannot capture the data.
-  rnn_size: 100
+  rnn_size: 200
   rnn_type: 'LSTM'
   # TODO(KGF): optimize number of RNN layers
   rnn_layers: 2
@@ -144,8 +144,8 @@ training:
   num_batches_minimum: 20 # minimum number of batches per epoch
   ranking_difficulty_fac: 1.0 # how much to upweight incorrectly classified shots during training
   timeline_prof: False
-  step_limit: 1000
-  no_validation: False
+  step_limit: 50
+  no_validation: True
 callbacks:
   list: ['earlystop']
   metrics: ['val_loss','val_roc','train_loss']
diff --git a/examples/slurm.cmd b/examples/slurm.cmd
index af1aabda..3dcae884 100644
--- a/examples/slurm.cmd
+++ b/examples/slurm.cmd
@@ -1,24 +1,22 @@
 #!/bin/bash
-#SBATCH --job-name=FRNNTest
 #SBATCH -t 01:00:00
-#SBATCH -N 2
+#SBATCH -N 4
 #SBATCH --ntasks-per-node=4
-#SBATCH --gpus-per-node=4
 #SBATCH --ntasks-per-socket=2
 #SBATCH --gres=gpu:4
 #SBATCH -c 4
 #SBATCH --mem-per-cpu=0
-#SBATCH --reservation test
-#SBATCH --mail-user=jrodrig@stanford.edu
-#SBATCH --mail-type=ALL
 
-# Load modules
-module load anaconda3/2020.7
-conda activate FRNN
-module load cudatoolkit/11.3
-module load cudnn/cuda-11.x/8.2.0
-module load openmpi/cuda-11.0/gcc/4.0.4/64
-module load hdf5/gcc/openmpi-4.0.4/1.10.6
+# Example Slurm configuration for TigerGPU nodes (4 nodes, 16 GPUs total)
+# Each node = 2.4 GHz Xeon Broadwell E5-2680 v4 + 4x 1328 MHz P100 GPU
+
+module load anaconda3
+conda activate my_env
+module load cudatoolkit
+module load cudnn
+module load openmpi/cuda-8.0/intel-17.0/3.0.0/64
+module load intel/19.0/64/19.0.3.199
+module load hdf5/intel-17.0/intel-mpi/1.10.0
 
 # remove checkpoints for a benchmark run
 rm /tigress/$USER/model_checkpoints/*
@@ -27,4 +25,5 @@ rm /tigress/$USER/csv_logs/*
 rm /tigress/$USER/Graph/*
 rm /tigress/$USER/normalization/*
 
+export OMPI_MCA_btl="tcp,self,vader"
 srun python mpi_learn.py
diff --git a/plasma/models/builder.py b/plasma/models/builder.py
index e4a41d99..6d44c97e 100644
--- a/plasma/models/builder.py
+++ b/plasma/models/builder.py
@@ -393,7 +393,7 @@ def save_model_weights(self, model, epoch):
         # TODO(KGF): model.save(..., save_format='tf') disabled in r1.15
         # Same with tf.keras.models.save_model(..., save_format="tf").
         # Need to use experimental API until r2.x
-        # model.save(full_model_save_dir, overwrite=True, save_format='tf')
+        model.save(full_model_save_dir, overwrite=True, save_format='tf')
 
         # try:
         if _has_tf2onnx:
diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index 081ac263..3df9c328 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -112,11 +112,11 @@
         else:
             import tensorflow.keras.backend as K
 
+
         from tensorflow.keras.utils import Progbar
         # TODO(KGF): instead of tensorflow.keras.callbacks.CallbackList()
         # until API added in tf-nightly in v2.2.0
-        import tensorflow.keras.callbacks as cbks
-        from plasma.utils.CallbackList import CallbackList
+        import tensorflow.python.keras.callbacks as cbks
 
 g.flush_all_inorder()
 g.pprint_unique(conf)
@@ -496,7 +496,7 @@ def build_callbacks(self, conf, callbacks_list):
         #         update_freq=1,)
         #     callbacks += [tb_callback]
 
-        return CallbackList(callbacks)
+        return cbks.CallbackList(callbacks)
 
     def add_noise(self, X):
         if self.conf['training']['noise'] is True:
@@ -564,80 +564,80 @@ def train_epoch(self):
         while ((self.num_so_far - self.epoch * num_total) < num_total
                or step < self.num_batches_minimum):
             # TODO(KGF): this is still not correctly tracing the steps on CPU
-            #with tf.profiler.experimental.Trace('train', step_num=step, _r=1):
-            if step_limit > 0 and step > step_limit:
-                print('reached step limit')
-                break
-            try:
-                (batch_xs, batch_ys, batches_to_reset, num_so_far_curr,
-                 num_total, is_warmup_period) = next(batch_iterator_func)
-            except StopIteration:
-                g.print_unique("Resetting batch iterator.")
-                self.num_so_far_accum = self.num_so_far_indiv
-                self.set_batch_iterator_func()
-                batch_iterator_func = self.batch_iterator_func
-                (batch_xs, batch_ys, batches_to_reset, num_so_far_curr,
-                 num_total, is_warmup_period) = next(batch_iterator_func)
-            self.num_so_far_indiv = self.num_so_far_accum + num_so_far_curr
-
-            # if batches_to_reset:
-            # self.model.reset_states(batches_to_reset)
-
-            warmup_phase = (step < self.warmup_steps and self.epoch == 0)
-            num_replicas = 1 if warmup_phase else self.num_replicas
-
-            self.num_so_far = self.mpi_sum_scalars(
-                self.num_so_far_indiv, num_replicas)
-
-            # run the model once to force compilation. Don't actually use these
-            # values.
-            if first_run:
-                first_run = False
-                t0_comp = time.time()
-                #   print('input_dimension:',batch_xs.shape)
-                #   print('output_dimension:',batch_ys.shape)
-                _, _ = self.train_on_batch_and_get_deltas(
+            with tf.profiler.experimental.Trace('train', step_num=step, _r=1):
+                if step_limit > 0 and step > step_limit:
+                    print('reached step limit')
+                    break
+                try:
+                    (batch_xs, batch_ys, batches_to_reset, num_so_far_curr,
+                     num_total, is_warmup_period) = next(batch_iterator_func)
+                except StopIteration:
+                    g.print_unique("Resetting batch iterator.")
+                    self.num_so_far_accum = self.num_so_far_indiv
+                    self.set_batch_iterator_func()
+                    batch_iterator_func = self.batch_iterator_func
+                    (batch_xs, batch_ys, batches_to_reset, num_so_far_curr,
+                     num_total, is_warmup_period) = next(batch_iterator_func)
+                self.num_so_far_indiv = self.num_so_far_accum + num_so_far_curr
+
+                # if batches_to_reset:
+                # self.model.reset_states(batches_to_reset)
+
+                warmup_phase = (step < self.warmup_steps and self.epoch == 0)
+                num_replicas = 1 if warmup_phase else self.num_replicas
+
+                self.num_so_far = self.mpi_sum_scalars(
+                    self.num_so_far_indiv, num_replicas)
+
+                # run the model once to force compilation. Don't actually use these
+                # values.
+                if first_run:
+                    first_run = False
+                    t0_comp = time.time()
+                    #   print('input_dimension:',batch_xs.shape)
+                    #   print('output_dimension:',batch_ys.shape)
+                    _, _ = self.train_on_batch_and_get_deltas(
                         batch_xs, batch_ys, verbose)
-                self.comm.Barrier()
-                sys.stdout.flush()
-                # TODO(KGF): check line feed/carriage returns around this
-                g.print_unique('\nCompilation finished in {:.2f}s'.format(
-                    time.time() - t0_comp))
-                t_start = time.time()
-                sys.stdout.flush()
-
-            if np.any(batches_to_reset):
-                reset_states(self.model, batches_to_reset)
-            if ('noise' in self.conf['training'].keys()
-                    and self.conf['training']['noise'] is not False):
-                batch_xs = self.add_noise(batch_xs)
-            t0 = time.time()
-            deltas, loss = self.train_on_batch_and_get_deltas(
-                batch_xs, batch_ys, verbose)
-            t1 = time.time()
-            if not is_warmup_period:
-                self.set_new_weights(deltas, num_replicas)
-                t2 = time.time()
-                write_str_0 = self.calculate_speed(t0, t1, t2, num_replicas)
-                curr_loss = self.mpi_average_scalars(1.0*loss, num_replicas)
-                # g.print_unique(self.model.get_weights()[0][0][:4])
-                loss_averager.add_val(curr_loss)
-                ave_loss = loss_averager.get_ave()
-                eta = self.estimate_remaining_time(
-                    t0 - t_start, self.num_so_far - self.epoch*num_total,
-                    num_total)
-                write_str = (
-                    '\r[{}] step: {} [ETA: {:.2f}s] [{:.2f}/{}], '.format(
-                        self.task_index, step, eta, 1.0*self.num_so_far,
+                    self.comm.Barrier()
+                    sys.stdout.flush()
+                    # TODO(KGF): check line feed/carriage returns around this
+                    g.print_unique('\nCompilation finished in {:.2f}s'.format(
+                        time.time() - t0_comp))
+                    t_start = time.time()
+                    sys.stdout.flush()
+
+                if np.any(batches_to_reset):
+                    reset_states(self.model, batches_to_reset)
+                if ('noise' in self.conf['training'].keys()
+                        and self.conf['training']['noise'] is not False):
+                    batch_xs = self.add_noise(batch_xs)
+                t0 = time.time()
+                deltas, loss = self.train_on_batch_and_get_deltas(
+                    batch_xs, batch_ys, verbose)
+                t1 = time.time()
+                if not is_warmup_period:
+                    self.set_new_weights(deltas, num_replicas)
+                    t2 = time.time()
+                    write_str_0 = self.calculate_speed(t0, t1, t2, num_replicas)
+                    curr_loss = self.mpi_average_scalars(1.0*loss, num_replicas)
+                    # g.print_unique(self.model.get_weights()[0][0][:4])
+                    loss_averager.add_val(curr_loss)
+                    ave_loss = loss_averager.get_ave()
+                    eta = self.estimate_remaining_time(
+                        t0 - t_start, self.num_so_far - self.epoch*num_total,
                         num_total)
-                    + 'loss: {:.5f} [{:.5f}] | '.format(ave_loss, curr_loss)
-                    + 'walltime: {:.4f} | '.format(
-                        time.time() - self.start_time))
-                g.write_unique(write_str + write_str_0)
-                step += 1
-            else:
-                g.write_unique('\r[{}] warmup phase, num so far: {}'.format(
-                    self.task_index, self.num_so_far))
+                    write_str = (
+                        '\r[{}] step: {} [ETA: {:.2f}s] [{:.2f}/{}], '.format(
+                            self.task_index, step, eta, 1.0*self.num_so_far,
+                            num_total)
+                        + 'loss: {:.5f} [{:.5f}] | '.format(ave_loss, curr_loss)
+                        + 'walltime: {:.4f} | '.format(
+                            time.time() - self.start_time))
+                    g.write_unique(write_str + write_str_0)
+                    step += 1
+                else:
+                    g.write_unique('\r[{}] warmup phase, num so far: {}'.format(
+                        self.task_index, self.num_so_far))
 
         effective_epochs = 1.0*self.num_so_far/num_total
         epoch_previous = self.epoch
diff --git a/plasma/preprocessor/preprocess.py b/plasma/preprocessor/preprocess.py
index 5bc5e950..111b7fa4 100644
--- a/plasma/preprocessor/preprocess.py
+++ b/plasma/preprocessor/preprocess.py
@@ -14,7 +14,6 @@
 import time
 import sys
 import os
-from pprint import pprint
 
 import numpy as np
 import pathos.multiprocessing as mp
@@ -78,7 +77,6 @@ def preprocess_all(self):
     def preprocess_from_files(self, shot_files, use_shots):
         # all shots, including invalid ones
         all_signals = self.conf['paths']['all_signals']
-        pprint(all_signals)
         shot_list = ShotList()
         shot_list.load_from_shot_list_files_objects(shot_files, all_signals)
         shot_list_picked = shot_list.random_sublist(use_shots)
diff --git a/plasma/primitives/data.py b/plasma/primitives/data.py
index 9ecb921e..ec80eb72 100644
--- a/plasma/primitives/data.py
+++ b/plasma/primitives/data.py
@@ -61,7 +61,9 @@ def __init__(self, description, paths, machines, tex_label=None,
         self.machines = machines
         if causal_shifts is None:
             causal_shifts = [0 for m in machines]
-        self.causal_shifts = causal_shifts  # causal shift in ms -> (JAR) the causal shifts appear to be supplied in s in signals.py, NOT ms
+        self.causal_shifts = causal_shifts  # causal shift in ms -> NOTE(JAR): 
+                                            # the causal shifts appear to be supplied 
+                                            # in s in signals.py, NOT ms
         self.is_ip = is_ip
         self.num_channels = 1
         self.normalize = normalize
diff --git a/plasma/utils/CallbackList.py b/plasma/utils/CallbackList.py
deleted file mode 100644
index 78e8ff15..00000000
--- a/plasma/utils/CallbackList.py
+++ /dev/null
@@ -1,219 +0,0 @@
-import collections
-
-class CallbackList(object):
-  """Container abstracting a list of callbacks.
-  Arguments:
-      callbacks: List of `Callback` instances.
-      queue_length: Queue length for keeping
-          running statistics over callback execution time.
-  """
-
-  def __init__(self, callbacks=None, queue_length=10):
-    callbacks = callbacks or []
-    self.callbacks = [c for c in callbacks]
-    self.queue_length = queue_length
-    self.params = {}
-    self.model = None
-    self._reset_batch_timing()
-
-  def _reset_batch_timing(self):
-    self._delta_t_batch = 0.
-    self._delta_ts = collections.defaultdict(
-        lambda: collections.deque([], maxlen=self.queue_length))
-
-  def append(self, callback):
-    self.callbacks.append(callback)
-
-  def set_params(self, params):
-    self.params = params
-    for callback in self.callbacks:
-      callback.set_params(params)
-
-  def set_model(self, model):
-    self.model = model
-    for callback in self.callbacks:
-      callback.set_model(model)
-
-  def _call_batch_hook(self, mode, hook, batch, logs=None):
-    """Helper function for all batch_{begin | end} methods."""
-    if not self.callbacks:
-      return
-    hook_name = 'on_{mode}_batch_{hook}'.format(mode=mode, hook=hook)
-    if hook == 'begin':
-      self._t_enter_batch = time.time()
-    if hook == 'end':
-      # Batch is ending, calculate batch time.
-      self._delta_t_batch = time.time() - self._t_enter_batch
-
-    logs = logs or {}
-    t_before_callbacks = time.time()
-    for callback in self.callbacks:
-      batch_hook = getattr(callback, hook_name)
-      batch_hook(batch, logs)
-    self._delta_ts[hook_name].append(time.time() - t_before_callbacks)
-
-    delta_t_median = np.median(self._delta_ts[hook_name])
-    if (self._delta_t_batch > 0. and
-        delta_t_median > 0.95 * self._delta_t_batch and delta_t_median > 0.1):
-      logging.warning(
-          'Method (%s) is slow compared '
-          'to the batch update (%f). Check your callbacks.', hook_name,
-          delta_t_median)
-
-  def _call_begin_hook(self, mode):
-    """Helper function for on_{train|test|predict}_begin methods."""
-    if mode == ModeKeys.TRAIN:
-      self.on_train_begin()
-    elif mode == ModeKeys.TEST:
-      self.on_test_begin()
-    else:
-      self.on_predict_begin()
-
-  def _call_end_hook(self, mode):
-    """Helper function for on_{train|test|predict}_end methods."""
-    if mode == ModeKeys.TRAIN:
-      self.on_train_end()
-    elif mode == ModeKeys.TEST:
-      self.on_test_end()
-    else:
-      self.on_predict_end()
-
-  def on_batch_begin(self, batch, logs=None):
-    self._call_batch_hook(ModeKeys.TRAIN, 'begin', batch, logs=logs)
-
-  def on_batch_end(self, batch, logs=None):
-    self._call_batch_hook(ModeKeys.TRAIN, 'end', batch, logs=logs)
-
-  def on_epoch_begin(self, epoch, logs=None):
-    """Calls the `on_epoch_begin` methods of its callbacks.
-    This function should only be called during TRAIN mode.
-    Arguments:
-        epoch: integer, index of epoch.
-        logs: dict. Currently no data is passed to this argument for this method
-          but that may change in the future.
-    """
-    logs = logs or {}
-    for callback in self.callbacks:
-      callback.on_epoch_begin(epoch, logs)
-    self._reset_batch_timing()
-
-  def on_epoch_end(self, epoch, logs=None):
-    """Calls the `on_epoch_end` methods of its callbacks.
-    This function should only be called during TRAIN mode.
-    Arguments:
-        epoch: integer, index of epoch.
-        logs: dict, metric results for this training epoch, and for the
-          validation epoch if validation is performed. Validation result keys
-          are prefixed with `val_`.
-    """
-    logs = logs or {}
-    for callback in self.callbacks:
-      callback.on_epoch_end(epoch, logs)
-
-  def on_train_batch_begin(self, batch, logs=None):
-    """Calls the `on_train_batch_begin` methods of its callbacks.
-    Arguments:
-        batch: integer, index of batch within the current epoch.
-        logs: dict. Has keys `batch` and `size` representing the current batch
-          number and the size of the batch.
-    """
-    self._call_batch_hook(ModeKeys.TRAIN, 'begin', batch, logs=logs)
-
-  def on_train_batch_end(self, batch, logs=None):
-    """Calls the `on_train_batch_end` methods of its callbacks.
-    Arguments:
-        batch: integer, index of batch within the current epoch.
-        logs: dict. Metric results for this batch.
-    """
-    self._call_batch_hook(ModeKeys.TRAIN, 'end', batch, logs=logs)
-
-  def on_test_batch_begin(self, batch, logs=None):
-    """Calls the `on_test_batch_begin` methods of its callbacks.
-    Arguments:
-        batch: integer, index of batch within the current epoch.
-        logs: dict. Has keys `batch` and `size` representing the current batch
-          number and the size of the batch.
-    """
-    self._call_batch_hook(ModeKeys.TEST, 'begin', batch, logs=logs)
-
-  def on_test_batch_end(self, batch, logs=None):
-    """Calls the `on_test_batch_end` methods of its callbacks.
-    Arguments:
-        batch: integer, index of batch within the current epoch.
-        logs: dict. Metric results for this batch.
-    """
-    self._call_batch_hook(ModeKeys.TEST, 'end', batch, logs=logs)
-
-  def on_predict_batch_begin(self, batch, logs=None):
-    """Calls the `on_predict_batch_begin` methods of its callbacks.
-    Arguments:
-        batch: integer, index of batch within the current epoch.
-        logs: dict. Has keys `batch` and `size` representing the current batch
-          number and the size of the batch.
-    """
-    self._call_batch_hook(ModeKeys.PREDICT, 'begin', batch, logs=logs)
-
-  def on_predict_batch_end(self, batch, logs=None):
-    """Calls the `on_predict_batch_end` methods of its callbacks.
-    Arguments:
-        batch: integer, index of batch within the current epoch.
-        logs: dict. Metric results for this batch.
-    """
-    self._call_batch_hook(ModeKeys.PREDICT, 'end', batch, logs=logs)
-
-  def on_train_begin(self, logs=None):
-    """Calls the `on_train_begin` methods of its callbacks.
-    Arguments:
-        logs: dict. Currently no data is passed to this argument for this method
-          but that may change in the future.
-    """
-    for callback in self.callbacks:
-      callback.on_train_begin(logs)
-
-  def on_train_end(self, logs=None):
-    """Calls the `on_train_end` methods of its callbacks.
-    Arguments:
-        logs: dict. Currently no data is passed to this argument for this method
-          but that may change in the future.
-    """
-    for callback in self.callbacks:
-      callback.on_train_end(logs)
-
-  def on_test_begin(self, logs=None):
-    """Calls the `on_test_begin` methods of its callbacks.
-    Arguments:
-        logs: dict. Currently no data is passed to this argument for this method
-          but that may change in the future.
-    """
-    for callback in self.callbacks:
-      callback.on_test_begin(logs)
-
-  def on_test_end(self, logs=None):
-    """Calls the `on_test_end` methods of its callbacks.
-    Arguments:
-        logs: dict. Currently no data is passed to this argument for this method
-          but that may change in the future.
-    """
-    for callback in self.callbacks:
-      callback.on_test_end(logs)
-
-  def on_predict_begin(self, logs=None):
-    """Calls the 'on_predict_begin` methods of its callbacks.
-    Arguments:
-        logs: dict. Currently no data is passed to this argument for this method
-          but that may change in the future.
-    """
-    for callback in self.callbacks:
-      callback.on_predict_begin(logs)
-
-  def on_predict_end(self, logs=None):
-    """Calls the `on_predict_end` methods of its callbacks.
-    Arguments:
-        logs: dict. Currently no data is passed to this argument for this method
-          but that may change in the future.
-    """
-    for callback in self.callbacks:
-      callback.on_predict_end(logs)
-
-  def __iter__(self):
-    return iter(self.callbacks)
\ No newline at end of file
diff --git a/plasma/utils/ECEI.py b/plasma/utils/ECEI.py
index b9d23887..5916646f 100644
--- a/plasma/utils/ECEI.py
+++ b/plasma/utils/ECEI.py
@@ -2,7 +2,8 @@
 The module composed in this file is designed to handle the processing/handling
 and incorporation of electron cyclotron emission imaging data into the FRNN
 disruption prediction software suite. It contains snippets from the rest of
-the FRNN codebase, and therefore is partially redundant.
+the FRNN codebase, and therefore is partially redundant, particularly in the
+utility functions at the top of the file.
 Jesse A Rodriguez, 06/28/2021
 """
 

From 642c0ac6ae14b66080ddfee97b5a191775a19f8e Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Mon, 15 Nov 2021 18:00:43 -0600
Subject: [PATCH 41/50] Clarify cuDNN restriction for dropout

---
 plasma/models/builder.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/plasma/models/builder.py b/plasma/models/builder.py
index 7f27a319..8e42bbbb 100644
--- a/plasma/models/builder.py
+++ b/plasma/models/builder.py
@@ -350,9 +350,10 @@ def slicer_output_shape(input_shape, indices):
                                 bias_regularizer=l2(regularization),
                                 )
             if rnn_type != 'CuDNNLSTM':
-                # Dropout is unsupported in CuDNN library
-                model_kwargs['dropout'] = dropout_prob
-                model_kwargs['recurrent_dropout'] = dropout_prob
+                # Dropout (on linear transformation of recurrent state) is unsupported
+                # in cuDNN library
+                model_kwargs['recurrent_dropout'] = dropout_prob  # recurrent states
+            model_kwargs['dropout'] = dropout_prob  # input states
             for _ in range(model_conf['rnn_layers']):
                 x_in = rnn_model(rnn_size, **model_kwargs)(x_in)
                 x_in = Dropout(dropout_prob)(x_in)

From 1d91803cea8eb743b98941ba00480f420c1e5189 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Wed, 15 Dec 2021 17:43:52 -0600
Subject: [PATCH 42/50] Exclude ECEi from all_signals dict by default, for now

---
 data/signals.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/data/signals.py b/data/signals.py
index 483a056c..695a71d4 100644
--- a/data/signals.py
+++ b/data/signals.py
@@ -426,7 +426,9 @@ def fetch_nstx_data(signal_path, shot_num, c):
     # 'neut_profile':neut_profile, 'q_profile':q_profile,
     # 'bootstrap_current_profile':bootstrap_current_profile,
     # 'q_psi_profile':q_psi_profile}
-    'ecei': ecei,
+
+    # KGF(2021-12-15): exclude ecei by default, for now:
+    # 'ecei': ecei,
 }
 
 all_signals_max_tol = {

From db6c1a375ac37a053acb8ab63131e3b75bfe5dc2 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Wed, 12 Jan 2022 15:00:20 -0600
Subject: [PATCH 43/50] lr --> learning_rate for Keras optimizers

Replaced all the way back in Sept 2019
https://github.com/keras-team/keras/releases/tag/2.3.0

Must be recently emitting deprecation warnings in TF ~2.7.0
---
 plasma/models/mpi_runner.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index f0e235a5..ac15592f 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -279,9 +279,9 @@ def compile(self, optimizer, clipnorm, loss='mse'):
             SGD, Adam, RMSprop, Nadam
             )
         if optimizer == 'sgd':
-            optimizer_class = SGD(lr=self.DUMMY_LR, clipnorm=clipnorm)
+            optimizer_class = SGD(learning_rate=self.DUMMY_LR, clipnorm=clipnorm)
         elif optimizer == 'momentum_sgd':
-            optimizer_class = SGD(lr=self.DUMMY_LR, clipnorm=clipnorm,
+            optimizer_class = SGD(learning_rate=self.DUMMY_LR, clipnorm=clipnorm,
                                   decay=1e-6, momentum=0.9)
         elif optimizer == 'tf_momentum_sgd':
             # TODO(KGF): removed TFOptimizer wrapper from here and below
@@ -290,14 +290,14 @@ def compile(self, optimizer, clipnorm, loss='mse'):
             optimizer_class = tf.train.MomentumOptimizer(
                 learning_rate=self.DUMMY_LR, momentum=0.9)
         elif optimizer == 'adam':
-            optimizer_class = Adam(lr=self.DUMMY_LR, clipnorm=clipnorm)
+            optimizer_class = Adam(learning_rate=self.DUMMY_LR, clipnorm=clipnorm)
         elif optimizer == 'tf_adam':
             optimizer_class = tf.train.AdamOptimizer(
                 learning_rate=self.DUMMY_LR)
         elif optimizer == 'rmsprop':
-            optimizer_class = RMSprop(lr=self.DUMMY_LR, clipnorm=clipnorm)
+            optimizer_class = RMSprop(learning_rate=self.DUMMY_LR, clipnorm=clipnorm)
         elif optimizer == 'nadam':
-            optimizer_class = Nadam(lr=self.DUMMY_LR, clipnorm=clipnorm)
+            optimizer_class = Nadam(learning_rate=self.DUMMY_LR, clipnorm=clipnorm)
         else:
             print("Optimizer not implemented yet")
             exit(1)
@@ -787,7 +787,7 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
     if g.task_index != 0:
         loader.verbose = False
 
-    # MPI loop works by predicting in batches of the 
+    # MPI loop works by predicting in batches of the
     # largest possible multiple of len(shot_sublists) < num_workers
     # i.e. if there are 9 shot_sublists and 4 workers,
     #      worker 0 will predict shot_sublist 0, 4, and 8
@@ -831,17 +831,17 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
             if color ==1:
                 shpz = [max(y.shape) for y in y_prime]
                 max_length = max([max(y.shape) for y in y_p])
-            max_length = g.comm.allreduce(max_length, MPI.MAX) 
+            max_length = g.comm.allreduce(max_length, MPI.MAX)
             if color == 1:
                 y_prime_numpy = np.stack([np.pad(sublist, pad_width=((0,max_length-max(sublist.shape)),(0,0))) for sublist in y_prime])
                 y_gold_numpy = np.stack([np.pad(sublist, pad_width=((0,max_length-max(sublist.shape)),(0,0))) for sublist in y_gold])
-            
+
             temp_predictor_only_comm = MPI.Comm.Split(g.comm, color, i)
             # Create numpy array to store all processors output, then aggregate and unpad using MPI gathered shape list
             shpzg = g.comm.allgather(shpz)
             shpzg = list(itertools.chain(*shpzg))
             shpzg = [s for s in shpzg if s != []]
-            max_length = g.comm.allreduce(max_length, MPI.MAX) 
+            max_length = g.comm.allreduce(max_length, MPI.MAX)
             if color == 1:
                 num_pred = temp_predictor_only_comm.size
             else:
@@ -857,11 +857,11 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
                 temp_predictor_only_comm.Allgather(y_prime_numpy.flatten(), y_primeg_flattend)
                 temp_predictor_only_comm.Allgather(y_gold_numpy.flatten(), y_goldg_flattend)
             # Process 0 broadcast y_primeg and y_goldg to all processors, including ones
-            # not involved in calculating predictions so they can each create their own 
+            # not involved in calculating predictions so they can each create their own
             # y_prime_global and y_gold_global
             g.comm.Barrier()
             g.comm.Bcast(y_primeg_flattend, root=0)
-            g.comm.Bcast(y_goldg_flattend, root=0) 
+            g.comm.Bcast(y_goldg_flattend, root=0)
             y_primeg_flattend = np.split(y_primeg_flattend, num_pred)
             y_goldg_flattend = np.split(y_goldg_flattend, num_pred)
             y_primeg = [y.reshape((conf['model']['pred_batch_size'], max_length, 1)) for y in y_primeg_flattend]
@@ -878,7 +878,7 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
                 g.comm.allgather(disruptive))
             y_prime = []
             y_gold = []
-            disruptive = [] 
+            disruptive = []
             color = 2
             temp_predictor_only_comm.Free()
 

From 93badd2155979c18faae78cfa715fd7fe6304ecb Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Mon, 28 Mar 2022 23:05:43 +0000
Subject: [PATCH 44/50] Port changes from Ge's 1+yr old copy of the code w/ RT
 signals

---
 data/signals.py       | 71 ++++++++++++++++++++++++++++++++++++++++---
 plasma/conf_parser.py | 13 +++++---
 2 files changed, 75 insertions(+), 9 deletions(-)

diff --git a/data/signals.py b/data/signals.py
index 695a71d4..386c3659 100644
--- a/data/signals.py
+++ b/data/signals.py
@@ -298,6 +298,16 @@ def fetch_nstx_data(signal_path, shot_num, c):
     "q95 safety factor", ['ppf/efit/q95', "EFIT01/RESULTS.AEQDSK.Q95"],
     [jet, d3d], causal_shifts=[15, 10], normalize=False,
     data_avail_tolerances=[0.03, 0.02])
+
+q95_EFITRT1 = Signal(
+    "q95 safety factor in real time", ['ppf/efit/q95', "EFITRT1/RESULTS.AEQDSK.Q95"],
+    [jet, d3d], causal_shifts=[0, 0], normalize=False,
+    data_avail_tolerances=[0.03, 0.029])
+vd = Signal(
+    "vertical displacement change", [ "d3d/vpsdfz"],
+    [d3d], causal_shifts=[ 0], normalize=False,
+    data_avail_tolerances=[ 0.029])
+
 q95t = Signal(
     "q95 safety factor tol", ['ppf/efit/q95', "EFIT01/RESULTS.AEQDSK.Q95"],
     [jet, d3d], causal_shifts=[15, 10], normalize=False,
@@ -406,11 +416,24 @@ def fetch_nstx_data(signal_path, shot_num, c):
 # dataset.
 
 all_signals = {
-    'q95': q95, 'li': li, 'ip': ip, 'betan': betan, 'energy': energy, 'lm': lm,
-    'dens': dens, 'pradcore': pradcore,
-    'pradedge': pradedge, 'pradtot': pradtot, 'pin': pin,
+    'q95_EFITRT1': q95_EFITRT1,
+    #'q95': q95,
+    'li': li,
+    'ip': ip,
+    'betan': betan,
+    'energy': energy,
+    'lm': lm,
+    'dens': dens,
+    # KGF: Ge leaves these uncommented in all_signals_real_time_0D
+    #'pradcore': pradcore,
+    #'pradedge': pradedge,
+    #'pradtot': pradtot,
+    #'energydt': energydt,
+    # 'vd': vd,
+    'pin': pin,
     'torquein': torquein,
-    'energydt': energydt, 'ipdirect': ipdirect, 'iptarget': iptarget,
+    'ipdirect': ipdirect,
+    'iptarget': iptarget,
     'iperr': iperr,
     # 'tmamp1':tmamp1, 'tmamp2':tmamp2, 'tmfreq1':tmfreq1, 'tmfreq2':tmfreq2,
     # 'pechin':pechin,
@@ -420,7 +443,7 @@ def fetch_nstx_data(signal_path, shot_num, c):
     # IMPORTANT: must comment-out the following line when preprocessing for
     # training on JET CW and testing on JET ILW (FRNN 0D).
     # Otherwise 1K+ CW shots are excluded due to missing profile data
-    'etemp_profile': etemp_profile, 'edens_profile': edens_profile,
+    #'etemp_profile': etemp_profile, 'edens_profile': edens_profile,
     # 'itemp_profile':itemp_profile, 'zdens_profile':zdens_profile,
     # 'trot_profile':trot_profile, 'pthm_profile':pthm_profile,
     # 'neut_profile':neut_profile, 'q_profile':q_profile,
@@ -431,6 +454,44 @@ def fetch_nstx_data(signal_path, shot_num, c):
     # 'ecei': ecei,
 }
 
+# ---------------------
+# KGF: from Ge's 2019-2020 code
+all_signals_real_time={
+    'q95_EFITRT1': q95_EFITRT1, 'li': li, 'ip': ip, 'betan': betan, 'energy': energy, 'lm': lm,
+    'dens': dens, 'pradcore': pradcore,
+    'pradedge': pradedge, 'pradtot': pradtot, 'pin': pin,
+    'torquein': torquein,
+    'energydt': energydt, 'ipdirect': ipdirect, 'iptarget': iptarget,
+    'iperr': iperr,
+    # 'tmamp1':tmamp1, 'tmamp2':tmamp2, 'tmfreq1':tmfreq1, 'tmfreq2':tmfreq2,
+    # 'pechin':pechin,
+    # 'rho_profile_spatial':rho_profile_spatial, 'etemp':etemp,
+    'etemp_profile': etemp_profile, 'edens_profile': edens_profile,
+}
+all_signals_real_time_0D={
+    'q95_EFITRT1': q95_EFITRT1,
+    'li': li,
+    'ip': ip,
+    'betan': betan,
+    'energy': energy,
+    'lm': lm,
+    'dens': dens,
+    'pradcore': pradcore,
+    'pradedge': pradedge,
+    'pradtot': pradtot,
+    'pin': pin,
+    'torquein': torquein,
+    'vd': vd,
+    'energydt': energydt,
+    'iperr':iperr,
+    'ipdirect': ipdirect,
+    'iptarget': iptarget
+    # 'tmamp1':tmamp1, 'tmamp2':tmamp2, 'tmfreq1':tmfreq1, 'tmfreq2':tmfreq2,
+    # 'pechin':pechin,
+    # 'rho_profile_spatial':rho_profile_spatial, 'etemp':etemp,
+}
+# ---------------------
+
 all_signals_max_tol = {
     'q95t': q95t, 'lit': lit, 'ipt': ipt, 'betant': betant,
     'energyt': energyt, 'lmt': lmt,
diff --git a/plasma/conf_parser.py b/plasma/conf_parser.py
index e9c4f53b..cd86386f 100644
--- a/plasma/conf_parser.py
+++ b/plasma/conf_parser.py
@@ -371,19 +371,24 @@ def parameters(input_file):
                 'q_psi_profile': sig.q_psi_profile,
             }
         elif params['paths']['data'] == 'd3d_0D':
-            params['paths']['shot_files'] = [d3d_full]
+            #params['paths']['shot_files'] = [d3d_full]
+            params['paths']['shot_files'] = [d3d_full_2019]
             params['paths']['shot_files_test'] = []
             params['paths']['use_signals_dict'] = {
-                'q95': sig.q95,
+                # KGF: edits for real-time signals
+                #'q95': sig.q95,
+                'q95_EFITRT1': sig.q95_EFITRT1,
                 'li': sig.li,
                 'ip': sig.ip,
                 'lm': sig.lm,
                 'betan': sig.betan,
                 'energy': sig.energy,
                 'dens': sig.dens,
-                'pradcore': sig.pradcore,
-                'pradedge': sig.pradedge,
+                # 'pradcore': sig.pradcore,
+                # 'pradedge': sig.pradedge,
                 'pin': sig.pin,
+                # KGF: added but commented out in Ge's version
+                # 'vd': sig.vd,
                 'torquein': sig.torquein,
                 'ipdirect': sig.ipdirect,
                 'iptarget': sig.iptarget,

From c7ae355580b6d0508c68257f6d0651cf46688b27 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Mon, 28 Mar 2022 21:21:30 -0500
Subject: [PATCH 45/50] Rip out keras2onnx; deprecated in favor of tf2onnx

Doesn't even work in TF >= 2.4
---
 plasma/models/builder.py | 25 -------------------------
 1 file changed, 25 deletions(-)

diff --git a/plasma/models/builder.py b/plasma/models/builder.py
index 8e42bbbb..a38dff5d 100644
--- a/plasma/models/builder.py
+++ b/plasma/models/builder.py
@@ -29,32 +29,15 @@
 from plasma.utils.hashing import general_object_hash
 from plasma.models.tcn import TCN
 # TODO(KGF): consider using importlib.util.find_spec() instead (Py>3.4)
-try:
-    import keras2onnx
-    import onnx
-except ImportError:  # as e:
-    _has_keras2onnx = False
-    # onnx = None
-    # keras2onnx = None
-else:
-    _has_keras2onnx = True
-
 try:
     import tf2onnx
     import onnx
     # CLI: python -m tf2onnx.convert --saved-model model.97765202633820900403308121179367157713._epoch_.0 --output frnn-1D.onnx
 except ImportError:  # as e:
     _has_tf2onnx = False
-    # onnx = None
-    # keras2onnx = None
 else:
     _has_tf2onnx = True
 
-# TODO(KGF): both conversion tools not working with current network and TF version
-_has_tf2onnx = False
-_has_keras2onnx = False
-
-
 # Synchronize 2x stderr msg from TensorFlow initialization via Keras backend
 # "Succesfully opened dynamic library... libcudart" "Using TensorFlow backend."
 if g.comm is not None:
@@ -419,14 +402,6 @@ def save_model_weights(self, model, epoch):
                 opset=10, output_path=save_path)
             # KGF: error likely due to the splitting of pre_rnn_model and rnn_model, since
             # the latter expects a trivially-wrapped TimeDistributed(Input()) for 0D model
-        if _has_keras2onnx:
-            save_path = self.get_save_path(epoch, ext='onnx')
-            # TODO(KGF): keras2onnx broken in TF >=2.4
-            # https://github.com/onnx/keras-onnx/issues/651
-
-            onnx_model = keras2onnx.convert_keras(model, model.name,
-                                                  target_opset=10)
-            onnx.save_model(onnx_model, save_path)
         # except Exception as e:
         #     print(e)
         return

From 947f849c3cf005c398c0c1bb17998ca35bc16693 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Tue, 29 Mar 2022 02:33:59 +0000
Subject: [PATCH 46/50] Don't reset states if stateful=False

---
 plasma/models/mpi_runner.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index ac15592f..08ee2516 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -610,7 +610,8 @@ def train_epoch(self):
                     t_start = time.time()
                     sys.stdout.flush()
 
-                if np.any(batches_to_reset):
+                if np.any(batches_to_reset) and self.conf['model']['stateful']:
+                    print(f"KGF batches_to_reset = {batches_to_reset}")
                     reset_states(self.model, batches_to_reset)
                 if ('noise' in self.conf['training'].keys()
                         and self.conf['training']['noise'] is not False):
@@ -626,7 +627,7 @@ def train_epoch(self):
                 t_start = time.time()
                 sys.stdout.flush()
 
-            if np.any(batches_to_reset):
+            if np.any(batches_to_reset) and self.conf['model']['stateful']:
                 reset_states(self.model, batches_to_reset)
             if ('noise' in self.conf['training'].keys()
                     and self.conf['training']['noise'] is not False):

From e6b1f728f16a820dad225ac3db769a3a783c4fb9 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Tue, 29 Mar 2022 02:41:49 +0000
Subject: [PATCH 47/50] Reset batch size dim to unspec during tf2onnx export

---
 plasma/models/builder.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/plasma/models/builder.py b/plasma/models/builder.py
index a38dff5d..d62355f4 100644
--- a/plasma/models/builder.py
+++ b/plasma/models/builder.py
@@ -147,7 +147,18 @@ def build_model(self, predict, custom_batch_size=None):
             print('Unkown Model Type, exiting.')
             exit(1)
 
+        # KGF: key line
         batch_input_shape = (batch_size, length, num_signals)
+
+        # TODO(KGF): need a more substantial redesign of conf to support stateful=False
+        # variable training batch size, since even in that case, we need
+        # conf[training][batch_size] to be fixed to form the inputs to the variable model
+
+        # For now, just reset batch_input_shape=(None, ...) at tf2onnx export time below
+        #batch_input_shape = (None, length, num_signals)
+
+
+
         # batch_shape_non_temporal = (batch_size, num_signals)
 
         indices_0d, indices_1d, num_0D, num_1D = self.get_0D_1D_indices()
@@ -288,8 +299,9 @@ def slicer_output_shape(input_shape, indices):
         #     pre_rnn_model.summary()
         #     sys.stdout = ori
         #     fr.close()
-        if g.task_index == 0:
-            pre_rnn_model.summary()
+
+        # if g.task_index == 0:
+        #     pre_rnn_model.summary()
         x_input = Input(batch_shape=batch_input_shape)
         if (num_1D > 0 or (
                 'extra_dense_input' in model_conf.keys()
@@ -392,8 +404,8 @@ def save_model_weights(self, model, epoch):
             batch_size = self.conf['training']['batch_size']
             use_signals = self.conf['paths']['use_signals']
             num_signals = sum([sig.num_channels for sig in use_signals])
-            batch_input_shape = (batch_size, length, num_signals)
-            print(f"batch_input_shape = {batch_input_shape}")
+            batch_input_shape = (None, length, num_signals)
+            ########print(f"batch_input_shape = {batch_input_shape}")
             # ValueError: Input 0 of node model_1/lstm/AssignVariableOp was passed float
             # from model_1/lstm/lstm_cell/ones_like_1/ReadVariableOp/resource:0
             # incompatible with expected resource.

From fd499cd1a6545ae3821f6c479154e2931d258191 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Tue, 29 Mar 2022 02:42:49 +0000
Subject: [PATCH 48/50] Change default conf.yaml to something suitable to
 latency bench

Training a wide variety of dummy models only 1 epoch each
on ThetaGPU
---
 examples/conf.yaml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/conf.yaml b/examples/conf.yaml
index a4992811..fbeb1066 100644
--- a/examples/conf.yaml
+++ b/examples/conf.yaml
@@ -9,14 +9,14 @@
 # will output csvlog, trained model checkpoints, etc.
 # in fs_path_output / [username] / results | csv_logs | model_checkpoints | Graph, etc.
 
-fs_path: '/Users/'
+fs_path: '/lus/theta-fs0/projects/fusiondl_aesp/'
 user_subdir: True
-fs_path_output: '/Users/'
+fs_path_output: '/lus/theta-fs0/projects/fusiondl_aesp/'
 user_subdir_output: True
 target: 'hinge' # 'maxhinge' # 'maxhinge' # 'binary' # 'hinge'
 num_gpus: 1  # per node
 paths:
-  signal_prepath: '/signal_data/' # /signal_data/jet/
+  signal_prepath: ['/signal_data/', '/signal_data_new_nov2019/'] # /signal_data/jet/
   shot_list_dir: '/shot_lists/'
   tensorboard_save_path: '/Graph/'
   data: d3d_0D
@@ -95,7 +95,7 @@ model:
   # size 100 slight overfitting, size 20 no overfitting. 200 is not better than 100.
   # Prediction is much better with size 100, size 20 cannot capture the data.
   rnn_size: 200
-  rnn_type: 'LSTM'
+  rnn_type: 'CuDNNLSTM'
   # TODO(KGF): optimize number of RNN layers
   rnn_layers: 2
   num_conv_filters: 128
@@ -114,7 +114,7 @@ model:
   # lr=1e-4 also works well if we decay a lot (i.e ~0.7 or more)
   lr: 0.00002 # 0.00001 # 0.0005 # for adam plots 0.0000001
   lr_decay: 0.97 # 0.98 # 0.9
-  stateful: True
+  stateful: False
   return_sequences: True
   dropout_prob: 0.1
   # only relevant if we want to do MPI training. The number of steps with a single replica
@@ -128,14 +128,14 @@ training:
   # used iff 1) test & 2) (train U validate) are both sampled from the same distribution/source lists of shots:
   train_frac: 0.75
   validation_frac: 0.3333333333333333
-  batch_size: 128 # 256
+  batch_size: 1024
   # THE MAX_PATCH_LENGTH WAS THE CULPRIT FOR NO TRAINING! Lower than 1000 performs very poorly
   max_patch_length: 100000
   # How many shots are we loading at once?
   num_shots_at_once: 200
   # large number = maximum number of epochs.
   # Early stopping will occur if loss does not decrease, after some patience # of epochs
-  num_epochs: 1000
+  num_epochs: 1
   use_mock_data: False
   data_parallel: False
   hyperparam_tuning: False
@@ -144,7 +144,7 @@ training:
   num_batches_minimum: 20 # minimum number of batches per epoch
   ranking_difficulty_fac: 1.0 # how much to upweight incorrectly classified shots during training
   timeline_prof: False
-  step_limit: 50
+  step_limit: 0
   no_validation: True
 callbacks:
   list: ['earlystop']

From f50e5b74c8fa437607078f777a969bf4568fded2 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Tue, 29 Mar 2022 02:43:39 +0000
Subject: [PATCH 49/50] Add Bash script for training hundreds of frnn_0d onnx
 models

---
 examples/generate_onnx.sh | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100755 examples/generate_onnx.sh

diff --git a/examples/generate_onnx.sh b/examples/generate_onnx.sh
new file mode 100755
index 00000000..1510302e
--- /dev/null
+++ b/examples/generate_onnx.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/bash -l
+
+module load conda/2021-11-30; conda activate
+
+length=(32)
+rnn_size=(300)
+rnn_layers=(6)
+
+for l in ${length[*]}
+do
+    for size in ${rnn_size[*]}
+    do
+	for nlayer in ${rnn_layers[*]}
+	do
+	    cd /home/felker/plasma-python/examples
+	    echo "STARTING bs_dynamic_layers${nlayer}_length${l}_size${size}.onnx"
+	    sed -i "91 c\  length: ${l}" conf.yaml
+	    sed -i "97 c\  rnn_size: ${size}" conf.yaml
+	    sed -i "100 c\  rnn_layers: ${nlayer}" conf.yaml
+	    python mpi_learn.py
+	    mv /lus/theta-fs0/projects/fusiondl_aesp/felker/model_checkpoints/*.onnx ~/bs_dynamic_layers${nlayer}_length${l}_size${size}.onnx
+	    rm -rfd /lus/theta-fs0/projects/fusiondl_aesp/felker/model_checkpoints/*
+	    echo "FINISHED bs_dynamic_layers${nlayer}_length${l}_size${size}.onnx"
+	done
+    done
+done

From 258967851014d6bea8fed98e085a9e8a5b710e53 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Mon, 23 May 2022 16:01:34 -0500
Subject: [PATCH 50/50] Remove duplicate model compilation step

Move main sub-epoch training loop under context manager
---
 plasma/models/mpi_runner.py | 62 ++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 39 deletions(-)

diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index 08ee2516..4b01b296 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -619,46 +619,30 @@ def train_epoch(self):
                 t0 = time.time()
                 deltas, loss = self.train_on_batch_and_get_deltas(
                     batch_xs, batch_ys, verbose)
-                self.comm.Barrier()
-                sys.stdout.flush()
-                # TODO(KGF): check line feed/carriage returns around this
-                g.print_unique('\nCompilation finished in {:.2f}s'.format(
-                    time.time() - t0_comp))
-                t_start = time.time()
-                sys.stdout.flush()
-
-            if np.any(batches_to_reset) and self.conf['model']['stateful']:
-                reset_states(self.model, batches_to_reset)
-            if ('noise' in self.conf['training'].keys()
-                    and self.conf['training']['noise'] is not False):
-                batch_xs = self.add_noise(batch_xs)
-            t0 = time.time()
-            deltas, loss = self.train_on_batch_and_get_deltas(
-                batch_xs, batch_ys, verbose)
-            t1 = time.time()
-            if not is_warmup_period:
-                self.set_new_weights(deltas, num_replicas)
-                t2 = time.time()
-                write_str_0 = self.calculate_speed(t0, t1, t2, num_replicas)
-                curr_loss = self.mpi_average_scalars(1.0*loss, num_replicas)
-                # g.print_unique(self.model.get_weights()[0][0][:4])
-                loss_averager.add_val(curr_loss)
-                ave_loss = loss_averager.get_ave()
-                eta = self.estimate_remaining_time(
-                    t0 - t_start, self.num_so_far - self.epoch*num_total,
-                    num_total)
-                write_str = (
-                    '\r[{}] step: {} [ETA: {:.2f}s] [{:.2f}/{}], '.format(
-                        self.task_index, step, eta, 1.0*self.num_so_far,
+                t1 = time.time()
+                if not is_warmup_period:
+                    self.set_new_weights(deltas, num_replicas)
+                    t2 = time.time()
+                    write_str_0 = self.calculate_speed(t0, t1, t2, num_replicas)
+                    curr_loss = self.mpi_average_scalars(1.0*loss, num_replicas)
+                    # g.print_unique(self.model.get_weights()[0][0][:4])
+                    loss_averager.add_val(curr_loss)
+                    ave_loss = loss_averager.get_ave()
+                    eta = self.estimate_remaining_time(
+                        t0 - t_start, self.num_so_far - self.epoch*num_total,
                         num_total)
-                    + 'loss: {:.5f} [{:.5f}] | '.format(ave_loss, curr_loss)
-                    + 'walltime: {:.4f} | '.format(
-                        time.time() - self.start_time))
-                g.write_unique(write_str + write_str_0)
-                step += 1
-            else:
-                g.write_unique('\r[{}] warmup phase, num so far: {}'.format(
-                    self.task_index, self.num_so_far))
+                    write_str = (
+                        '\r[{}] step: {} [ETA: {:.2f}s] [{:.2f}/{}], '.format(
+                            self.task_index, step, eta, 1.0*self.num_so_far,
+                            num_total)
+                        + 'loss: {:.5f} [{:.5f}] | '.format(ave_loss, curr_loss)
+                        + 'walltime: {:.4f} | '.format(
+                            time.time() - self.start_time))
+                    g.write_unique(write_str + write_str_0)
+                    step += 1
+                else:
+                    g.write_unique('\r[{}] warmup phase, num so far: {}'.format(
+                        self.task_index, self.num_so_far))
 
         effective_epochs = 1.0*self.num_so_far/num_total
         epoch_previous = self.epoch