From b00ae12250c365a60fc28492e9bb7b9c54b9d491 Mon Sep 17 00:00:00 2001
From: David Baddeley <david.baddeley.nz@gmail.com>
Date: Tue, 27 Sep 2022 15:13:36 +1300
Subject: [PATCH 1/5] cluster docs stuff

---
 docs/cluster/PYMECluster.rst       |  11 +++
 docs/cluster/clusterFilesystem.rst |   2 +-
 docs/cluster/cluster_compute.rst   | 131 +++++++++++++++++++++++++++++
 docs/index.rst                     |   1 +
 4 files changed, 144 insertions(+), 1 deletion(-)
 create mode 100644 docs/cluster/PYMECluster.rst
 create mode 100644 docs/cluster/cluster_compute.rst
diff --git a/docs/cluster/PYMECluster.rst b/docs/cluster/PYMECluster.rst
new file mode 100644
index 000000000..7761937cc
--- /dev/null
+++ b/docs/cluster/PYMECluster.rst
@@ -0,0 +1,11 @@
+The PYME Storage and compute cluster architecture
+*************************************************
+
+
+.. toctree::
+   :maxdepth: 1
+
+   cluster_install
+   clusterFilesystem
+   cluster_compute
+   cluster_ui
\ No newline at end of file
diff --git a/docs/cluster/clusterFilesystem.rst b/docs/cluster/clusterFilesystem.rst
index 2761870b1..49c8ee940 100644
--- a/docs/cluster/clusterFilesystem.rst
+++ b/docs/cluster/clusterFilesystem.rst
@@ -44,7 +44,7 @@ Startup / installation
 
 - install PYME as a python package
 - run :py:mod:`PYMEDataServer <PYME.cluster.HTTPDataServer>` (either using :program:`PYMEClusterOfOne`, or launching individually on each
-  cluster node after following the `configuration instructions <_clusterinstall>`_ :ref:`clusterinstall_` )
+  cluster node after following the `configuration instructions <cluster_install.html>`_ )
 
 Data streaming
 ==============
diff --git a/docs/cluster/cluster_compute.rst b/docs/cluster/cluster_compute.rst
new file mode 100644
index 000000000..8dfff5976
--- /dev/null
+++ b/docs/cluster/cluster_compute.rst
@@ -0,0 +1,131 @@
+Distributed computation on the PYME Cluster
+*******************************************
+
+Compute tasks
+=============
+
+Localisation tasks
+------------------
+
+
+"Recipe" tasks
+--------------
+
+
+Rule-based task scheduler
+=========================
+
+PYME takes a different approach to task scheduling than many 
+common cluster schedulers. Rather than centrally definining individual tasks 
+and distribututing these across cluster nodes from a central server, 
+we define so-called "**Rules**" which decribe how to generate tasks.
+Each cluster node contacts the central server for a copy of the currently
+active rules, generates candidate tasks based on the rules, scores these
+based on node capability and whether the required data is available
+locally on the that node, and then submits a bid to the central server 
+specifying which tasks it want's to perform and at what cost. The central 
+server then compares the bids and replies with a set of tasks indices for
+which the node has won the bid.
+
+This architecture solves a number of problems faced when trying to distribute
+a very large number (thousands per second) of small tasks. It has 3 main 
+advantages: 
+
+- It dramatically reduces the network traffic required for task distribution, 
+with only the rule and integer task IDs transmitted between nodes.
+- It distributes the work of assigning the tasks across the entire cluster. 
+Most notably, it allows data locality checking to be performed locally. 
+*"Do I have this file?"* turns out out to be a **much, much** cheaper question than 
+*"Which node has this file?"*. 
+- It reduces memory usage in storing task descriptions, with task-descriptions
+only generated for enough tasks to full each nodes input queue. This is 
+critically important when, e.g. re-analysing streamed data, where a 
+conventional task allocation strategy would lead to the generation of 200,000,000 
+tasks for a day's worth of streamed image frames. 
+
+
+High-level rule API
+-------------------
+
+Recipe Rules
+''''''''''''
+
+Recipe rules can be used apply a single :ref:`recipe <recipes>` to multiple 
+files in parallel. A ``RecipeRule`` expects recipe text, an output directory
+path that is relative to the PYME cluster's root data directory and at least
+one input. The recipe text can alternatively be passed via a cluster URI that
+points to recipe text.
+
+For example, we can create a recipe that will simulate random points and save
+them in an output directory of our choice.
+
+::
+    recipe_text= '''
+    - simulation.RandomPoints:
+        output: points
+    - output.HDFOutput:
+        filePattern: '{output_dir}/test.hdf'
+        inputVariables:
+        points: points
+        scheme: pyme-cluster://
+    '''
+
+.. note::
+
+    ``scheme: pyme-cluster://`` puts the output into the cluster file system so 
+    that it is accessible to subsequent recipe runs etc ... This is typically 
+    ``~/PYMEData`` in a :ref:`cluster-of-one <localisationanalysis>` scenario 
+    unless otherwise specified through `PYME.config <api/PYME.config>` It also 
+    routes all IO through the dataserver, avoiding IO conflicts.
+
+    To concatenate the results from multiple operations into a single table, 
+    use ``scheme: pyme-cluster:// - aggregate``.
+
+Recipe tasks are generated for each input in the rule - for simulation we use 
+the ``__sim`` proxy input. For ``__sim`` "inputs" the filename can be an 
+arbitrary string, and is propagated to the ``sim_tag`` context variable which 
+can be used in the output `filePattern`.
+
+::
+    from PYME.cluster import rules
+
+    r = rules.RecipeRule(recipe=recipe_text, output_dir='test', inputs={'__sim':['1']})
+    r.push()
+
+Inputs are specified as a dictionary, where the key is passed in as the name of 
+the input datasource, for direct use within the recipe, and the value must be a 
+*list* of data sources.  ``__sim`` is a special case where no input is used in 
+the recipe. If, instead, we wanted to create a recipe that read in a data set, 
+created a surface, and then saved this surface to file, we would execute the 
+following.
+
+::
+    recipe_text = '''
+    - pointcloud.Octree:
+        input_localizations: shape_shape
+        output_octree: octree
+    - surface_fitting.DualMarchingCubes:
+        input: octree
+        output: mesh
+        remesh: true
+    - output.STLOutput:
+        filePattern: '{{output_dir}}/my_surface.stl'
+        inputName: membrane
+        scheme: pyme-cluster://
+    '''
+
+    rule = RecipeRule(recipe=recipe_text, output_dir=output_dir, 
+                      inputs={'shape': [f'pyme-cluster:///{output_dir}/shape.hdf']})
+
+    rule.push()
+
+Note that in this case, an HDF file is passed as input. This is opened in the
+pipeline as a data source with name ``<input_name>_<table_name>``. In this 
+case, the table name is also named ``shape`` and so ``input_localizations`` in 
+the recipe is named ``shape_shape``.
+
+
+
+Low level API
+-------------
+
diff --git a/docs/index.rst b/docs/index.rst
index f6f0bdee0..aa129d8f3 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -30,6 +30,7 @@ particular could use a lot of work. Hopefully it's enough to get you started.
    PYMEVis
    component_programs
    LocalisationAnalysis
+   cluster/PYMECluster
 
 
 

From 977bf3b438f73017d94f4fe1990492d70221e935 Mon Sep 17 00:00:00 2001
From: David Baddeley <david.baddeley.nz@gmail.com>
Date: Tue, 27 Sep 2022 15:14:27 +1300
Subject: [PATCH 2/5] removing rules.rst (contents merged into cluster_compute)

---
 docs/cluster/rules.rst | 88 ------------------------------------------
 1 file changed, 88 deletions(-)
 delete mode 100644 docs/cluster/rules.rst

diff --git a/docs/cluster/rules.rst b/docs/cluster/rules.rst
deleted file mode 100644
index 47ba5a375..000000000
--- a/docs/cluster/rules.rst
+++ /dev/null
@@ -1,88 +0,0 @@
-Using PYME's cluster rules
-***************************
-
-**Rules** are essentially a json template which defines how tasks may be 
-generated (by substitution into the template on the client side). Generation of
-individual tasks on the client has the dual benefits of a) reducing the CPU 
-load and memory requirements on the server and b) dramatically decreasing the 
-network bandwidth used for task distribution.
-
-The rules API, defined in ``PYME.cluster.rules.py``, can be used to distribute
-tasks programatically on the PYME cluster from third-party software.
-
-Recipe Rules
-============
-
-Recipe rules can be used apply a single :ref:`recipe <recipes>` to multiple 
-files in parallel. A ``RecipeRule`` expects recipe text, an output directory
-path that is relative to the PYME cluster's root data directory and at least
-one input. The recipe text can alternatively be passed via a cluster URI that
-points to recipe text.
-
-For example, we can create a recipe that will simulate random points and save
-them in an output directory of our choice.
-
-::
-    recipe_text= '''
-    - simulation.RandomPoints:
-        output: points
-    - output.HDFOutput:
-        filePattern: '{output_dir}/test.hdf'
-        inputVariables:
-        points: points
-        scheme: pyme-cluster://
-    '''
-
-.. note::
-
-    ``scheme: pyme-cluster://`` puts the output into the cluster file system so 
-    that it is accessible to subsequent recipe runs etc ... This is typically 
-    ``~/PYMEData`` in a :ref:`cluster-of-one <localisationanalysis>` scenario 
-    unless otherwise specified through `PYME.config <api/PYME.config>` It also 
-    routes all IO through the dataserver, avoiding IO conflicts.
-
-    To concatenate the results from multiple operations into a single table, 
-    use ``scheme: pyme-cluster:// - aggregate``.
-
-Recipe tasks are generated for each input in the rule - for simulation we use 
-the ``__sim`` proxy input. For ``__sim`` "inputs" the filename can be an 
-arbitrary string, and is propagated to the ``sim_tag`` context variable which 
-can be used in the output `filePattern`.
-
-::
-    from PYME.cluster import rules
-
-    r = rules.RecipeRule(recipe=recipe_text, output_dir='test', inputs={'__sim':['1']})
-    r.push()
-
-Inputs are specified as a dictionary, where the key is passed in as the name of 
-the input datasource, for direct use within the recipe, and the value must be a 
-*list* of data sources.  ``__sim`` is a special case where no input is used in 
-the recipe. If, instead, we wanted to create a recipe that read in a data set, 
-created a surface, and then saved this surface to file, we would execute the 
-following.
-
-::
-    recipe_text = '''
-    - pointcloud.Octree:
-        input_localizations: shape_shape
-        output_octree: octree
-    - surface_fitting.DualMarchingCubes:
-        input: octree
-        output: mesh
-        remesh: true
-    - output.STLOutput:
-        filePattern: '{{output_dir}}/my_surface.stl'
-        inputName: membrane
-        scheme: pyme-cluster://
-    '''
-
-    rule = RecipeRule(recipe=recipe_text, output_dir=output_dir, 
-                      inputs={'shape': [f'pyme-cluster:///{output_dir}/shape.hdf']})
-
-    rule.push()
-
-Note that in this case, an HDF file is passed as input. This is opened in the
-pipeline as a data source with name ``<input_name>_<table_name>``. In this 
-case, the table name is also named ``shape`` and so ``input_localizations`` in 
-the recipe is named ``shape_shape``.

From 882f9ae7b3598052a2203516453b035d4059ad6b Mon Sep 17 00:00:00 2001
From: David Baddeley <david.baddeley.nz@gmail.com>
Date: Thu, 29 Sep 2022 09:49:59 +1300
Subject: [PATCH 3/5] more doc fixes

---
 docs/cluster/clusterFilesystem.rst |  2 +-
 docs/cluster/cluster_compute.rst   | 42 ++++++++++++++++++++++--------
 2 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/docs/cluster/clusterFilesystem.rst b/docs/cluster/clusterFilesystem.rst
index 49c8ee940..298e42633 100644
--- a/docs/cluster/clusterFilesystem.rst
+++ b/docs/cluster/clusterFilesystem.rst
@@ -44,7 +44,7 @@ Startup / installation
 
 - install PYME as a python package
 - run :py:mod:`PYMEDataServer <PYME.cluster.HTTPDataServer>` (either using :program:`PYMEClusterOfOne`, or launching individually on each
-  cluster node after following the `configuration instructions <cluster_install.html>`_ )
+  cluster node after following the :ref:`configuration instructions <clusterinstall>` )
 
 Data streaming
 ==============
diff --git a/docs/cluster/cluster_compute.rst b/docs/cluster/cluster_compute.rst
index 8dfff5976..7554d358a 100644
--- a/docs/cluster/cluster_compute.rst
+++ b/docs/cluster/cluster_compute.rst
@@ -1,16 +1,33 @@
 Distributed computation on the PYME Cluster
 *******************************************
 
+The PYME cluster architecture contains tools for distributed analysis of 
+imaging data in a "data-local" manner where computation is performed on
+the nodes housing the data, avoiding IO overhead which would otherwise limit
+performance.
+
 Compute tasks
 =============
 
+We currently support two types of computational tasks (below), although the
+framework is designed to allow additional task types to easily be added in 
+the future. 
+
 Localisation tasks
 ------------------
 
+These are a localisation-microscopy specific task type optimised for
+performing single-molecule localisation on a single frame of an image series.
+A wide range of different localisation algorithms are supported (for more details see :ref:`localisationanalysis`)
 
 "Recipe" tasks
 --------------
 
+These are a more generic task type which supports a range of different image
+and data-processing functions specified by combining analysis building blocks
+into :ref:`recipes <recipes>`. Recipes are specified using a .yaml text format,
+and can either be created manually or using the recipe editor.
+
 
 Rule-based task scheduler
 =========================
@@ -32,16 +49,16 @@ a very large number (thousands per second) of small tasks. It has 3 main
 advantages: 
 
 - It dramatically reduces the network traffic required for task distribution, 
-with only the rule and integer task IDs transmitted between nodes.
+  with only the rule and integer task IDs transmitted between nodes.
 - It distributes the work of assigning the tasks across the entire cluster. 
-Most notably, it allows data locality checking to be performed locally. 
-*"Do I have this file?"* turns out out to be a **much, much** cheaper question than 
-*"Which node has this file?"*. 
+  Most notably, it allows data locality checking to be performed locally. 
+  *"Do I have this file?"* turns out out to be a **much, much** cheaper question than 
+  *"Which node has this file?"*. 
 - It reduces memory usage in storing task descriptions, with task-descriptions
-only generated for enough tasks to full each nodes input queue. This is 
-critically important when, e.g. re-analysing streamed data, where a 
-conventional task allocation strategy would lead to the generation of 200,000,000 
-tasks for a day's worth of streamed image frames. 
+  only generated for enough tasks to full each nodes input queue. This is 
+  critically important when, e.g. re-analysing streamed data, where a 
+  conventional task allocation strategy would lead to the generation of 200,000,000 
+  tasks for a day's worth of streamed image frames. 
 
 
 High-level rule API
@@ -59,7 +76,8 @@ points to recipe text.
 For example, we can create a recipe that will simulate random points and save
 them in an output directory of our choice.
 
-::
+.. code-block:: python
+
     recipe_text= '''
     - simulation.RandomPoints:
         output: points
@@ -86,7 +104,8 @@ the ``__sim`` proxy input. For ``__sim`` "inputs" the filename can be an
 arbitrary string, and is propagated to the ``sim_tag`` context variable which 
 can be used in the output `filePattern`.
 
-::
+.. code-block:: python
+
     from PYME.cluster import rules
 
     r = rules.RecipeRule(recipe=recipe_text, output_dir='test', inputs={'__sim':['1']})
@@ -99,7 +118,8 @@ the recipe. If, instead, we wanted to create a recipe that read in a data set,
 created a surface, and then saved this surface to file, we would execute the 
 following.
 
-::
+.. code-block:: python
+
     recipe_text = '''
     - pointcloud.Octree:
         input_localizations: shape_shape

From 444d2d4c27bcf986fa4737f32db2cacd4a26d60d Mon Sep 17 00:00:00 2001
From: David Baddeley <david.baddeley.nz@gmail.com>
Date: Fri, 30 Sep 2022 10:31:02 +1300
Subject: [PATCH 4/5] fix sat threshold regression

---
 PYME/Acquire/Hardware/Camera.py            | 7 ++++++-
 PYME/Acquire/Hardware/Simulator/fakeCam.py | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/PYME/Acquire/Hardware/Camera.py b/PYME/Acquire/Hardware/Camera.py
index 46a82f0cf..9396e8bcf 100644
--- a/PYME/Acquire/Hardware/Camera.py
+++ b/PYME/Acquire/Hardware/Camera.py
@@ -138,6 +138,8 @@ def __init__(self, *args, **kwargs):
 
         self.active = True  # Should the camera write its metadata?
 
+        self._saturation_threshold = (2**16) - 1 # default saturation threshold, if not provided in noise_properties
+
         # Register as a provider of metadata (record camera settings)
         # this is important so that the camera settings get recorded
         MetaDataHandler.provideStartMetadata.append(self.GenStartMetadata)
@@ -827,7 +829,10 @@ def SaturationThreshold(self):
         int
             the full well capacity (in ADU), typically 2^bitdepth - 1
         """
-        return self.noise_properties['SaturationThreshold']
+        try: 
+            return self.noise_properties['SaturationThreshold']
+        except (KeyError, RuntimeError):
+            return self._saturation_threshold
             
 
     def Shutdown(self):
diff --git a/PYME/Acquire/Hardware/Simulator/fakeCam.py b/PYME/Acquire/Hardware/Simulator/fakeCam.py
index 18cd07d1c..be791dd0d 100755
--- a/PYME/Acquire/Hardware/Simulator/fakeCam.py
+++ b/PYME/Acquire/Hardware/Simulator/fakeCam.py
@@ -383,7 +383,7 @@ def __init__(self, XVals, YVals, noiseMaker, zPiezo, zOffset=50.0, fluors=None,
         self._objects=None
         self.noiseMaker=noiseMaker
 
-        self.SaturationThreshold = (2**16) - 1
+        self._saturation_threshold = (2**16) - 1
         self.DefaultEMGain = 150
         self.preampGain = 1
         

From 00b4482a6ce06617a69678b1a54d4d91d672a083 Mon Sep 17 00:00:00 2001
From: David Baddeley <david.baddeley.nz@gmail.com>
Date: Fri, 30 Sep 2022 10:31:22 +1300
Subject: [PATCH 5/5] keep_pairs function

---
 PYME/Analysis/keep_pairs.py | 69 +++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 PYME/Analysis/keep_pairs.py

diff --git a/PYME/Analysis/keep_pairs.py b/PYME/Analysis/keep_pairs.py
new file mode 100644
index 000000000..91b299d4f
--- /dev/null
+++ b/PYME/Analysis/keep_pairs.py
@@ -0,0 +1,69 @@
+"""
+A simple utility function to look for (and keep) pairs in 2 sets of localisations.
+
+Useful for comparing different analysis and/or detection modes on the same data. 
+Could also be used on, e.g. beads in different colour channels.
+
+David Baddeley 2022
+"""
+
+import numpy as np
+
+def _idx(t):
+    idx = np.concatenate([[0,], np.argwhere(np.diff(t) > 0).squeeze() + 1])
+    idr_r = {t[i]:i for i in idx}
+    idx_e = {t[i]:i+1 for i in idx[1:]-1}
+    idx_e [t[-1]] = len(t) - 1
+
+    return idx, idr_r, idx_e
+
+def keep_pairs(x0, y0, t0, x1, y1, t1):
+    """
+    Find all localisations in (x0,y0,t0) which have a corresponding localisation in (x1,y1,t1) and return
+    the matching pairs. Uses a simple distance metric to find pairs within a frame.
+
+    TODO:
+    - Return indices so that additional columns etc ... can be examined
+    - Potentially change to finding nearest match **without** replacement. As coded it is theoretically possible
+      for 2 localisations in set 0 to map to a single localisation in set 1 (and vice versa), although this is
+      expected to be rare in practive.
+    """
+    idx0, f0, e0, = _idx(t0)
+    idx1, f1, e1 = _idx(t1)
+
+    o0 = []
+    o1 = []
+    do = []
+
+    idxs = np.array(list(f0.keys()), dtype='i')
+    for i in idxs:
+        if i in f1:
+            #skip frames which are completely missing from t1
+
+            x0_i, y0_i, t0_i = x0[f0[i]:e0[i]], y0[f0[i]:e0[i]], t0[f0[i]:e0[i]]
+            x1_i, y1_i, t1_i = x1[f1[i]:e1[i]], y1[f1[i]:e1[i]], t1[f1[i]:e1[i]]
+
+            d = (x0_i[:,None] - x1_i[None,:])**2 + (y0_i[:,None] - y1_i[None,:])**2
+
+            if len(x0_i) > len(x1_i):
+                mx = np.atleast_1d(d.argmin(0).squeeze())
+                dm = d[mx, np.arange(len(mx), dtype='i')].squeeze()
+                o1.append(np.vstack([x1_i, y1_i, t1_i]))
+                o0.append(np.vstack([x0_i[mx], y0_i[mx], t0_i[mx]]))
+                do.append(dm)
+            else:
+                mx = np.atleast_1d(d.argmin(1).squeeze())
+                dm = d[np.arange(len(mx), dtype='i'),mx].squeeze()
+                o1.append(np.vstack([x1_i[mx], y1_i[mx], t1_i[mx]]))
+                o0.append(np.vstack([x0_i, y0_i, t0_i]))
+                do.append(dm)
+
+            #if i < 50:
+            #    print(d.shape, dm.shape, mx.shape)
+            #    print(dm)
+
+    #print(o0[0].shape, len(o0), o0[1].shape, do[0].shape)
+
+    do = np.hstack(do)
+    return np.concatenate(o0, 1), np.concatenate(o1, 1), do
+