From b00ae12250c365a60fc28492e9bb7b9c54b9d491 Mon Sep 17 00:00:00 2001 From: David Baddeley Date: Tue, 27 Sep 2022 15:13:36 +1300 Subject: [PATCH 1/5] cluster docs stuff --- docs/cluster/PYMECluster.rst | 11 +++ docs/cluster/clusterFilesystem.rst | 2 +- docs/cluster/cluster_compute.rst | 131 +++++++++++++++++++++++++++++ docs/index.rst | 1 + 4 files changed, 144 insertions(+), 1 deletion(-) create mode 100644 docs/cluster/PYMECluster.rst create mode 100644 docs/cluster/cluster_compute.rst diff --git a/docs/cluster/PYMECluster.rst b/docs/cluster/PYMECluster.rst new file mode 100644 index 000000000..7761937cc --- /dev/null +++ b/docs/cluster/PYMECluster.rst @@ -0,0 +1,11 @@ +The PYME Storage and compute cluster architecture +************************************************* + + +.. toctree:: + :maxdepth: 1 + + cluster_install + clusterFilesystem + cluster_compute + cluster_ui \ No newline at end of file diff --git a/docs/cluster/clusterFilesystem.rst b/docs/cluster/clusterFilesystem.rst index 2761870b1..49c8ee940 100644 --- a/docs/cluster/clusterFilesystem.rst +++ b/docs/cluster/clusterFilesystem.rst @@ -44,7 +44,7 @@ Startup / installation - install PYME as a python package - run :py:mod:`PYMEDataServer ` (either using :program:`PYMEClusterOfOne`, or launching individually on each - cluster node after following the `configuration instructions <_clusterinstall>`_ :ref:`clusterinstall_` ) + cluster node after following the `configuration instructions `_ ) Data streaming ============== diff --git a/docs/cluster/cluster_compute.rst b/docs/cluster/cluster_compute.rst new file mode 100644 index 000000000..8dfff5976 --- /dev/null +++ b/docs/cluster/cluster_compute.rst @@ -0,0 +1,131 @@ +Distributed computation on the PYME Cluster +******************************************* + +Compute tasks +============= + +Localisation tasks +------------------ + + +"Recipe" tasks +-------------- + + +Rule-based task scheduler +========================= + +PYME takes a different approach to task scheduling than many +common cluster schedulers. Rather than centrally definining individual tasks +and distribututing these across cluster nodes from a central server, +we define so-called "**Rules**" which decribe how to generate tasks. +Each cluster node contacts the central server for a copy of the currently +active rules, generates candidate tasks based on the rules, scores these +based on node capability and whether the required data is available +locally on the that node, and then submits a bid to the central server +specifying which tasks it want's to perform and at what cost. The central +server then compares the bids and replies with a set of tasks indices for +which the node has won the bid. + +This architecture solves a number of problems faced when trying to distribute +a very large number (thousands per second) of small tasks. It has 3 main +advantages: + +- It dramatically reduces the network traffic required for task distribution, +with only the rule and integer task IDs transmitted between nodes. +- It distributes the work of assigning the tasks across the entire cluster. +Most notably, it allows data locality checking to be performed locally. +*"Do I have this file?"* turns out out to be a **much, much** cheaper question than +*"Which node has this file?"*. +- It reduces memory usage in storing task descriptions, with task-descriptions +only generated for enough tasks to full each nodes input queue. This is +critically important when, e.g. re-analysing streamed data, where a +conventional task allocation strategy would lead to the generation of 200,000,000 +tasks for a day's worth of streamed image frames. + + +High-level rule API +------------------- + +Recipe Rules +'''''''''''' + +Recipe rules can be used apply a single :ref:`recipe ` to multiple +files in parallel. A ``RecipeRule`` expects recipe text, an output directory +path that is relative to the PYME cluster's root data directory and at least +one input. The recipe text can alternatively be passed via a cluster URI that +points to recipe text. + +For example, we can create a recipe that will simulate random points and save +them in an output directory of our choice. + +:: + recipe_text= ''' + - simulation.RandomPoints: + output: points + - output.HDFOutput: + filePattern: '{output_dir}/test.hdf' + inputVariables: + points: points + scheme: pyme-cluster:// + ''' + +.. note:: + + ``scheme: pyme-cluster://`` puts the output into the cluster file system so + that it is accessible to subsequent recipe runs etc ... This is typically + ``~/PYMEData`` in a :ref:`cluster-of-one ` scenario + unless otherwise specified through `PYME.config ` It also + routes all IO through the dataserver, avoiding IO conflicts. + + To concatenate the results from multiple operations into a single table, + use ``scheme: pyme-cluster:// - aggregate``. + +Recipe tasks are generated for each input in the rule - for simulation we use +the ``__sim`` proxy input. For ``__sim`` "inputs" the filename can be an +arbitrary string, and is propagated to the ``sim_tag`` context variable which +can be used in the output `filePattern`. + +:: + from PYME.cluster import rules + + r = rules.RecipeRule(recipe=recipe_text, output_dir='test', inputs={'__sim':['1']}) + r.push() + +Inputs are specified as a dictionary, where the key is passed in as the name of +the input datasource, for direct use within the recipe, and the value must be a +*list* of data sources. ``__sim`` is a special case where no input is used in +the recipe. If, instead, we wanted to create a recipe that read in a data set, +created a surface, and then saved this surface to file, we would execute the +following. + +:: + recipe_text = ''' + - pointcloud.Octree: + input_localizations: shape_shape + output_octree: octree + - surface_fitting.DualMarchingCubes: + input: octree + output: mesh + remesh: true + - output.STLOutput: + filePattern: '{{output_dir}}/my_surface.stl' + inputName: membrane + scheme: pyme-cluster:// + ''' + + rule = RecipeRule(recipe=recipe_text, output_dir=output_dir, + inputs={'shape': [f'pyme-cluster:///{output_dir}/shape.hdf']}) + + rule.push() + +Note that in this case, an HDF file is passed as input. This is opened in the +pipeline as a data source with name ``_``. In this +case, the table name is also named ``shape`` and so ``input_localizations`` in +the recipe is named ``shape_shape``. + + + +Low level API +------------- + diff --git a/docs/index.rst b/docs/index.rst index f6f0bdee0..aa129d8f3 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -30,6 +30,7 @@ particular could use a lot of work. Hopefully it's enough to get you started. PYMEVis component_programs LocalisationAnalysis + cluster/PYMECluster From 977bf3b438f73017d94f4fe1990492d70221e935 Mon Sep 17 00:00:00 2001 From: David Baddeley Date: Tue, 27 Sep 2022 15:14:27 +1300 Subject: [PATCH 2/5] removing rules.rst (contents merged into cluster_compute) --- docs/cluster/rules.rst | 88 ------------------------------------------ 1 file changed, 88 deletions(-) delete mode 100644 docs/cluster/rules.rst diff --git a/docs/cluster/rules.rst b/docs/cluster/rules.rst deleted file mode 100644 index 47ba5a375..000000000 --- a/docs/cluster/rules.rst +++ /dev/null @@ -1,88 +0,0 @@ -Using PYME's cluster rules -*************************** - -**Rules** are essentially a json template which defines how tasks may be -generated (by substitution into the template on the client side). Generation of -individual tasks on the client has the dual benefits of a) reducing the CPU -load and memory requirements on the server and b) dramatically decreasing the -network bandwidth used for task distribution. - -The rules API, defined in ``PYME.cluster.rules.py``, can be used to distribute -tasks programatically on the PYME cluster from third-party software. - -Recipe Rules -============ - -Recipe rules can be used apply a single :ref:`recipe ` to multiple -files in parallel. A ``RecipeRule`` expects recipe text, an output directory -path that is relative to the PYME cluster's root data directory and at least -one input. The recipe text can alternatively be passed via a cluster URI that -points to recipe text. - -For example, we can create a recipe that will simulate random points and save -them in an output directory of our choice. - -:: - recipe_text= ''' - - simulation.RandomPoints: - output: points - - output.HDFOutput: - filePattern: '{output_dir}/test.hdf' - inputVariables: - points: points - scheme: pyme-cluster:// - ''' - -.. note:: - - ``scheme: pyme-cluster://`` puts the output into the cluster file system so - that it is accessible to subsequent recipe runs etc ... This is typically - ``~/PYMEData`` in a :ref:`cluster-of-one ` scenario - unless otherwise specified through `PYME.config ` It also - routes all IO through the dataserver, avoiding IO conflicts. - - To concatenate the results from multiple operations into a single table, - use ``scheme: pyme-cluster:// - aggregate``. - -Recipe tasks are generated for each input in the rule - for simulation we use -the ``__sim`` proxy input. For ``__sim`` "inputs" the filename can be an -arbitrary string, and is propagated to the ``sim_tag`` context variable which -can be used in the output `filePattern`. - -:: - from PYME.cluster import rules - - r = rules.RecipeRule(recipe=recipe_text, output_dir='test', inputs={'__sim':['1']}) - r.push() - -Inputs are specified as a dictionary, where the key is passed in as the name of -the input datasource, for direct use within the recipe, and the value must be a -*list* of data sources. ``__sim`` is a special case where no input is used in -the recipe. If, instead, we wanted to create a recipe that read in a data set, -created a surface, and then saved this surface to file, we would execute the -following. - -:: - recipe_text = ''' - - pointcloud.Octree: - input_localizations: shape_shape - output_octree: octree - - surface_fitting.DualMarchingCubes: - input: octree - output: mesh - remesh: true - - output.STLOutput: - filePattern: '{{output_dir}}/my_surface.stl' - inputName: membrane - scheme: pyme-cluster:// - ''' - - rule = RecipeRule(recipe=recipe_text, output_dir=output_dir, - inputs={'shape': [f'pyme-cluster:///{output_dir}/shape.hdf']}) - - rule.push() - -Note that in this case, an HDF file is passed as input. This is opened in the -pipeline as a data source with name ``_``. In this -case, the table name is also named ``shape`` and so ``input_localizations`` in -the recipe is named ``shape_shape``. From 882f9ae7b3598052a2203516453b035d4059ad6b Mon Sep 17 00:00:00 2001 From: David Baddeley Date: Thu, 29 Sep 2022 09:49:59 +1300 Subject: [PATCH 3/5] more doc fixes --- docs/cluster/clusterFilesystem.rst | 2 +- docs/cluster/cluster_compute.rst | 42 ++++++++++++++++++++++-------- 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/docs/cluster/clusterFilesystem.rst b/docs/cluster/clusterFilesystem.rst index 49c8ee940..298e42633 100644 --- a/docs/cluster/clusterFilesystem.rst +++ b/docs/cluster/clusterFilesystem.rst @@ -44,7 +44,7 @@ Startup / installation - install PYME as a python package - run :py:mod:`PYMEDataServer ` (either using :program:`PYMEClusterOfOne`, or launching individually on each - cluster node after following the `configuration instructions `_ ) + cluster node after following the :ref:`configuration instructions ` ) Data streaming ============== diff --git a/docs/cluster/cluster_compute.rst b/docs/cluster/cluster_compute.rst index 8dfff5976..7554d358a 100644 --- a/docs/cluster/cluster_compute.rst +++ b/docs/cluster/cluster_compute.rst @@ -1,16 +1,33 @@ Distributed computation on the PYME Cluster ******************************************* +The PYME cluster architecture contains tools for distributed analysis of +imaging data in a "data-local" manner where computation is performed on +the nodes housing the data, avoiding IO overhead which would otherwise limit +performance. + Compute tasks ============= +We currently support two types of computational tasks (below), although the +framework is designed to allow additional task types to easily be added in +the future. + Localisation tasks ------------------ +These are a localisation-microscopy specific task type optimised for +performing single-molecule localisation on a single frame of an image series. +A wide range of different localisation algorithms are supported (for more details see :ref:`localisationanalysis`) "Recipe" tasks -------------- +These are a more generic task type which supports a range of different image +and data-processing functions specified by combining analysis building blocks +into :ref:`recipes `. Recipes are specified using a .yaml text format, +and can either be created manually or using the recipe editor. + Rule-based task scheduler ========================= @@ -32,16 +49,16 @@ a very large number (thousands per second) of small tasks. It has 3 main advantages: - It dramatically reduces the network traffic required for task distribution, -with only the rule and integer task IDs transmitted between nodes. + with only the rule and integer task IDs transmitted between nodes. - It distributes the work of assigning the tasks across the entire cluster. -Most notably, it allows data locality checking to be performed locally. -*"Do I have this file?"* turns out out to be a **much, much** cheaper question than -*"Which node has this file?"*. + Most notably, it allows data locality checking to be performed locally. + *"Do I have this file?"* turns out out to be a **much, much** cheaper question than + *"Which node has this file?"*. - It reduces memory usage in storing task descriptions, with task-descriptions -only generated for enough tasks to full each nodes input queue. This is -critically important when, e.g. re-analysing streamed data, where a -conventional task allocation strategy would lead to the generation of 200,000,000 -tasks for a day's worth of streamed image frames. + only generated for enough tasks to full each nodes input queue. This is + critically important when, e.g. re-analysing streamed data, where a + conventional task allocation strategy would lead to the generation of 200,000,000 + tasks for a day's worth of streamed image frames. High-level rule API @@ -59,7 +76,8 @@ points to recipe text. For example, we can create a recipe that will simulate random points and save them in an output directory of our choice. -:: +.. code-block:: python + recipe_text= ''' - simulation.RandomPoints: output: points @@ -86,7 +104,8 @@ the ``__sim`` proxy input. For ``__sim`` "inputs" the filename can be an arbitrary string, and is propagated to the ``sim_tag`` context variable which can be used in the output `filePattern`. -:: +.. code-block:: python + from PYME.cluster import rules r = rules.RecipeRule(recipe=recipe_text, output_dir='test', inputs={'__sim':['1']}) @@ -99,7 +118,8 @@ the recipe. If, instead, we wanted to create a recipe that read in a data set, created a surface, and then saved this surface to file, we would execute the following. -:: +.. code-block:: python + recipe_text = ''' - pointcloud.Octree: input_localizations: shape_shape From 444d2d4c27bcf986fa4737f32db2cacd4a26d60d Mon Sep 17 00:00:00 2001 From: David Baddeley Date: Fri, 30 Sep 2022 10:31:02 +1300 Subject: [PATCH 4/5] fix sat threshold regression --- PYME/Acquire/Hardware/Camera.py | 7 ++++++- PYME/Acquire/Hardware/Simulator/fakeCam.py | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/PYME/Acquire/Hardware/Camera.py b/PYME/Acquire/Hardware/Camera.py index 46a82f0cf..9396e8bcf 100644 --- a/PYME/Acquire/Hardware/Camera.py +++ b/PYME/Acquire/Hardware/Camera.py @@ -138,6 +138,8 @@ def __init__(self, *args, **kwargs): self.active = True # Should the camera write its metadata? + self._saturation_threshold = (2**16) - 1 # default saturation threshold, if not provided in noise_properties + # Register as a provider of metadata (record camera settings) # this is important so that the camera settings get recorded MetaDataHandler.provideStartMetadata.append(self.GenStartMetadata) @@ -827,7 +829,10 @@ def SaturationThreshold(self): int the full well capacity (in ADU), typically 2^bitdepth - 1 """ - return self.noise_properties['SaturationThreshold'] + try: + return self.noise_properties['SaturationThreshold'] + except (KeyError, RuntimeError): + return self._saturation_threshold def Shutdown(self): diff --git a/PYME/Acquire/Hardware/Simulator/fakeCam.py b/PYME/Acquire/Hardware/Simulator/fakeCam.py index 18cd07d1c..be791dd0d 100755 --- a/PYME/Acquire/Hardware/Simulator/fakeCam.py +++ b/PYME/Acquire/Hardware/Simulator/fakeCam.py @@ -383,7 +383,7 @@ def __init__(self, XVals, YVals, noiseMaker, zPiezo, zOffset=50.0, fluors=None, self._objects=None self.noiseMaker=noiseMaker - self.SaturationThreshold = (2**16) - 1 + self._saturation_threshold = (2**16) - 1 self.DefaultEMGain = 150 self.preampGain = 1 From 00b4482a6ce06617a69678b1a54d4d91d672a083 Mon Sep 17 00:00:00 2001 From: David Baddeley Date: Fri, 30 Sep 2022 10:31:22 +1300 Subject: [PATCH 5/5] keep_pairs function --- PYME/Analysis/keep_pairs.py | 69 +++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 PYME/Analysis/keep_pairs.py diff --git a/PYME/Analysis/keep_pairs.py b/PYME/Analysis/keep_pairs.py new file mode 100644 index 000000000..91b299d4f --- /dev/null +++ b/PYME/Analysis/keep_pairs.py @@ -0,0 +1,69 @@ +""" +A simple utility function to look for (and keep) pairs in 2 sets of localisations. + +Useful for comparing different analysis and/or detection modes on the same data. +Could also be used on, e.g. beads in different colour channels. + +David Baddeley 2022 +""" + +import numpy as np + +def _idx(t): + idx = np.concatenate([[0,], np.argwhere(np.diff(t) > 0).squeeze() + 1]) + idr_r = {t[i]:i for i in idx} + idx_e = {t[i]:i+1 for i in idx[1:]-1} + idx_e [t[-1]] = len(t) - 1 + + return idx, idr_r, idx_e + +def keep_pairs(x0, y0, t0, x1, y1, t1): + """ + Find all localisations in (x0,y0,t0) which have a corresponding localisation in (x1,y1,t1) and return + the matching pairs. Uses a simple distance metric to find pairs within a frame. + + TODO: + - Return indices so that additional columns etc ... can be examined + - Potentially change to finding nearest match **without** replacement. As coded it is theoretically possible + for 2 localisations in set 0 to map to a single localisation in set 1 (and vice versa), although this is + expected to be rare in practive. + """ + idx0, f0, e0, = _idx(t0) + idx1, f1, e1 = _idx(t1) + + o0 = [] + o1 = [] + do = [] + + idxs = np.array(list(f0.keys()), dtype='i') + for i in idxs: + if i in f1: + #skip frames which are completely missing from t1 + + x0_i, y0_i, t0_i = x0[f0[i]:e0[i]], y0[f0[i]:e0[i]], t0[f0[i]:e0[i]] + x1_i, y1_i, t1_i = x1[f1[i]:e1[i]], y1[f1[i]:e1[i]], t1[f1[i]:e1[i]] + + d = (x0_i[:,None] - x1_i[None,:])**2 + (y0_i[:,None] - y1_i[None,:])**2 + + if len(x0_i) > len(x1_i): + mx = np.atleast_1d(d.argmin(0).squeeze()) + dm = d[mx, np.arange(len(mx), dtype='i')].squeeze() + o1.append(np.vstack([x1_i, y1_i, t1_i])) + o0.append(np.vstack([x0_i[mx], y0_i[mx], t0_i[mx]])) + do.append(dm) + else: + mx = np.atleast_1d(d.argmin(1).squeeze()) + dm = d[np.arange(len(mx), dtype='i'),mx].squeeze() + o1.append(np.vstack([x1_i[mx], y1_i[mx], t1_i[mx]])) + o0.append(np.vstack([x0_i, y0_i, t0_i])) + do.append(dm) + + #if i < 50: + # print(d.shape, dm.shape, mx.shape) + # print(dm) + + #print(o0[0].shape, len(o0), o0[1].shape, do[0].shape) + + do = np.hstack(do) + return np.concatenate(o0, 1), np.concatenate(o1, 1), do +