From 72f2fd6d15721da76372c1c2bf51d578bb40cf2f Mon Sep 17 00:00:00 2001
From: Yujin-Bao <yujinbao111@gmail.com>
Date: Wed, 17 Dec 2025 15:01:23 -0500
Subject: [PATCH 1/5] software fix for pco sometimes crashing/capturing black
 frames/being stuck

---
 PYME/Acquire/Hardware/pco/pco_sdk_cam.py | 95 +++++++++++++++++++-----
 PYME/Acquire/frameWrangler.py            | 13 +++-
 2 files changed, 89 insertions(+), 19 deletions(-)

diff --git a/PYME/Acquire/Hardware/pco/pco_sdk_cam.py b/PYME/Acquire/Hardware/pco/pco_sdk_cam.py
index 61dc1fbea..97aeb900a 100644
--- a/PYME/Acquire/Hardware/pco/pco_sdk_cam.py
+++ b/PYME/Acquire/Hardware/pco/pco_sdk_cam.py
@@ -67,6 +67,9 @@ def unregCamera(cls):
 MAX_TIMEOUTS = 100
 MAX_QUEUED_BUFFERS = 16  # pco. has a hard limit on attaching no 
                          # more than 16 buffers at a time to the camera
+WAIT_OBJECT_0 = 0x00000000
+WAIT_TIMEOUT  = 0x00000102
+MAX_CONSECUTIVE_ERRORS = 10  # max consecutive wait/buffer errors before we abort recording
 
 class PcoSdkCam(Camera):
     def __init__(self, camNum, debuglevel='off'):
@@ -107,11 +110,14 @@ def Init(self):
         self._binning_x = 1
         self._binning_y = 1
         self._n_timeouts = 0
+        self.hardware_overflowed = False  # used by frameWrangler to know camera is unhappy
         self._i = 0
         self._buffers_to_queue = queue.Queue()
         self._queued_buffers = queue.Queue()
         self._full_buffers = queue.Queue()
         self.SetROI(1, 1, self.GetCCDWidth(), self.GetCCDHeight())
+        self._consecutive_errors = 0
+        self._max_consecutive_errors = MAX_CONSECUTIVE_ERRORS
         self.SetIntegTime(0.025)
         self.SetAcquisitionMode(self.MODE_CONTINUOUS)
         self._cam_type = pco_sdk.get_camera_type(self._handle)
@@ -155,24 +161,74 @@ def _poll_loop(self):
                         
                         # wait for the buffer
                         wait_status = k32_dll.WaitForSingleObject(self._buf_event[_curr_buf], self._timeout)
-                        if wait_status:
-                            logger.warning(f"Waited too long for buffer ({self._timeout} ms).") 
-                            
-                            #TODO: we currently continue as if we got the buffer - is this the right thing to do?
-                            # Presumably the status will be non-zero and we will drop the buffer?, but then what 
-                            # happens to those buffers? do they just dissapear?
+                        if wait_status == WAIT_TIMEOUT:
+                            # Timeout: do NOT deliver a frame; recycle the buffer
+                            self._n_timeouts += 1
+                            self._consecutive_errors += 1
+
+                            logger.warning(
+                                f"PCO: WaitForSingleObject timeout after {self._timeout} ms "
+                                f"(timeouts={self._n_timeouts}, consecutive_errors={self._consecutive_errors}) "
+                                f"on buffer {_curr_buf}. Recycling."
+                            )
+
+                            k32_dll.ResetEvent(self._buf_event[_curr_buf])
+                            self._buffers_to_queue.put(_curr_buf)
+
+                            # After a few consecutive timeouts, flag for higher-level recovery logic
+                            if self._n_timeouts >= 3:
+                                logger.error("PCO: repeated buffer timeouts -> hardware_overflowed=True")
+                                self.hardware_overflowed = True
                         
-                        k32_dll.ResetEvent(self._buf_event[_curr_buf])
-                        # make sure this buffer is safe to use
-                        status = self._buffer_status[_curr_buf]
-                        if status:
-                            logger.warning(f"Error {status} during check of buffer {_curr_buf}.")
-                            #DB: Do you see a lot of these warnings? IE - do we get one every time we have a timeout?
-                            # drop this buffer
+                        elif wait_status == WAIT_OBJECT_0:
+                            # Normal case
+                            self._n_timeouts = 0
+                            self._consecutive_errors = 0
+
+                            k32_dll.ResetEvent(self._buf_event[_curr_buf])
+
+                            status = self._buffer_status[_curr_buf]
+                            if status != 0:
+                                # Bad buffer -> recycle it, don't deliver
+                                logger.warning(f"PCO: error {status} during check of buffer {_curr_buf}. Recycling.")
+                                self._consecutive_errors += 1
+                                self._buffers_to_queue.put(_curr_buf)
+                            else:
+                                # use it
+                                self._full_buffers.put(_curr_buf)
+                                self._n_buffered += 1
+
                         else:
-                            # use it
-                            self._full_buffers.put(_curr_buf)
-                            self._n_buffered += 1
+                            # Other wait error: recycle buffer, flag error
+                            self._consecutive_errors += 1
+                            logger.error(
+                                "PCO: WaitForSingleObject returned unexpected status 0x%08X for buffer %d. Recycling.",
+                                wait_status, _curr_buf
+                            )
+
+                            k32_dll.ResetEvent(self._buf_event[_curr_buf])
+                            self._buffers_to_queue.put(_curr_buf)
+
+                    # If things look persistently bad, stop recording to avoid hard SDK crashes.
+                    if self._consecutive_errors >= self._max_consecutive_errors:
+                        logger.error(
+                            "PCO: too many consecutive buffer errors (%d) -> stopping recording to avoid crash",
+                            self._consecutive_errors
+                        )
+                        try:
+                            pco_sdk.set_recording_state(self._handle, pco_sdk.PCO_CAMERA_STOPPED)
+                            pco_sdk.cancel_images(self._handle)
+                        except Exception:
+                            logger.exception("PCO: error while stopping camera after repeated buffer errors")
+
+                        # Mark stopped on our side; FrameWrangler/UI can restart cleanly.
+                        self._recording = False
+                        self.hardware_overflowed = True
+                        self._n_buffered = 0
+                        self._n_queued = 0
+                        self._n_timeouts = 0
+                        self._consecutive_errors = 0
+                    
                     else:
                         # sleep for a bit longer if there were no buffers queued
                         sleep_time = 0.01
@@ -433,7 +489,9 @@ def _init_buffers(self):
 
     @property
     def _timeout(self):
-        return int(max(2*100*self.GetCycleTime(), 100))
+        # set the time longer for all hardware check
+        # for example, LC settling time is usually set at 300 ms (current minimum is 150 ms)
+        return int(max(2*100*self.GetCycleTime(), 1000))
     
     def StartExposure(self):
         #logger.debug(f'PcoSdkCam: StartExposure called from thread {threading.current_thread().name} at {time.time()}')
@@ -448,6 +506,8 @@ def StartExposure(self):
             if self._recording == False:
                 self._init_buffers()
             self._recording = True
+            self.hardware_overflowed = False
+            self._n_timeouts = 0
 
         self._log_exposure_start()
         
@@ -472,6 +532,7 @@ def StopAq(self):
             self._n_buffered = 0
             self._n_queued = 0
             self._n_timeouts = 0
+            self.hardware_overflowed = False
             self._buf_event = []
             self._buf_addr = []
             self._buf_status_addr = []
diff --git a/PYME/Acquire/frameWrangler.py b/PYME/Acquire/frameWrangler.py
index 5be027328..f1c4fee6c 100644
--- a/PYME/Acquire/frameWrangler.py
+++ b/PYME/Acquire/frameWrangler.py
@@ -419,9 +419,18 @@ def checkHardware(self):
         NB: This is largely legacy code, as the camera is usually used in 
         free-running mode."""
         for callback in self.HardwareChecks:
-            if not callback():
-                logger.debug('Waiting for hardware')
+            try:
+                ready = callback()
+            except Exception:
+                logger.exception('Hardware check %r raised an exception', callback)
+                return False
+
+            if not ready:
+                logger.debug('Waiting for hardware: %r reported not-ready', callback)
                 return False
+            # if not callback():
+            #     logger.debug('Waiting for hardware')
+            #     return False
 
         return True
 

From 5ab54579938cc094cc66888c0ed3e409ed20baf4 Mon Sep 17 00:00:00 2001
From: David Baddeley <david.baddeley.nz@gmail.com>
Date: Fri, 19 Dec 2025 14:30:56 +1300
Subject: [PATCH 2/5] make sure we clean up event handles, slightly improved
 diagnostics

---
 PYME/Acquire/Hardware/pco/pco_sdk_cam.py | 49 ++++++++++++++++++++----
 1 file changed, 42 insertions(+), 7 deletions(-)

diff --git a/PYME/Acquire/Hardware/pco/pco_sdk_cam.py b/PYME/Acquire/Hardware/pco/pco_sdk_cam.py
index 97aeb900a..92c3f9bd3 100644
--- a/PYME/Acquire/Hardware/pco/pco_sdk_cam.py
+++ b/PYME/Acquire/Hardware/pco/pco_sdk_cam.py
@@ -24,6 +24,8 @@
 k32_dll = ctypes.windll.kernel32  # lets us use the recommended WaitForSingleObject call (see pco.sdk)
                                   # instead of the not-recommended-for-polling pco_sdk.get_buffer_status()
 
+
+
 # Define event handle type (needed for pco_sdk.add_buffer_extern())
 # Generally we will want to use k32_dll.CreateEventA(None, 1, 0, None)
 # See https://docs.microsoft.com/en-us/windows/win32/api/synchapi/nf-synchapi-createeventa
@@ -67,8 +69,13 @@ def unregCamera(cls):
 MAX_TIMEOUTS = 100
 MAX_QUEUED_BUFFERS = 16  # pco. has a hard limit on attaching no 
                          # more than 16 buffers at a time to the camera
+
+# Windows WaitForSingleObject return codes
 WAIT_OBJECT_0 = 0x00000000
 WAIT_TIMEOUT  = 0x00000102
+WAIT_ABANDONED = 0x00000080
+WAIT_FAILED    = 0xFFFFFFFF
+
 MAX_CONSECUTIVE_ERRORS = 10  # max consecutive wait/buffer errors before we abort recording
 
 class PcoSdkCam(Camera):
@@ -173,10 +180,13 @@ def _poll_loop(self):
                             )
 
                             k32_dll.ResetEvent(self._buf_event[_curr_buf])
-                            self._buffers_to_queue.put(_curr_buf)
-
-                            # After a few consecutive timeouts, flag for higher-level recovery logic
-                            if self._n_timeouts >= 3:
+                            
+                            if (self._n_timeouts < MAX_TIMEOUTS) and (self.contMode == self.MODE_CONTINUOUS):
+                                # In continuous mode, we can try and re-queue the buffer and carry on
+                                # In single-shot mode, requeuing won't work as we will also need to retrigger - just flag overflow
+                                # and let the FrameWrangler handle a restart
+                                self._buffers_to_queue.put(_curr_buf)
+                            else:
                                 logger.error("PCO: repeated buffer timeouts -> hardware_overflowed=True")
                                 self.hardware_overflowed = True
                         
@@ -192,7 +202,12 @@ def _poll_loop(self):
                                 # Bad buffer -> recycle it, don't deliver
                                 logger.warning(f"PCO: error {status} during check of buffer {_curr_buf}. Recycling.")
                                 self._consecutive_errors += 1
-                                self._buffers_to_queue.put(_curr_buf)
+                                if (self.contMode == self.MODE_CONTINUOUS):
+                                    self._buffers_to_queue.put(_curr_buf)
+                                else:
+                                    # single shot mode - just flag overflow
+                                    logger.error("PCO: buffer error in single-shot mode -> hardware_overflowed=True")
+                                    self.hardware_overflowed = True
                             else:
                                 # use it
                                 self._full_buffers.put(_curr_buf)
@@ -201,9 +216,23 @@ def _poll_loop(self):
                         else:
                             # Other wait error: recycle buffer, flag error
                             self._consecutive_errors += 1
+
+                            # use GetLastError and FormatMessage to get more info
+                            err_code = k32_dll.GetLastError()
+                            msg_buffer = ctypes.create_string_buffer(256)
+                            k32_dll.FormatMessageA(
+                                0x00000000,
+                                None,
+                                err_code,
+                                0,
+                                msg_buffer,
+                                len(msg_buffer),
+                                None
+                            )
+
                             logger.error(
-                                "PCO: WaitForSingleObject returned unexpected status 0x%08X for buffer %d. Recycling.",
-                                wait_status, _curr_buf
+                                "PCO: WaitForSingleObject returned unexpected status 0x%08X for buffer %d.\n GetLastError: %d\n Message: %s\n Recycling buffer.",
+                                wait_status, _curr_buf, err_code, msg_buffer.value.decode()
                             )
 
                             k32_dll.ResetEvent(self._buf_event[_curr_buf])
@@ -533,7 +562,13 @@ def StopAq(self):
             self._n_queued = 0
             self._n_timeouts = 0
             self.hardware_overflowed = False
+            
+            # close and clear buffer events
+            # (Prevents us from leaking handles when running for a long time with many start/stops)
+            for ev in self._buf_event:
+                k32_dll.CloseHandle(ev)
             self._buf_event = []
+
             self._buf_addr = []
             self._buf_status_addr = []
             self._buffer = None

From 3d0f7afbf75633a44812c455e915bd11328d6572 Mon Sep 17 00:00:00 2001
From: David Baddeley <david.baddeley.nz@gmail.com>
Date: Fri, 19 Dec 2025 14:52:49 +1300
Subject: [PATCH 3/5] fix potential mutability of buffer and handle indices

---
 PYME/Acquire/Hardware/pco/pco_sdk_cam.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/PYME/Acquire/Hardware/pco/pco_sdk_cam.py b/PYME/Acquire/Hardware/pco/pco_sdk_cam.py
index 92c3f9bd3..7491cd67b 100644
--- a/PYME/Acquire/Hardware/pco/pco_sdk_cam.py
+++ b/PYME/Acquire/Hardware/pco/pco_sdk_cam.py
@@ -20,6 +20,7 @@
 import queue
 import threading
 import time
+import copy
 
 k32_dll = ctypes.windll.kernel32  # lets us use the recommended WaitForSingleObject call (see pco.sdk)
                                   # instead of the not-recommended-for-polling pco_sdk.get_buffer_status()
@@ -580,7 +581,7 @@ def TriggerAq(self):
             res = pco_sdk.force_trigger(self._handle)
             # FIFO queue the queable buffers so we don't
             # grab images before a trigger in _poll_loop
-            self._buffers_to_queue.put(self._i)
+            self._buffers_to_queue.put(copy.deepcopy(self._i)) # ensure we put a copy of the index, not a reference
             self._i += 1
             if self._i >= self._n_buffers:
                 self._i = 0

From 6f0bb267ebddd67365ab34bf8bbb4c8d84cd0659 Mon Sep 17 00:00:00 2001
From: David Baddeley <david.baddeley.nz@gmail.com>
Date: Fri, 19 Dec 2025 15:01:14 +1300
Subject: [PATCH 4/5] docs on timeout

---
 PYME/Acquire/Hardware/pco/pco_sdk_cam.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/PYME/Acquire/Hardware/pco/pco_sdk_cam.py b/PYME/Acquire/Hardware/pco/pco_sdk_cam.py
index 7491cd67b..760f7b7fe 100644
--- a/PYME/Acquire/Hardware/pco/pco_sdk_cam.py
+++ b/PYME/Acquire/Hardware/pco/pco_sdk_cam.py
@@ -519,9 +519,12 @@ def _init_buffers(self):
 
     @property
     def _timeout(self):
-        # set the time longer for all hardware check
-        # for example, LC settling time is usually set at 300 ms (current minimum is 150 ms)
-        return int(max(2*100*self.GetCycleTime(), 1000))
+        # set the timeout (ms)based on cycle time
+        # timeout should be longer than cycle time + readout time + some margin for windows timing uncertainty (default to 2x cycle time)
+        # timeout should not be too long, otherwise one bad buffer can block things and cause overflows.
+        # TODO: reduce the minimum timeout to something more reasonable (e.g. 100 ms)?? Current hardcoded value of 1s is very likely excessive
+        # and may cause issues, especially if the camera is running at a high frame rate.
+        return int(max(2*1000*self.GetCycleTime(), 1000))
     
     def StartExposure(self):
         #logger.debug(f'PcoSdkCam: StartExposure called from thread {threading.current_thread().name} at {time.time()}')

From 5bf88771623ee0113f46a04471e8c7728130cec1 Mon Sep 17 00:00:00 2001
From: Yujin-Bao <yujinbao111@gmail.com>
Date: Sun, 21 Dec 2025 13:02:15 -0500
Subject: [PATCH 5/5] queue buffer before TriggerAq

---
 PYME/Acquire/Hardware/pco/pco_sdk_cam.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/PYME/Acquire/Hardware/pco/pco_sdk_cam.py b/PYME/Acquire/Hardware/pco/pco_sdk_cam.py
index 760f7b7fe..1c31ba570 100644
--- a/PYME/Acquire/Hardware/pco/pco_sdk_cam.py
+++ b/PYME/Acquire/Hardware/pco/pco_sdk_cam.py
@@ -581,10 +581,11 @@ def StopAq(self):
 
     def TriggerAq(self):
         if (self._mode == self.MODE_SINGLE_SHOT) or (self._mode == self.MODE_SOFTWARE_TRIGGER):
-            res = pco_sdk.force_trigger(self._handle)
+            # res = pco_sdk.force_trigger(self._handle)
             # FIFO queue the queable buffers so we don't
             # grab images before a trigger in _poll_loop
             self._buffers_to_queue.put(copy.deepcopy(self._i)) # ensure we put a copy of the index, not a reference
+            res = pco_sdk.force_trigger(self._handle)
             self._i += 1
             if self._i >= self._n_buffers:
                 self._i = 0