From 72f2fd6d15721da76372c1c2bf51d578bb40cf2f Mon Sep 17 00:00:00 2001 From: Yujin-Bao Date: Wed, 17 Dec 2025 15:01:23 -0500 Subject: [PATCH 1/5] software fix for pco sometimes crashing/capturing black frames/being stuck --- PYME/Acquire/Hardware/pco/pco_sdk_cam.py | 95 +++++++++++++++++++----- PYME/Acquire/frameWrangler.py | 13 +++- 2 files changed, 89 insertions(+), 19 deletions(-) diff --git a/PYME/Acquire/Hardware/pco/pco_sdk_cam.py b/PYME/Acquire/Hardware/pco/pco_sdk_cam.py index 61dc1fbea..97aeb900a 100644 --- a/PYME/Acquire/Hardware/pco/pco_sdk_cam.py +++ b/PYME/Acquire/Hardware/pco/pco_sdk_cam.py @@ -67,6 +67,9 @@ def unregCamera(cls): MAX_TIMEOUTS = 100 MAX_QUEUED_BUFFERS = 16 # pco. has a hard limit on attaching no # more than 16 buffers at a time to the camera +WAIT_OBJECT_0 = 0x00000000 +WAIT_TIMEOUT = 0x00000102 +MAX_CONSECUTIVE_ERRORS = 10 # max consecutive wait/buffer errors before we abort recording class PcoSdkCam(Camera): def __init__(self, camNum, debuglevel='off'): @@ -107,11 +110,14 @@ def Init(self): self._binning_x = 1 self._binning_y = 1 self._n_timeouts = 0 + self.hardware_overflowed = False # used by frameWrangler to know camera is unhappy self._i = 0 self._buffers_to_queue = queue.Queue() self._queued_buffers = queue.Queue() self._full_buffers = queue.Queue() self.SetROI(1, 1, self.GetCCDWidth(), self.GetCCDHeight()) + self._consecutive_errors = 0 + self._max_consecutive_errors = MAX_CONSECUTIVE_ERRORS self.SetIntegTime(0.025) self.SetAcquisitionMode(self.MODE_CONTINUOUS) self._cam_type = pco_sdk.get_camera_type(self._handle) @@ -155,24 +161,74 @@ def _poll_loop(self): # wait for the buffer wait_status = k32_dll.WaitForSingleObject(self._buf_event[_curr_buf], self._timeout) - if wait_status: - logger.warning(f"Waited too long for buffer ({self._timeout} ms).") - - #TODO: we currently continue as if we got the buffer - is this the right thing to do? - # Presumably the status will be non-zero and we will drop the buffer?, but then what - # happens to those buffers? do they just dissapear? + if wait_status == WAIT_TIMEOUT: + # Timeout: do NOT deliver a frame; recycle the buffer + self._n_timeouts += 1 + self._consecutive_errors += 1 + + logger.warning( + f"PCO: WaitForSingleObject timeout after {self._timeout} ms " + f"(timeouts={self._n_timeouts}, consecutive_errors={self._consecutive_errors}) " + f"on buffer {_curr_buf}. Recycling." + ) + + k32_dll.ResetEvent(self._buf_event[_curr_buf]) + self._buffers_to_queue.put(_curr_buf) + + # After a few consecutive timeouts, flag for higher-level recovery logic + if self._n_timeouts >= 3: + logger.error("PCO: repeated buffer timeouts -> hardware_overflowed=True") + self.hardware_overflowed = True - k32_dll.ResetEvent(self._buf_event[_curr_buf]) - # make sure this buffer is safe to use - status = self._buffer_status[_curr_buf] - if status: - logger.warning(f"Error {status} during check of buffer {_curr_buf}.") - #DB: Do you see a lot of these warnings? IE - do we get one every time we have a timeout? - # drop this buffer + elif wait_status == WAIT_OBJECT_0: + # Normal case + self._n_timeouts = 0 + self._consecutive_errors = 0 + + k32_dll.ResetEvent(self._buf_event[_curr_buf]) + + status = self._buffer_status[_curr_buf] + if status != 0: + # Bad buffer -> recycle it, don't deliver + logger.warning(f"PCO: error {status} during check of buffer {_curr_buf}. Recycling.") + self._consecutive_errors += 1 + self._buffers_to_queue.put(_curr_buf) + else: + # use it + self._full_buffers.put(_curr_buf) + self._n_buffered += 1 + else: - # use it - self._full_buffers.put(_curr_buf) - self._n_buffered += 1 + # Other wait error: recycle buffer, flag error + self._consecutive_errors += 1 + logger.error( + "PCO: WaitForSingleObject returned unexpected status 0x%08X for buffer %d. Recycling.", + wait_status, _curr_buf + ) + + k32_dll.ResetEvent(self._buf_event[_curr_buf]) + self._buffers_to_queue.put(_curr_buf) + + # If things look persistently bad, stop recording to avoid hard SDK crashes. + if self._consecutive_errors >= self._max_consecutive_errors: + logger.error( + "PCO: too many consecutive buffer errors (%d) -> stopping recording to avoid crash", + self._consecutive_errors + ) + try: + pco_sdk.set_recording_state(self._handle, pco_sdk.PCO_CAMERA_STOPPED) + pco_sdk.cancel_images(self._handle) + except Exception: + logger.exception("PCO: error while stopping camera after repeated buffer errors") + + # Mark stopped on our side; FrameWrangler/UI can restart cleanly. + self._recording = False + self.hardware_overflowed = True + self._n_buffered = 0 + self._n_queued = 0 + self._n_timeouts = 0 + self._consecutive_errors = 0 + else: # sleep for a bit longer if there were no buffers queued sleep_time = 0.01 @@ -433,7 +489,9 @@ def _init_buffers(self): @property def _timeout(self): - return int(max(2*100*self.GetCycleTime(), 100)) + # set the time longer for all hardware check + # for example, LC settling time is usually set at 300 ms (current minimum is 150 ms) + return int(max(2*100*self.GetCycleTime(), 1000)) def StartExposure(self): #logger.debug(f'PcoSdkCam: StartExposure called from thread {threading.current_thread().name} at {time.time()}') @@ -448,6 +506,8 @@ def StartExposure(self): if self._recording == False: self._init_buffers() self._recording = True + self.hardware_overflowed = False + self._n_timeouts = 0 self._log_exposure_start() @@ -472,6 +532,7 @@ def StopAq(self): self._n_buffered = 0 self._n_queued = 0 self._n_timeouts = 0 + self.hardware_overflowed = False self._buf_event = [] self._buf_addr = [] self._buf_status_addr = [] diff --git a/PYME/Acquire/frameWrangler.py b/PYME/Acquire/frameWrangler.py index 5be027328..f1c4fee6c 100644 --- a/PYME/Acquire/frameWrangler.py +++ b/PYME/Acquire/frameWrangler.py @@ -419,9 +419,18 @@ def checkHardware(self): NB: This is largely legacy code, as the camera is usually used in free-running mode.""" for callback in self.HardwareChecks: - if not callback(): - logger.debug('Waiting for hardware') + try: + ready = callback() + except Exception: + logger.exception('Hardware check %r raised an exception', callback) + return False + + if not ready: + logger.debug('Waiting for hardware: %r reported not-ready', callback) return False + # if not callback(): + # logger.debug('Waiting for hardware') + # return False return True From 5ab54579938cc094cc66888c0ed3e409ed20baf4 Mon Sep 17 00:00:00 2001 From: David Baddeley Date: Fri, 19 Dec 2025 14:30:56 +1300 Subject: [PATCH 2/5] make sure we clean up event handles, slightly improved diagnostics --- PYME/Acquire/Hardware/pco/pco_sdk_cam.py | 49 ++++++++++++++++++++---- 1 file changed, 42 insertions(+), 7 deletions(-) diff --git a/PYME/Acquire/Hardware/pco/pco_sdk_cam.py b/PYME/Acquire/Hardware/pco/pco_sdk_cam.py index 97aeb900a..92c3f9bd3 100644 --- a/PYME/Acquire/Hardware/pco/pco_sdk_cam.py +++ b/PYME/Acquire/Hardware/pco/pco_sdk_cam.py @@ -24,6 +24,8 @@ k32_dll = ctypes.windll.kernel32 # lets us use the recommended WaitForSingleObject call (see pco.sdk) # instead of the not-recommended-for-polling pco_sdk.get_buffer_status() + + # Define event handle type (needed for pco_sdk.add_buffer_extern()) # Generally we will want to use k32_dll.CreateEventA(None, 1, 0, None) # See https://docs.microsoft.com/en-us/windows/win32/api/synchapi/nf-synchapi-createeventa @@ -67,8 +69,13 @@ def unregCamera(cls): MAX_TIMEOUTS = 100 MAX_QUEUED_BUFFERS = 16 # pco. has a hard limit on attaching no # more than 16 buffers at a time to the camera + +# Windows WaitForSingleObject return codes WAIT_OBJECT_0 = 0x00000000 WAIT_TIMEOUT = 0x00000102 +WAIT_ABANDONED = 0x00000080 +WAIT_FAILED = 0xFFFFFFFF + MAX_CONSECUTIVE_ERRORS = 10 # max consecutive wait/buffer errors before we abort recording class PcoSdkCam(Camera): @@ -173,10 +180,13 @@ def _poll_loop(self): ) k32_dll.ResetEvent(self._buf_event[_curr_buf]) - self._buffers_to_queue.put(_curr_buf) - - # After a few consecutive timeouts, flag for higher-level recovery logic - if self._n_timeouts >= 3: + + if (self._n_timeouts < MAX_TIMEOUTS) and (self.contMode == self.MODE_CONTINUOUS): + # In continuous mode, we can try and re-queue the buffer and carry on + # In single-shot mode, requeuing won't work as we will also need to retrigger - just flag overflow + # and let the FrameWrangler handle a restart + self._buffers_to_queue.put(_curr_buf) + else: logger.error("PCO: repeated buffer timeouts -> hardware_overflowed=True") self.hardware_overflowed = True @@ -192,7 +202,12 @@ def _poll_loop(self): # Bad buffer -> recycle it, don't deliver logger.warning(f"PCO: error {status} during check of buffer {_curr_buf}. Recycling.") self._consecutive_errors += 1 - self._buffers_to_queue.put(_curr_buf) + if (self.contMode == self.MODE_CONTINUOUS): + self._buffers_to_queue.put(_curr_buf) + else: + # single shot mode - just flag overflow + logger.error("PCO: buffer error in single-shot mode -> hardware_overflowed=True") + self.hardware_overflowed = True else: # use it self._full_buffers.put(_curr_buf) @@ -201,9 +216,23 @@ def _poll_loop(self): else: # Other wait error: recycle buffer, flag error self._consecutive_errors += 1 + + # use GetLastError and FormatMessage to get more info + err_code = k32_dll.GetLastError() + msg_buffer = ctypes.create_string_buffer(256) + k32_dll.FormatMessageA( + 0x00000000, + None, + err_code, + 0, + msg_buffer, + len(msg_buffer), + None + ) + logger.error( - "PCO: WaitForSingleObject returned unexpected status 0x%08X for buffer %d. Recycling.", - wait_status, _curr_buf + "PCO: WaitForSingleObject returned unexpected status 0x%08X for buffer %d.\n GetLastError: %d\n Message: %s\n Recycling buffer.", + wait_status, _curr_buf, err_code, msg_buffer.value.decode() ) k32_dll.ResetEvent(self._buf_event[_curr_buf]) @@ -533,7 +562,13 @@ def StopAq(self): self._n_queued = 0 self._n_timeouts = 0 self.hardware_overflowed = False + + # close and clear buffer events + # (Prevents us from leaking handles when running for a long time with many start/stops) + for ev in self._buf_event: + k32_dll.CloseHandle(ev) self._buf_event = [] + self._buf_addr = [] self._buf_status_addr = [] self._buffer = None From 3d0f7afbf75633a44812c455e915bd11328d6572 Mon Sep 17 00:00:00 2001 From: David Baddeley Date: Fri, 19 Dec 2025 14:52:49 +1300 Subject: [PATCH 3/5] fix potential mutability of buffer and handle indices --- PYME/Acquire/Hardware/pco/pco_sdk_cam.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PYME/Acquire/Hardware/pco/pco_sdk_cam.py b/PYME/Acquire/Hardware/pco/pco_sdk_cam.py index 92c3f9bd3..7491cd67b 100644 --- a/PYME/Acquire/Hardware/pco/pco_sdk_cam.py +++ b/PYME/Acquire/Hardware/pco/pco_sdk_cam.py @@ -20,6 +20,7 @@ import queue import threading import time +import copy k32_dll = ctypes.windll.kernel32 # lets us use the recommended WaitForSingleObject call (see pco.sdk) # instead of the not-recommended-for-polling pco_sdk.get_buffer_status() @@ -580,7 +581,7 @@ def TriggerAq(self): res = pco_sdk.force_trigger(self._handle) # FIFO queue the queable buffers so we don't # grab images before a trigger in _poll_loop - self._buffers_to_queue.put(self._i) + self._buffers_to_queue.put(copy.deepcopy(self._i)) # ensure we put a copy of the index, not a reference self._i += 1 if self._i >= self._n_buffers: self._i = 0 From 6f0bb267ebddd67365ab34bf8bbb4c8d84cd0659 Mon Sep 17 00:00:00 2001 From: David Baddeley Date: Fri, 19 Dec 2025 15:01:14 +1300 Subject: [PATCH 4/5] docs on timeout --- PYME/Acquire/Hardware/pco/pco_sdk_cam.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/PYME/Acquire/Hardware/pco/pco_sdk_cam.py b/PYME/Acquire/Hardware/pco/pco_sdk_cam.py index 7491cd67b..760f7b7fe 100644 --- a/PYME/Acquire/Hardware/pco/pco_sdk_cam.py +++ b/PYME/Acquire/Hardware/pco/pco_sdk_cam.py @@ -519,9 +519,12 @@ def _init_buffers(self): @property def _timeout(self): - # set the time longer for all hardware check - # for example, LC settling time is usually set at 300 ms (current minimum is 150 ms) - return int(max(2*100*self.GetCycleTime(), 1000)) + # set the timeout (ms)based on cycle time + # timeout should be longer than cycle time + readout time + some margin for windows timing uncertainty (default to 2x cycle time) + # timeout should not be too long, otherwise one bad buffer can block things and cause overflows. + # TODO: reduce the minimum timeout to something more reasonable (e.g. 100 ms)?? Current hardcoded value of 1s is very likely excessive + # and may cause issues, especially if the camera is running at a high frame rate. + return int(max(2*1000*self.GetCycleTime(), 1000)) def StartExposure(self): #logger.debug(f'PcoSdkCam: StartExposure called from thread {threading.current_thread().name} at {time.time()}') From 5bf88771623ee0113f46a04471e8c7728130cec1 Mon Sep 17 00:00:00 2001 From: Yujin-Bao Date: Sun, 21 Dec 2025 13:02:15 -0500 Subject: [PATCH 5/5] queue buffer before TriggerAq --- PYME/Acquire/Hardware/pco/pco_sdk_cam.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PYME/Acquire/Hardware/pco/pco_sdk_cam.py b/PYME/Acquire/Hardware/pco/pco_sdk_cam.py index 760f7b7fe..1c31ba570 100644 --- a/PYME/Acquire/Hardware/pco/pco_sdk_cam.py +++ b/PYME/Acquire/Hardware/pco/pco_sdk_cam.py @@ -581,10 +581,11 @@ def StopAq(self): def TriggerAq(self): if (self._mode == self.MODE_SINGLE_SHOT) or (self._mode == self.MODE_SOFTWARE_TRIGGER): - res = pco_sdk.force_trigger(self._handle) + # res = pco_sdk.force_trigger(self._handle) # FIFO queue the queable buffers so we don't # grab images before a trigger in _poll_loop self._buffers_to_queue.put(copy.deepcopy(self._i)) # ensure we put a copy of the index, not a reference + res = pco_sdk.force_trigger(self._handle) self._i += 1 if self._i >= self._n_buffers: self._i = 0