Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Fix flaky tests in test_external_inspection
Several tests calling unwinder.get_stack_trace() were flaky because they
used retry loops without exception handling. Transient failures like
"Failed to parse initial frame in chain" that occur when sampling at an
inopportune moment would immediately fail the test instead of being
retried.

The fix adds a _get_stack_trace_with_retry helper function and updates
seven locations to use busy_retry with contextlib.suppress for OSError
and RuntimeError, matching the existing pattern in
_get_frames_with_retry. This allows transient failures to be silently
retried while still timing out if the expected condition is never met.
  • Loading branch information
pablogsal committed Dec 23, 2025
commit 691c8dcb53bbc23308cab5279f02f0b5fa68ff36
171 changes: 95 additions & 76 deletions Lib/test/test_external_inspection.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,26 @@ def get_all_awaited_by(pid):
raise RuntimeError("Failed to get all awaited_by after retries")


def _get_stack_trace_with_retry(unwinder, timeout=SHORT_TIMEOUT):
"""Get stack trace from an existing unwinder with retry for transient errors.

This handles the case where we want to reuse an existing RemoteUnwinder
instance but still handle transient failures like "Failed to parse initial
frame in chain" that can occur when sampling at an inopportune moment.
"""
last_error = None
for _ in busy_retry(timeout):
try:
return unwinder.get_stack_trace()
except (OSError, RuntimeError) as e:
last_error = e
time.sleep(0.1)
continue
raise RuntimeError(
f"Failed to get stack trace after retries: {last_error}"
)


# ============================================================================
# Base test class with shared infrastructure
# ============================================================================
Expand Down Expand Up @@ -1704,16 +1724,16 @@ def main_work():

# Get stack trace with all threads
unwinder_all = RemoteUnwinder(p.pid, all_threads=True)
for _ in range(MAX_TRIES):
all_traces = unwinder_all.get_stack_trace()
found = self._find_frame_in_trace(
all_traces,
lambda f: f.funcname == "main_work"
and f.location.lineno > 12,
)
if found:
break
time.sleep(0.1)
for _ in busy_retry(SHORT_TIMEOUT):
with contextlib.suppress(OSError, RuntimeError):
all_traces = unwinder_all.get_stack_trace()
found = self._find_frame_in_trace(
all_traces,
lambda f: f.funcname == "main_work"
and f.location.lineno > 12,
)
if found:
break
else:
self.fail(
"Main thread did not start its busy work on time"
Expand All @@ -1723,7 +1743,7 @@ def main_work():
unwinder_gil = RemoteUnwinder(
p.pid, only_active_thread=True
)
gil_traces = unwinder_gil.get_stack_trace()
gil_traces = _get_stack_trace_with_retry(unwinder_gil)

# Count threads
total_threads = sum(
Expand Down Expand Up @@ -1998,15 +2018,15 @@ def busy():
mode=mode,
skip_non_matching_threads=False,
)
for _ in range(MAX_TRIES):
traces = unwinder.get_stack_trace()
statuses = self._get_thread_statuses(traces)
for _ in busy_retry(SHORT_TIMEOUT):
with contextlib.suppress(OSError, RuntimeError):
traces = unwinder.get_stack_trace()
statuses = self._get_thread_statuses(traces)

if check_condition(
statuses, sleeper_tid, busy_tid
):
break
time.sleep(0.5)
if check_condition(
statuses, sleeper_tid, busy_tid
):
break

return statuses, sleeper_tid, busy_tid
finally:
Expand Down Expand Up @@ -2150,29 +2170,29 @@ def busy_thread():
mode=PROFILING_MODE_ALL,
skip_non_matching_threads=False,
)
for _ in range(MAX_TRIES):
traces = unwinder.get_stack_trace()
statuses = self._get_thread_statuses(traces)

# Check ALL mode provides both GIL and CPU info
if (
sleeper_tid in statuses
and busy_tid in statuses
and not (
statuses[sleeper_tid]
& THREAD_STATUS_ON_CPU
)
and not (
statuses[sleeper_tid]
& THREAD_STATUS_HAS_GIL
)
and (statuses[busy_tid] & THREAD_STATUS_ON_CPU)
and (
statuses[busy_tid] & THREAD_STATUS_HAS_GIL
)
):
break
time.sleep(0.5)
for _ in busy_retry(SHORT_TIMEOUT):
with contextlib.suppress(OSError, RuntimeError):
traces = unwinder.get_stack_trace()
statuses = self._get_thread_statuses(traces)

# Check ALL mode provides both GIL and CPU info
if (
sleeper_tid in statuses
and busy_tid in statuses
and not (
statuses[sleeper_tid]
& THREAD_STATUS_ON_CPU
)
and not (
statuses[sleeper_tid]
& THREAD_STATUS_HAS_GIL
)
and (statuses[busy_tid] & THREAD_STATUS_ON_CPU)
and (
statuses[busy_tid] & THREAD_STATUS_HAS_GIL
)
):
break

self.assertIsNotNone(
sleeper_tid, "Sleeper thread id not received"
Expand Down Expand Up @@ -2296,18 +2316,18 @@ def test_thread_status_exception_detection(self):
mode=PROFILING_MODE_ALL,
skip_non_matching_threads=False,
)
for _ in range(MAX_TRIES):
traces = unwinder.get_stack_trace()
statuses = self._get_thread_statuses(traces)

if (
exception_tid in statuses
and normal_tid in statuses
and (statuses[exception_tid] & THREAD_STATUS_HAS_EXCEPTION)
and not (statuses[normal_tid] & THREAD_STATUS_HAS_EXCEPTION)
):
break
time.sleep(0.5)
for _ in busy_retry(SHORT_TIMEOUT):
with contextlib.suppress(OSError, RuntimeError):
traces = unwinder.get_stack_trace()
statuses = self._get_thread_statuses(traces)

if (
exception_tid in statuses
and normal_tid in statuses
and (statuses[exception_tid] & THREAD_STATUS_HAS_EXCEPTION)
and not (statuses[normal_tid] & THREAD_STATUS_HAS_EXCEPTION)
):
break

self.assertIn(exception_tid, statuses)
self.assertIn(normal_tid, statuses)
Expand Down Expand Up @@ -2339,18 +2359,18 @@ def test_thread_status_exception_mode_filtering(self):
mode=PROFILING_MODE_EXCEPTION,
skip_non_matching_threads=True,
)
for _ in range(MAX_TRIES):
traces = unwinder.get_stack_trace()
statuses = self._get_thread_statuses(traces)

if exception_tid in statuses:
self.assertNotIn(
normal_tid,
statuses,
"Normal thread should be filtered out in exception mode",
)
return
time.sleep(0.5)
for _ in busy_retry(SHORT_TIMEOUT):
with contextlib.suppress(OSError, RuntimeError):
traces = unwinder.get_stack_trace()
statuses = self._get_thread_statuses(traces)

if exception_tid in statuses:
self.assertNotIn(
normal_tid,
statuses,
"Normal thread should be filtered out in exception mode",
)
return

self.fail("Never found exception thread in exception mode")

Expand Down Expand Up @@ -2504,18 +2524,17 @@ def _check_exception_status(self, p, thread_tid, expect_exception):

# Collect multiple samples for reliability
results = []
for _ in range(MAX_TRIES):
traces = unwinder.get_stack_trace()
statuses = self._get_thread_statuses(traces)

if thread_tid in statuses:
has_exc = bool(statuses[thread_tid] & THREAD_STATUS_HAS_EXCEPTION)
results.append(has_exc)
for _ in busy_retry(SHORT_TIMEOUT):
with contextlib.suppress(OSError, RuntimeError):
traces = unwinder.get_stack_trace()
statuses = self._get_thread_statuses(traces)

if len(results) >= 3:
break
if thread_tid in statuses:
has_exc = bool(statuses[thread_tid] & THREAD_STATUS_HAS_EXCEPTION)
results.append(has_exc)

time.sleep(0.2)
if len(results) >= 3:
break

# Check majority of samples match expected
if not results:
Expand Down