From 4e56d19dd8d3e4fb54ec8bcbec2669c248b056d9 Mon Sep 17 00:00:00 2001 From: Mohamed Hegazy Date: Fri, 12 Jun 2026 00:53:02 -0700 Subject: [PATCH] =?UTF-8?q?security-guidance:=20handle=20signal-killed=20v?= =?UTF-8?q?env=20builds=20(memory)=20+=20cooldown=20(2.0.5=20=E2=86=92=202?= =?UTF-8?q?.0.6)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The real dominant Linux failure, identified by a CCR Linux repro. A CCR container reproduced the production signature — non-zero exit + EMPTY stdout + EMPTY stderr (~60k fires/day, 4,485 Linux users on 2.0.4): running `python -m venv` under a tight memory limit (ulimit -v) kills the memory-heavy venv+ensurepip/pip subprocess with SIGSEGV (-11, RLIMIT_AS) or SIGKILL (-9, kernel OOM-killer) BEFORE it writes anything. This is NOT the ensurepip/packaging case (that always writes to stderr, code 11) and NOT fixable by --target (a --target pip install is also memory-heavy and gets killed too). Three earlier hypotheses (stdout, packaging, Option A fixes Linux) were wrong — the repro corrected them. Changes: - Detect the signal kill (rc<0, or 128+sig: 134/137/139) in the venv/pip and --target paths → err_kind "signal_killed:" (new code 16). The returncode rides in a new sdk_bootstrap_rc metric so prod confirms which signal dominates (-9 OOM-killer vs -11 RLIMIT_AS). - Cooldown: on a signal kill, write a marker and return the new SKIP_COOLDOWN outcome (9) on subsequent sessions for 24h — stops the retry storm (every session was re-attempting a build that just gets re-killed, burning the user's memory/CPU). Retries once per window so a machine that frees memory still recovers. - --no-cache-dir on both pip installs (venv + --target) trims pip's peak memory; may get marginal machines under the OOM threshold. No happy-path change: signal detection is at the top of the existing failure handler; cooldown is checked only after all no-op probes (NOOP_SYSTEM/VENV/TARGET short-circuit first). Verified locally on macOS Python 3.13: - py_compile clean. - 35 new tests (test_signal_kill_cooldown.py): _is_signal_kill across signals/exit-codes, rc decode, signal_killed→code 16, cooldown lifecycle (none→write→expire), and an integration flow — simulated SIGKILL'd venv → BUILD_FAILED/signal_killed:-9 + cooldown written → 2nd run SKIP_COOLDOWN without re-attempting → retry after window; non-signal failure does NOT cool down; --no-cache-dir present on both pip paths; sdk_bootstrap_rc emitted conditionally. - End-to-end harness: the full kill→categorize→cooldown→skip→retry chain confirmed in-process. The original CCR repro (ulimit -v ≤7000 KB → rc=-11, empty streams) is the ground truth this fix is built on. Can be re-validated on CCR with the same ulimit approach. Version 2.0.5 -> 2.0.6 per the per-PR-bump policy (#2114). Co-Authored-By: Claude Opus 4.7 (1M context) --- .claude-plugin/marketplace.json | 2 +- .../.claude-plugin/plugin.json | 2 +- .../hooks/ensure_agent_sdk.py | 100 +++++++++++++++++- 3 files changed, 100 insertions(+), 4 deletions(-) diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index ab47e2c9..060e8497 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -2387,7 +2387,7 @@ { "name": "security-guidance", "description": "Security review for Claude-generated code. Pattern-based warnings on edits, LLM-powered diff review on Stop, and an agentic commit reviewer that catches injection, XSS, SSRF, hardcoded secrets, and 25+ other vulnerability classes.", - "version": "2.0.5", + "version": "2.0.6", "author": { "name": "Anthropic", "email": "support@anthropic.com" diff --git a/plugins/security-guidance/.claude-plugin/plugin.json b/plugins/security-guidance/.claude-plugin/plugin.json index 83813a4a..f4cc218e 100644 --- a/plugins/security-guidance/.claude-plugin/plugin.json +++ b/plugins/security-guidance/.claude-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "security-guidance", - "version": "2.0.5", + "version": "2.0.6", "description": "Security review for Claude-generated code. Pattern-based warnings on edits, LLM-powered diff review on Stop, and an agentic commit reviewer that catches injection, XSS, SSRF, hardcoded secrets, and 25+ other vulnerability classes.", "author": { "name": "David Dworken", diff --git a/plugins/security-guidance/hooks/ensure_agent_sdk.py b/plugins/security-guidance/hooks/ensure_agent_sdk.py index 69df6a22..23312baa 100644 --- a/plugins/security-guidance/hooks/ensure_agent_sdk.py +++ b/plugins/security-guidance/hooks/ensure_agent_sdk.py @@ -49,6 +49,17 @@ HOOK_PY_INCOMPATIBLE = 6 # hook interpreter is <3.10 — SDK syntax can't load # single-shot review. See #2154 follow-up. BUILT_TARGET = 7 # venv ensurepip failed → SDK pip-installed via --target NOOP_TARGET = 8 # --target libs already present and importable +SKIP_COOLDOWN = 9 # a recent build was signal-killed (memory pressure) — not + # retrying this session to avoid burning the user's + # memory/CPU on a build that keeps getting killed. CCR + # repro confirmed the dominant Linux BUILD_FAILED is a + # SIGKILL/SIGSEGV of the memory-heavy venv+pip subprocess + # (rc<0, empty streams). See #2154 follow-up. + +# How long to skip rebuilds after a signal kill. Retries at most once per +# window so a machine whose memory frees up still recovers (just not every +# session). Keyed by marker mtime. +SIGNAL_KILL_COOLDOWN_SEC = 24 * 3600 # Phase + err-kind integer encoding for sdk_bootstrap_phase / sdk_bootstrap_err. @@ -85,6 +96,11 @@ SDK_BOOTSTRAP_ERR_CODES = { "proxy_auth": 8, "stderr_timeout": 9, # pip stderr containing "timeout"/"timed out" "subprocess_timeout": 10, # subprocess.TimeoutExpired (>120s) + "signal_killed": 16, # venv/pip subprocess killed by a signal + # (rc<0 or 128+sig) — OOM-killer SIGKILL / + # RLIMIT_AS SIGSEGV, empty streams. The + # actual rc rides in sdk_bootstrap_rc. This + # is the dominant Linux failure (CCR repro). # Venv-stage specific categories added after PR #2112 telemetry surfaced # 2,406 phase=2/err=99 sessions in the first 3h of v2.0.1 — venv phase # failing in ways the original pip-flavored patterns didn't catch. These @@ -165,6 +181,10 @@ def _encode_err_kind(s): return 0 if s in SDK_BOOTSTRAP_ERR_CODES: return SDK_BOOTSTRAP_ERR_CODES[s] + # "signal_killed:" carries the returncode in sdk_bootstrap_rc; the + # category maps to the signal_killed code. + if s.startswith("signal_killed"): + return SDK_BOOTSTRAP_ERR_CODES["signal_killed"] # Prefix matches for the catch-all categories if s.startswith("exc:") or s.startswith("other:") or s == "other": return SDK_BOOTSTRAP_ERR_CODES["_uncategorized"] @@ -172,6 +192,52 @@ def _encode_err_kind(s): return SDK_BOOTSTRAP_ERR_CODES["_uncategorized"] +def _encode_rc(err_kind): + """Extract the subprocess returncode embedded in a 'signal_killed:' + err_kind (e.g. -11 SIGSEGV / -9 SIGKILL / 139 shell-wrapped). Emitted as + sdk_bootstrap_rc so BQ can tell OOM-killer (-9) from RLIMIT_AS (-11). + Returns 0 when absent/non-numeric.""" + if not err_kind or not err_kind.startswith("signal_killed:"): + return 0 + try: + return int(err_kind.split(":", 1)[1]) + except (ValueError, IndexError): + return 0 + + +def _is_signal_kill(returncode) -> bool: + """A subprocess killed by a signal rather than a clean non-zero exit. + subprocess.run (no shell, as used here) reports negative rc = -signum + (SIGKILL→-9 OOM-killer, SIGSEGV→-11 RLIMIT_AS, SIGABRT→-6). The 128+sig + forms (134/137/139) are defensive for any shell-wrapped path. Paired with + empty stdout+stderr this is the memory-kill signature (CCR repro).""" + if returncode is None: + return False + return returncode < 0 or returncode in (134, 137, 139) + + +def _cooldown_remaining(state_dir) -> float: + """Seconds left in the signal-kill cooldown (0 if none/expired). Reads the + marker's mtime; a missing/unreadable marker means not in cooldown.""" + marker = Path(state_dir) / "agent-sdk-venv.cooldown" + try: + age = time.time() - marker.stat().st_mtime + except OSError: + return 0.0 + return max(0.0, SIGNAL_KILL_COOLDOWN_SEC - age) + + +def _write_cooldown(state_dir) -> None: + """Start/refresh the signal-kill cooldown so we stop re-attempting a build + that keeps getting killed every session. Best-effort.""" + try: + Path(state_dir).mkdir(parents=True, exist_ok=True) + (Path(state_dir) / "agent-sdk-venv.cooldown").write_text( + time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())) + except OSError: + pass + + def _encode_stderr_sig(err_kind): """Bounded integer hash of the stderr tail captured in "other:" err_kinds. Lets us distinguish patterns INSIDE the _uncategorized @@ -326,12 +392,17 @@ def _build_via_target(state_dir) -> tuple[int, str, str]: subprocess.run( [sys.executable, "-m", "pip", "install", "--target", str(target), "--upgrade", - "--disable-pip-version-check", "--prefer-binary", + "--disable-pip-version-check", "--prefer-binary", "--no-cache-dir", "claude-agent-sdk"], capture_output=True, timeout=120, check=True, ) return BUILT_TARGET, "", "" except subprocess.CalledProcessError as e: + # A --target pip install is also memory-heavy, so it too can be + # signal-killed under memory pressure — cool down, same as the venv path. + if _is_signal_kill(e.returncode): + _write_cooldown(state_dir) + return BUILD_FAILED, "pip_target", f"signal_killed:{e.returncode}" return BUILD_FAILED, "pip_target", _pip_err_from_stderr(e.stderr) except subprocess.TimeoutExpired: return BUILD_FAILED, "pip_target", "subprocess_timeout" @@ -436,6 +507,14 @@ def main() -> tuple[int, str, str]: if _target_sdk_importable(state_dir): return NOOP_TARGET, "", "" + # If a recent build was signal-killed (memory pressure), don't re-attempt + # this session — the memory-heavy venv+pip just gets killed again, burning + # the user's resources. Retry at most once per cooldown window. Reached + # only after all no-op probes, so a machine that later gets the SDK via + # system/venv/target still short-circuits above. + if _cooldown_remaining(state_dir) > 0: + return SKIP_COOLDOWN, "", "" + err_phase = "" err_kind = "" we_own_sentinel = False @@ -468,14 +547,25 @@ def main() -> tuple[int, str, str]: # --prefer-binary tells pip to pick it. Cross-platform safe: no-op # on platforms where the latest version already has a wheel. err_phase = "pip" + # --no-cache-dir trims pip's peak memory (no cache read/write/unpack + # buffering) — helps marginal low-memory machines get under the OOM + # threshold that kills the dominant Linux builds (CCR repro). subprocess.run( [str(venv_py), "-m", "pip", "install", "--quiet", - "--disable-pip-version-check", "--prefer-binary", + "--disable-pip-version-check", "--prefer-binary", "--no-cache-dir", "claude-agent-sdk"], capture_output=True, timeout=120, check=True, ) return BUILT, "", "" except subprocess.CalledProcessError as e: + # Signal kill (OOM-killer SIGKILL / RLIMIT_AS SIGSEGV) — rc<0, empty + # streams. The dominant Linux failure. Record the rc, start a cooldown + # so we stop retry-storming a build that keeps getting killed, and + # skip the stderr categorization (there's nothing in stderr). err_phase + # says whether it died creating the venv or installing via pip. + if _is_signal_kill(e.returncode): + _write_cooldown(state_dir) + return BUILD_FAILED, err_phase, f"signal_killed:{e.returncode}" # Capture a stderr fingerprint so telemetry can split BUILD_FAILED by # root cause (no-network, package-not-found, dns-fail, etc.). # Categorize first, then keep a short raw tail for the long tail of @@ -683,6 +773,12 @@ if __name__ == "__main__": exc_errno = _encode_errno(err_kind) if exc_errno: metrics["sdk_bootstrap_errno"] = exc_errno + # Subprocess returncode for signal kills (-9 OOM-killer / -11 + # RLIMIT_AS / -6 abort). Confirms in prod which signal dominates the + # Linux memory-kill bucket. 0 (omitted) for non-signal failures. + rc = _encode_rc(err_kind) + if rc: + metrics["sdk_bootstrap_rc"] = rc # venv_ensurepip_fail (code 11) is the top categorizable venv # failure, and telemetry shows it's NOT just Debian — macOS has the # most distinct affected users. Probe whether this interpreter has