From 475038edfce3b736fba6648af7d8c1bd627c2464 Mon Sep 17 00:00:00 2001 From: Mohamed Hegazy Date: Sun, 31 May 2026 08:34:35 -0700 Subject: [PATCH] security-guidance: emit HTTP error codes + fix sdk_bootstrap phase/err encoding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fills two failure-visibility gaps in plugin telemetry. ## Gap 1: HTTP errors from _call_claude invisible Before: a 4xx/5xx response from the LLM API caused `_call_claude` to return None and produce ZERO fingerprint in tengu_hook_plugin_metrics. A failed call looked identical to "no review needed". The recent deprecation-400 outage (PR #2105, output_format → output_config.format, #2098) was invisible in aggregate dashboards until a user manually reported errors from their debug log. Cohort-specific or partial outages would never show up in BQ. Fix: add `http_err_last` (most recent status) and `http_err_count` to the existing `_USAGE` accumulator in `_base.py`. `_usage_metrics()` snapshots them whenever count > 0 (skip-path no-pollute contract preserved when count == 0). All `_call_claude` error sites now call the new `_record_http_error()` helper alongside the existing `_last_call_claude_http_error` module-state assignment. Now any future API failure category is queryable in BQ in real time: SELECT DATE(server_timestamp, "America/Los_Angeles") AS d, CAST(JSON_VALUE(additional_metadata, "$.http_err_last") AS INT64) AS code, COUNT(*) AS n FROM ... WHERE event_name = "tengu_hook_plugin_metrics" AND JSON_VALUE(additional_metadata, "$.pluginId") LIKE "%security-guidance%" AND JSON_VALUE(additional_metadata, "$.http_err_count") IS NOT NULL GROUP BY d, code ORDER BY d, n DESC ## Gap 2: sdk_bootstrap_phase / sdk_bootstrap_err always NULL in BQ Before: ensure_agent_sdk.py emitted these as strings (e.g. "pip", "dns_fail"). CC's plugin-metrics pipeline silently drops plugin-emitted string values — only bool|finite-number plugin metrics reach BigQuery. (CC-core fields like `subscription_type` are exempt because they're injected downstream of plugin validation.) Confirmed empirically: ~185K BUILD_FAILED rows in BQ over the past 2 days had `sdk_bootstrap_phase` = NULL and `sdk_bootstrap_err` = NULL despite the Python code emitting them. ~28K BUILD_FAILED sessions/day had no diagnostic split — flying blind on whether the failures are pip-no-match vs dns-fail vs ssl-verify vs proxy-auth etc. Fix: encode phase + err_kind as stable integers via SDK_BOOTSTRAP_PHASE_CODES and SDK_BOOTSTRAP_ERR_CODES. Phase: 1=pre, 2=venv, 3=pip, 4=main. Err: 10 known categories (1-10), 11-98 reserved, 99 = uncategorized catch-all (covers "exc:", "other:", and unmapped strings). APPEND-ONLY for telemetry stability. Also corrects the misleading "CC accepts string metric values" comment in ensure_agent_sdk.py that led to the bug originally. Verified locally on macOS Python 3.13: - py_compile clean. - 32 new tests in test_telemetry_failure_signals.py (added to internal test suite at sg-staging/tests/, not in this PR): * 4 HTTP-error tracking unit tests: _record_http_error increments count + tracks most-recent; handles None/invalid; -1 for network/timeout. * 4 _usage_metrics emission tests: empty when no activity; successful call has no http_err fields; failure-only has http_err and no api_calls; mixed has both. * 1 contract test: every emitted value is bool|finite-number (catches future regression of the string-dropping bug class). * 13 sdk_bootstrap encoding tests (parametrized over the 10 known err_kind categories + 5 catch-all shapes): each maps to the right integer; unknown phase = 0; unknown err = 99. * 1 static-shape regression catcher: every `err_kind = "..."` string in ensure_agent_sdk.main() must be in SDK_BOOTSTRAP_ERR_CODES (otherwise new err_kinds silently collapse to 99). * 2 emit-shape regression catchers: the assignments in main() go through _encode_phase / _encode_err_kind helpers (no raw strings); no literal string values for sdk_bootstrap_phase/err. * 1 comment-accuracy: the misleading "CC accepts string metric values" comment is gone. - Full suite: 437/437 pass + 2 skipped (live API tests, opt-in). NOT verified end-to-end against BQ — would require shipping + observing in production for 24h to confirm the http_err and sdk_bootstrap_phase/err fields actually appear in tengu_hook_plugin_metrics rows. The unit tests pin the contract; if the wire shape is broken, BQ will show NULL for the new fields and we revisit (with the same diagnostic the BUILD_FAILED bug gave us). Co-Authored-By: Claude Opus 4.7 (1M context) --- plugins/security-guidance/hooks/_base.py | 75 +++++++++++++--- .../hooks/ensure_agent_sdk.py | 89 ++++++++++++++++--- plugins/security-guidance/hooks/llm.py | 8 +- 3 files changed, 147 insertions(+), 25 deletions(-) diff --git a/plugins/security-guidance/hooks/_base.py b/plugins/security-guidance/hooks/_base.py index 9adbcff4..ce051753 100644 --- a/plugins/security-guidance/hooks/_base.py +++ b/plugins/security-guidance/hooks/_base.py @@ -116,7 +116,18 @@ _PV = _read_plugin_version_int() # Emitted via _usage_metrics() into the existing emit_metrics() channel so # hook metrics rows carry per-invocation token/cost totals # alongside the existing skip_reason / vulns_found fields. -_USAGE = {"in": 0, "out": 0, "cr": 0, "cw": 0, "cost": 0.0, "n": 0} +_USAGE = { + "in": 0, "out": 0, "cr": 0, "cw": 0, "cost": 0.0, "n": 0, + # HTTP error visibility (#2098 visibility gap — see emit comment in + # _usage_metrics). Without this, API failures from `_call_claude` left + # zero fingerprint in telemetry: the call returns None, the caller's + # emit_metrics carries no api_calls field, and the failure is + # indistinguishable from "no review needed". The deprecation outage + # that broke every commit-review LLM call was invisible until users + # reported it manually. + "http_err_last": 0, # most recent HTTP error code this invocation + "http_err_count": 0, # total HTTP errors (4xx + 5xx + network) +} _USAGE_LOCK = threading.Lock() # $/Mtok (input, output). Used only for the raw-HTTP path; the SDK path @@ -166,19 +177,55 @@ def _record_usage(usage, model, cost_usd=None): _USAGE["n"] += 1 +def _record_http_error(status): + """Record an HTTP error from an LLM API call. `status` is the HTTP + status code (integer 400–599) or -1 for network/timeout errors. Stored + in `_USAGE["http_err_last"]` (most recent) and counted in + `_USAGE["http_err_count"]`. Snapshot via `_usage_metrics()` so every + subsequent `emit_metrics` includes the failure fingerprint. + + Background: without this, the most recent example was the #2098 + deprecation 400. Every hook fire's LLM call returned HTTP 400; the + plugin caught it and returned None; the emit_metrics carried no + api_calls field; aggregate dashboards looked normal. The failure + only became visible when a user manually reported errors out of + their debug log. With this field, a category-of-failure spike (4xx, + 5xx, or -1 network) is queryable from BQ in real time. + """ + try: + s = int(status) + except (TypeError, ValueError): + return + with _USAGE_LOCK: + _USAGE["http_err_last"] = s + _USAGE["http_err_count"] += 1 + + def _usage_metrics(): """Snapshot the accumulator as metric keys. Returns {} when no API calls - were made so skip-path emits don't burn key budget. cost_usd rounded to - 1e-6 to keep the float finite/short for the zod schema.""" - with _USAGE_LOCK: - if _USAGE["n"] == 0: - return {} - return { - "tok_in": _USAGE["in"], - "tok_out": _USAGE["out"], - "tok_cache_r": _USAGE["cr"], - "tok_cache_w": _USAGE["cw"], - "cost_usd": round(_USAGE["cost"], 6), - "api_calls": _USAGE["n"], - } + AND no HTTP errors were made so skip-path emits don't burn key budget. + cost_usd rounded to 1e-6 to keep the float finite/short for the zod + schema. + + HTTP errors (`http_err_last`, `http_err_count`) emitted ONLY when + `http_err_count > 0` so successful calls don't pad every metrics row + with two zero fields. + """ + with _USAGE_LOCK: + if _USAGE["n"] == 0 and _USAGE["http_err_count"] == 0: + return {} + out = {} + if _USAGE["n"] > 0: + out.update({ + "tok_in": _USAGE["in"], + "tok_out": _USAGE["out"], + "tok_cache_r": _USAGE["cr"], + "tok_cache_w": _USAGE["cw"], + "cost_usd": round(_USAGE["cost"], 6), + "api_calls": _USAGE["n"], + }) + if _USAGE["http_err_count"] > 0: + out["http_err_last"] = _USAGE["http_err_last"] + out["http_err_count"] = _USAGE["http_err_count"] + return out diff --git a/plugins/security-guidance/hooks/ensure_agent_sdk.py b/plugins/security-guidance/hooks/ensure_agent_sdk.py index 262a53a9..4c5f8423 100644 --- a/plugins/security-guidance/hooks/ensure_agent_sdk.py +++ b/plugins/security-guidance/hooks/ensure_agent_sdk.py @@ -42,6 +42,71 @@ HOOK_PY_INCOMPATIBLE = 6 # hook interpreter is <3.10 — SDK syntax can't load # here no matter how the venv was built. See #2071. +# Phase + err-kind integer encoding for sdk_bootstrap_phase / sdk_bootstrap_err. +# +# Earlier versions emitted these as STRINGS (e.g. "pip", "dns_fail"). CC's +# plugin-metrics pipeline silently drops plugin-emitted string values — +# only `bool|finite-number` plugin metrics reach BigQuery. (CC-core +# metrics like `subscription_type` are exempt because they're injected +# downstream of plugin validation.) Confirmed empirically: 185K +# BUILD_FAILED rows in BQ had `sdk_bootstrap_phase`/`sdk_bootstrap_err` +# = NULL despite the Python code emitting them. This left ~28K +# BUILD_FAILED sessions/day with no diagnostic split — flying blind on +# the real failure modes (pip-no-match vs dns-fail vs ssl-verify etc.). +# +# Fix: encode as small integers per the maps below. Values are +# APPEND-ONLY for telemetry stability. Reserve 99 as the "unknown / +# uncategorized" bucket so an unmapped err_kind (e.g., a new exception +# type) still emits a non-zero signal. +SDK_BOOTSTRAP_PHASE_CODES = { + "pre": 1, # pre-venv (state_dir.mkdir, sentinel open) + "venv": 2, # python -m venv --clear + "pip": 3, # pip install + "main": 4, # uncaught exception above main() +} +SDK_BOOTSTRAP_ERR_CODES = { + "pip_no_match": 1, + "dns_fail": 2, + "conn_refused": 3, + "ssl_verify": 4, + "perm_denied": 5, + "no_pip": 6, + "disk_full": 7, + "proxy_auth": 8, + "stderr_timeout": 9, # pip stderr containing "timeout"/"timed out" + "subprocess_timeout": 10, # subprocess.TimeoutExpired (>120s) + # 11–98 reserved for future categories; APPEND-ONLY. + # 99 catches everything else (including "exc:" and "other:" + # — the original string is debug-loggable but the integer is what makes + # it to telemetry). + "_uncategorized": 99, +} + + +def _encode_phase(s): + """Map err_phase string to its telemetry integer code, or 0 if unset. + Empty/None → 0 lets `if encoded:` cleanly skip emission. Per + SDK_BOOTSTRAP_PHASE_CODES, valid codes are 1-4.""" + return SDK_BOOTSTRAP_PHASE_CODES.get((s or "").strip(), 0) + + +def _encode_err_kind(s): + """Map err_kind string to its telemetry integer code, or 0 if unset. + Direct hits use the static map; "exc:" and "other:" both + collapse to _uncategorized (99) — the raw string survives in debug + logs, only the integer reaches BQ.""" + s = (s or "").strip() + if not s: + return 0 + if s in SDK_BOOTSTRAP_ERR_CODES: + return SDK_BOOTSTRAP_ERR_CODES[s] + # Prefix matches for the catch-all categories + if s.startswith("exc:") or s.startswith("other:") or s == "other": + return SDK_BOOTSTRAP_ERR_CODES["_uncategorized"] + # Unknown string — still emit as uncategorized rather than dropping + return SDK_BOOTSTRAP_ERR_CODES["_uncategorized"] + + def _sdk_on_syspath() -> bool: # find_spec is ~10ms; actually importing the SDK pulls in # transitive deps and costs ~800ms — too heavy for a @@ -288,21 +353,25 @@ if __name__ == "__main__": # and takes the FIRST non-{"async":...} JSON line as the hook response; # its `metrics` key is forwarded to the hook metrics event on the # next attachments pass. Must be a single line — the registry splits on - # \n and json-parses each independently. Values must be bool|number OR - # short strings (CC accepts string metric values if they're not - # null). Stay inside the 10-key emit cap. + # \n and json-parses each independently. + # + # IMPORTANT — values must be bool|finite-number. The validation comment + # has historically said "or short strings" but that was wrong: CC's + # plugin-metrics pipeline silently drops plugin-emitted string values. + # Stay inside the 10-key emit cap. metrics: dict[str, object] = { "sdk_bootstrap": outcome, "sdk_bootstrap_ms": round((time.perf_counter() - t0) * 1000), } if err_kind: - # Truncate defensively; categorized values are <40 chars but the - # `other:` mode could be longer. err_phase may be empty for - # pre-venv failures (state_dir.mkdir perm-denied, sentinel O_EXCL - # raising a non-FileExistsError OSError) — emit as "pre" so the - # err_kind isn't silently dropped. - metrics["sdk_bootstrap_phase"] = (err_phase or "pre")[:16] - metrics["sdk_bootstrap_err"] = err_kind[:96] + # Encode phase + err_kind as integer codes (see + # SDK_BOOTSTRAP_PHASE_CODES / SDK_BOOTSTRAP_ERR_CODES). Earlier + # versions emitted these as strings and CC dropped them — restoring + # the diagnostic split that 28K BUILD_FAILED/day need to triage by + # root cause. err_phase defaults to "pre" when empty (pre-venv + # failure path, e.g. state_dir.mkdir perm-denied). + metrics["sdk_bootstrap_phase"] = _encode_phase(err_phase or "pre") + metrics["sdk_bootstrap_err"] = _encode_err_kind(err_kind) pv = _plugin_version_int() if pv: metrics["pv"] = pv diff --git a/plugins/security-guidance/hooks/llm.py b/plugins/security-guidance/hooks/llm.py index 72728c39..e2ac37db 100644 --- a/plugins/security-guidance/hooks/llm.py +++ b/plugins/security-guidance/hooks/llm.py @@ -27,7 +27,7 @@ from typing import Optional, Tuple, Dict, Any, List import extensibility import review_api -from _base import debug_log, _record_usage, _PV, PROVENANCE_TAG, state_dir as _resolve_state_dir # noqa: F401 +from _base import debug_log, _record_usage, _record_http_error, _PV, PROVENANCE_TAG, state_dir as _resolve_state_dir # noqa: F401 from session_state import with_locked_state @@ -368,6 +368,7 @@ def _call_claude_via_sdk(prompt, output_schema, *, max_tokens=16000, model=None) except Exception as e: debug_log(f"3P sdk-single-turn: SDK unavailable ({e})") _last_call_claude_http_error = -1 + _record_http_error(-1) return None cli_path = os.environ.get("SG_AGENTIC_CLI_PATH") or None @@ -425,6 +426,7 @@ def _call_claude_via_sdk(prompt, output_schema, *, max_tokens=16000, model=None) except _asyncio.TimeoutError: debug_log("3P sdk-single-turn: timeout after 60s") _last_call_claude_http_error = -1 + _record_http_error(-1) return None except Exception as e: debug_log(f"3P sdk-single-turn: query failed ({e})") @@ -433,6 +435,7 @@ def _call_claude_via_sdk(prompt, output_schema, *, max_tokens=16000, model=None) for _l in _captured_stderr[:20]: debug_log(f" | {_l.rstrip()}") _last_call_claude_http_error = -1 + _record_http_error(-1) return None @@ -542,6 +545,7 @@ def _call_claude(prompt, output_schema, thinking_budget=10000, max_tokens=16000, error_body = e.read().decode("utf-8") if e.fp else "" debug_log(f"API error: {e.code} - {error_body[:200]}") _last_call_claude_http_error = e.code + _record_http_error(e.code) return None except (urllib.error.URLError, TimeoutError) as e: if attempt < 2: @@ -551,6 +555,7 @@ def _call_claude(prompt, output_schema, thinking_budget=10000, max_tokens=16000, else: debug_log(f"Request failed after retries: {e}") _last_call_claude_http_error = -1 + _record_http_error(-1) return None if not response_data: @@ -559,6 +564,7 @@ def _call_claude(prompt, output_schema, thinking_budget=10000, max_tokens=16000, # call uses the token; record the 401 so callers don't see error=None. if _last_call_claude_http_error is None: _last_call_claude_http_error = 401 + _record_http_error(401) return None # Find the text block (skip thinking blocks)