From 475038edfce3b736fba6648af7d8c1bd627c2464 Mon Sep 17 00:00:00 2001
From: Mohamed Hegazy <mhegazy@anthropic.com>
Date: Sun, 31 May 2026 08:34:35 -0700
Subject: [PATCH] security-guidance: emit HTTP error codes + fix sdk_bootstrap
 phase/err encoding
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fills two failure-visibility gaps in plugin telemetry.

## Gap 1: HTTP errors from _call_claude invisible

Before: a 4xx/5xx response from the LLM API caused `_call_claude` to
return None and produce ZERO fingerprint in tengu_hook_plugin_metrics.
A failed call looked identical to "no review needed". The recent
deprecation-400 outage (PR #2105, output_format → output_config.format,
#2098) was invisible in aggregate dashboards until a user manually
reported errors from their debug log. Cohort-specific or partial
outages would never show up in BQ.

Fix: add `http_err_last` (most recent status) and `http_err_count`
to the existing `_USAGE` accumulator in `_base.py`. `_usage_metrics()`
snapshots them whenever count > 0 (skip-path no-pollute contract
preserved when count == 0). All `_call_claude` error sites now call
the new `_record_http_error()` helper alongside the existing
`_last_call_claude_http_error` module-state assignment.

Now any future API failure category is queryable in BQ in real time:

  SELECT
    DATE(server_timestamp, "America/Los_Angeles") AS d,
    CAST(JSON_VALUE(additional_metadata, "$.http_err_last") AS INT64) AS code,
    COUNT(*) AS n
  FROM ... WHERE event_name = "tengu_hook_plugin_metrics"
    AND JSON_VALUE(additional_metadata, "$.pluginId") LIKE "%security-guidance%"
    AND JSON_VALUE(additional_metadata, "$.http_err_count") IS NOT NULL
  GROUP BY d, code ORDER BY d, n DESC

## Gap 2: sdk_bootstrap_phase / sdk_bootstrap_err always NULL in BQ

Before: ensure_agent_sdk.py emitted these as strings (e.g. "pip",
"dns_fail"). CC's plugin-metrics pipeline silently drops
plugin-emitted string values — only bool|finite-number plugin metrics
reach BigQuery. (CC-core fields like `subscription_type` are exempt
because they're injected downstream of plugin validation.)

Confirmed empirically: ~185K BUILD_FAILED rows in BQ over the past 2
days had `sdk_bootstrap_phase` = NULL and `sdk_bootstrap_err` = NULL
despite the Python code emitting them. ~28K BUILD_FAILED sessions/day
had no diagnostic split — flying blind on whether the failures are
pip-no-match vs dns-fail vs ssl-verify vs proxy-auth etc.

Fix: encode phase + err_kind as stable integers via
SDK_BOOTSTRAP_PHASE_CODES and SDK_BOOTSTRAP_ERR_CODES. Phase: 1=pre,
2=venv, 3=pip, 4=main. Err: 10 known categories (1-10), 11-98
reserved, 99 = uncategorized catch-all (covers "exc:<X>", "other:<X>",
and unmapped strings). APPEND-ONLY for telemetry stability.

Also corrects the misleading "CC accepts string metric values"
comment in ensure_agent_sdk.py that led to the bug originally.

Verified locally on macOS Python 3.13:

  - py_compile clean.
  - 32 new tests in test_telemetry_failure_signals.py (added to
    internal test suite at sg-staging/tests/, not in this PR):

      * 4 HTTP-error tracking unit tests: _record_http_error increments
        count + tracks most-recent; handles None/invalid; -1 for
        network/timeout.
      * 4 _usage_metrics emission tests: empty when no activity;
        successful call has no http_err fields; failure-only has http_err
        and no api_calls; mixed has both.
      * 1 contract test: every emitted value is bool|finite-number
        (catches future regression of the string-dropping bug class).
      * 13 sdk_bootstrap encoding tests (parametrized over the 10 known
        err_kind categories + 5 catch-all shapes): each maps to the
        right integer; unknown phase = 0; unknown err = 99.
      * 1 static-shape regression catcher: every `err_kind = "..."`
        string in ensure_agent_sdk.main() must be in
        SDK_BOOTSTRAP_ERR_CODES (otherwise new err_kinds silently
        collapse to 99).
      * 2 emit-shape regression catchers: the assignments in main() go
        through _encode_phase / _encode_err_kind helpers (no raw
        strings); no literal string values for sdk_bootstrap_phase/err.
      * 1 comment-accuracy: the misleading "CC accepts string metric
        values" comment is gone.

  - Full suite: 437/437 pass + 2 skipped (live API tests, opt-in).

NOT verified end-to-end against BQ — would require shipping +
observing in production for 24h to confirm the http_err and
sdk_bootstrap_phase/err fields actually appear in
tengu_hook_plugin_metrics rows. The unit tests pin the contract; if
the wire shape is broken, BQ will show NULL for the new fields and we
revisit (with the same diagnostic the BUILD_FAILED bug gave us).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 plugins/security-guidance/hooks/_base.py      | 75 +++++++++++++---
 .../hooks/ensure_agent_sdk.py                 | 89 ++++++++++++++++---
 plugins/security-guidance/hooks/llm.py        |  8 +-
 3 files changed, 147 insertions(+), 25 deletions(-)
diff --git a/plugins/security-guidance/hooks/_base.py b/plugins/security-guidance/hooks/_base.py
index 9adbcff4..ce051753 100644
--- a/plugins/security-guidance/hooks/_base.py
+++ b/plugins/security-guidance/hooks/_base.py
@@ -116,7 +116,18 @@ _PV = _read_plugin_version_int()
 # Emitted via _usage_metrics() into the existing emit_metrics() channel so
 # hook metrics rows carry per-invocation token/cost totals
 # alongside the existing skip_reason / vulns_found fields.
-_USAGE = {"in": 0, "out": 0, "cr": 0, "cw": 0, "cost": 0.0, "n": 0}
+_USAGE = {
+    "in": 0, "out": 0, "cr": 0, "cw": 0, "cost": 0.0, "n": 0,
+    # HTTP error visibility (#2098 visibility gap — see emit comment in
+    # _usage_metrics). Without this, API failures from `_call_claude` left
+    # zero fingerprint in telemetry: the call returns None, the caller's
+    # emit_metrics carries no api_calls field, and the failure is
+    # indistinguishable from "no review needed". The deprecation outage
+    # that broke every commit-review LLM call was invisible until users
+    # reported it manually.
+    "http_err_last": 0,    # most recent HTTP error code this invocation
+    "http_err_count": 0,   # total HTTP errors (4xx + 5xx + network)
+}
 _USAGE_LOCK = threading.Lock()
 
 # $/Mtok (input, output). Used only for the raw-HTTP path; the SDK path
@@ -166,19 +177,55 @@ def _record_usage(usage, model, cost_usd=None):
         _USAGE["n"] += 1
 
 
+def _record_http_error(status):
+    """Record an HTTP error from an LLM API call. `status` is the HTTP
+    status code (integer 400–599) or -1 for network/timeout errors. Stored
+    in `_USAGE["http_err_last"]` (most recent) and counted in
+    `_USAGE["http_err_count"]`. Snapshot via `_usage_metrics()` so every
+    subsequent `emit_metrics` includes the failure fingerprint.
+
+    Background: without this, the most recent example was the #2098
+    deprecation 400. Every hook fire's LLM call returned HTTP 400; the
+    plugin caught it and returned None; the emit_metrics carried no
+    api_calls field; aggregate dashboards looked normal. The failure
+    only became visible when a user manually reported errors out of
+    their debug log. With this field, a category-of-failure spike (4xx,
+    5xx, or -1 network) is queryable from BQ in real time.
+    """
+    try:
+        s = int(status)
+    except (TypeError, ValueError):
+        return
+    with _USAGE_LOCK:
+        _USAGE["http_err_last"] = s
+        _USAGE["http_err_count"] += 1
+
+
 def _usage_metrics():
     """Snapshot the accumulator as metric keys. Returns {} when no API calls
-    were made so skip-path emits don't burn key budget. cost_usd rounded to
-    1e-6 to keep the float finite/short for the zod schema."""
-    with _USAGE_LOCK:
-        if _USAGE["n"] == 0:
-            return {}
-        return {
-            "tok_in": _USAGE["in"],
-            "tok_out": _USAGE["out"],
-            "tok_cache_r": _USAGE["cr"],
-            "tok_cache_w": _USAGE["cw"],
-            "cost_usd": round(_USAGE["cost"], 6),
-            "api_calls": _USAGE["n"],
-        }
+    AND no HTTP errors were made so skip-path emits don't burn key budget.
+    cost_usd rounded to 1e-6 to keep the float finite/short for the zod
+    schema.
+
+    HTTP errors (`http_err_last`, `http_err_count`) emitted ONLY when
+    `http_err_count > 0` so successful calls don't pad every metrics row
+    with two zero fields.
+    """
+    with _USAGE_LOCK:
+        if _USAGE["n"] == 0 and _USAGE["http_err_count"] == 0:
+            return {}
+        out = {}
+        if _USAGE["n"] > 0:
+            out.update({
+                "tok_in": _USAGE["in"],
+                "tok_out": _USAGE["out"],
+                "tok_cache_r": _USAGE["cr"],
+                "tok_cache_w": _USAGE["cw"],
+                "cost_usd": round(_USAGE["cost"], 6),
+                "api_calls": _USAGE["n"],
+            })
+        if _USAGE["http_err_count"] > 0:
+            out["http_err_last"] = _USAGE["http_err_last"]
+            out["http_err_count"] = _USAGE["http_err_count"]
+        return out
 
diff --git a/plugins/security-guidance/hooks/ensure_agent_sdk.py b/plugins/security-guidance/hooks/ensure_agent_sdk.py
index 262a53a9..4c5f8423 100644
--- a/plugins/security-guidance/hooks/ensure_agent_sdk.py
+++ b/plugins/security-guidance/hooks/ensure_agent_sdk.py
@@ -42,6 +42,71 @@ HOOK_PY_INCOMPATIBLE = 6  # hook interpreter is <3.10 — SDK syntax can't load
                           # here no matter how the venv was built. See #2071.
 
 
+# Phase + err-kind integer encoding for sdk_bootstrap_phase / sdk_bootstrap_err.
+#
+# Earlier versions emitted these as STRINGS (e.g. "pip", "dns_fail"). CC's
+# plugin-metrics pipeline silently drops plugin-emitted string values —
+# only `bool|finite-number` plugin metrics reach BigQuery. (CC-core
+# metrics like `subscription_type` are exempt because they're injected
+# downstream of plugin validation.) Confirmed empirically: 185K
+# BUILD_FAILED rows in BQ had `sdk_bootstrap_phase`/`sdk_bootstrap_err`
+# = NULL despite the Python code emitting them. This left ~28K
+# BUILD_FAILED sessions/day with no diagnostic split — flying blind on
+# the real failure modes (pip-no-match vs dns-fail vs ssl-verify etc.).
+#
+# Fix: encode as small integers per the maps below. Values are
+# APPEND-ONLY for telemetry stability. Reserve 99 as the "unknown /
+# uncategorized" bucket so an unmapped err_kind (e.g., a new exception
+# type) still emits a non-zero signal.
+SDK_BOOTSTRAP_PHASE_CODES = {
+    "pre":  1,  # pre-venv (state_dir.mkdir, sentinel open)
+    "venv": 2,  # python -m venv --clear
+    "pip":  3,  # pip install
+    "main": 4,  # uncaught exception above main()
+}
+SDK_BOOTSTRAP_ERR_CODES = {
+    "pip_no_match":       1,
+    "dns_fail":           2,
+    "conn_refused":       3,
+    "ssl_verify":         4,
+    "perm_denied":        5,
+    "no_pip":             6,
+    "disk_full":          7,
+    "proxy_auth":         8,
+    "stderr_timeout":     9,  # pip stderr containing "timeout"/"timed out"
+    "subprocess_timeout": 10, # subprocess.TimeoutExpired (>120s)
+    # 11–98 reserved for future categories; APPEND-ONLY.
+    # 99 catches everything else (including "exc:<TypeName>" and "other:<tail>"
+    # — the original string is debug-loggable but the integer is what makes
+    # it to telemetry).
+    "_uncategorized":     99,
+}
+
+
+def _encode_phase(s):
+    """Map err_phase string to its telemetry integer code, or 0 if unset.
+    Empty/None → 0 lets `if encoded:` cleanly skip emission. Per
+    SDK_BOOTSTRAP_PHASE_CODES, valid codes are 1-4."""
+    return SDK_BOOTSTRAP_PHASE_CODES.get((s or "").strip(), 0)
+
+
+def _encode_err_kind(s):
+    """Map err_kind string to its telemetry integer code, or 0 if unset.
+    Direct hits use the static map; "exc:<X>" and "other:<tail>" both
+    collapse to _uncategorized (99) — the raw string survives in debug
+    logs, only the integer reaches BQ."""
+    s = (s or "").strip()
+    if not s:
+        return 0
+    if s in SDK_BOOTSTRAP_ERR_CODES:
+        return SDK_BOOTSTRAP_ERR_CODES[s]
+    # Prefix matches for the catch-all categories
+    if s.startswith("exc:") or s.startswith("other:") or s == "other":
+        return SDK_BOOTSTRAP_ERR_CODES["_uncategorized"]
+    # Unknown string — still emit as uncategorized rather than dropping
+    return SDK_BOOTSTRAP_ERR_CODES["_uncategorized"]
+
+
 def _sdk_on_syspath() -> bool:
     # find_spec is ~10ms; actually importing the SDK pulls in
     # transitive deps and costs ~800ms — too heavy for a
@@ -288,21 +353,25 @@ if __name__ == "__main__":
     # and takes the FIRST non-{"async":...} JSON line as the hook response;
     # its `metrics` key is forwarded to the hook metrics event on the
     # next attachments pass. Must be a single line — the registry splits on
-    # \n and json-parses each independently. Values must be bool|number OR
-    # short strings (CC accepts string metric values if they're not
-    # null). Stay inside the 10-key emit cap.
+    # \n and json-parses each independently.
+    #
+    # IMPORTANT — values must be bool|finite-number. The validation comment
+    # has historically said "or short strings" but that was wrong: CC's
+    # plugin-metrics pipeline silently drops plugin-emitted string values.
+    # Stay inside the 10-key emit cap.
     metrics: dict[str, object] = {
         "sdk_bootstrap": outcome,
         "sdk_bootstrap_ms": round((time.perf_counter() - t0) * 1000),
     }
     if err_kind:
-        # Truncate defensively; categorized values are <40 chars but the
-        # `other:<tail>` mode could be longer. err_phase may be empty for
-        # pre-venv failures (state_dir.mkdir perm-denied, sentinel O_EXCL
-        # raising a non-FileExistsError OSError) — emit as "pre" so the
-        # err_kind isn't silently dropped.
-        metrics["sdk_bootstrap_phase"] = (err_phase or "pre")[:16]
-        metrics["sdk_bootstrap_err"] = err_kind[:96]
+        # Encode phase + err_kind as integer codes (see
+        # SDK_BOOTSTRAP_PHASE_CODES / SDK_BOOTSTRAP_ERR_CODES). Earlier
+        # versions emitted these as strings and CC dropped them — restoring
+        # the diagnostic split that 28K BUILD_FAILED/day need to triage by
+        # root cause. err_phase defaults to "pre" when empty (pre-venv
+        # failure path, e.g. state_dir.mkdir perm-denied).
+        metrics["sdk_bootstrap_phase"] = _encode_phase(err_phase or "pre")
+        metrics["sdk_bootstrap_err"] = _encode_err_kind(err_kind)
     pv = _plugin_version_int()
     if pv:
         metrics["pv"] = pv
diff --git a/plugins/security-guidance/hooks/llm.py b/plugins/security-guidance/hooks/llm.py
index 72728c39..e2ac37db 100644
--- a/plugins/security-guidance/hooks/llm.py
+++ b/plugins/security-guidance/hooks/llm.py
@@ -27,7 +27,7 @@ from typing import Optional, Tuple, Dict, Any, List
 
 import extensibility
 import review_api
-from _base import debug_log, _record_usage, _PV, PROVENANCE_TAG, state_dir as _resolve_state_dir  # noqa: F401
+from _base import debug_log, _record_usage, _record_http_error, _PV, PROVENANCE_TAG, state_dir as _resolve_state_dir  # noqa: F401
 from session_state import with_locked_state
 
 
@@ -368,6 +368,7 @@ def _call_claude_via_sdk(prompt, output_schema, *, max_tokens=16000, model=None)
         except Exception as e:
             debug_log(f"3P sdk-single-turn: SDK unavailable ({e})")
             _last_call_claude_http_error = -1
+            _record_http_error(-1)
             return None
 
     cli_path = os.environ.get("SG_AGENTIC_CLI_PATH") or None
@@ -425,6 +426,7 @@ def _call_claude_via_sdk(prompt, output_schema, *, max_tokens=16000, model=None)
     except _asyncio.TimeoutError:
         debug_log("3P sdk-single-turn: timeout after 60s")
         _last_call_claude_http_error = -1
+        _record_http_error(-1)
         return None
     except Exception as e:
         debug_log(f"3P sdk-single-turn: query failed ({e})")
@@ -433,6 +435,7 @@ def _call_claude_via_sdk(prompt, output_schema, *, max_tokens=16000, model=None)
             for _l in _captured_stderr[:20]:
                 debug_log(f"  | {_l.rstrip()}")
         _last_call_claude_http_error = -1
+        _record_http_error(-1)
         return None
 
 
@@ -542,6 +545,7 @@ def _call_claude(prompt, output_schema, thinking_budget=10000, max_tokens=16000,
                 error_body = e.read().decode("utf-8") if e.fp else ""
                 debug_log(f"API error: {e.code} - {error_body[:200]}")
                 _last_call_claude_http_error = e.code
+                _record_http_error(e.code)
                 return None
         except (urllib.error.URLError, TimeoutError) as e:
             if attempt < 2:
@@ -551,6 +555,7 @@ def _call_claude(prompt, output_schema, thinking_budget=10000, max_tokens=16000,
             else:
                 debug_log(f"Request failed after retries: {e}")
                 _last_call_claude_http_error = -1
+                _record_http_error(-1)
                 return None
 
     if not response_data:
@@ -559,6 +564,7 @@ def _call_claude(prompt, output_schema, thinking_budget=10000, max_tokens=16000,
         # call uses the token; record the 401 so callers don't see error=None.
         if _last_call_claude_http_error is None:
             _last_call_claude_http_error = 401
+            _record_http_error(401)
         return None
 
     # Find the text block (skip thinking blocks)