Merge pull request #2112 from anthropics/telemetry-failure-signals

security-guidance: emit HTTP error codes + fix sdk_bootstrap phase/err encoding (telemetry visibility)
This commit is contained in:
Mohamed Hegazy 2026-06-01 10:31:29 -07:00 committed by GitHub
commit fcdcd079e3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 147 additions and 25 deletions

View File

@ -116,7 +116,18 @@ _PV = _read_plugin_version_int()
# Emitted via _usage_metrics() into the existing emit_metrics() channel so
# hook metrics rows carry per-invocation token/cost totals
# alongside the existing skip_reason / vulns_found fields.
_USAGE = {"in": 0, "out": 0, "cr": 0, "cw": 0, "cost": 0.0, "n": 0}
_USAGE = {
"in": 0, "out": 0, "cr": 0, "cw": 0, "cost": 0.0, "n": 0,
# HTTP error visibility (#2098 visibility gap — see emit comment in
# _usage_metrics). Without this, API failures from `_call_claude` left
# zero fingerprint in telemetry: the call returns None, the caller's
# emit_metrics carries no api_calls field, and the failure is
# indistinguishable from "no review needed". The deprecation outage
# that broke every commit-review LLM call was invisible until users
# reported it manually.
"http_err_last": 0, # most recent HTTP error code this invocation
"http_err_count": 0, # total HTTP errors (4xx + 5xx + network)
}
_USAGE_LOCK = threading.Lock()
# $/Mtok (input, output). Used only for the raw-HTTP path; the SDK path
@ -166,19 +177,55 @@ def _record_usage(usage, model, cost_usd=None):
_USAGE["n"] += 1
def _record_http_error(status):
"""Record an HTTP error from an LLM API call. `status` is the HTTP
status code (integer 400599) or -1 for network/timeout errors. Stored
in `_USAGE["http_err_last"]` (most recent) and counted in
`_USAGE["http_err_count"]`. Snapshot via `_usage_metrics()` so every
subsequent `emit_metrics` includes the failure fingerprint.
Background: without this, the most recent example was the #2098
deprecation 400. Every hook fire's LLM call returned HTTP 400; the
plugin caught it and returned None; the emit_metrics carried no
api_calls field; aggregate dashboards looked normal. The failure
only became visible when a user manually reported errors out of
their debug log. With this field, a category-of-failure spike (4xx,
5xx, or -1 network) is queryable from BQ in real time.
"""
try:
s = int(status)
except (TypeError, ValueError):
return
with _USAGE_LOCK:
_USAGE["http_err_last"] = s
_USAGE["http_err_count"] += 1
def _usage_metrics():
"""Snapshot the accumulator as metric keys. Returns {} when no API calls
were made so skip-path emits don't burn key budget. cost_usd rounded to
1e-6 to keep the float finite/short for the zod schema."""
AND no HTTP errors were made so skip-path emits don't burn key budget.
cost_usd rounded to 1e-6 to keep the float finite/short for the zod
schema.
HTTP errors (`http_err_last`, `http_err_count`) emitted ONLY when
`http_err_count > 0` so successful calls don't pad every metrics row
with two zero fields.
"""
with _USAGE_LOCK:
if _USAGE["n"] == 0:
if _USAGE["n"] == 0 and _USAGE["http_err_count"] == 0:
return {}
return {
out = {}
if _USAGE["n"] > 0:
out.update({
"tok_in": _USAGE["in"],
"tok_out": _USAGE["out"],
"tok_cache_r": _USAGE["cr"],
"tok_cache_w": _USAGE["cw"],
"cost_usd": round(_USAGE["cost"], 6),
"api_calls": _USAGE["n"],
}
})
if _USAGE["http_err_count"] > 0:
out["http_err_last"] = _USAGE["http_err_last"]
out["http_err_count"] = _USAGE["http_err_count"]
return out

View File

@ -42,6 +42,71 @@ HOOK_PY_INCOMPATIBLE = 6 # hook interpreter is <3.10 — SDK syntax can't load
# here no matter how the venv was built. See #2071.
# Phase + err-kind integer encoding for sdk_bootstrap_phase / sdk_bootstrap_err.
#
# Earlier versions emitted these as STRINGS (e.g. "pip", "dns_fail"). CC's
# plugin-metrics pipeline silently drops plugin-emitted string values —
# only `bool|finite-number` plugin metrics reach BigQuery. (CC-core
# metrics like `subscription_type` are exempt because they're injected
# downstream of plugin validation.) Confirmed empirically: 185K
# BUILD_FAILED rows in BQ had `sdk_bootstrap_phase`/`sdk_bootstrap_err`
# = NULL despite the Python code emitting them. This left ~28K
# BUILD_FAILED sessions/day with no diagnostic split — flying blind on
# the real failure modes (pip-no-match vs dns-fail vs ssl-verify etc.).
#
# Fix: encode as small integers per the maps below. Values are
# APPEND-ONLY for telemetry stability. Reserve 99 as the "unknown /
# uncategorized" bucket so an unmapped err_kind (e.g., a new exception
# type) still emits a non-zero signal.
SDK_BOOTSTRAP_PHASE_CODES = {
"pre": 1, # pre-venv (state_dir.mkdir, sentinel open)
"venv": 2, # python -m venv --clear
"pip": 3, # pip install
"main": 4, # uncaught exception above main()
}
SDK_BOOTSTRAP_ERR_CODES = {
"pip_no_match": 1,
"dns_fail": 2,
"conn_refused": 3,
"ssl_verify": 4,
"perm_denied": 5,
"no_pip": 6,
"disk_full": 7,
"proxy_auth": 8,
"stderr_timeout": 9, # pip stderr containing "timeout"/"timed out"
"subprocess_timeout": 10, # subprocess.TimeoutExpired (>120s)
# 1198 reserved for future categories; APPEND-ONLY.
# 99 catches everything else (including "exc:<TypeName>" and "other:<tail>"
# — the original string is debug-loggable but the integer is what makes
# it to telemetry).
"_uncategorized": 99,
}
def _encode_phase(s):
"""Map err_phase string to its telemetry integer code, or 0 if unset.
Empty/None 0 lets `if encoded:` cleanly skip emission. Per
SDK_BOOTSTRAP_PHASE_CODES, valid codes are 1-4."""
return SDK_BOOTSTRAP_PHASE_CODES.get((s or "").strip(), 0)
def _encode_err_kind(s):
"""Map err_kind string to its telemetry integer code, or 0 if unset.
Direct hits use the static map; "exc:<X>" and "other:<tail>" both
collapse to _uncategorized (99) the raw string survives in debug
logs, only the integer reaches BQ."""
s = (s or "").strip()
if not s:
return 0
if s in SDK_BOOTSTRAP_ERR_CODES:
return SDK_BOOTSTRAP_ERR_CODES[s]
# Prefix matches for the catch-all categories
if s.startswith("exc:") or s.startswith("other:") or s == "other":
return SDK_BOOTSTRAP_ERR_CODES["_uncategorized"]
# Unknown string — still emit as uncategorized rather than dropping
return SDK_BOOTSTRAP_ERR_CODES["_uncategorized"]
def _sdk_on_syspath() -> bool:
# find_spec is ~10ms; actually importing the SDK pulls in
# transitive deps and costs ~800ms — too heavy for a
@ -288,21 +353,25 @@ if __name__ == "__main__":
# and takes the FIRST non-{"async":...} JSON line as the hook response;
# its `metrics` key is forwarded to the hook metrics event on the
# next attachments pass. Must be a single line — the registry splits on
# \n and json-parses each independently. Values must be bool|number OR
# short strings (CC accepts string metric values if they're not
# null). Stay inside the 10-key emit cap.
# \n and json-parses each independently.
#
# IMPORTANT — values must be bool|finite-number. The validation comment
# has historically said "or short strings" but that was wrong: CC's
# plugin-metrics pipeline silently drops plugin-emitted string values.
# Stay inside the 10-key emit cap.
metrics: dict[str, object] = {
"sdk_bootstrap": outcome,
"sdk_bootstrap_ms": round((time.perf_counter() - t0) * 1000),
}
if err_kind:
# Truncate defensively; categorized values are <40 chars but the
# `other:<tail>` mode could be longer. err_phase may be empty for
# pre-venv failures (state_dir.mkdir perm-denied, sentinel O_EXCL
# raising a non-FileExistsError OSError) — emit as "pre" so the
# err_kind isn't silently dropped.
metrics["sdk_bootstrap_phase"] = (err_phase or "pre")[:16]
metrics["sdk_bootstrap_err"] = err_kind[:96]
# Encode phase + err_kind as integer codes (see
# SDK_BOOTSTRAP_PHASE_CODES / SDK_BOOTSTRAP_ERR_CODES). Earlier
# versions emitted these as strings and CC dropped them — restoring
# the diagnostic split that 28K BUILD_FAILED/day need to triage by
# root cause. err_phase defaults to "pre" when empty (pre-venv
# failure path, e.g. state_dir.mkdir perm-denied).
metrics["sdk_bootstrap_phase"] = _encode_phase(err_phase or "pre")
metrics["sdk_bootstrap_err"] = _encode_err_kind(err_kind)
pv = _plugin_version_int()
if pv:
metrics["pv"] = pv

View File

@ -27,7 +27,7 @@ from typing import Optional, Tuple, Dict, Any, List
import extensibility
import review_api
from _base import debug_log, _record_usage, _PV, PROVENANCE_TAG, state_dir as _resolve_state_dir # noqa: F401
from _base import debug_log, _record_usage, _record_http_error, _PV, PROVENANCE_TAG, state_dir as _resolve_state_dir # noqa: F401
from session_state import with_locked_state
@ -368,6 +368,7 @@ def _call_claude_via_sdk(prompt, output_schema, *, max_tokens=16000, model=None)
except Exception as e:
debug_log(f"3P sdk-single-turn: SDK unavailable ({e})")
_last_call_claude_http_error = -1
_record_http_error(-1)
return None
cli_path = os.environ.get("SG_AGENTIC_CLI_PATH") or None
@ -425,6 +426,7 @@ def _call_claude_via_sdk(prompt, output_schema, *, max_tokens=16000, model=None)
except _asyncio.TimeoutError:
debug_log("3P sdk-single-turn: timeout after 60s")
_last_call_claude_http_error = -1
_record_http_error(-1)
return None
except Exception as e:
debug_log(f"3P sdk-single-turn: query failed ({e})")
@ -433,6 +435,7 @@ def _call_claude_via_sdk(prompt, output_schema, *, max_tokens=16000, model=None)
for _l in _captured_stderr[:20]:
debug_log(f" | {_l.rstrip()}")
_last_call_claude_http_error = -1
_record_http_error(-1)
return None
@ -542,6 +545,7 @@ def _call_claude(prompt, output_schema, thinking_budget=10000, max_tokens=16000,
error_body = e.read().decode("utf-8") if e.fp else ""
debug_log(f"API error: {e.code} - {error_body[:200]}")
_last_call_claude_http_error = e.code
_record_http_error(e.code)
return None
except (urllib.error.URLError, TimeoutError) as e:
if attempt < 2:
@ -551,6 +555,7 @@ def _call_claude(prompt, output_schema, thinking_budget=10000, max_tokens=16000,
else:
debug_log(f"Request failed after retries: {e}")
_last_call_claude_http_error = -1
_record_http_error(-1)
return None
if not response_data:
@ -559,6 +564,7 @@ def _call_claude(prompt, output_schema, thinking_budget=10000, max_tokens=16000,
# call uses the token; record the 401 so callers don't see error=None.
if _last_call_claude_http_error is None:
_last_call_claude_http_error = 401
_record_http_error(401)
return None
# Find the text block (skip thinking blocks)