Merge pull request #2100 from anthropics/fix-2099-text-true-pythonutf8

security-guidance: purge text=True from subprocess.run + bake PYTHONUTF8=1 (#2099)
This commit is contained in:
Mohamed Hegazy 2026-05-30 12:14:03 -07:00 committed by GitHub
commit c7a3e2ffa0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 84 additions and 30 deletions

View File

@ -32,12 +32,17 @@ GIT_CMD = [
def _git_rev_parse_head(cwd):
"""Return the current HEAD SHA, or None if not a git repo / no commits."""
try:
# See #2099: text=True on Windows cp1252 crashes the reader thread on
# any UTF-8 byte undefined in cp1252 (e.g. via a git error message
# referencing a non-ASCII filename in stderr). stdout is a SHA so it
# IS safe; stderr is not. capture_output=True with bytes-by-default
# never decodes, so the reader thread can't crash.
result = subprocess.run(
[*GIT_CMD, "rev-parse", "HEAD"],
cwd=cwd, capture_output=True, text=True, timeout=5
cwd=cwd, capture_output=True, timeout=5
)
if result.returncode == 0 and result.stdout.strip():
return result.stdout.strip()
return result.stdout.decode("utf-8", errors="replace").strip()
return None
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
return None
@ -52,13 +57,17 @@ def _find_git_index(cwd):
Returns the absolute path to the index file, or None.
"""
try:
# See #2099: stdout here is a PATH which can contain non-ASCII bytes
# (e.g. C:\אבטחה\repo\.git). text=True decodes via cp1252 strict on
# Windows → crashes the reader thread → returns stdout=None →
# caller does .strip() on None → AttributeError. Decode manually.
result = subprocess.run(
[*GIT_CMD, "rev-parse", "--git-dir"],
cwd=cwd, capture_output=True, text=True, timeout=5
cwd=cwd, capture_output=True, timeout=5
)
if result.returncode != 0:
return None
git_dir = result.stdout.strip()
git_dir = result.stdout.decode("utf-8", errors="replace").strip()
if not os.path.isabs(git_dir):
git_dir = os.path.join(cwd, git_dir)
index_path = os.path.join(git_dir, "index")
@ -128,9 +137,13 @@ def _temp_index(cwd, untracked_paths=None):
else:
add_args = None
if add_args:
# No stdout used here (only returncode matters), but text=True
# still spawns reader threads that decode stderr — git error
# messages can reference non-ASCII filenames and crash on
# cp1252. See #2099. Drop text=True so bytes stay raw.
subprocess.run(
[*GIT_CMD, "add", "--intent-to-add"] + add_args,
cwd=cwd, capture_output=True, text=True, timeout=10,
cwd=cwd, capture_output=True, timeout=10,
env=env,
)
yield env
@ -144,11 +157,17 @@ def _temp_index(cwd, untracked_paths=None):
def _git_toplevel(cwd):
"""Absolute repo root for `cwd`, or None if not in a work tree."""
try:
# See #2099: stdout is a PATH — `C:\אבטחה\repo` returned as UTF-8
# bytes by git. text=True would decode via cp1252 strict on Windows
# → reader-thread crash. Decode manually with errors="replace".
r = subprocess.run(
[*GIT_CMD, "rev-parse", "--show-toplevel"],
cwd=cwd, capture_output=True, text=True, timeout=5,
cwd=cwd, capture_output=True, timeout=5,
)
return r.stdout.strip() if r.returncode == 0 and r.stdout.strip() else None
if r.returncode != 0:
return None
path = r.stdout.decode("utf-8", errors="replace").strip()
return path if path else None
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
return None
@ -164,13 +183,15 @@ def _git_dir(repo_root):
callers can degrade (push-sweep state is best-effort).
"""
try:
# See #2099: stdout is a PATH (shared gitdir), may be non-ASCII.
# Decode bytes manually to avoid cp1252 reader-thread crash.
r = subprocess.run(
[*GIT_CMD, "rev-parse", "--git-common-dir"],
cwd=repo_root, capture_output=True, text=True, timeout=5,
cwd=repo_root, capture_output=True, timeout=5,
)
if r.returncode != 0:
return None
d = r.stdout.strip()
d = r.stdout.decode("utf-8", errors="replace").strip()
return d if os.path.isabs(d) else os.path.join(repo_root, d)
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
return None
@ -179,13 +200,15 @@ def _git_dir(repo_root):
def _git_rev_list_range(repo_root, base, head="HEAD"):
"""Shas in `base..head`, oldest→newest. Empty list on error."""
try:
# See #2099: stdout is ASCII SHAs, but stderr can carry git error
# messages referencing non-ASCII filenames — keep bytes raw.
r = subprocess.run(
[*GIT_CMD, "rev-list", "--reverse", f"{base}..{head}"],
cwd=repo_root, capture_output=True, text=True, timeout=10,
cwd=repo_root, capture_output=True, timeout=10,
)
if r.returncode != 0:
return []
return [s for s in r.stdout.strip().split("\n") if s]
return [s for s in r.stdout.decode("utf-8", errors="replace").strip().split("\n") if s]
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
return []
@ -220,9 +243,11 @@ def _git_diff_range(repo_root, base, head="HEAD"):
def _detect_main_branch(repo_root):
for ref in ("origin/HEAD", "origin/main", "origin/master", "main", "master"):
try:
# See #2099: stdout is a SHA but stderr can carry non-ASCII git
# warnings — keep bytes raw to avoid cp1252 reader-thread crash.
r = subprocess.run(
[*GIT_CMD, "rev-parse", "--verify", "-q", ref],
cwd=repo_root, capture_output=True, text=True, timeout=5,
cwd=repo_root, capture_output=True, timeout=5,
)
if r.returncode == 0 and r.stdout.strip():
return ref
@ -410,9 +435,12 @@ def _is_ancestor(cwd, maybe_ancestor, descendant):
"""True if `maybe_ancestor` is reachable from `descendant` (i.e. HEAD
moved forward via commit/merge, not sideways via checkout)."""
try:
# See #2099: only returncode matters, but text=True spawns reader
# threads that decode stderr — git error messages can carry non-ASCII
# filenames. Drop text=True to keep bytes raw, avoid cp1252 crash.
result = subprocess.run(
[*GIT_CMD, "merge-base", "--is-ancestor", maybe_ancestor, descendant],
cwd=cwd, capture_output=True, text=True, timeout=5,
cwd=cwd, capture_output=True, timeout=5,
)
return result.returncode == 0
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):

View File

@ -549,7 +549,11 @@ def handle_user_prompt_submit(input_data):
elif sha:
debug_log(f"Captured git baseline: {sha[:12]}")
else:
debug_log("Failed to capture git baseline (not a git repo?)")
# Show cwd so the next reporter can immediately see when this isn't
# actually "not a git repo" but a path-encoding / permissions / git
# invocation failure. See #2099.
debug_log(f"Failed to capture git baseline (cwd={cwd!r}) — not a git repo, "
f"or git invocation failed (check log entries above)")
sys.exit(0)
@ -856,23 +860,30 @@ def _detect_prev_upstream(repo_root, bash_output):
# @{u}@{1} — only meaningful if an upstream is configured.
for ref in ("@{u}@{1}", "@{push}@{1}"):
try:
# See #2099: stdout is a SHA but stderr can carry non-ASCII git
# warnings — keep bytes raw to avoid cp1252 reader-thread crash.
r = subprocess.run(
[*GIT_CMD, "rev-parse", "--verify", "-q", ref],
cwd=repo_root, capture_output=True, text=True, timeout=5,
cwd=repo_root, capture_output=True, timeout=5,
)
if r.returncode == 0 and r.stdout.strip():
return r.stdout.strip()
sha = r.stdout.decode("utf-8", errors="replace").strip()
if r.returncode == 0 and sha:
return sha
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
pass
main = _detect_main_branch(repo_root)
if main:
try:
# See #2099: drop text=True; decode bytes manually so a
# cp1252-undefined byte in git's stderr doesn't crash the
# reader thread.
r = subprocess.run(
[*GIT_CMD, "merge-base", "HEAD", main],
cwd=repo_root, capture_output=True, text=True, timeout=5,
cwd=repo_root, capture_output=True, timeout=5,
)
if r.returncode == 0 and r.stdout.strip():
return r.stdout.strip()
sha = r.stdout.decode("utf-8", errors="replace").strip()
if r.returncode == 0 and sha:
return sha
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
pass
return None
@ -1324,12 +1335,13 @@ def handle_commit_review_posttooluse(input_data):
try:
full_shas = []
for s in shas:
# See #2099: drop text=True; decode manually for cp1252 safety.
r = subprocess.run(
[*GIT_CMD, "rev-parse", "--verify", "-q", s],
cwd=repo_root, capture_output=True, text=True, timeout=5,
cwd=repo_root, capture_output=True, timeout=5,
)
if r.returncode == 0:
full_shas.append(r.stdout.strip())
full_shas.append(r.stdout.decode("utf-8", errors="replace").strip())
_append_reviewed_shas(repo_root, full_shas, vulns_found=len(vulns or []))
except Exception:
pass
@ -1531,9 +1543,10 @@ def handle_push_sweep_posttooluse(input_data):
# both.
head = None
try:
# See #2099: drop text=True; decode manually for cp1252 safety.
r = subprocess.run([*GIT_CMD, "rev-parse", "HEAD"], cwd=repo_root,
capture_output=True, text=True, timeout=5)
head = r.stdout.strip() if r.returncode == 0 else None
capture_output=True, timeout=5)
head = r.stdout.decode("utf-8", errors="replace").strip() if r.returncode == 0 else None
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
pass
push_section = _push_section(bash_output or "")
@ -1563,14 +1576,15 @@ def handle_push_sweep_posttooluse(input_data):
quiet_success = False
if not (bash_output or "").strip() and not interrupted:
try:
# See #2099: drop text=True; decode manually for cp1252 safety.
r_cur = subprocess.run(
[*GIT_CMD, "rev-parse", "--verify", "-q", "@{u}"],
cwd=repo_root, capture_output=True, text=True, timeout=5)
cwd=repo_root, capture_output=True, timeout=5)
r_prev = subprocess.run(
[*GIT_CMD, "rev-parse", "--verify", "-q", "@{u}@{1}"],
cwd=repo_root, capture_output=True, text=True, timeout=5)
cur = r_cur.stdout.strip() if r_cur.returncode == 0 else ""
prev_u = r_prev.stdout.strip() if r_prev.returncode == 0 else ""
cwd=repo_root, capture_output=True, timeout=5)
cur = r_cur.stdout.decode("utf-8", errors="replace").strip() if r_cur.returncode == 0 else ""
prev_u = r_prev.stdout.decode("utf-8", errors="replace").strip() if r_prev.returncode == 0 else ""
quiet_success = bool(cur and prev_u and cur == head and prev_u != cur)
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
pass
@ -1584,11 +1598,12 @@ def handle_push_sweep_posttooluse(input_data):
# reviewed-shas state.
for local_ref in new_branch_matches:
try:
# See #2099: drop text=True; decode manually for cp1252 safety.
r = subprocess.run(
[*GIT_CMD, "rev-parse", "--verify", "-q", local_ref],
cwd=repo_root, capture_output=True, text=True, timeout=5,
cwd=repo_root, capture_output=True, timeout=5,
)
local_sha = r.stdout.strip() if r.returncode == 0 else ""
local_sha = r.stdout.decode("utf-8", errors="replace").strip() if r.returncode == 0 else ""
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
local_sha = ""
if local_sha and local_sha != head:

View File

@ -22,6 +22,17 @@
# "${CLAUDE_PLUGIN_ROOT}/hooks/security_reminder_hook.py"
set -e
# Force UTF-8 for ALL Python filesystem + IO operations (PEP 540).
# Without this, Windows Python defaults `locale.getpreferredencoding()` to
# cp1252 — which makes `text=True` in subprocess.run / open() / json.load
# crash the internal reader thread on any byte that's undefined in cp1252
# (e.g. the 0x81 byte from ف, present in any path/filename with
# Arabic/Hebrew/CJK characters). See #2056, #2099.
#
# No-op on macOS/Linux (already UTF-8). Must be set BEFORE Python starts —
# changing it from inside the interpreter has no effect.
export PYTHONUTF8=1
# Git Bash / MSYS on Windows hands script paths to this shim in POSIX form
# (`/c/Users/...`). When we exec a Windows `python.exe` (which we do on
# Windows since `python3` is the Microsoft Store stub), python interprets the