diff --git a/plugins/security-guidance/hooks/diffstate.py b/plugins/security-guidance/hooks/diffstate.py index 7dbe542a..d163715d 100644 --- a/plugins/security-guidance/hooks/diffstate.py +++ b/plugins/security-guidance/hooks/diffstate.py @@ -138,7 +138,17 @@ def restore_unreviewed_stop_state(session_id, paths, baseline_sha): def get_baseline_file_content(session_id, file_path, cwd): - """Get the content of a file at the baseline SHA. Returns None if unavailable.""" + """Get the content of a file at the baseline SHA. Returns None if unavailable. + + Decode the file content as UTF-8 with errors="replace" rather than using + text=True: source files in user repos can be latin-1 / cp1252 / shift-jis + / etc., and on Windows text=True would decode via locale.getpreferredencoding() + in strict mode and raise UnicodeDecodeError in the subprocess reader + thread — leaving result.stdout=None and propagating AttributeError when + the caller tries to use it. Same class as the existing migrations at + security_reminder_hook.py:540 (reflog subjects) and :1115 (commit + diffs); this helper was missed in that pass. See + anthropics/claude-plugins-official#2056.""" baseline_sha = load_baseline_sha(session_id) if not baseline_sha: return None @@ -151,12 +161,12 @@ def get_baseline_file_content(session_id, file_path, cwd): return None result = subprocess.run( [*GIT_CMD, "show", f"{baseline_sha}:{rel_path}"], - cwd=cwd, capture_output=True, text=True, timeout=5 + cwd=cwd, capture_output=True, timeout=5 ) if result.returncode == 0: - return result.stdout + return (result.stdout or b"").decode("utf-8", errors="replace") return None - except (subprocess.TimeoutExpired, FileNotFoundError, OSError): + except (subprocess.TimeoutExpired, FileNotFoundError, OSError, ValueError): return None @@ -173,11 +183,16 @@ def capture_git_baseline(cwd): and `compute_v2_review_set` subtracts that set so pre-existing untracked files are not reviewed as Claude-authored. """ + # stdout is a SHA so text=True is safe on stdout, but a non-ASCII + # filename in `git stash create`'s STDERR warning (e.g. a worktree + # with `Ávila_report.txt` triggers a quotePath/locale warning) would + # trip the stderr reader thread on Windows cp1252. Decode both streams + # leniently for symmetry with _list_untracked. See #2056. try: # Check if HEAD exists (i.e., repo has at least one commit) head_check = subprocess.run( [*GIT_CMD, "rev-parse", "HEAD"], - cwd=cwd, capture_output=True, text=True, timeout=5 + cwd=cwd, capture_output=True, timeout=5 ) if head_check.returncode != 0: # No commits yet — skip review rather than creating commits in the user's repo @@ -186,20 +201,20 @@ def capture_git_baseline(cwd): result = subprocess.run( [*GIT_CMD, "stash", "create"], - cwd=cwd, capture_output=True, text=True, timeout=15 + cwd=cwd, capture_output=True, timeout=15 ) - sha = result.stdout.strip() + sha = (result.stdout or b"").decode("utf-8", errors="replace").strip() if sha: return sha # Working tree is clean — stash create returns empty. Use HEAD. result = subprocess.run( [*GIT_CMD, "rev-parse", "HEAD"], - cwd=cwd, capture_output=True, text=True, timeout=5 + cwd=cwd, capture_output=True, timeout=5 ) - sha = result.stdout.strip() + sha = (result.stdout or b"").decode("utf-8", errors="replace").strip() return sha if sha else None - except (subprocess.TimeoutExpired, FileNotFoundError, OSError) as e: + except (subprocess.TimeoutExpired, FileNotFoundError, OSError, ValueError) as e: debug_log(f"Failed to capture git baseline: {e}") return None @@ -323,19 +338,35 @@ def _list_untracked(cwd): mtime is captured so an in-place edit during the turn is still reviewed. Uses ls-files (not status) for the UPS path: the index diff isn't needed, - and ls-files --others only walks the worktree against .gitignore.""" + and ls-files --others only walks the worktree against .gitignore. + + Decodes stdout/stderr as UTF-8 with errors="replace" instead of using + text=True. With core.quotePath=false git emits raw UTF-8 bytes for + non-ASCII filenames; text=True decodes via locale.getpreferredencoding() + in strict mode — on Windows that's cp1252 with several undefined bytes + (0x81/0x8D/0x8F/0x90/0x9D), all of which appear in UTF-8 encodings of + common accented capitals (Á Í Ï Ð Ý) and most CJK/emoji codepoints. + A non-ASCII filename in the worktree crashed the subprocess reader + thread, left r.stdout=None, and propagated AttributeError out of the + helper — silently losing the baseline snapshot every UserPromptSubmit. + See anthropics/claude-plugins-official#2056. The sibling helpers in + gitutil.py already follow the lenient pattern; this function and + capture_git_baseline / _git_name_only / _git_status_porcelain were + the holdouts.""" try: repo = _git_toplevel(cwd) or cwd r = subprocess.run( [*GIT_CMD, "-c", "core.quotePath=false", "ls-files", "--others", "--exclude-standard", "-z"], - cwd=repo, capture_output=True, text=True, timeout=15, + cwd=repo, capture_output=True, timeout=15, ) if r.returncode != 0: - debug_log(f"_list_untracked rc={r.returncode}: {r.stderr[:200]}") + stderr_str = (r.stderr or b"").decode("utf-8", errors="replace") + debug_log(f"_list_untracked rc={r.returncode}: {stderr_str[:200]}") return {} + stdout = (r.stdout or b"").decode("utf-8", errors="replace") out = {} - for p in r.stdout.split("\0"): + for p in stdout.split("\0"): if not p: continue try: @@ -346,7 +377,9 @@ def _list_untracked(cwd): debug_log(f"_list_untracked: capped at {UNTRACKED_BASELINE_CAP}") break return out - except (subprocess.TimeoutExpired, FileNotFoundError, OSError) as e: + except (subprocess.TimeoutExpired, FileNotFoundError, OSError, ValueError) as e: + # ValueError guards against any future strict-decode regression + # so the helper degrades to {} instead of crashing the hook. debug_log(f"_list_untracked error: {e}") return {} diff --git a/plugins/security-guidance/hooks/gitutil.py b/plugins/security-guidance/hooks/gitutil.py index 1a549d76..68e6f13a 100644 --- a/plugins/security-guidance/hooks/gitutil.py +++ b/plugins/security-guidance/hooks/gitutil.py @@ -259,19 +259,29 @@ def _git_reflog_recent_commits(repo_root, max_age_s=120, max_n=5): # %gs (the reflog subject) is `commit: ` and can # contain `|`; put it LAST so split("|", 2) leaves it intact. %H is # hex and %ct is integer, so the first two fields are delimiter-safe. + # + # Bytes + decode utf-8/replace: %gs embeds commit-message subjects + # which git stores as raw bytes — commits can be authored in + # latin-1 / cp1252 / shift-jis etc., and text=True would raise + # UnicodeDecodeError in the subprocess reader thread on Windows + # cp1252 (subprocess.run returns r.stdout=None, then + # r.stdout.splitlines() AttributeErrors). Mirrors the existing + # migration at security_reminder_hook.py:540 — same pattern was + # missed here. See anthropics/claude-plugins-official#2056. r = subprocess.run( [*GIT_CMD, "log", "-g", "-n", str(max_n), "--format=%H|%ct|%gs", "HEAD"], - cwd=repo_root, capture_output=True, text=True, timeout=5, + cwd=repo_root, capture_output=True, timeout=5, ) - except (subprocess.TimeoutExpired, FileNotFoundError, OSError): + except (subprocess.TimeoutExpired, FileNotFoundError, OSError, ValueError): return [], 0 if r.returncode != 0: return [], 0 + stdout = (r.stdout or b"").decode("utf-8", errors="replace") import time as _time now = int(_time.time()) fresh, stale = [], 0 - for idx, line in enumerate(r.stdout.splitlines()): + for idx, line in enumerate(stdout.splitlines()): parts = line.split("|", 2) if len(parts) != 3: continue @@ -306,23 +316,31 @@ def _git_name_only(cwd, base, include_untracked=False): must distinguish None (error → don't trust as a filter) from set() (genuinely nothing changed). `-c core.quotePath=false -z` keeps non-ASCII and space-containing paths intact.""" + # Decode stdout/stderr as UTF-8 with errors="replace" instead of using + # text=True. core.quotePath=false makes git emit raw UTF-8 for non-ASCII + # paths, and text=True on Windows decodes via cp1252 strict — a non-ASCII + # changed path would crash the subprocess reader thread, leave + # result.stdout=None, and propagate AttributeError out of the helper. + # Same fix shape as diffstate._list_untracked. See #2056. def _run(env): result = subprocess.run( [*GIT_CMD, "-c", "core.quotePath=false", "diff", "--name-only", "-z", base], - cwd=cwd, capture_output=True, text=True, timeout=30, + cwd=cwd, capture_output=True, timeout=30, env=env, ) if result.returncode != 0: - debug_log(f"_git_name_only({base!r}) rc={result.returncode}: {result.stderr[:200]}") + stderr_str = (result.stderr or b"").decode("utf-8", errors="replace") + debug_log(f"_git_name_only({base!r}) rc={result.returncode}: {stderr_str[:200]}") return None - return {p for p in result.stdout.split("\0") if p} + stdout = (result.stdout or b"").decode("utf-8", errors="replace") + return {p for p in stdout.split("\0") if p} try: if not include_untracked: return _run(None) with _temp_index(cwd) as env: return _run(env) - except (subprocess.TimeoutExpired, FileNotFoundError, OSError) as e: + except (subprocess.TimeoutExpired, FileNotFoundError, OSError, ValueError) as e: debug_log(f"_git_name_only({base!r}) error: {e}") return None @@ -339,17 +357,22 @@ def _git_status_porcelain(cwd): collapses to `dir/`). Required so the untracked set subtracts cleanly against the UPS-time `_list_untracked` snapshot, which uses ls-files and therefore always lists individual files.""" + # Lenient decode: same UTF-8 + errors="replace" pattern as the + # sibling helpers — a non-ASCII path in the worktree would otherwise + # crash the cp1252 reader thread on Windows. See #2056. try: r = subprocess.run( [*GIT_CMD, "-c", "core.quotePath=false", "status", "--porcelain=v1", "-uall", "-z"], - cwd=cwd, capture_output=True, text=True, timeout=30, + cwd=cwd, capture_output=True, timeout=30, ) if r.returncode != 0: - debug_log(f"_git_status_porcelain rc={r.returncode}: {r.stderr[:200]}") + stderr_str = (r.stderr or b"").decode("utf-8", errors="replace") + debug_log(f"_git_status_porcelain rc={r.returncode}: {stderr_str[:200]}") return None, None tracked, untracked = set(), set() - entries = r.stdout.split("\0") + stdout = (r.stdout or b"").decode("utf-8", errors="replace") + entries = stdout.split("\0") i = 0 while i < len(entries): e = entries[i] @@ -368,7 +391,9 @@ def _git_status_porcelain(cwd): i += 1 i += 1 return tracked, untracked - except (subprocess.TimeoutExpired, FileNotFoundError, OSError) as e: + except (subprocess.TimeoutExpired, FileNotFoundError, OSError, ValueError) as e: + # ValueError guards against any future strict-decode regression + # so the helper degrades to (None, None) instead of crashing. debug_log(f"_git_status_porcelain error: {e}") return None, None