""" Regex-based security pattern definitions for the security-guidance plugin. Pure data + one pure helper. No env-var reads, no I/O, no debug_log — kept side-effect-free so it can be imported in isolation. """ from enum import IntEnum _JS_EXTS = (".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs", ".mts", ".cts", ".vue", ".svelte") _PY_EXTS = (".py", ".pyi", ".ipynb") _DOC_EXTS = (".md", ".mdx", ".txt", ".rst", ".json", ".yaml", ".yml") _UNSAFE_DESERIALIZATION_REMINDER = """⚠️ Security Warning: Loading pickle data (or equivalents: cPickle, cloudpickle, dill, marshal, shelve, joblib, pandas.read_pickle, numpy with allow_pickle=True) from untrusted sources allows arbitrary code execution. For simple data, prefer JSON or msgspec. For typed objects, prefer a schema-validated deserializer (msgspec.Struct, pydantic, marshmallow) that constructs only declared types. If this is safe or is explicitly needed, briefly document that in a comment before continuing.""" _UNSAFE_YAML_LOAD_REMINDER = """⚠️ Security Warning: yaml.load() / yaml.unsafe_load() execute arbitrary Python via !!python/object tags. Use yaml.safe_load() if the file only contains simple data structures (dicts, lists, strings, numbers). If you need typed objects, parse with safe_load and validate the result against a schema (pydantic, msgspec, marshmallow) — never use a custom Loader that constructs arbitrary types.""" _UNSAFE_TORCH_LOAD_REMINDER = """⚠️ Security Warning: torch.load() defaults to weights_only=False, which unpickles arbitrary Python objects and allows arbitrary code execution. If the file only contains tensors and simple data structures, pass weights_only=True (or set TORCH_FORCE_WEIGHTS_ONLY_LOAD=1).""" # Security patterns configuration SECURITY_PATTERNS = [ { "ruleName": "github_actions_workflow", "path_check": lambda path: ".github/workflows/" in path and (path.endswith(".yml") or path.endswith(".yaml")), "reminder": """⚠️ Security Warning: You are editing a GitHub Actions workflow file. Be aware of these security risks: 1. **Command Injection**: Never use untrusted input (like issue titles, PR descriptions, commit messages) directly in run: commands without proper escaping 2. **Use environment variables**: Instead of ${{ github.event.issue.title }}, use env: with proper quoting 3. **Review the guide**: https://github.blog/security/vulnerability-research/how-to-catch-github-actions-workflow-injections-before-attackers-do/ Example of UNSAFE pattern to avoid: run: echo "${{ github.event.issue.title }}" Example of SAFE pattern: env: TITLE: ${{ github.event.issue.title }} run: echo "$TITLE" Other risky inputs to be careful with: - github.event.issue.body - github.event.pull_request.title - github.event.pull_request.body - github.event.comment.body - github.event.review.body - github.event.review_comment.body - github.event.pages.*.page_name - github.event.commits.*.message - github.event.head_commit.message - github.event.head_commit.author.email - github.event.head_commit.author.name - github.event.commits.*.author.email - github.event.commits.*.author.name - github.event.pull_request.head.ref - github.event.pull_request.head.label - github.event.pull_request.head.repo.default_branch - github.event.client_payload.* (repository_dispatch events — attacker can set any field) 4. **Ref injection**: Never use untrusted input in `ref:` parameters of `actions/checkout`. For `client_payload.pr_number`, validate it matches `^[0-9]+$` before using in `ref: refs/pull/${{ ... }}/head` - github.head_ref""", }, { "ruleName": "child_process_exec", # Gate to JS/TS files — bare `exec(` otherwise fires on Python's # exec() and on prose/docstrings mentioning exec. "path_filter": lambda p: p.endswith(_JS_EXTS), "substrings": ["child_process.exec", "execSync("], "regex": r"(? o[k], root); for computation use a safe expression parser. NEVER interpolate untrusted strings into new Function() bodies.", }, { "ruleName": "eval_injection", # Lookbehind excludes `.` so method calls like PyTorch model.eval(), # redis.eval(), spec.eval() don't match. Skip doc/prose files. "path_filter": lambda p: not p.endswith(_DOC_EXTS), "regex": r"(?]{0,400}integrity\s*=)" r"[^>]{0,200}src\s*=\s*[\x22\x27](?:https?:)?//" r"[^\x22\x27]{1,300}[\x22\x27]" r"[^>]{0,100}>" ), "reminder": '⚠️ Security Warning: Add integrity="sha384-..." crossorigin="anonymous" to external script tags. Loading scripts without Subresource Integrity exposes you to CDN compromise.', }, { "ruleName": "torch_unsafe_load", # Suppressed by weights_only=True on the same line (within 200 chars). weights_only=False # still triggers. Multi-line calls false-positive — same known limitation as unsafe_yaml_load. "regex": r"(?:\btorch\.load|\.torch_load)\s*\((?![^)\n]{0,200}weights_only\s*=\s*True)", "reminder": _UNSAFE_TORCH_LOAD_REMINDER, }, { "ruleName": "yaml_unsafe_load_variants", # yaml.unsafe_load (stdlib alias) plus unsafe wrapper method names seen in the wild. # Bare yaml.load() is unsafe_yaml_load's job (RuleId 12). "regex": r"(?:\byaml\.unsafe_load|\.yaml_unsafe_load)\s*\(", "reminder": _UNSAFE_YAML_LOAD_REMINDER, }, { "ruleName": "pickle_wrapper_load", # Library APIs that unpickle without saying "pickle". numpy.load only triggers # when allow_pickle=True is explicit (defaults to False since numpy 1.16.3). "regex": r"\bjoblib\.load\s*\(|\b(?:pd|pandas)\.read_pickle\s*\(|\.cloudpickle_load\s*\(|\b(?:np|numpy)\.load\s*\([^)\n]{0,200}allow_pickle\s*=\s*True", "reminder": _UNSAFE_DESERIALIZATION_REMINDER, }, ] class RuleId(IntEnum): """ Stable numeric IDs for SECURITY_PATTERNS rules, emitted via the PostToolUse metrics field so telemetry can attribute pattern-warning events to specific checks. The metrics schema only allows bool|number values (no strings), so rule names can't be sent directly. Values are frozen: do not renumber existing entries. Append new ones. """ GITHUB_ACTIONS_WORKFLOW = 1 CHILD_PROCESS_EXEC = 2 NEW_FUNCTION_INJECTION = 3 EVAL_INJECTION = 4 REACT_DANGEROUSLY_SET_HTML = 5 DOCUMENT_WRITE_XSS = 6 INNERHTML_XSS = 7 PICKLE_DESERIALIZATION = 8 OS_SYSTEM_INJECTION = 9 PYTHON_SUBPROCESS_SHELL = 10 GO_EXEC_SHELL_INJECTION = 11 UNSAFE_YAML_LOAD = 12 NODE_CREATECIPHER_NO_IV = 13 AES_ECB_MODE = 14 TLS_VERIFICATION_DISABLED = 15 MARSHAL_LOADS = 16 SHELVE_OPEN = 17 XML_UNSAFE_PARSE = 18 PICKLE_VARIANTS_LOAD = 19 OUTERHTML_XSS = 20 INSERTADJACENTHTML_XSS = 21 SCRIPT_SRC_WITHOUT_SRI = 22 TORCH_UNSAFE_LOAD = 23 YAML_UNSAFE_LOAD_VARIANTS = 24 PICKLE_WRAPPER_LOAD = 25 _RULE_NAME_TO_ID = { "github_actions_workflow": RuleId.GITHUB_ACTIONS_WORKFLOW, "child_process_exec": RuleId.CHILD_PROCESS_EXEC, "new_function_injection": RuleId.NEW_FUNCTION_INJECTION, "eval_injection": RuleId.EVAL_INJECTION, "react_dangerously_set_html": RuleId.REACT_DANGEROUSLY_SET_HTML, "document_write_xss": RuleId.DOCUMENT_WRITE_XSS, "innerHTML_xss": RuleId.INNERHTML_XSS, "pickle_deserialization": RuleId.PICKLE_DESERIALIZATION, "os_system_injection": RuleId.OS_SYSTEM_INJECTION, "python_subprocess_shell": RuleId.PYTHON_SUBPROCESS_SHELL, "go_exec_shell_injection": RuleId.GO_EXEC_SHELL_INJECTION, "unsafe_yaml_load": RuleId.UNSAFE_YAML_LOAD, "node_createcipher_no_iv": RuleId.NODE_CREATECIPHER_NO_IV, "aes_ecb_mode": RuleId.AES_ECB_MODE, "tls_verification_disabled": RuleId.TLS_VERIFICATION_DISABLED, "marshal_loads": RuleId.MARSHAL_LOADS, "shelve_open": RuleId.SHELVE_OPEN, "xml_unsafe_parse": RuleId.XML_UNSAFE_PARSE, "pickle_variants_load": RuleId.PICKLE_VARIANTS_LOAD, "outerHTML_xss": RuleId.OUTERHTML_XSS, "insertAdjacentHTML_xss": RuleId.INSERTADJACENTHTML_XSS, "script_src_without_sri": RuleId.SCRIPT_SRC_WITHOUT_SRI, "torch_unsafe_load": RuleId.TORCH_UNSAFE_LOAD, "yaml_unsafe_load_variants": RuleId.YAML_UNSAFE_LOAD_VARIANTS, "pickle_wrapper_load": RuleId.PICKLE_WRAPPER_LOAD, } # Fail loudly at import time if a pattern is added without a RuleId. # This fires in pytest on every PR, so desync is caught before merge. assert set(_RULE_NAME_TO_ID) == {p["ruleName"] for p in SECURITY_PATTERNS}, ( f"RuleId enum out of sync with SECURITY_PATTERNS: " f"missing={set(p['ruleName'] for p in SECURITY_PATTERNS) - set(_RULE_NAME_TO_ID)}, " f"extra={set(_RULE_NAME_TO_ID) - set(p['ruleName'] for p in SECURITY_PATTERNS)}" ) def rule_names_to_mask(rule_names): """Pack a set of rule names into a bitmask. Bit N set means RuleId(N) matched. User-defined patterns (rule_name starting with "user:") have no static RuleId and are excluded from the mask.""" mask = 0 for name in rule_names: if name in _RULE_NAME_TO_ID: mask |= 1 << _RULE_NAME_TO_ID[name] return mask