diff --git a/.githooks/pre-commit b/.githooks/pre-commit new file mode 100755 index 0000000..5153309 --- /dev/null +++ b/.githooks/pre-commit @@ -0,0 +1,76 @@ +#!/usr/bin/env bash +# .githooks/pre-commit — blocks sensitive files and API key patterns +set -euo pipefail + +RED='\033[0;31m'; YELLOW='\033[1;33m'; NC='\033[0m' + +BLOCKED_PATHS=( + "config/user.yaml" + "config/server.yaml" + "config/llm.yaml" + "config/notion.yaml" + "config/adzuna.yaml" + "config/label_tool.yaml" + ".env" +) + +BLOCKED_PATTERNS=( + "data/.*\.db$" + "data/.*\.jsonl$" + "demo/data/.*\.db$" +) + +KEY_REGEXES=( + 'sk-[A-Za-z0-9]{20,}' + 'Bearer [A-Za-z0-9\-_]{20,}' + 'api_key:[[:space:]]*["\x27]?[A-Za-z0-9\-_]{16,}' +) + +ERRORS=0 + +# Get list of staged files +EMPTY_TREE="4b825dc642cb6eb9a060e54bf8d69288fbee4904" +mapfile -t staged_files < <(git diff-index --cached --name-only HEAD 2>/dev/null || \ + git diff-index --cached --name-only "$EMPTY_TREE") + +for file in "${staged_files[@]}"; do + # Exact path blocklist + for blocked in "${BLOCKED_PATHS[@]}"; do + if [[ "$file" == "$blocked" ]]; then + echo -e "${RED}BLOCKED:${NC} $file is in the sensitive file blocklist." + echo -e " Use: ${YELLOW}git restore --staged $file${NC}" + ERRORS=$((ERRORS + 1)) + fi + done + + # Pattern blocklist + for pattern in "${BLOCKED_PATTERNS[@]}"; do + if echo "$file" | grep -qE "$pattern"; then + echo -e "${RED}BLOCKED:${NC} $file matches sensitive path pattern ($pattern)." + echo -e " Add to .gitignore or: ${YELLOW}git restore --staged $file${NC}" + ERRORS=$((ERRORS + 1)) + fi + done + + # Content scan for key patterns (only on existing staged files) + if [[ -f "$file" ]]; then + staged_content=$(git diff --cached -- "$file" 2>/dev/null | grep '^+' | grep -v '^+++' || true) + for regex in "${KEY_REGEXES[@]}"; do + if echo "$staged_content" | grep -qE "$regex"; then + echo -e "${RED}BLOCKED:${NC} $file appears to contain an API key or token." + echo -e " Pattern matched: ${YELLOW}$regex${NC}" + echo -e " Review with: ${YELLOW}git diff --cached -- $file${NC}" + echo -e " Use: ${YELLOW}git restore --staged $file${NC}" + ERRORS=$((ERRORS + 1)) + break + fi + done + fi +done + +if [[ $ERRORS -gt 0 ]]; then + echo "" + echo -e "${RED}Commit blocked.${NC} Fix the issues above and try again." + exit 1 +fi +exit 0 diff --git a/tests/test_hooks.sh b/tests/test_hooks.sh new file mode 100755 index 0000000..b48358c --- /dev/null +++ b/tests/test_hooks.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +set -euo pipefail +HOOK=".githooks/pre-commit" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" +cd "$REPO_DIR" + +pass() { echo " PASS: $1"; } +fail() { echo " FAIL: $1"; exit 1; } + +# Helper: run hook against a fake staged file list +run_hook_with() { + local staged_file="$1" + local staged_content="${2:-}" + local tmpdir + tmpdir=$(mktemp -d) + + # Create shim that reports our file as staged + cat > "$tmpdir/git" <&1 + local status=$? + rm -rf "$tmpdir" + return $status +} + +echo "Test 1: blocks config/user.yaml" +run_hook_with "config/user.yaml" && fail "should have blocked" || pass "blocked user.yaml" + +echo "Test 2: blocks .env" +run_hook_with ".env" && fail "should have blocked" || pass "blocked .env" + +echo "Test 3: blocks content with OpenAI key pattern" +run_hook_with "app/app.py" "+sk-abcdefghijklmnopqrstuvwxyz123456" && \ + fail "should have blocked key pattern" || pass "blocked key pattern" + +echo "Test 4: allows safe file" +run_hook_with "app/app.py" "import streamlit" && pass "allowed safe file" || \ + fail "should have allowed safe file" + +echo "All pre-commit hook tests passed."