peregrine/scripts/skills_utils.py
pyr0ball 93bf6b3c6f feat: bundled skills suggestion list and content filter utility
- config/skills_suggestions.yaml: 168 curated tags across skills (77),
  domains (40), keywords (51) covering CS/TAM/ops and common tech roles;
  structured for future community aggregate (paid tier backlog)
- scripts/skills_utils.py: filter_tag() rejects blanks, URLs, profanity,
  overlong strings, disallowed chars, and repeated-char runs;
  load_suggestions() reads bundled YAML per category
2026-02-26 13:09:32 -08:00

67 lines
2.5 KiB
Python

"""
skills_utils.py — Content filter and suggestion loader for the skills tagging system.
load_suggestions(category) → list[str] bundled suggestions for a category
filter_tag(tag) → str | None cleaned tag, or None if rejected
"""
from __future__ import annotations
import re
from pathlib import Path
_SUGGESTIONS_FILE = Path(__file__).parent.parent / "config" / "skills_suggestions.yaml"
# ── Content filter ─────────────────────────────────────────────────────────────
# Tags must be short, human-readable skill/domain labels. No URLs, no abuse.
_BLOCKED = {
# profanity placeholder — extend as needed
"fuck", "shit", "ass", "bitch", "cunt", "dick", "bastard", "damn",
}
_URL_RE = re.compile(r"https?://|www\.|\.com\b|\.net\b|\.org\b", re.I)
_ALLOWED_CHARS = re.compile(r"^[\w\s\-\.\+\#\/\&\(\)]+$", re.UNICODE)
def filter_tag(raw: str) -> str | None:
"""Return a cleaned tag string, or None if the tag should be rejected.
Rejection criteria:
- Blank after stripping
- Too short (< 2 chars) or too long (> 60 chars)
- Contains a URL pattern
- Contains disallowed characters
- Matches a blocked term (case-insensitive, whole-word)
- Repeated character run (e.g. 'aaaaa')
"""
tag = " ".join(raw.strip().split()) # normalise whitespace
if not tag or len(tag) < 2:
return None
if len(tag) > 60:
return None
if _URL_RE.search(tag):
return None
if not _ALLOWED_CHARS.match(tag):
return None
lower = tag.lower()
for blocked in _BLOCKED:
if re.search(rf"\b{re.escape(blocked)}\b", lower):
return None
if re.search(r"(.)\1{4,}", lower): # 5+ repeated chars
return None
return tag
# ── Suggestion loader ──────────────────────────────────────────────────────────
def load_suggestions(category: str) -> list[str]:
"""Return the bundled suggestion list for a category ('skills'|'domains'|'keywords').
Returns an empty list if the file is missing or the category is not found.
"""
if not _SUGGESTIONS_FILE.exists():
return []
try:
import yaml
data = yaml.safe_load(_SUGGESTIONS_FILE.read_text()) or {}
return list(data.get(category, []))
except Exception:
return []