peregrine/scripts/skills_utils.py

"""
skills_utils.py — Content filter and suggestion loader for the skills tagging system.

load_suggestions(category)  → list[str]   bundled suggestions for a category
filter_tag(tag)             → str | None   cleaned tag, or None if rejected
"""
from __future__ import annotations
import re
from pathlib import Path

_SUGGESTIONS_FILE = Path(__file__).parent.parent / "config" / "skills_suggestions.yaml"

# ── Content filter ─────────────────────────────────────────────────────────────
# Tags must be short, human-readable skill/domain labels. No URLs, no abuse.

_BLOCKED = {
    # profanity placeholder — extend as needed
    "fuck", "shit", "ass", "bitch", "cunt", "dick", "bastard", "damn",
}

_URL_RE = re.compile(r"https?://|www\.|\.com\b|\.net\b|\.org\b", re.I)
_ALLOWED_CHARS = re.compile(r"^[\w\s\-\.\+\#\/\&\(\)]+$", re.UNICODE)


def filter_tag(raw: str) -> str | None:
    """Return a cleaned tag string, or None if the tag should be rejected.

    Rejection criteria:
    - Blank after stripping
    - Too short (< 2 chars) or too long (> 60 chars)
    - Contains a URL pattern
    - Contains disallowed characters
    - Matches a blocked term (case-insensitive, whole-word)
    - Repeated character run (e.g. 'aaaaa')
    """
    tag = " ".join(raw.strip().split())  # normalise whitespace
    if not tag or len(tag) < 2:
        return None
    if len(tag) > 60:
        return None
    if _URL_RE.search(tag):
        return None
    if not _ALLOWED_CHARS.match(tag):
        return None
    lower = tag.lower()
    for blocked in _BLOCKED:
        if re.search(rf"\b{re.escape(blocked)}\b", lower):
            return None
    if re.search(r"(.)\1{4,}", lower):  # 5+ repeated chars
        return None
    return tag


# ── Suggestion loader ──────────────────────────────────────────────────────────

def load_suggestions(category: str) -> list[str]:
    """Return the bundled suggestion list for a category ('skills'|'domains'|'keywords').
    Returns an empty list if the file is missing or the category is not found.
    """
    if not _SUGGESTIONS_FILE.exists():
        return []
    try:
        import yaml
        data = yaml.safe_load(_SUGGESTIONS_FILE.read_text()) or {}
        return list(data.get(category, []))
    except Exception:
        return []