- config/skills_suggestions.yaml: 168 curated tags across skills (77), domains (40), keywords (51) covering CS/TAM/ops and common tech roles; structured for future community aggregate (paid tier backlog) - scripts/skills_utils.py: filter_tag() rejects blanks, URLs, profanity, overlong strings, disallowed chars, and repeated-char runs; load_suggestions() reads bundled YAML per category
67 lines
2.5 KiB
Python
67 lines
2.5 KiB
Python
"""
|
|
skills_utils.py — Content filter and suggestion loader for the skills tagging system.
|
|
|
|
load_suggestions(category) → list[str] bundled suggestions for a category
|
|
filter_tag(tag) → str | None cleaned tag, or None if rejected
|
|
"""
|
|
from __future__ import annotations
|
|
import re
|
|
from pathlib import Path
|
|
|
|
_SUGGESTIONS_FILE = Path(__file__).parent.parent / "config" / "skills_suggestions.yaml"
|
|
|
|
# ── Content filter ─────────────────────────────────────────────────────────────
|
|
# Tags must be short, human-readable skill/domain labels. No URLs, no abuse.
|
|
|
|
_BLOCKED = {
|
|
# profanity placeholder — extend as needed
|
|
"fuck", "shit", "ass", "bitch", "cunt", "dick", "bastard", "damn",
|
|
}
|
|
|
|
_URL_RE = re.compile(r"https?://|www\.|\.com\b|\.net\b|\.org\b", re.I)
|
|
_ALLOWED_CHARS = re.compile(r"^[\w\s\-\.\+\#\/\&\(\)]+$", re.UNICODE)
|
|
|
|
|
|
def filter_tag(raw: str) -> str | None:
|
|
"""Return a cleaned tag string, or None if the tag should be rejected.
|
|
|
|
Rejection criteria:
|
|
- Blank after stripping
|
|
- Too short (< 2 chars) or too long (> 60 chars)
|
|
- Contains a URL pattern
|
|
- Contains disallowed characters
|
|
- Matches a blocked term (case-insensitive, whole-word)
|
|
- Repeated character run (e.g. 'aaaaa')
|
|
"""
|
|
tag = " ".join(raw.strip().split()) # normalise whitespace
|
|
if not tag or len(tag) < 2:
|
|
return None
|
|
if len(tag) > 60:
|
|
return None
|
|
if _URL_RE.search(tag):
|
|
return None
|
|
if not _ALLOWED_CHARS.match(tag):
|
|
return None
|
|
lower = tag.lower()
|
|
for blocked in _BLOCKED:
|
|
if re.search(rf"\b{re.escape(blocked)}\b", lower):
|
|
return None
|
|
if re.search(r"(.)\1{4,}", lower): # 5+ repeated chars
|
|
return None
|
|
return tag
|
|
|
|
|
|
# ── Suggestion loader ──────────────────────────────────────────────────────────
|
|
|
|
def load_suggestions(category: str) -> list[str]:
|
|
"""Return the bundled suggestion list for a category ('skills'|'domains'|'keywords').
|
|
Returns an empty list if the file is missing or the category is not found.
|
|
"""
|
|
if not _SUGGESTIONS_FILE.exists():
|
|
return []
|
|
try:
|
|
import yaml
|
|
data = yaml.safe_load(_SUGGESTIONS_FILE.read_text()) or {}
|
|
return list(data.get(category, []))
|
|
except Exception:
|
|
return []
|