196 lines
8 KiB
Python
196 lines
8 KiB
Python
# tests/test_discover.py
|
||
import pytest
|
||
from unittest.mock import patch, MagicMock
|
||
import pandas as pd
|
||
from pathlib import Path
|
||
|
||
SAMPLE_JOB = {
|
||
"title": "Customer Success Manager",
|
||
"company": "Acme Corp",
|
||
"location": "Remote",
|
||
"is_remote": True,
|
||
"job_url": "https://linkedin.com/jobs/view/123456",
|
||
"site": "linkedin",
|
||
"min_amount": 90000,
|
||
"max_amount": 120000,
|
||
"salary_source": "$90,000 - $120,000",
|
||
"description": "Great CS role",
|
||
}
|
||
|
||
SAMPLE_FM = {
|
||
"title_field": "Salary", "job_title": "Job Title", "company": "Company Name",
|
||
"url": "Role Link", "source": "Job Source", "status": "Status of Application",
|
||
"status_new": "Application Submitted", "date_found": "Date Found",
|
||
"remote": "Remote", "match_score": "Match Score",
|
||
"keyword_gaps": "Keyword Gaps", "notes": "Notes", "job_description": "Job Description",
|
||
}
|
||
|
||
SAMPLE_NOTION_CFG = {"token": "secret_test", "database_id": "fake-db-id", "field_map": SAMPLE_FM}
|
||
SAMPLE_PROFILES_CFG = {
|
||
"profiles": [{"name": "cs", "titles": ["Customer Success Manager"],
|
||
"locations": ["Remote"], "boards": ["linkedin"],
|
||
"results_per_board": 5, "hours_old": 72}]
|
||
}
|
||
|
||
|
||
def make_jobs_df(jobs=None):
|
||
return pd.DataFrame(jobs or [SAMPLE_JOB])
|
||
|
||
|
||
def test_discover_writes_to_sqlite(tmp_path):
|
||
"""run_discovery inserts new jobs into SQLite staging db."""
|
||
from scripts.discover import run_discovery
|
||
from scripts.db import get_jobs_by_status
|
||
|
||
db_path = tmp_path / "test.db"
|
||
with patch("scripts.discover.load_config", return_value=(SAMPLE_PROFILES_CFG, SAMPLE_NOTION_CFG)), \
|
||
patch("scripts.discover.scrape_jobs", return_value=make_jobs_df()), \
|
||
patch("scripts.discover.Client"):
|
||
run_discovery(db_path=db_path)
|
||
|
||
jobs = get_jobs_by_status(db_path, "pending")
|
||
assert len(jobs) == 1
|
||
assert jobs[0]["title"] == "Customer Success Manager"
|
||
|
||
|
||
def test_discover_skips_duplicate_urls(tmp_path):
|
||
"""run_discovery does not insert a job whose URL is already in SQLite."""
|
||
from scripts.discover import run_discovery
|
||
from scripts.db import init_db, insert_job, get_jobs_by_status
|
||
|
||
db_path = tmp_path / "test.db"
|
||
init_db(db_path)
|
||
insert_job(db_path, {
|
||
"title": "Old", "company": "X", "url": "https://linkedin.com/jobs/view/123456",
|
||
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||
"salary": "", "description": "", "date_found": "2026-01-01",
|
||
})
|
||
|
||
with patch("scripts.discover.load_config", return_value=(SAMPLE_PROFILES_CFG, SAMPLE_NOTION_CFG)), \
|
||
patch("scripts.discover.scrape_jobs", return_value=make_jobs_df()), \
|
||
patch("scripts.discover.Client"):
|
||
run_discovery(db_path=db_path)
|
||
|
||
jobs = get_jobs_by_status(db_path, "pending")
|
||
assert len(jobs) == 1 # only the pre-existing one, not a duplicate
|
||
|
||
|
||
def test_discover_pushes_new_jobs(tmp_path):
|
||
"""Legacy: discover still calls push_to_notion when notion_push=True."""
|
||
from scripts.discover import run_discovery
|
||
db_path = tmp_path / "test.db"
|
||
with patch("scripts.discover.load_config", return_value=(SAMPLE_PROFILES_CFG, SAMPLE_NOTION_CFG)), \
|
||
patch("scripts.discover.scrape_jobs", return_value=make_jobs_df()), \
|
||
patch("scripts.discover.push_to_notion") as mock_push, \
|
||
patch("scripts.discover.get_existing_urls", return_value=set()), \
|
||
patch("scripts.discover.Client"):
|
||
run_discovery(db_path=db_path, notion_push=True)
|
||
assert mock_push.call_count == 1
|
||
|
||
|
||
def test_push_to_notion_sets_status_new():
|
||
"""push_to_notion always sets Status to the configured status_new value."""
|
||
from scripts.discover import push_to_notion
|
||
mock_notion = MagicMock()
|
||
push_to_notion(mock_notion, "fake-db-id", SAMPLE_JOB, SAMPLE_FM)
|
||
call_kwargs = mock_notion.pages.create.call_args[1]
|
||
status = call_kwargs["properties"]["Status of Application"]["select"]["name"]
|
||
assert status == "Application Submitted"
|
||
|
||
|
||
# ── Custom boards integration ─────────────────────────────────────────────────
|
||
|
||
_PROFILE_WITH_CUSTOM = {
|
||
"profiles": [{
|
||
"name": "cs", "titles": ["Customer Success Manager"],
|
||
"locations": ["Remote"], "boards": [],
|
||
"custom_boards": ["adzuna"],
|
||
"results_per_board": 5, "hours_old": 72,
|
||
}]
|
||
}
|
||
|
||
_ADZUNA_JOB = {
|
||
"title": "Customer Success Manager",
|
||
"company": "TestCo",
|
||
"url": "https://www.adzuna.com/jobs/details/999",
|
||
"source": "adzuna",
|
||
"location": "Remote",
|
||
"is_remote": True,
|
||
"salary": "$90,000 – $120,000",
|
||
"description": "Great remote CSM role",
|
||
}
|
||
|
||
|
||
def test_discover_custom_board_inserts_jobs(tmp_path):
|
||
"""run_discovery dispatches custom_boards scrapers and inserts returned jobs."""
|
||
from scripts.discover import run_discovery
|
||
from scripts.db import get_jobs_by_status
|
||
|
||
db_path = tmp_path / "test.db"
|
||
with patch("scripts.discover.load_config", return_value=(_PROFILE_WITH_CUSTOM, SAMPLE_NOTION_CFG)), \
|
||
patch("scripts.discover.scrape_jobs", return_value=pd.DataFrame()), \
|
||
patch("scripts.discover.CUSTOM_SCRAPERS", {"adzuna": lambda *a, **kw: [_ADZUNA_JOB]}), \
|
||
patch("scripts.discover.Client"):
|
||
count = run_discovery(db_path=db_path)
|
||
|
||
assert count == 1
|
||
jobs = get_jobs_by_status(db_path, "pending")
|
||
assert jobs[0]["title"] == "Customer Success Manager"
|
||
assert jobs[0]["source"] == "adzuna"
|
||
|
||
|
||
def test_discover_custom_board_skips_unknown(tmp_path, capsys):
|
||
"""run_discovery logs and skips an unregistered custom board name."""
|
||
from scripts.discover import run_discovery
|
||
|
||
profile_unknown = {
|
||
"profiles": [{
|
||
"name": "cs", "titles": ["CSM"], "locations": ["Remote"],
|
||
"boards": [], "custom_boards": ["nonexistent_board"],
|
||
"results_per_board": 5, "hours_old": 72,
|
||
}]
|
||
}
|
||
db_path = tmp_path / "test.db"
|
||
with patch("scripts.discover.load_config", return_value=(profile_unknown, SAMPLE_NOTION_CFG)), \
|
||
patch("scripts.discover.scrape_jobs", return_value=pd.DataFrame()), \
|
||
patch("scripts.discover.Client"):
|
||
run_discovery(db_path=db_path)
|
||
|
||
captured = capsys.readouterr()
|
||
assert "nonexistent_board" in captured.out
|
||
assert "Unknown scraper" in captured.out
|
||
|
||
|
||
def test_discover_custom_board_deduplicates(tmp_path):
|
||
"""Custom board results are deduplicated by URL against pre-existing jobs."""
|
||
from scripts.discover import run_discovery
|
||
from scripts.db import init_db, insert_job, get_jobs_by_status
|
||
|
||
db_path = tmp_path / "test.db"
|
||
init_db(db_path)
|
||
insert_job(db_path, {
|
||
"title": "CSM", "company": "TestCo",
|
||
"url": "https://www.adzuna.com/jobs/details/999",
|
||
"source": "adzuna", "location": "Remote", "is_remote": True,
|
||
"salary": "", "description": "", "date_found": "2026-01-01",
|
||
})
|
||
|
||
with patch("scripts.discover.load_config", return_value=(_PROFILE_WITH_CUSTOM, SAMPLE_NOTION_CFG)), \
|
||
patch("scripts.discover.scrape_jobs", return_value=pd.DataFrame()), \
|
||
patch("scripts.discover.CUSTOM_SCRAPERS", {"adzuna": lambda *a, **kw: [_ADZUNA_JOB]}), \
|
||
patch("scripts.discover.Client"):
|
||
count = run_discovery(db_path=db_path)
|
||
|
||
assert count == 0 # duplicate skipped
|
||
assert len(get_jobs_by_status(db_path, "pending")) == 1
|
||
|
||
|
||
# ── Blocklist integration ─────────────────────────────────────────────────────
|
||
|
||
def test_is_blocklisted_jobgether():
|
||
"""_is_blocklisted filters jobs from Jobgether (case-insensitive)."""
|
||
from scripts.discover import _is_blocklisted
|
||
blocklist = {"companies": ["jobgether"], "industries": [], "locations": []}
|
||
assert _is_blocklisted({"company": "Jobgether", "location": "", "description": ""}, blocklist)
|
||
assert _is_blocklisted({"company": "jobgether inc", "location": "", "description": ""}, blocklist)
|
||
assert not _is_blocklisted({"company": "Acme Corp", "location": "", "description": ""}, blocklist)
|