"""Tests for URL-based job scraping.""" from unittest.mock import patch, MagicMock def _make_db(tmp_path, url="https://www.linkedin.com/jobs/view/99999/"): from scripts.db import init_db, insert_job db = tmp_path / "test.db" init_db(db) job_id = insert_job(db, { "title": "Importing…", "company": "", "url": url, "source": "manual", "location": "", "description": "", "date_found": "2026-02-24", }) return db, job_id def test_canonicalize_url_linkedin(): from scripts.scrape_url import canonicalize_url messy = ( "https://www.linkedin.com/jobs/view/4376518925/" "?trk=eml-email_job_alert&refId=abc%3D%3D&trackingId=xyz" ) assert canonicalize_url(messy) == "https://www.linkedin.com/jobs/view/4376518925/" def test_canonicalize_url_linkedin_comm(): from scripts.scrape_url import canonicalize_url comm = "https://www.linkedin.com/comm/jobs/view/4376518925/?trackingId=abc" assert canonicalize_url(comm) == "https://www.linkedin.com/jobs/view/4376518925/" def test_canonicalize_url_generic_strips_utm(): from scripts.scrape_url import canonicalize_url url = "https://jobs.example.com/post/42?utm_source=linkedin&utm_medium=email&jk=real_param" result = canonicalize_url(url) assert "utm_source" not in result assert "real_param" in result def test_detect_board_linkedin(): from scripts.scrape_url import _detect_board assert _detect_board("https://www.linkedin.com/jobs/view/12345/") == "linkedin" assert _detect_board("https://linkedin.com/jobs/view/12345/?tracking=abc") == "linkedin" def test_detect_board_indeed(): from scripts.scrape_url import _detect_board assert _detect_board("https://www.indeed.com/viewjob?jk=abc123") == "indeed" def test_detect_board_glassdoor(): from scripts.scrape_url import _detect_board assert _detect_board("https://www.glassdoor.com/job-listing/foo-bar-123.htm") == "glassdoor" def test_detect_board_generic(): from scripts.scrape_url import _detect_board assert _detect_board("https://jobs.example.com/posting/42") == "generic" def test_extract_linkedin_job_id(): from scripts.scrape_url import _extract_linkedin_job_id assert _extract_linkedin_job_id("https://www.linkedin.com/jobs/view/4376518925/") == "4376518925" assert _extract_linkedin_job_id("https://www.linkedin.com/comm/jobs/view/4376518925/?tracking=x") == "4376518925" assert _extract_linkedin_job_id("https://example.com/no-id") is None def test_scrape_linkedin_updates_job(tmp_path): db, job_id = _make_db(tmp_path) linkedin_html = """