peregrine/tests/test_linkedin_scraper.py

213 lines
8.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# tests/test_linkedin_scraper.py
import io
import json
import sys
import zipfile
from pathlib import Path
from unittest.mock import MagicMock, patch
import tempfile
sys.path.insert(0, str(Path(__file__).parent.parent))
def test_invalid_url_raises():
from scripts.linkedin_scraper import scrape_profile
with tempfile.TemporaryDirectory() as tmp:
stage = Path(tmp) / "stage.json"
try:
scrape_profile("https://linkedin.com/company/acme", stage)
assert False, "should have raised"
except ValueError as e:
assert "linkedin.com/in/" in str(e)
def test_non_linkedin_url_raises():
from scripts.linkedin_scraper import scrape_profile
with tempfile.TemporaryDirectory() as tmp:
stage = Path(tmp) / "stage.json"
try:
scrape_profile("https://example.com/profile", stage)
assert False, "should have raised"
except ValueError:
pass
def test_valid_linkedin_url_accepted():
from scripts.linkedin_scraper import scrape_profile
with tempfile.TemporaryDirectory() as tmp:
stage = Path(tmp) / "stage.json"
fixture_html = (Path(__file__).parent / "fixtures" / "linkedin_profile.html").read_text()
mock_page = MagicMock()
mock_page.content.return_value = fixture_html
mock_browser = MagicMock()
mock_browser.new_page.return_value = mock_page
mock_playwright = MagicMock()
mock_playwright.chromium.launch.return_value = mock_browser
with patch("scripts.linkedin_scraper.sync_playwright") as mock_sync_pw:
mock_sync_pw.return_value.__enter__ = MagicMock(return_value=mock_playwright)
mock_sync_pw.return_value.__exit__ = MagicMock(return_value=False)
result = scrape_profile("https://linkedin.com/in/alanw", stage)
assert result["name"] == "Alan Weinstock"
assert stage.exists()
def test_scrape_profile_writes_staging_file():
from scripts.linkedin_scraper import scrape_profile
with tempfile.TemporaryDirectory() as tmp:
stage = Path(tmp) / "stage.json"
fixture_html = (Path(__file__).parent / "fixtures" / "linkedin_profile.html").read_text()
mock_page = MagicMock()
mock_page.content.return_value = fixture_html
mock_browser = MagicMock()
mock_browser.new_page.return_value = mock_page
mock_playwright = MagicMock()
mock_playwright.chromium.launch.return_value = mock_browser
with patch("scripts.linkedin_scraper.sync_playwright") as mock_sync_pw:
mock_sync_pw.return_value.__enter__ = MagicMock(return_value=mock_playwright)
mock_sync_pw.return_value.__exit__ = MagicMock(return_value=False)
scrape_profile("https://linkedin.com/in/alanw", stage)
data = json.loads(stage.read_text())
assert data["source"] == "url_scrape"
assert data["url"] == "https://linkedin.com/in/alanw"
assert "raw_html" in data
assert "extracted" in data
assert data["extracted"]["name"] == "Alan Weinstock"
def _make_export_zip() -> bytes:
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w") as zf:
zf.writestr("Position.csv",
"Company Name,Title,Description,Started On,Finished On\n"
"Acme Corp,Staff Engineer,Led migration. Built CI/CD.,Jan 2022,\n"
"Beta Industries,Senior Engineer,Maintained clusters.,Mar 2019,Dec 2021\n"
)
zf.writestr("Education.csv",
"School Name,Degree Name,Field Of Study,Start Date,End Date\n"
"State University,Bachelor of Science,Computer Science,2010,2014\n"
)
zf.writestr("Skills.csv",
"Name,Description\n"
"Python,\n"
"Kubernetes,\n"
)
zf.writestr("Profile.csv",
"First Name,Last Name,Headline,Summary,Email Address\n"
"Alan,Weinstock,Staff Engineer,Experienced engineer.,alan@example.com\n"
)
return buf.getvalue()
def test_parse_export_zip_experience():
from scripts.linkedin_scraper import parse_export_zip
with tempfile.TemporaryDirectory() as tmp:
stage = Path(tmp) / "stage.json"
result = parse_export_zip(_make_export_zip(), stage)
assert len(result["experience"]) == 2
assert result["experience"][0]["company"] == "Acme Corp"
assert result["experience"][0]["title"] == "Staff Engineer"
def test_parse_export_zip_education():
from scripts.linkedin_scraper import parse_export_zip
with tempfile.TemporaryDirectory() as tmp:
stage = Path(tmp) / "stage.json"
result = parse_export_zip(_make_export_zip(), stage)
assert result["education"][0]["school"] == "State University"
assert result["education"][0]["field"] == "Computer Science"
def test_parse_export_zip_skills():
from scripts.linkedin_scraper import parse_export_zip
with tempfile.TemporaryDirectory() as tmp:
stage = Path(tmp) / "stage.json"
result = parse_export_zip(_make_export_zip(), stage)
assert "Python" in result["skills"]
def test_parse_export_zip_name_and_email():
from scripts.linkedin_scraper import parse_export_zip
with tempfile.TemporaryDirectory() as tmp:
stage = Path(tmp) / "stage.json"
result = parse_export_zip(_make_export_zip(), stage)
assert result["name"] == "Alan Weinstock"
assert result["email"] == "alan@example.com"
def test_parse_export_zip_missing_csv_does_not_raise():
from scripts.linkedin_scraper import parse_export_zip
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w") as zf:
zf.writestr("Profile.csv",
"First Name,Last Name,Headline,Summary,Email Address\n"
"Alan,Weinstock,Engineer,Summary here.,alan@example.com\n"
)
with tempfile.TemporaryDirectory() as tmp:
stage = Path(tmp) / "stage.json"
result = parse_export_zip(buf.getvalue(), stage)
assert result["name"] == "Alan Weinstock"
assert result["experience"] == []
def test_parse_export_zip_writes_staging_file():
from scripts.linkedin_scraper import parse_export_zip
with tempfile.TemporaryDirectory() as tmp:
stage = Path(tmp) / "stage.json"
parse_export_zip(_make_export_zip(), stage)
data = json.loads(stage.read_text())
assert data["source"] == "export_zip"
assert data["raw_html"] is None
def test_scrape_profile_sets_linkedin_url():
from scripts.linkedin_scraper import scrape_profile
with tempfile.TemporaryDirectory() as tmp:
stage = Path(tmp) / "stage.json"
fixture_html = (Path(__file__).parent / "fixtures" / "linkedin_profile.html").read_text()
mock_page = MagicMock()
mock_page.content.return_value = fixture_html
mock_browser = MagicMock()
mock_browser.new_page.return_value = mock_page
mock_playwright = MagicMock()
mock_playwright.chromium.launch.return_value = mock_browser
with patch("scripts.linkedin_scraper.sync_playwright") as mock_sync_pw:
mock_sync_pw.return_value.__enter__ = MagicMock(return_value=mock_playwright)
mock_sync_pw.return_value.__exit__ = MagicMock(return_value=False)
result = scrape_profile("https://linkedin.com/in/alanw", stage)
assert result["linkedin"] == "https://linkedin.com/in/alanw"
def test_parse_export_zip_bad_zip_raises():
from scripts.linkedin_scraper import parse_export_zip
with tempfile.TemporaryDirectory() as tmp:
stage = Path(tmp) / "stage.json"
try:
parse_export_zip(b"not a zip file at all", stage)
assert False, "should have raised"
except ValueError as e:
assert "zip" in str(e).lower()
def test_parse_export_zip_current_job_shows_present():
"""Empty Finished On renders as ' Present', not truncated."""
from scripts.linkedin_scraper import parse_export_zip
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w") as zf:
zf.writestr("Position.csv",
"Company Name,Title,Description,Started On,Finished On\n"
"Acme Corp,Staff Engineer,,Jan 2022,\n"
)
zf.writestr("Profile.csv",
"First Name,Last Name,Headline,Summary,Email Address\n"
"Alan,Weinstock,Engineer,,\n"
)
with tempfile.TemporaryDirectory() as tmp:
stage = Path(tmp) / "stage.json"
result = parse_export_zip(buf.getvalue(), stage)
assert result["experience"][0]["date_range"] == "Jan 2022 Present"