pagepiper/tests/test_text_clean.py

# tests/test_text_clean.py
"""Tests for ebook artifact filtering in scripts/text_clean.py."""
from __future__ import annotations

import pytest

from scripts.text_clean import (
    clean_line,
    clean_paragraph,
    filter_paragraphs,
    is_artifact_line,
)


class TestIsArtifactLine:
    def test_abc_amber_lit(self):
        assert is_artifact_line(
            "Generated by ABC Amber LIT Converter, http://www.processtext.com/abclit.html"
        )

    def test_abc_amber_rtf(self):
        assert is_artifact_line("Generated by ABC Amber RTF Converter")

    def test_processtext_url_only(self):
        assert is_artifact_line("http://www.processtext.com/abclit.html")

    def test_standalone_url(self):
        assert is_artifact_line("https://www.example.com/book")

    def test_page_number_only(self):
        assert is_artifact_line("42")
        assert is_artifact_line("- 42 -")
        assert is_artifact_line(" 7 ")

    def test_downloaded_from(self):
        assert is_artifact_line("Downloaded from www.fictionsite.net")

    def test_scanned_by(self):
        assert is_artifact_line("Scanned by SomeUser")

    def test_normal_prose_not_artifact(self):
        assert not is_artifact_line(
            '"And what if food isn\'t the only reason Jagang is going to Anderith?"'
        )

    def test_url_embedded_in_prose_not_dropped(self):
        # A URL inside a sentence is not a standalone-URL artifact line
        assert not is_artifact_line(
            "You can read more about this at https://example.com and continue."
        )

    def test_short_page_header_not_dropped(self):
        # "Chapter 1" is not an artifact — 4-digit number check only drops bare numbers
        assert not is_artifact_line("Chapter 1")


class TestCleanLine:
    def test_strips_inline_abc_amber(self):
        line = "Some prose. Generated by ABC Amber LIT Converter, http://www.processtext.com/abclit.html"
        result = clean_line(line)
        assert "ABC Amber" not in result
        assert "processtext" not in result
        assert "Some prose." in result

    def test_passes_clean_line_unchanged(self):
        line = "He cocked an eyebrow and smiled."
        assert clean_line(line) == line


class TestCleanParagraph:
    def test_drops_artifact_lines_from_paragraph(self):
        text = (
            "Generated by ABC Amber LIT Converter, http://www.processtext.com/abclit.html\n"
            '"And what if food isn\'t the only reason Jagang is going to Anderith?"\n'
            "He cocked an eyebrow."
        )
        result = clean_paragraph(text)
        assert "ABC Amber" not in result
        assert "Jagang" in result
        assert "eyebrow" in result

    def test_all_artifact_paragraph_returns_empty(self):
        text = "Generated by ABC Amber LIT Converter\nhttp://www.processtext.com/abclit.html"
        assert clean_paragraph(text) == ""

    def test_clean_paragraph_unchanged(self):
        text = "Richard raised his sword.\nThe magic surged through him."
        assert clean_paragraph(text) == text


class TestFilterParagraphs:
    def test_drops_artifact_paragraphs(self):
        paras = [
            "Generated by ABC Amber LIT Converter, http://www.processtext.com/abclit.html",
            '"And what if food isn\'t the only reason Jagang is going to Anderith?"',
            "He cocked an eyebrow at the question.",
        ]
        result = filter_paragraphs(paras)
        assert len(result) == 2
        assert all("ABC Amber" not in p for p in result)

    def test_drops_short_lines_under_4_words(self):
        paras = ["Hi", "OK sure", "Valid sentence with enough words here."]
        result = filter_paragraphs(paras)
        assert result == ["Valid sentence with enough words here."]

    def test_empty_input(self):
        assert filter_paragraphs([]) == []