# tests/test_text_clean.py """Tests for ebook artifact filtering in scripts/text_clean.py.""" from __future__ import annotations import pytest from scripts.text_clean import ( clean_line, clean_paragraph, filter_paragraphs, is_artifact_line, ) class TestIsArtifactLine: def test_abc_amber_lit(self): assert is_artifact_line( "Generated by ABC Amber LIT Converter, http://www.processtext.com/abclit.html" ) def test_abc_amber_rtf(self): assert is_artifact_line("Generated by ABC Amber RTF Converter") def test_processtext_url_only(self): assert is_artifact_line("http://www.processtext.com/abclit.html") def test_standalone_url(self): assert is_artifact_line("https://www.example.com/book") def test_page_number_only(self): assert is_artifact_line("42") assert is_artifact_line("- 42 -") assert is_artifact_line(" 7 ") def test_downloaded_from(self): assert is_artifact_line("Downloaded from www.fictionsite.net") def test_scanned_by(self): assert is_artifact_line("Scanned by SomeUser") def test_normal_prose_not_artifact(self): assert not is_artifact_line( '"And what if food isn\'t the only reason Jagang is going to Anderith?"' ) def test_url_embedded_in_prose_not_dropped(self): # A URL inside a sentence is not a standalone-URL artifact line assert not is_artifact_line( "You can read more about this at https://example.com and continue." ) def test_short_page_header_not_dropped(self): # "Chapter 1" is not an artifact — 4-digit number check only drops bare numbers assert not is_artifact_line("Chapter 1") class TestCleanLine: def test_strips_inline_abc_amber(self): line = "Some prose. Generated by ABC Amber LIT Converter, http://www.processtext.com/abclit.html" result = clean_line(line) assert "ABC Amber" not in result assert "processtext" not in result assert "Some prose." in result def test_passes_clean_line_unchanged(self): line = "He cocked an eyebrow and smiled." assert clean_line(line) == line class TestCleanParagraph: def test_drops_artifact_lines_from_paragraph(self): text = ( "Generated by ABC Amber LIT Converter, http://www.processtext.com/abclit.html\n" '"And what if food isn\'t the only reason Jagang is going to Anderith?"\n' "He cocked an eyebrow." ) result = clean_paragraph(text) assert "ABC Amber" not in result assert "Jagang" in result assert "eyebrow" in result def test_all_artifact_paragraph_returns_empty(self): text = "Generated by ABC Amber LIT Converter\nhttp://www.processtext.com/abclit.html" assert clean_paragraph(text) == "" def test_clean_paragraph_unchanged(self): text = "Richard raised his sword.\nThe magic surged through him." assert clean_paragraph(text) == text class TestFilterParagraphs: def test_drops_artifact_paragraphs(self): paras = [ "Generated by ABC Amber LIT Converter, http://www.processtext.com/abclit.html", '"And what if food isn\'t the only reason Jagang is going to Anderith?"', "He cocked an eyebrow at the question.", ] result = filter_paragraphs(paras) assert len(result) == 2 assert all("ABC Amber" not in p for p in result) def test_drops_short_lines_under_4_words(self): paras = ["Hi", "OK sure", "Valid sentence with enough words here."] result = filter_paragraphs(paras) assert result == ["Valid sentence with enough words here."] def test_empty_input(self): assert filter_paragraphs([]) == []