Skip to content

Commit abd0377

Browse files
committed
✅ Add tests for PDF inputs
1 parent 6b19e70 commit abd0377

File tree

6 files changed

+197
-17
lines changed

6 files changed

+197
-17
lines changed

__test__/artifacts/test_pdf.pdf

425 KB
Binary file not shown.

__test__/test_main.py

Lines changed: 137 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ def test_process_pdf_page(self, mock_create_task, mock_get_seq, mock_s3_client):
147147
tmp_dir
148148
)
149149

150-
assert raw_key == "raw/test_document_page001.tiff"
150+
assert raw_key == "raw/test_document_0001.tiff"
151151
mock_page.save.assert_called_once()
152152
mock_s3_client.upload_file.assert_called_once()
153153

@@ -159,9 +159,9 @@ def test_process_pdf_multiple_pages(self, mock_process_page, mock_convert):
159159
mock_pages = [Mock() for _ in range(3)]
160160
mock_convert.return_value = mock_pages
161161
mock_process_page.side_effect = [
162-
"raw/doc_page001.tiff",
163-
"raw/doc_page002.tiff",
164-
"raw/doc_page003.tiff"
162+
"raw/doc_0001.tiff",
163+
"raw/doc_0002.tiff",
164+
"raw/doc_0003.tiff"
165165
]
166166

167167
with tempfile.TemporaryDirectory() as tmp_dir:
@@ -209,9 +209,8 @@ def test_move_original_to_raw(self, mock_s3_client):
209209
)
210210

211211
@patch('ingest_pipeline.main.process_pdf')
212-
@patch('ingest_pipeline.main.move_original_to_raw')
213212
@patch('tempfile.TemporaryDirectory')
214-
def test_process_s3_record_pdf(self, mock_tempdir, mock_move, mock_process_pdf, mock_s3_client):
213+
def test_process_s3_record_pdf(self, mock_tempdir, mock_process_pdf, mock_s3_client):
215214
"""Test processing an S3 record for a PDF file."""
216215
# Setup mocks
217216
mock_temp = MagicMock()
@@ -232,7 +231,8 @@ def test_process_s3_record_pdf(self, mock_tempdir, mock_move, mock_process_pdf,
232231
process_s3_record(record)
233232

234233
mock_s3_client.download_file.assert_called_once()
235-
mock_move.assert_called_once()
234+
# PDFs should not be moved anymore
235+
mock_s3_client.delete_object.assert_not_called()
236236
mock_process_pdf.assert_called_once()
237237

238238
def test_process_s3_record_skip_non_upload(self, mock_s3_client):
@@ -303,9 +303,8 @@ def test_url_decode_in_process_s3_record(self, mock_s3_client):
303303
}
304304

305305
with patch('ingest_pipeline.main.process_pdf'):
306-
with patch('ingest_pipeline.main.move_original_to_raw'):
307-
with patch('tempfile.TemporaryDirectory'):
308-
process_s3_record(record)
306+
with patch('tempfile.TemporaryDirectory'):
307+
process_s3_record(record)
309308

310309
# Check that download was called with decoded key
311310
mock_s3_client.download_file.assert_called_once()
@@ -351,8 +350,132 @@ def test_full_pdf_processing_flow(self, mock_convert, mock_s3):
351350
# Should have downloaded the original
352351
assert mock_s3.download_file.call_count == 1
353352

354-
# Should have uploaded: original + 2 TIFFs + 2 JSONs = 5 files
355-
assert mock_s3.upload_file.call_count == 5
353+
# Should have uploaded: 2 TIFFs + 2 JSONs = 4 files (PDF not moved anymore)
354+
assert mock_s3.upload_file.call_count == 4
355+
356+
# Should NOT have deleted the original PDF
357+
assert mock_s3.delete_object.call_count == 0
358+
359+
360+
class TestRealPDFProcessing:
361+
"""Test with real PDF file and verify TIFF output."""
362+
363+
@pytest.fixture
364+
def test_pdf_path(self):
365+
"""Path to the test PDF file."""
366+
return os.path.join(os.path.dirname(__file__), "artifacts", "test_pdf.pdf")
367+
368+
@patch('ingest_pipeline.main.s3')
369+
def test_process_pdf_real_file_tiff_quality(self, mock_s3, test_pdf_path):
370+
"""Test processing a real PDF and verify TIFF output quality."""
371+
# Setup mock
372+
mock_s3.list_objects_v2.return_value = {}
373+
374+
with tempfile.TemporaryDirectory() as tmp_dir:
375+
# Process the real PDF
376+
processed_keys = process_pdf(test_pdf_path, "test_pdf", "test-bucket", tmp_dir)
377+
378+
# Verify that TIFFs were created
379+
assert len(processed_keys) > 0
380+
381+
# Check each TIFF file
382+
tiff_sizes = []
383+
for i, key in enumerate(processed_keys):
384+
# Expected filename format: raw/test_pdf_0001.tiff, raw/test_pdf_0002.tiff, etc.
385+
expected_key = f"raw/test_pdf_{i+1:04d}.tiff"
386+
assert key == expected_key
387+
388+
# Find the local TIFF file
389+
local_tiff = os.path.join(tmp_dir, f"test_pdf_{i+1:04d}.tiff")
390+
391+
# Verify the file exists and has reasonable size
392+
assert os.path.exists(local_tiff)
393+
file_size = os.path.getsize(local_tiff)
394+
tiff_sizes.append(file_size)
395+
396+
# Ensure TIFF is not suspiciously small
397+
assert file_size > 1000, f"TIFF file {local_tiff} is too small: {file_size} bytes"
398+
399+
# Verify it's a valid TIFF by opening it
400+
img = Image.open(local_tiff)
401+
assert img.format == "TIFF"
402+
assert img.width > 0
403+
assert img.height > 0
404+
405+
# Verify file sizes are reasonable (this replaces snapshot testing)
406+
print(f"TIFF file sizes: {tiff_sizes}")
407+
assert len(tiff_sizes) > 0, "At least one TIFF should be generated"
408+
for size in tiff_sizes:
409+
# Most TIFF files from PDFs should be much larger than 1KB
410+
assert size > 1000, f"TIFF file size too small: {size} bytes"
411+
412+
@patch('ingest_pipeline.main.s3')
413+
def test_process_pdf_page_format(self, mock_s3):
414+
"""Test that PDF pages are named with the correct format."""
415+
mock_s3.list_objects_v2.return_value = {}
416+
417+
# Create a mock page
418+
mock_page = Mock()
419+
420+
with tempfile.TemporaryDirectory() as tmp_dir:
421+
# Process page 1
422+
key1 = process_pdf_page(mock_page, 1, "test_doc", "test-bucket", tmp_dir)
423+
assert key1 == "raw/test_doc_0001.tiff"
424+
425+
# Process page 10
426+
key10 = process_pdf_page(mock_page, 10, "test_doc", "test-bucket", tmp_dir)
427+
assert key10 == "raw/test_doc_0010.tiff"
428+
429+
# Process page 100
430+
key100 = process_pdf_page(mock_page, 100, "test_doc", "test-bucket", tmp_dir)
431+
assert key100 == "raw/test_doc_0100.tiff"
432+
433+
@pytest.fixture
434+
def mock_s3_client(self):
435+
"""Mock S3 client."""
436+
with patch('ingest_pipeline.main.s3') as mock_s3:
437+
yield mock_s3
438+
439+
def test_pdf_not_moved_to_raw(self, mock_s3_client):
440+
"""Test that PDF files are NOT moved from upload/ to raw/."""
441+
record = {
442+
"body": json.dumps({
443+
"Records": [{
444+
"s3": {
445+
"bucket": {"name": "test-bucket"},
446+
"object": {"key": "upload/test.pdf"}
447+
}
448+
}]
449+
})
450+
}
451+
452+
with patch('ingest_pipeline.main.process_pdf'):
453+
with patch('tempfile.TemporaryDirectory'):
454+
process_s3_record(record)
455+
456+
# Should download the file
457+
mock_s3_client.download_file.assert_called_once()
458+
459+
# Should NOT delete the original PDF
460+
mock_s3_client.delete_object.assert_not_called()
461+
462+
def test_tiff_still_moved_to_raw(self, mock_s3_client):
463+
"""Test that TIFF files are still moved from upload/ to raw/."""
464+
record = {
465+
"body": json.dumps({
466+
"Records": [{
467+
"s3": {
468+
"bucket": {"name": "test-bucket"},
469+
"object": {"key": "upload/test.tiff"}
470+
}
471+
}]
472+
})
473+
}
474+
475+
with patch('ingest_pipeline.main.process_tiff'):
476+
with patch('ingest_pipeline.main.move_original_to_raw') as mock_move:
477+
with patch('tempfile.TemporaryDirectory'):
478+
process_s3_record(record)
356479

357-
# Should have deleted the original
358-
assert mock_s3.delete_object.call_count == 1
480+
# Should call move_original_to_raw for TIFF files
481+
mock_move.assert_called_once()

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ dependencies = [
1212
"pdf2image>=1.17.0",
1313
"pillow>=11.2.1",
1414
"pytest",
15+
"snapshottest>=0.6.0",
1516
]
1617

1718
[build-system]

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
pdf2image
22
Pillow
33
boto3
4+
pytest

src/ingest_pipeline/main.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def create_and_upload_task(bucket, image_s3_path, seq_num, tmp_dir):
4444
def process_pdf_page(page, page_num, base_name, bucket, tmp_dir):
4545
"""Process a single PDF page: save as TIFF and create task."""
4646
# Create unique filename for this page
47-
page_base = f"{base_name}_page{page_num:03d}"
47+
page_base = f"{base_name}_{page_num:04d}"
4848
page_tif = os.path.join(tmp_dir, f"{page_base}.tiff")
4949

5050
# Save page as TIFF
@@ -145,8 +145,9 @@ def process_s3_record(record):
145145
# Download original file
146146
s3.download_file(bucket, key, local_in)
147147

148-
# Move the original file to "raw/" prefix
149-
move_original_to_raw(local_in, bucket, key, base, ext)
148+
# Only move TIFFs to raw/, leave PDFs in upload/
149+
if ext in (".tiff", ".tif"):
150+
move_original_to_raw(local_in, bucket, key, base, ext)
150151

151152
# Process based on file type
152153
try:

uv.lock

Lines changed: 54 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)