@@ -147,7 +147,7 @@ def test_process_pdf_page(self, mock_create_task, mock_get_seq, mock_s3_client):
147
147
tmp_dir
148
148
)
149
149
150
- assert raw_key == "raw/test_document_page001 .tiff"
150
+ assert raw_key == "raw/test_document_0001 .tiff"
151
151
mock_page .save .assert_called_once ()
152
152
mock_s3_client .upload_file .assert_called_once ()
153
153
@@ -159,9 +159,9 @@ def test_process_pdf_multiple_pages(self, mock_process_page, mock_convert):
159
159
mock_pages = [Mock () for _ in range (3 )]
160
160
mock_convert .return_value = mock_pages
161
161
mock_process_page .side_effect = [
162
- "raw/doc_page001 .tiff" ,
163
- "raw/doc_page002 .tiff" ,
164
- "raw/doc_page003 .tiff"
162
+ "raw/doc_0001 .tiff" ,
163
+ "raw/doc_0002 .tiff" ,
164
+ "raw/doc_0003 .tiff"
165
165
]
166
166
167
167
with tempfile .TemporaryDirectory () as tmp_dir :
@@ -209,9 +209,8 @@ def test_move_original_to_raw(self, mock_s3_client):
209
209
)
210
210
211
211
@patch ('ingest_pipeline.main.process_pdf' )
212
- @patch ('ingest_pipeline.main.move_original_to_raw' )
213
212
@patch ('tempfile.TemporaryDirectory' )
214
- def test_process_s3_record_pdf (self , mock_tempdir , mock_move , mock_process_pdf , mock_s3_client ):
213
+ def test_process_s3_record_pdf (self , mock_tempdir , mock_process_pdf , mock_s3_client ):
215
214
"""Test processing an S3 record for a PDF file."""
216
215
# Setup mocks
217
216
mock_temp = MagicMock ()
@@ -232,7 +231,8 @@ def test_process_s3_record_pdf(self, mock_tempdir, mock_move, mock_process_pdf,
232
231
process_s3_record (record )
233
232
234
233
mock_s3_client .download_file .assert_called_once ()
235
- mock_move .assert_called_once ()
234
+ # PDFs should not be moved anymore
235
+ mock_s3_client .delete_object .assert_not_called ()
236
236
mock_process_pdf .assert_called_once ()
237
237
238
238
def test_process_s3_record_skip_non_upload (self , mock_s3_client ):
@@ -303,9 +303,8 @@ def test_url_decode_in_process_s3_record(self, mock_s3_client):
303
303
}
304
304
305
305
with patch ('ingest_pipeline.main.process_pdf' ):
306
- with patch ('ingest_pipeline.main.move_original_to_raw' ):
307
- with patch ('tempfile.TemporaryDirectory' ):
308
- process_s3_record (record )
306
+ with patch ('tempfile.TemporaryDirectory' ):
307
+ process_s3_record (record )
309
308
310
309
# Check that download was called with decoded key
311
310
mock_s3_client .download_file .assert_called_once ()
@@ -351,8 +350,132 @@ def test_full_pdf_processing_flow(self, mock_convert, mock_s3):
351
350
# Should have downloaded the original
352
351
assert mock_s3 .download_file .call_count == 1
353
352
354
- # Should have uploaded: original + 2 TIFFs + 2 JSONs = 5 files
355
- assert mock_s3 .upload_file .call_count == 5
353
+ # Should have uploaded: 2 TIFFs + 2 JSONs = 4 files (PDF not moved anymore)
354
+ assert mock_s3 .upload_file .call_count == 4
355
+
356
+ # Should NOT have deleted the original PDF
357
+ assert mock_s3 .delete_object .call_count == 0
358
+
359
+
360
+ class TestRealPDFProcessing :
361
+ """Test with real PDF file and verify TIFF output."""
362
+
363
+ @pytest .fixture
364
+ def test_pdf_path (self ):
365
+ """Path to the test PDF file."""
366
+ return os .path .join (os .path .dirname (__file__ ), "artifacts" , "test_pdf.pdf" )
367
+
368
+ @patch ('ingest_pipeline.main.s3' )
369
+ def test_process_pdf_real_file_tiff_quality (self , mock_s3 , test_pdf_path ):
370
+ """Test processing a real PDF and verify TIFF output quality."""
371
+ # Setup mock
372
+ mock_s3 .list_objects_v2 .return_value = {}
373
+
374
+ with tempfile .TemporaryDirectory () as tmp_dir :
375
+ # Process the real PDF
376
+ processed_keys = process_pdf (test_pdf_path , "test_pdf" , "test-bucket" , tmp_dir )
377
+
378
+ # Verify that TIFFs were created
379
+ assert len (processed_keys ) > 0
380
+
381
+ # Check each TIFF file
382
+ tiff_sizes = []
383
+ for i , key in enumerate (processed_keys ):
384
+ # Expected filename format: raw/test_pdf_0001.tiff, raw/test_pdf_0002.tiff, etc.
385
+ expected_key = f"raw/test_pdf_{ i + 1 :04d} .tiff"
386
+ assert key == expected_key
387
+
388
+ # Find the local TIFF file
389
+ local_tiff = os .path .join (tmp_dir , f"test_pdf_{ i + 1 :04d} .tiff" )
390
+
391
+ # Verify the file exists and has reasonable size
392
+ assert os .path .exists (local_tiff )
393
+ file_size = os .path .getsize (local_tiff )
394
+ tiff_sizes .append (file_size )
395
+
396
+ # Ensure TIFF is not suspiciously small
397
+ assert file_size > 1000 , f"TIFF file { local_tiff } is too small: { file_size } bytes"
398
+
399
+ # Verify it's a valid TIFF by opening it
400
+ img = Image .open (local_tiff )
401
+ assert img .format == "TIFF"
402
+ assert img .width > 0
403
+ assert img .height > 0
404
+
405
+ # Verify file sizes are reasonable (this replaces snapshot testing)
406
+ print (f"TIFF file sizes: { tiff_sizes } " )
407
+ assert len (tiff_sizes ) > 0 , "At least one TIFF should be generated"
408
+ for size in tiff_sizes :
409
+ # Most TIFF files from PDFs should be much larger than 1KB
410
+ assert size > 1000 , f"TIFF file size too small: { size } bytes"
411
+
412
+ @patch ('ingest_pipeline.main.s3' )
413
+ def test_process_pdf_page_format (self , mock_s3 ):
414
+ """Test that PDF pages are named with the correct format."""
415
+ mock_s3 .list_objects_v2 .return_value = {}
416
+
417
+ # Create a mock page
418
+ mock_page = Mock ()
419
+
420
+ with tempfile .TemporaryDirectory () as tmp_dir :
421
+ # Process page 1
422
+ key1 = process_pdf_page (mock_page , 1 , "test_doc" , "test-bucket" , tmp_dir )
423
+ assert key1 == "raw/test_doc_0001.tiff"
424
+
425
+ # Process page 10
426
+ key10 = process_pdf_page (mock_page , 10 , "test_doc" , "test-bucket" , tmp_dir )
427
+ assert key10 == "raw/test_doc_0010.tiff"
428
+
429
+ # Process page 100
430
+ key100 = process_pdf_page (mock_page , 100 , "test_doc" , "test-bucket" , tmp_dir )
431
+ assert key100 == "raw/test_doc_0100.tiff"
432
+
433
+ @pytest .fixture
434
+ def mock_s3_client (self ):
435
+ """Mock S3 client."""
436
+ with patch ('ingest_pipeline.main.s3' ) as mock_s3 :
437
+ yield mock_s3
438
+
439
+ def test_pdf_not_moved_to_raw (self , mock_s3_client ):
440
+ """Test that PDF files are NOT moved from upload/ to raw/."""
441
+ record = {
442
+ "body" : json .dumps ({
443
+ "Records" : [{
444
+ "s3" : {
445
+ "bucket" : {"name" : "test-bucket" },
446
+ "object" : {"key" : "upload/test.pdf" }
447
+ }
448
+ }]
449
+ })
450
+ }
451
+
452
+ with patch ('ingest_pipeline.main.process_pdf' ):
453
+ with patch ('tempfile.TemporaryDirectory' ):
454
+ process_s3_record (record )
455
+
456
+ # Should download the file
457
+ mock_s3_client .download_file .assert_called_once ()
458
+
459
+ # Should NOT delete the original PDF
460
+ mock_s3_client .delete_object .assert_not_called ()
461
+
462
+ def test_tiff_still_moved_to_raw (self , mock_s3_client ):
463
+ """Test that TIFF files are still moved from upload/ to raw/."""
464
+ record = {
465
+ "body" : json .dumps ({
466
+ "Records" : [{
467
+ "s3" : {
468
+ "bucket" : {"name" : "test-bucket" },
469
+ "object" : {"key" : "upload/test.tiff" }
470
+ }
471
+ }]
472
+ })
473
+ }
474
+
475
+ with patch ('ingest_pipeline.main.process_tiff' ):
476
+ with patch ('ingest_pipeline.main.move_original_to_raw' ) as mock_move :
477
+ with patch ('tempfile.TemporaryDirectory' ):
478
+ process_s3_record (record )
356
479
357
- # Should have deleted the original
358
- assert mock_s3 . delete_object . call_count == 1
480
+ # Should call move_original_to_raw for TIFF files
481
+ mock_move . assert_called_once ()
0 commit comments