1
+ import json
2
+ import os
3
+ import tempfile
4
+ from unittest .mock import Mock , patch , MagicMock , call
5
+ import pytest
6
+ from PIL import Image
7
+ import io
8
+
9
+ # Import the functions we want to test
10
+ from ingest_pipeline .main import (
11
+ lambda_handler ,
12
+ get_next_task_sequence ,
13
+ create_and_upload_task ,
14
+ process_pdf_page ,
15
+ process_pdf ,
16
+ process_tiff ,
17
+ move_original_to_raw ,
18
+ process_s3_record
19
+ )
20
+
21
+
22
+ class TestLambdaHandler :
23
+ """Test suite for the Lambda handler and its functions."""
24
+
25
+ @pytest .fixture
26
+ def sample_sqs_event (self ):
27
+ """Sample SQS event with S3 notification."""
28
+ return {
29
+ "Records" : [{
30
+ "messageId" : "42e2d2ed-65de-42db-bfaa-9d04ee397dc5" ,
31
+ "receiptHandle" : "AQEBZah8dAcvxL/kRUO9KKQQMwi+njiqWLi4EJP5tdcQ3FrJVwo/Yf5q8gYieTu+diOcAeLZSrC2ig/BWPYi2oz1EyqlaZRvDnjURVzC7H+cAP3WqXQ34VxAPk7Q5IWTorNb+d6z7BYvdhu2aCuVPXMDkrINRAMSCbuGu7H562YXau+0GIQhqWDpZUyX5uX34DBx6RLAl5+At/5S9qwb1UL6XLYoXweIv1yJ5W3yRVq3Vlygtp1HRCq+zKRiG1ORy67NMB52+Bwf4CKdt6OMRL0uKSg+FE9oKwIfKj/PuIG3q4lkyan1Icy9w1anBcTo6U8mQxzQsDEba36eDUCVEA7NBvSifI4GOV0T+0Mz+HSE0kpDgx0CqxtyHGFStkBfEgMtzlHEcE2plqoG28AUeG9gjg==" ,
32
+ "body" : json .dumps ({
33
+ "Records" : [{
34
+ "eventVersion" : "2.1" ,
35
+ "eventSource" : "aws:s3" ,
36
+ "awsRegion" : "us-east-1" ,
37
+ "eventTime" : "2025-06-19T22:53:18.720Z" ,
38
+ "eventName" : "ObjectCreated:Put" ,
39
+ "userIdentity" : {"principalId" : "AWS:AIDATGW6D4T35VO3LM4FT" },
40
+ "requestParameters" : {"sourceIPAddress" : "96.250.79.68" },
41
+ "responseElements" : {
42
+ "x-amz-request-id" : "P5GBR9KGYW6HZ3MA" ,
43
+ "x-amz-id-2" : "IbTG1f+r0tBMobjHdn4j2izKhusX9eBaCx7N8RbIGx7x23e9SAcC2bSCNNoVA8jV24Z7blJ17+j6QYuP8jlJNn6528sfQVbS"
44
+ },
45
+ "s3" : {
46
+ "s3SchemaVersion" : "1.0" ,
47
+ "configurationId" : "tf-s3-queue-20250619224812800500000002" ,
48
+ "bucket" : {
49
+ "name" : "brij-v1-bucket" ,
50
+ "ownerIdentity" : {"principalId" : "AN6L0W7Z1W8K6" },
51
+ "arn" : "arn:aws:s3:::brij-v1-bucket"
52
+ },
53
+ "object" : {
54
+ "key" : "upload/BD23-25943+FLR+PLN.pdf" ,
55
+ "size" : 435027 ,
56
+ "eTag" : "80955dc9417677628d46674340d1acc9" ,
57
+ "versionId" : "0RqUcc4jvul.yempoufM1IBRyWt0sNWt" ,
58
+ "sequencer" : "00685494DE8E151A1B"
59
+ }
60
+ }
61
+ }]
62
+ }),
63
+ "attributes" : {
64
+ "ApproximateReceiveCount" : "1" ,
65
+ "SentTimestamp" : "1750373599461" ,
66
+ "SenderId" : "AROA4R74ZO52XAB5OD7T4:S3-PROD-END" ,
67
+ "ApproximateFirstReceiveTimestamp" : "1750373599472"
68
+ },
69
+ "messageAttributes" : {},
70
+ "md5OfBody" : "7d84fe3663343107b186e691b40080ac" ,
71
+ "eventSource" : "aws:sqs" ,
72
+ "eventSourceARN" : "arn:aws:sqs:us-east-1:220582896887:brij-v1-upload-queue" ,
73
+ "awsRegion" : "us-east-1"
74
+ }]
75
+ }
76
+
77
+ @pytest .fixture
78
+ def mock_s3_client (self ):
79
+ """Mock S3 client."""
80
+ with patch ('ingest_pipeline.main.s3' ) as mock_s3 :
81
+ yield mock_s3
82
+
83
+ def test_get_next_task_sequence_empty (self , mock_s3_client ):
84
+ """Test getting next sequence when no tasks exist."""
85
+ mock_s3_client .list_objects_v2 .return_value = {}
86
+
87
+ seq = get_next_task_sequence ("test-bucket" )
88
+
89
+ assert seq == 1
90
+ mock_s3_client .list_objects_v2 .assert_called_once_with (
91
+ Bucket = "test-bucket" ,
92
+ Prefix = "ingest/"
93
+ )
94
+
95
+ def test_get_next_task_sequence_with_existing (self , mock_s3_client ):
96
+ """Test getting next sequence with existing tasks."""
97
+ mock_s3_client .list_objects_v2 .return_value = {
98
+ "Contents" : [
99
+ {"Key" : "ingest/TASK_0000001.json" },
100
+ {"Key" : "ingest/TASK_0000003.json" },
101
+ {"Key" : "ingest/TASK_0000002.json" },
102
+ ]
103
+ }
104
+
105
+ seq = get_next_task_sequence ("test-bucket" )
106
+
107
+ assert seq == 4
108
+
109
+ def test_create_and_upload_task (self , mock_s3_client ):
110
+ """Test creating and uploading a task."""
111
+ with tempfile .TemporaryDirectory () as tmp_dir :
112
+ ingest_key = create_and_upload_task (
113
+ "test-bucket" ,
114
+ "s3://test-bucket/raw/test.tiff" ,
115
+ 5 ,
116
+ tmp_dir
117
+ )
118
+
119
+ assert ingest_key == "ingest/TASK_0000005.json"
120
+ mock_s3_client .upload_file .assert_called_once ()
121
+
122
+ # Verify the JSON file was created correctly
123
+ call_args = mock_s3_client .upload_file .call_args
124
+ local_file = call_args [0 ][0 ]
125
+ assert os .path .exists (local_file )
126
+
127
+ with open (local_file , 'r' ) as f :
128
+ content = json .load (f )
129
+ assert content == [{"data" : {"image" : "s3://test-bucket/raw/test.tiff" }}]
130
+
131
+ @patch ('ingest_pipeline.main.get_next_task_sequence' )
132
+ @patch ('ingest_pipeline.main.create_and_upload_task' )
133
+ def test_process_pdf_page (self , mock_create_task , mock_get_seq , mock_s3_client ):
134
+ """Test processing a single PDF page."""
135
+ mock_get_seq .return_value = 1
136
+ mock_create_task .return_value = "ingest/TASK_0000001.json"
137
+
138
+ # Create a mock PIL Image
139
+ mock_page = Mock ()
140
+
141
+ with tempfile .TemporaryDirectory () as tmp_dir :
142
+ raw_key = process_pdf_page (
143
+ mock_page ,
144
+ 1 ,
145
+ "test_document" ,
146
+ "test-bucket" ,
147
+ tmp_dir
148
+ )
149
+
150
+ assert raw_key == "raw/test_document_page001.tiff"
151
+ mock_page .save .assert_called_once ()
152
+ mock_s3_client .upload_file .assert_called_once ()
153
+
154
+ @patch ('ingest_pipeline.main.convert_from_path' )
155
+ @patch ('ingest_pipeline.main.process_pdf_page' )
156
+ def test_process_pdf_multiple_pages (self , mock_process_page , mock_convert ):
157
+ """Test processing a PDF with multiple pages."""
158
+ # Mock 3 pages
159
+ mock_pages = [Mock () for _ in range (3 )]
160
+ mock_convert .return_value = mock_pages
161
+ mock_process_page .side_effect = [
162
+ "raw/doc_page001.tiff" ,
163
+ "raw/doc_page002.tiff" ,
164
+ "raw/doc_page003.tiff"
165
+ ]
166
+
167
+ with tempfile .TemporaryDirectory () as tmp_dir :
168
+ keys = process_pdf ("test.pdf" , "doc" , "test-bucket" , tmp_dir )
169
+
170
+ assert len (keys ) == 3
171
+ assert mock_process_page .call_count == 3
172
+
173
+ @patch ('ingest_pipeline.main.Image' )
174
+ @patch ('ingest_pipeline.main.get_next_task_sequence' )
175
+ @patch ('ingest_pipeline.main.create_and_upload_task' )
176
+ def test_process_tiff (self , mock_create_task , mock_get_seq , mock_image , mock_s3_client ):
177
+ """Test processing a TIFF file."""
178
+ mock_get_seq .return_value = 1
179
+ mock_create_task .return_value = "ingest/TASK_0000001.json"
180
+ mock_img = Mock ()
181
+ mock_image .open .return_value = mock_img
182
+
183
+ with tempfile .TemporaryDirectory () as tmp_dir :
184
+ keys = process_tiff ("test.tif" , "test_image" , "test-bucket" , tmp_dir )
185
+
186
+ assert keys == ["raw/test_image.tiff" ]
187
+ mock_img .save .assert_called_once ()
188
+ mock_s3_client .upload_file .assert_called_once ()
189
+
190
+ def test_move_original_to_raw (self , mock_s3_client ):
191
+ """Test moving original file to raw prefix."""
192
+ raw_key = move_original_to_raw (
193
+ "local_file.pdf" ,
194
+ "test-bucket" ,
195
+ "upload/original.pdf" ,
196
+ "original" ,
197
+ ".pdf"
198
+ )
199
+
200
+ assert raw_key == "raw/original.pdf"
201
+ mock_s3_client .upload_file .assert_called_once_with (
202
+ "local_file.pdf" ,
203
+ "test-bucket" ,
204
+ "raw/original.pdf"
205
+ )
206
+ mock_s3_client .delete_object .assert_called_once_with (
207
+ Bucket = "test-bucket" ,
208
+ Key = "upload/original.pdf"
209
+ )
210
+
211
+ @patch ('ingest_pipeline.main.process_pdf' )
212
+ @patch ('ingest_pipeline.main.move_original_to_raw' )
213
+ @patch ('tempfile.TemporaryDirectory' )
214
+ def test_process_s3_record_pdf (self , mock_tempdir , mock_move , mock_process_pdf , mock_s3_client ):
215
+ """Test processing an S3 record for a PDF file."""
216
+ # Setup mocks
217
+ mock_temp = MagicMock ()
218
+ mock_temp .__enter__ .return_value = "/tmp/test"
219
+ mock_tempdir .return_value = mock_temp
220
+
221
+ record = {
222
+ "body" : json .dumps ({
223
+ "Records" : [{
224
+ "s3" : {
225
+ "bucket" : {"name" : "test-bucket" },
226
+ "object" : {"key" : "upload/test.pdf" }
227
+ }
228
+ }]
229
+ })
230
+ }
231
+
232
+ process_s3_record (record )
233
+
234
+ mock_s3_client .download_file .assert_called_once ()
235
+ mock_move .assert_called_once ()
236
+ mock_process_pdf .assert_called_once ()
237
+
238
+ def test_process_s3_record_skip_non_upload (self , mock_s3_client ):
239
+ """Test skipping files not in upload/ prefix."""
240
+ record = {
241
+ "body" : json .dumps ({
242
+ "Records" : [{
243
+ "s3" : {
244
+ "bucket" : {"name" : "test-bucket" },
245
+ "object" : {"key" : "other/test.pdf" }
246
+ }
247
+ }]
248
+ })
249
+ }
250
+
251
+ process_s3_record (record )
252
+
253
+ # Should not download or process
254
+ mock_s3_client .download_file .assert_not_called ()
255
+
256
+ def test_process_s3_record_skip_unsupported_type (self , mock_s3_client ):
257
+ """Test skipping unsupported file types."""
258
+ record = {
259
+ "body" : json .dumps ({
260
+ "Records" : [{
261
+ "s3" : {
262
+ "bucket" : {"name" : "test-bucket" },
263
+ "object" : {"key" : "upload/test.jpg" }
264
+ }
265
+ }]
266
+ })
267
+ }
268
+
269
+ process_s3_record (record )
270
+
271
+ # Should not download or process
272
+ mock_s3_client .download_file .assert_not_called ()
273
+
274
+ @patch ('ingest_pipeline.main.process_s3_record' )
275
+ def test_lambda_handler_success (self , mock_process_record , sample_sqs_event ):
276
+ """Test successful Lambda handler execution."""
277
+ result = lambda_handler (sample_sqs_event , None )
278
+
279
+ assert result == {"status" : "success" }
280
+ mock_process_record .assert_called_once ()
281
+
282
+ @patch ('ingest_pipeline.main.process_s3_record' )
283
+ def test_lambda_handler_error_handling (self , mock_process_record , sample_sqs_event ):
284
+ """Test Lambda handler error handling."""
285
+ mock_process_record .side_effect = Exception ("Test error" )
286
+
287
+ # Should not raise exception, but handle it
288
+ result = lambda_handler (sample_sqs_event , None )
289
+
290
+ assert result == {"status" : "success" }
291
+
292
+ def test_url_decode_in_process_s3_record (self , mock_s3_client ):
293
+ """Test that URL-encoded keys are properly decoded."""
294
+ record = {
295
+ "body" : json .dumps ({
296
+ "Records" : [{
297
+ "s3" : {
298
+ "bucket" : {"name" : "test-bucket" },
299
+ "object" : {"key" : "upload/file+with+spaces.pdf" }
300
+ }
301
+ }]
302
+ })
303
+ }
304
+
305
+ with patch ('ingest_pipeline.main.process_pdf' ):
306
+ with patch ('ingest_pipeline.main.move_original_to_raw' ):
307
+ with patch ('tempfile.TemporaryDirectory' ):
308
+ process_s3_record (record )
309
+
310
+ # Check that download was called with decoded key
311
+ mock_s3_client .download_file .assert_called_once ()
312
+ call_args = mock_s3_client .download_file .call_args
313
+ assert call_args [0 ][1 ] == "upload/file with spaces.pdf"
314
+
315
+
316
+ class TestIntegration :
317
+ """Integration tests that test the full flow."""
318
+
319
+ @patch ('ingest_pipeline.main.s3' )
320
+ @patch ('ingest_pipeline.main.convert_from_path' )
321
+ def test_full_pdf_processing_flow (self , mock_convert , mock_s3 ):
322
+ """Test the complete flow of processing a PDF file."""
323
+ # Setup mocks
324
+ mock_s3 .list_objects_v2 .return_value = {}
325
+
326
+ # Create mock pages
327
+ mock_page1 = Mock ()
328
+ mock_page2 = Mock ()
329
+ mock_convert .return_value = [mock_page1 , mock_page2 ]
330
+
331
+ # Create test event
332
+ event = {
333
+ "Records" : [{
334
+ "body" : json .dumps ({
335
+ "Records" : [{
336
+ "s3" : {
337
+ "bucket" : {"name" : "test-bucket" },
338
+ "object" : {"key" : "upload/test_document.pdf" }
339
+ }
340
+ }]
341
+ })
342
+ }]
343
+ }
344
+
345
+ # Execute
346
+ result = lambda_handler (event , None )
347
+
348
+ # Verify
349
+ assert result == {"status" : "success" }
350
+
351
+ # Should have downloaded the original
352
+ assert mock_s3 .download_file .call_count == 1
353
+
354
+ # Should have uploaded: original + 2 TIFFs + 2 JSONs = 5 files
355
+ assert mock_s3 .upload_file .call_count == 5
356
+
357
+ # Should have deleted the original
358
+ assert mock_s3 .delete_object .call_count == 1
0 commit comments