Skip to content

Commit 6b19e70

Browse files
committed
👷 Adds a CI test
1 parent 0136539 commit 6b19e70

File tree

2 files changed

+390
-0
lines changed

2 files changed

+390
-0
lines changed

.github/workflows/test.yml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
name: Run Tests
2+
3+
on:
4+
push:
5+
branches: [ main ]
6+
pull_request:
7+
branches: [ main ]
8+
9+
jobs:
10+
test:
11+
name: python
12+
runs-on: ubuntu-latest
13+
strategy:
14+
matrix:
15+
python-version: ["3.12"]
16+
17+
steps:
18+
- name: Checkout repository
19+
uses: actions/checkout@v4
20+
21+
- name: Install uv and set the python version
22+
uses: astral-sh/setup-uv@v5
23+
with:
24+
python-version: ${{ matrix.python-version }}
25+
enable-cache: true
26+
cache-dependency-glob: "uv.lock"
27+
28+
- name: Install the project
29+
run: uv sync --locked --all-extras --dev
30+
31+
- name: Run tests
32+
run: uv run pytest __test__/ -v

__test__/test_main.py

Lines changed: 358 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,358 @@
1+
import json
2+
import os
3+
import tempfile
4+
from unittest.mock import Mock, patch, MagicMock, call
5+
import pytest
6+
from PIL import Image
7+
import io
8+
9+
# Import the functions we want to test
10+
from ingest_pipeline.main import (
11+
lambda_handler,
12+
get_next_task_sequence,
13+
create_and_upload_task,
14+
process_pdf_page,
15+
process_pdf,
16+
process_tiff,
17+
move_original_to_raw,
18+
process_s3_record
19+
)
20+
21+
22+
class TestLambdaHandler:
23+
"""Test suite for the Lambda handler and its functions."""
24+
25+
@pytest.fixture
26+
def sample_sqs_event(self):
27+
"""Sample SQS event with S3 notification."""
28+
return {
29+
"Records": [{
30+
"messageId": "42e2d2ed-65de-42db-bfaa-9d04ee397dc5",
31+
"receiptHandle": "AQEBZah8dAcvxL/kRUO9KKQQMwi+njiqWLi4EJP5tdcQ3FrJVwo/Yf5q8gYieTu+diOcAeLZSrC2ig/BWPYi2oz1EyqlaZRvDnjURVzC7H+cAP3WqXQ34VxAPk7Q5IWTorNb+d6z7BYvdhu2aCuVPXMDkrINRAMSCbuGu7H562YXau+0GIQhqWDpZUyX5uX34DBx6RLAl5+At/5S9qwb1UL6XLYoXweIv1yJ5W3yRVq3Vlygtp1HRCq+zKRiG1ORy67NMB52+Bwf4CKdt6OMRL0uKSg+FE9oKwIfKj/PuIG3q4lkyan1Icy9w1anBcTo6U8mQxzQsDEba36eDUCVEA7NBvSifI4GOV0T+0Mz+HSE0kpDgx0CqxtyHGFStkBfEgMtzlHEcE2plqoG28AUeG9gjg==",
32+
"body": json.dumps({
33+
"Records": [{
34+
"eventVersion": "2.1",
35+
"eventSource": "aws:s3",
36+
"awsRegion": "us-east-1",
37+
"eventTime": "2025-06-19T22:53:18.720Z",
38+
"eventName": "ObjectCreated:Put",
39+
"userIdentity": {"principalId": "AWS:AIDATGW6D4T35VO3LM4FT"},
40+
"requestParameters": {"sourceIPAddress": "96.250.79.68"},
41+
"responseElements": {
42+
"x-amz-request-id": "P5GBR9KGYW6HZ3MA",
43+
"x-amz-id-2": "IbTG1f+r0tBMobjHdn4j2izKhusX9eBaCx7N8RbIGx7x23e9SAcC2bSCNNoVA8jV24Z7blJ17+j6QYuP8jlJNn6528sfQVbS"
44+
},
45+
"s3": {
46+
"s3SchemaVersion": "1.0",
47+
"configurationId": "tf-s3-queue-20250619224812800500000002",
48+
"bucket": {
49+
"name": "brij-v1-bucket",
50+
"ownerIdentity": {"principalId": "AN6L0W7Z1W8K6"},
51+
"arn": "arn:aws:s3:::brij-v1-bucket"
52+
},
53+
"object": {
54+
"key": "upload/BD23-25943+FLR+PLN.pdf",
55+
"size": 435027,
56+
"eTag": "80955dc9417677628d46674340d1acc9",
57+
"versionId": "0RqUcc4jvul.yempoufM1IBRyWt0sNWt",
58+
"sequencer": "00685494DE8E151A1B"
59+
}
60+
}
61+
}]
62+
}),
63+
"attributes": {
64+
"ApproximateReceiveCount": "1",
65+
"SentTimestamp": "1750373599461",
66+
"SenderId": "AROA4R74ZO52XAB5OD7T4:S3-PROD-END",
67+
"ApproximateFirstReceiveTimestamp": "1750373599472"
68+
},
69+
"messageAttributes": {},
70+
"md5OfBody": "7d84fe3663343107b186e691b40080ac",
71+
"eventSource": "aws:sqs",
72+
"eventSourceARN": "arn:aws:sqs:us-east-1:220582896887:brij-v1-upload-queue",
73+
"awsRegion": "us-east-1"
74+
}]
75+
}
76+
77+
@pytest.fixture
78+
def mock_s3_client(self):
79+
"""Mock S3 client."""
80+
with patch('ingest_pipeline.main.s3') as mock_s3:
81+
yield mock_s3
82+
83+
def test_get_next_task_sequence_empty(self, mock_s3_client):
84+
"""Test getting next sequence when no tasks exist."""
85+
mock_s3_client.list_objects_v2.return_value = {}
86+
87+
seq = get_next_task_sequence("test-bucket")
88+
89+
assert seq == 1
90+
mock_s3_client.list_objects_v2.assert_called_once_with(
91+
Bucket="test-bucket",
92+
Prefix="ingest/"
93+
)
94+
95+
def test_get_next_task_sequence_with_existing(self, mock_s3_client):
96+
"""Test getting next sequence with existing tasks."""
97+
mock_s3_client.list_objects_v2.return_value = {
98+
"Contents": [
99+
{"Key": "ingest/TASK_0000001.json"},
100+
{"Key": "ingest/TASK_0000003.json"},
101+
{"Key": "ingest/TASK_0000002.json"},
102+
]
103+
}
104+
105+
seq = get_next_task_sequence("test-bucket")
106+
107+
assert seq == 4
108+
109+
def test_create_and_upload_task(self, mock_s3_client):
110+
"""Test creating and uploading a task."""
111+
with tempfile.TemporaryDirectory() as tmp_dir:
112+
ingest_key = create_and_upload_task(
113+
"test-bucket",
114+
"s3://test-bucket/raw/test.tiff",
115+
5,
116+
tmp_dir
117+
)
118+
119+
assert ingest_key == "ingest/TASK_0000005.json"
120+
mock_s3_client.upload_file.assert_called_once()
121+
122+
# Verify the JSON file was created correctly
123+
call_args = mock_s3_client.upload_file.call_args
124+
local_file = call_args[0][0]
125+
assert os.path.exists(local_file)
126+
127+
with open(local_file, 'r') as f:
128+
content = json.load(f)
129+
assert content == [{"data": {"image": "s3://test-bucket/raw/test.tiff"}}]
130+
131+
@patch('ingest_pipeline.main.get_next_task_sequence')
132+
@patch('ingest_pipeline.main.create_and_upload_task')
133+
def test_process_pdf_page(self, mock_create_task, mock_get_seq, mock_s3_client):
134+
"""Test processing a single PDF page."""
135+
mock_get_seq.return_value = 1
136+
mock_create_task.return_value = "ingest/TASK_0000001.json"
137+
138+
# Create a mock PIL Image
139+
mock_page = Mock()
140+
141+
with tempfile.TemporaryDirectory() as tmp_dir:
142+
raw_key = process_pdf_page(
143+
mock_page,
144+
1,
145+
"test_document",
146+
"test-bucket",
147+
tmp_dir
148+
)
149+
150+
assert raw_key == "raw/test_document_page001.tiff"
151+
mock_page.save.assert_called_once()
152+
mock_s3_client.upload_file.assert_called_once()
153+
154+
@patch('ingest_pipeline.main.convert_from_path')
155+
@patch('ingest_pipeline.main.process_pdf_page')
156+
def test_process_pdf_multiple_pages(self, mock_process_page, mock_convert):
157+
"""Test processing a PDF with multiple pages."""
158+
# Mock 3 pages
159+
mock_pages = [Mock() for _ in range(3)]
160+
mock_convert.return_value = mock_pages
161+
mock_process_page.side_effect = [
162+
"raw/doc_page001.tiff",
163+
"raw/doc_page002.tiff",
164+
"raw/doc_page003.tiff"
165+
]
166+
167+
with tempfile.TemporaryDirectory() as tmp_dir:
168+
keys = process_pdf("test.pdf", "doc", "test-bucket", tmp_dir)
169+
170+
assert len(keys) == 3
171+
assert mock_process_page.call_count == 3
172+
173+
@patch('ingest_pipeline.main.Image')
174+
@patch('ingest_pipeline.main.get_next_task_sequence')
175+
@patch('ingest_pipeline.main.create_and_upload_task')
176+
def test_process_tiff(self, mock_create_task, mock_get_seq, mock_image, mock_s3_client):
177+
"""Test processing a TIFF file."""
178+
mock_get_seq.return_value = 1
179+
mock_create_task.return_value = "ingest/TASK_0000001.json"
180+
mock_img = Mock()
181+
mock_image.open.return_value = mock_img
182+
183+
with tempfile.TemporaryDirectory() as tmp_dir:
184+
keys = process_tiff("test.tif", "test_image", "test-bucket", tmp_dir)
185+
186+
assert keys == ["raw/test_image.tiff"]
187+
mock_img.save.assert_called_once()
188+
mock_s3_client.upload_file.assert_called_once()
189+
190+
def test_move_original_to_raw(self, mock_s3_client):
191+
"""Test moving original file to raw prefix."""
192+
raw_key = move_original_to_raw(
193+
"local_file.pdf",
194+
"test-bucket",
195+
"upload/original.pdf",
196+
"original",
197+
".pdf"
198+
)
199+
200+
assert raw_key == "raw/original.pdf"
201+
mock_s3_client.upload_file.assert_called_once_with(
202+
"local_file.pdf",
203+
"test-bucket",
204+
"raw/original.pdf"
205+
)
206+
mock_s3_client.delete_object.assert_called_once_with(
207+
Bucket="test-bucket",
208+
Key="upload/original.pdf"
209+
)
210+
211+
@patch('ingest_pipeline.main.process_pdf')
212+
@patch('ingest_pipeline.main.move_original_to_raw')
213+
@patch('tempfile.TemporaryDirectory')
214+
def test_process_s3_record_pdf(self, mock_tempdir, mock_move, mock_process_pdf, mock_s3_client):
215+
"""Test processing an S3 record for a PDF file."""
216+
# Setup mocks
217+
mock_temp = MagicMock()
218+
mock_temp.__enter__.return_value = "/tmp/test"
219+
mock_tempdir.return_value = mock_temp
220+
221+
record = {
222+
"body": json.dumps({
223+
"Records": [{
224+
"s3": {
225+
"bucket": {"name": "test-bucket"},
226+
"object": {"key": "upload/test.pdf"}
227+
}
228+
}]
229+
})
230+
}
231+
232+
process_s3_record(record)
233+
234+
mock_s3_client.download_file.assert_called_once()
235+
mock_move.assert_called_once()
236+
mock_process_pdf.assert_called_once()
237+
238+
def test_process_s3_record_skip_non_upload(self, mock_s3_client):
239+
"""Test skipping files not in upload/ prefix."""
240+
record = {
241+
"body": json.dumps({
242+
"Records": [{
243+
"s3": {
244+
"bucket": {"name": "test-bucket"},
245+
"object": {"key": "other/test.pdf"}
246+
}
247+
}]
248+
})
249+
}
250+
251+
process_s3_record(record)
252+
253+
# Should not download or process
254+
mock_s3_client.download_file.assert_not_called()
255+
256+
def test_process_s3_record_skip_unsupported_type(self, mock_s3_client):
257+
"""Test skipping unsupported file types."""
258+
record = {
259+
"body": json.dumps({
260+
"Records": [{
261+
"s3": {
262+
"bucket": {"name": "test-bucket"},
263+
"object": {"key": "upload/test.jpg"}
264+
}
265+
}]
266+
})
267+
}
268+
269+
process_s3_record(record)
270+
271+
# Should not download or process
272+
mock_s3_client.download_file.assert_not_called()
273+
274+
@patch('ingest_pipeline.main.process_s3_record')
275+
def test_lambda_handler_success(self, mock_process_record, sample_sqs_event):
276+
"""Test successful Lambda handler execution."""
277+
result = lambda_handler(sample_sqs_event, None)
278+
279+
assert result == {"status": "success"}
280+
mock_process_record.assert_called_once()
281+
282+
@patch('ingest_pipeline.main.process_s3_record')
283+
def test_lambda_handler_error_handling(self, mock_process_record, sample_sqs_event):
284+
"""Test Lambda handler error handling."""
285+
mock_process_record.side_effect = Exception("Test error")
286+
287+
# Should not raise exception, but handle it
288+
result = lambda_handler(sample_sqs_event, None)
289+
290+
assert result == {"status": "success"}
291+
292+
def test_url_decode_in_process_s3_record(self, mock_s3_client):
293+
"""Test that URL-encoded keys are properly decoded."""
294+
record = {
295+
"body": json.dumps({
296+
"Records": [{
297+
"s3": {
298+
"bucket": {"name": "test-bucket"},
299+
"object": {"key": "upload/file+with+spaces.pdf"}
300+
}
301+
}]
302+
})
303+
}
304+
305+
with patch('ingest_pipeline.main.process_pdf'):
306+
with patch('ingest_pipeline.main.move_original_to_raw'):
307+
with patch('tempfile.TemporaryDirectory'):
308+
process_s3_record(record)
309+
310+
# Check that download was called with decoded key
311+
mock_s3_client.download_file.assert_called_once()
312+
call_args = mock_s3_client.download_file.call_args
313+
assert call_args[0][1] == "upload/file with spaces.pdf"
314+
315+
316+
class TestIntegration:
317+
"""Integration tests that test the full flow."""
318+
319+
@patch('ingest_pipeline.main.s3')
320+
@patch('ingest_pipeline.main.convert_from_path')
321+
def test_full_pdf_processing_flow(self, mock_convert, mock_s3):
322+
"""Test the complete flow of processing a PDF file."""
323+
# Setup mocks
324+
mock_s3.list_objects_v2.return_value = {}
325+
326+
# Create mock pages
327+
mock_page1 = Mock()
328+
mock_page2 = Mock()
329+
mock_convert.return_value = [mock_page1, mock_page2]
330+
331+
# Create test event
332+
event = {
333+
"Records": [{
334+
"body": json.dumps({
335+
"Records": [{
336+
"s3": {
337+
"bucket": {"name": "test-bucket"},
338+
"object": {"key": "upload/test_document.pdf"}
339+
}
340+
}]
341+
})
342+
}]
343+
}
344+
345+
# Execute
346+
result = lambda_handler(event, None)
347+
348+
# Verify
349+
assert result == {"status": "success"}
350+
351+
# Should have downloaded the original
352+
assert mock_s3.download_file.call_count == 1
353+
354+
# Should have uploaded: original + 2 TIFFs + 2 JSONs = 5 files
355+
assert mock_s3.upload_file.call_count == 5
356+
357+
# Should have deleted the original
358+
assert mock_s3.delete_object.call_count == 1

0 commit comments

Comments
 (0)