Skip to content
This repository was archived by the owner on Mar 13, 2020. It is now read-only.

Commit bb3ee5e

Browse files
authored
Merge pull request #59 from pageuppeople-opensource/feature/SP-145_retrieve-result-via-s3
[SP-145] Download dataset via S3
2 parents 57d7b6d + 2f72c60 commit bb3ee5e

File tree

1 file changed

+35
-19
lines changed

1 file changed

+35
-19
lines changed

rdl/data_sources/AWSLambdaDataSource.py

Lines changed: 35 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ def __init__(self, connection_string, logger=None):
2929
.split(AWSLambdaDataSource.CONNECTION_STRING_GROUP_SEPARATOR)
3030
)
3131
self.aws_lambda_client = boto3.client("lambda")
32+
self.aws_s3_client = boto3.client("s3")
3233

3334
@staticmethod
3435
def can_handle_connection_string(connection_string):
@@ -43,8 +44,9 @@ def get_connection_string_prefix():
4344
return AWSLambdaDataSource.CONNECTION_STRING_PREFIX
4445

4546
def get_table_info(self, table_config, last_known_sync_version):
46-
column_names, last_sync_version, sync_version, full_refresh_required, data_changed_since_last_sync \
47-
= self.__get_table_info(table_config, last_known_sync_version)
47+
column_names, last_sync_version, sync_version, full_refresh_required, data_changed_since_last_sync = self.__get_table_info(
48+
table_config, last_known_sync_version
49+
)
4850
columns_in_database = column_names
4951
change_tracking_info = ChangeTrackingInfo(
5052
last_sync_version=last_sync_version,
@@ -91,11 +93,13 @@ def __get_table_info(self, table_config, last_known_sync_version):
9193

9294
result = self.__invoke_lambda(pay_load)
9395

94-
return result["ColumnNames"], \
95-
result["LastSyncVersion"], \
96-
result["CurrentSyncVersion"], \
97-
result["FullRefreshRequired"], \
98-
result["DataChangedSinceLastSync"]
96+
return (
97+
result["ColumnNames"],
98+
result["LastSyncVersion"],
99+
result["CurrentSyncVersion"],
100+
result["FullRefreshRequired"],
101+
result["DataChangedSinceLastSync"],
102+
)
99103

100104
def __get_table_data(
101105
self,
@@ -116,23 +120,31 @@ def __get_table_data(
116120
"BatchSize": batch_config["size"],
117121
"LastSyncVersion": change_tracking_info.last_sync_version,
118122
"FullRefresh": full_refresh,
119-
"ColumnNames": list(map(lambda cfg: cfg['source_name'], columns_config)),
123+
"ColumnNames": list(
124+
map(lambda cfg: cfg["source_name"], columns_config)
125+
),
120126
"PrimaryKeyColumnNames": table_config["primary_keys"],
121127
"LastBatchPrimaryKeys": [
122-
{"Key": k, "Value": v} for k, v in batch_key_tracker.bookmarks.items()
128+
{"Key": k, "Value": v}
129+
for k, v in batch_key_tracker.bookmarks.items()
123130
],
124131
},
125132
}
126133

127134
result = self.__invoke_lambda(pay_load)
135+
command_result = self.aws_s3_client.get_object(
136+
Bucket=result["DataBucketName"], Key=result["DataKey"]
137+
)
128138

129-
return result["ColumnNames"], result["Data"]
139+
data = json.loads(command_result["Body"].read())
140+
141+
return result["ColumnNames"], data
130142

131143
def __get_data_frame(self, data: [[]], column_names: []):
132144
return pandas.DataFrame(data=data, columns=column_names)
133145

134146
def __invoke_lambda(self, pay_load):
135-
self.logger.debug('\nRequest being sent to Lambda:')
147+
self.logger.debug("\nRequest being sent to Lambda:")
136148
self.logger.debug(pay_load)
137149

138150
lambda_response = self.aws_lambda_client.invoke(
@@ -142,24 +154,28 @@ def __invoke_lambda(self, pay_load):
142154
Payload=json.dumps(pay_load).encode(),
143155
)
144156

145-
response_status_code = int(lambda_response['StatusCode'])
157+
response_status_code = int(lambda_response["StatusCode"])
146158
response_function_error = lambda_response.get("FunctionError")
147-
self.logger.debug('\nResponse received from Lambda:')
159+
self.logger.debug("\nResponse received from Lambda:")
148160
self.logger.debug(f'Response - StatusCode = "{response_status_code}"')
149161
self.logger.debug(f'Response - FunctionError = "{response_function_error}"')
150162

151-
response_payload = json.loads(lambda_response['Payload'].read())
163+
response_payload = json.loads(lambda_response["Payload"].read())
152164

153165
if response_status_code != 200 or response_function_error:
154-
self.logger.error(F'Error in response from aws lambda {self.connection_data["function"]}')
155-
self.logger.error(f'Response - Status Code = {response_status_code}')
156-
self.logger.error(f'Response - Error Function = {response_function_error}')
157-
self.logger.error(f'Response - Error Details:')
166+
self.logger.error(
167+
f'Error in response from aws lambda {self.connection_data["function"]}'
168+
)
169+
self.logger.error(f"Response - Status Code = {response_status_code}")
170+
self.logger.error(f"Response - Error Function = {response_function_error}")
171+
self.logger.error(f"Response - Error Details:")
158172
# the below is risky as it may contain actual data if this line is reached in case of a successful result
159173
# however, the same Payload field is used to return actual error details in case of real errors
160174
# i.e. StatusCode is 200 (since AWS could invoke the lambda)
161175
# BUT the lambda barfed with an error and therefore the FunctionError would not be None
162176
self.logger.error(response_payload)
163-
raise Exception('Error received when invoking AWS Lambda. See logs for further details.')
177+
raise Exception(
178+
"Error received when invoking AWS Lambda. See logs for further details."
179+
)
164180

165181
return response_payload

0 commit comments

Comments
 (0)