From 1d9405e57955004d180f277405a3c3559148f167 Mon Sep 17 00:00:00 2001 From: Matthieu Napoli Date: Tue, 4 Jan 2022 17:13:58 +0100 Subject: [PATCH] Flush FPM logs in case of timeouts When Lambda times out with the PHP-FPM layer, the logs written by the PHP script are never flushed to stderr by PHP-FPM. That means they never reach CloudWatch, which makes timeouts really hard to debug. With this change, Bref waits for the FPM response until 1 second before the actual Lambda timeout (via a connection timeout on the FastCGI connection). If Bref reaches that point, it will ask PHP-FPM to gracefully restart the PHP-FPM worker, which: - flushes the logs (logs end up in CloudWatch, which is great) - restarts a clean FPM worker, without doing a full FPM restart (which may take longer) Follow up of #770, #772, #895 May address some of #862 Note: this does not change anything for the Function layer (only affects FPM). Also this does not show a full stack track of the place in the code where the timeout happens (#895 did). Still it's an improvement over the current status. --- demo/http.php | 1 + serverless.yml | 10 +++++----- src/Event/Http/FastCgi/Timeout.php | 20 ++++++++++++++++++++ src/Event/Http/FpmHandler.php | 24 +++++++++++++++++++++++- tests/Handler/FpmHandlerTest.php | 24 ++++++++++++++++++++++-- tests/Handler/PhpFpm/timeout.php | 2 ++ tests/Runtime/LambdaRuntimeTest.php | 6 +++--- 7 files changed, 76 insertions(+), 11 deletions(-) create mode 100644 src/Event/Http/FastCgi/Timeout.php diff --git a/demo/http.php b/demo/http.php index c0b35dd12..1e24d5367 100644 --- a/demo/http.php +++ b/demo/http.php @@ -3,6 +3,7 @@ require __DIR__ . '/../vendor/autoload.php'; if (isset($_GET['sleep'])) { + error_log('This is a log'); sleep(10); } diff --git a/serverless.yml b/serverless.yml index 2aebfbdfe..a54bed2e2 100644 --- a/serverless.yml +++ b/serverless.yml @@ -25,7 +25,7 @@ functions: handler: demo/function.php description: 'Bref CLI demo' layers: - - ${bref:layer.php-74} + - ${bref:layer.php-80} environment: BREF_LOOP_MAX: 100 @@ -34,7 +34,7 @@ functions: description: 'Bref HTTP demo' timeout: 5 # in seconds (API Gateway has a timeout of 29 seconds) layers: - - ${bref:layer.php-74-fpm} + - ${bref:layer.php-80-fpm} events: - http: 'ANY /' @@ -43,7 +43,7 @@ functions: description: 'Bref HTTP demo with a PSR-7 handler' timeout: 5 # in seconds (API Gateway has a timeout of 29 seconds) layers: - - ${bref:layer.php-74} + - ${bref:layer.php-80} events: - http: 'ANY /psr7' - httpApi: 'GET /psr7' @@ -55,7 +55,7 @@ functions: description: 'Bref HTTP demo' timeout: 5 # in seconds (API Gateway has a timeout of 29 seconds) layers: - - ${bref:layer.php-74-fpm} + - ${bref:layer.php-80-fpm} events: - httpApi: '*' @@ -64,5 +64,5 @@ functions: description: 'Bref console command demo' timeout: 900 layers: - - ${bref:layer.php-74} + - ${bref:layer.php-80} - ${bref:layer.console} diff --git a/src/Event/Http/FastCgi/Timeout.php b/src/Event/Http/FastCgi/Timeout.php new file mode 100644 index 000000000..51ea2d650 --- /dev/null +++ b/src/Event/Http/FastCgi/Timeout.php @@ -0,0 +1,20 @@ +eventToFastCgiRequest($event, $context); + // The script will timeout 1 second before the remaining time + // to allow some time for Bref/PHP-FPM to recover and cleanup + $margin = 1000; + $timeoutDelayInMs = max(1000, $context->getRemainingTimeInMillis() - $margin); + try { - $response = $this->client->sendRequest($this->connection, $request); + $socketId = $this->client->sendAsyncRequest($this->connection, $request); + + $response = $this->client->readResponse($socketId, $timeoutDelayInMs); + } catch (TimedoutException $e) { + // Send a SIGUSR2 signal to PHP-FPM + // That causes FPM to reload all workers, which allows us to cleanly stop the FPM worker that is stuck in a timeout/waiting state. + // A (intentional) side-effect is that it causes all worker logs buffered by FPM to be written to stderr. + // Without that, all logs written by the PHP script are never written to stderr (and thus never logged to CloudWatch). + // This takes a bit of time (a few ms), but it's still faster than rebooting FPM entirely. + posix_kill($this->fpm->getPid(), SIGUSR2); + + // Throw an exception so that: + // - this is reported as a Lambda execution error ("error rate" metrics are accurate) + // - the CloudWatch logs correctly reflect that an execution error occurred + // - the 500 response is the same as if an exception happened in Bref + throw new Timeout($timeoutDelayInMs); } catch (Throwable $e) { printf( "Error communicating with PHP-FPM to read the HTTP response. A root cause of this can be that the Lambda (or PHP) timed out, for example when trying to connect to a remote API or database, if this happens continuously check for those! Bref will restart PHP-FPM now. Original exception message: %s %s\n", diff --git a/tests/Handler/FpmHandlerTest.php b/tests/Handler/FpmHandlerTest.php index 2568ae53d..3e0f95a3b 100644 --- a/tests/Handler/FpmHandlerTest.php +++ b/tests/Handler/FpmHandlerTest.php @@ -3,7 +3,7 @@ namespace Bref\Test\Handler; use Bref\Context\Context; -use Bref\Event\Http\FastCgi\FastCgiCommunicationFailed; +use Bref\Event\Http\FastCgi\Timeout; use Bref\Event\Http\FpmHandler; use Bref\Test\HttpRequestProxyTest; use DMS\PHPUnitExtensions\ArraySubset\ArraySubsetAsserts; @@ -1071,7 +1071,7 @@ public function test FPM timeouts are recovered from() 'httpMethod' => 'GET', ], $this->fakeContext); $this->fail('No exception was thrown'); - } catch (FastCgiCommunicationFailed $e) { + } catch (Timeout $e) { // PHP-FPM should work after that $statusCode = $this->fpm->handle([ 'version' => '1.0', @@ -1084,6 +1084,26 @@ public function test FPM timeouts are recovered from() } } + /** + * See https://github.com/brefphp/bref/issues/862 + */ + public function test worker logs are still written in case of a timeout() + { + $this->fpm = new FpmHandler(__DIR__ . '/PhpFpm/timeout.php', __DIR__ . '/PhpFpm/php-fpm.conf'); + $this->fpm->start(); + + try { + $this->fpm->handle([ + 'version' => '1.0', + 'httpMethod' => 'GET', + ], new Context('abc', time(), 'abc', 'abc')); + $this->fail('No exception was thrown'); + } catch (Timeout $e) { + $logs = ob_get_contents(); + self::assertStringContainsString('This is a log message', $logs); + } + } + /** * @see https://github.com/brefphp/bref/issues/316 */ diff --git a/tests/Handler/PhpFpm/timeout.php b/tests/Handler/PhpFpm/timeout.php index af608e3ee..8d476703e 100644 --- a/tests/Handler/PhpFpm/timeout.php +++ b/tests/Handler/PhpFpm/timeout.php @@ -1,3 +1,5 @@ expectExceptionMessage('Failed to fetch next Lambda invocation: The requested URL returned error: 404 Not Found'); + $this->expectExceptionMessage('Failed to fetch next Lambda invocation: The requested URL returned error: 404'); Server::enqueue([ new Response( // lambda event 404, // 404 instead of 200 @@ -177,9 +177,9 @@ public function test a wrong response from the runtime API turns the i // Check the lambda result contains the error message $error = json_decode((string) $eventFailureLog->getBody(), true); - $this->assertSame('Error while calling the Lambda runtime API: The requested URL returned error: 400 Bad Request', $error['errorMessage']); + $this->assertSame('Error while calling the Lambda runtime API: The requested URL returned error: 400', $error['errorMessage']); - $this->assertErrorInLogs('Exception', 'Error while calling the Lambda runtime API: The requested URL returned error: 400 Bad Request'); + $this->assertErrorInLogs('Exception', 'Error while calling the Lambda runtime API: The requested URL returned error: 400'); } public function test function results that cannot be encoded are reported as invocation errors()