Skip to content

Commit

Permalink
Flush FPM logs in case of timeouts
Browse files Browse the repository at this point in the history
When Lambda times out with the PHP-FPM layer, the logs written by the PHP script are never flushed to stderr by PHP-FPM. That means they never reach CloudWatch, which makes timeouts really hard to debug.

With this change, Bref waits for the FPM response until 1 second before the actual Lambda timeout (via a connection timeout on the FastCGI connection).

If Bref reaches that point, it will ask PHP-FPM to gracefully restart the PHP-FPM worker, which:

- flushes the logs (logs end up in CloudWatch, which is great)
- restarts a clean FPM worker, without doing a full FPM restart (which may take longer)

Follow up of #770, #772, #895

May address some of #862

Note: this does not change anything for the Function layer (only affects FPM). Also this does not show a full stack track of the place in the code where the timeout happens (#895 did). Still it's an improvement over the current status.
  • Loading branch information
mnapoli committed Jan 4, 2022
1 parent 54c43ba commit 57cf99f
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 8 deletions.
1 change: 1 addition & 0 deletions demo/http.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
require __DIR__ . '/../vendor/autoload.php';

if (isset($_GET['sleep'])) {
error_log('This is a log');
sleep(10);
}

Expand Down
10 changes: 5 additions & 5 deletions serverless.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ functions:
handler: demo/function.php
description: 'Bref CLI demo'
layers:
- ${bref:layer.php-74}
- ${bref:layer.php-80}
environment:
BREF_LOOP_MAX: 100

Expand All @@ -34,7 +34,7 @@ functions:
description: 'Bref HTTP demo'
timeout: 5 # in seconds (API Gateway has a timeout of 29 seconds)
layers:
- ${bref:layer.php-74-fpm}
- ${bref:layer.php-80-fpm}
events:
- http: 'ANY /'

Expand All @@ -43,7 +43,7 @@ functions:
description: 'Bref HTTP demo with a PSR-7 handler'
timeout: 5 # in seconds (API Gateway has a timeout of 29 seconds)
layers:
- ${bref:layer.php-74}
- ${bref:layer.php-80}
events:
- http: 'ANY /psr7'
- httpApi: 'GET /psr7'
Expand All @@ -55,7 +55,7 @@ functions:
description: 'Bref HTTP demo'
timeout: 5 # in seconds (API Gateway has a timeout of 29 seconds)
layers:
- ${bref:layer.php-74-fpm}
- ${bref:layer.php-80-fpm}
events:
- httpApi: '*'

Expand All @@ -64,5 +64,5 @@ functions:
description: 'Bref console command demo'
timeout: 900
layers:
- ${bref:layer.php-74}
- ${bref:layer.php-80}
- ${bref:layer.console}
20 changes: 20 additions & 0 deletions src/Event/Http/FastCgi/Timeout.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<?php declare(strict_types=1);

namespace Bref\Event\Http\FastCgi;

/**
* There was a timeout while processing the PHP request
*
* @internal
*/
final class Timeout extends \Exception
{
public function __construct(int $taskTimeoutInMs)
{
$message = "The request timed out after $taskTimeoutInMs ms. "
. "Note: that duration may be lower than the Lambda timeout, don't be surprised, that is intentional. "
. "Indeed, Bref stops the PHP-FPM request *before* a hard Lambda timeout, because a hard timeout prevents all logs to be written to CloudWatch.";

parent::__construct($message);
}
}
24 changes: 23 additions & 1 deletion src/Event/Http/FpmHandler.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
use Bref\Context\Context;
use Bref\Event\Http\FastCgi\FastCgiCommunicationFailed;
use Bref\Event\Http\FastCgi\FastCgiRequest;
use Bref\Event\Http\FastCgi\Timeout;
use Exception;
use hollodotme\FastCGI\Client;
use hollodotme\FastCGI\Exceptions\TimedoutException;
use hollodotme\FastCGI\Interfaces\ProvidesRequestData;
use hollodotme\FastCGI\Interfaces\ProvidesResponseData;
use hollodotme\FastCGI\SocketConnections\UnixDomainSocket;
Expand Down Expand Up @@ -110,8 +112,28 @@ public function handleRequest(HttpRequestEvent $event, Context $context): HttpRe
{
$request = $this->eventToFastCgiRequest($event, $context);

// The script will timeout 1 second before the remaining time
// to allow some time for Bref/PHP-FPM to recover and cleanup
$margin = 1000;
$timeoutDelayInMs = max(1000, $context->getRemainingTimeInMillis() - $margin);

try {
$response = $this->client->sendRequest($this->connection, $request);
$socketId = $this->client->sendAsyncRequest($this->connection, $request);

$response = $this->client->readResponse($socketId, $timeoutDelayInMs);
} catch (TimedoutException $e) {
// Send a SIGUSR2 signal to PHP-FPM
// That causes FPM to reload all workers, which allows us to cleanly stop the FPM worker that is stuck in a timeout/waiting state.
// A (intentional) side-effect is that it causes all worker logs buffered by FPM to be written to stderr.
// Without that, all logs written by the PHP script are never written to stderr (and thus never logged to CloudWatch).
// This takes a bit of time (a few ms), but it's still faster than rebooting FPM entirely.
posix_kill($this->fpm->getPid(), SIGUSR2);

// Throw an exception so that:
// - this is reported as a Lambda execution error ("error rate" metrics are accurate)
// - the CloudWatch logs correctly reflect that an execution error occurred
// - the 500 response is the same as if an exception happened in Bref
throw new Timeout($timeoutDelayInMs);
} catch (Throwable $e) {
printf(
"Error communicating with PHP-FPM to read the HTTP response. A root cause of this can be that the Lambda (or PHP) timed out, for example when trying to connect to a remote API or database, if this happens continuously check for those! Bref will restart PHP-FPM now. Original exception message: %s %s\n",
Expand Down
24 changes: 22 additions & 2 deletions tests/Handler/FpmHandlerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
namespace Bref\Test\Handler;

use Bref\Context\Context;
use Bref\Event\Http\FastCgi\FastCgiCommunicationFailed;
use Bref\Event\Http\FastCgi\Timeout;
use Bref\Event\Http\FpmHandler;
use Bref\Test\HttpRequestProxyTest;
use DMS\PHPUnitExtensions\ArraySubset\ArraySubsetAsserts;
Expand Down Expand Up @@ -1071,7 +1071,7 @@ public function test FPM timeouts are recovered from()
'httpMethod' => 'GET',
], $this->fakeContext);
$this->fail('No exception was thrown');
} catch (FastCgiCommunicationFailed $e) {
} catch (Timeout $e) {
// PHP-FPM should work after that
$statusCode = $this->fpm->handle([
'version' => '1.0',
Expand All @@ -1084,6 +1084,26 @@ public function test FPM timeouts are recovered from()
}
}

/**
* See https://github.com/brefphp/bref/issues/862
*/
public function test worker logs are still written in case of a timeout()
{
$this->fpm = new FpmHandler(__DIR__ . '/PhpFpm/timeout.php', __DIR__ . '/PhpFpm/php-fpm.conf');
$this->fpm->start();

try {
$this->fpm->handle([
'version' => '1.0',
'httpMethod' => 'GET',
], new Context('abc', time(), 'abc', 'abc'));
$this->fail('No exception was thrown');
} catch (Timeout $e) {
$logs = ob_get_contents();
self::assertStringContainsString('This is a log message', $logs);
}
}

/**
* @see https://github.com/brefphp/bref/issues/316
*/
Expand Down
2 changes: 2 additions & 0 deletions tests/Handler/PhpFpm/timeout.php
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
<?php declare(strict_types=1);

error_log('This is a log message');

sleep((int) ($_GET['timeout'] ?? 10));

0 comments on commit 57cf99f

Please sign in to comment.