From 7e195c17cf1d9beae7f6f068fe505f1334a3a5b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Ad=C3=A1mek?= Date: Tue, 11 Jun 2024 20:26:45 +0200 Subject: [PATCH] fix: fix detection of HTTP site when using the `useState` in adaptive crawler (#2530) The crawler state provided in `useState` is now cloned before we run the browser, and the old state is used for the HTTP based rerun, so it can produce the same results. Since we only do this for the reruns, we don't need to care about persistence of the state, we only check for the detection outcome. --- .../internals/adaptive-playwright-crawler.ts | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts b/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts index 7f2f89b0dfc..305c41760b2 100644 --- a/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts +++ b/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts @@ -300,6 +300,12 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler { crawlingContext.log.debug(`Running browser request handler for ${crawlingContext.request.url}`); this.stats.trackBrowserRequestHandlerRun(); + // Keep a copy of the `useState` value, we need to use the old state when trying the HTTP handler to have + // the same outcome. We don't need to care about its persistence, since we only run this for detection + // purposes. We read the value directly instead of using `useState` so there are no side effects. + const kvs = await crawlingContext.getKeyValueStore(); + const oldState = await kvs.getValue(AdaptivePlaywrightCrawler.CRAWLEE_STATE_KEY); + const oldStateCopy = JSON.parse(JSON.stringify(oldState)); const browserRun = await this.runRequestHandlerInBrowser(crawlingContext); if (!browserRun.ok) { @@ -310,7 +316,7 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler { if (shouldDetectRenderingType) { crawlingContext.log.debug(`Detecting rendering type for ${crawlingContext.request.url}`); - const plainHTTPRun = await this.runRequestHandlerWithPlainHTTP(crawlingContext); + const plainHTTPRun = await this.runRequestHandlerWithPlainHTTP(crawlingContext, oldStateCopy); const detectionResult: RenderingType = (() => { if (!plainHTTPRun.ok) { @@ -438,6 +444,7 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler { protected async runRequestHandlerWithPlainHTTP( crawlingContext: PlaywrightCrawlingContext, + oldStateCopy?: Dictionary, ): Promise> { const result = new RequestHandlerResult(this.config, AdaptivePlaywrightCrawler.CRAWLEE_STATE_KEY); const logs: LogProxyCall[] = []; @@ -490,7 +497,15 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler { }, addRequests: result.addRequests, pushData: result.pushData, - useState: this.allowStorageAccess(result.useState), + useState: async (defaultValue) => { + // return the old state before the browser handler was executed + // when rerunning the handler via HTTP for detection + if (oldStateCopy !== undefined) { + return oldStateCopy ?? defaultValue; // fallback to the default for `null` + } + + return this.allowStorageAccess(result.useState)(defaultValue); + }, getKeyValueStore: this.allowStorageAccess(result.getKeyValueStore), }), this.requestHandlerTimeoutInnerMillis,