Skip to content

Commit

Permalink
fix: fix detection of HTTP site when using the useState in adaptive…
Browse files Browse the repository at this point in the history
… crawler (#2530)

The crawler state provided in `useState` is now cloned before we run the
browser, and the old state is used for the HTTP based rerun, so it can
produce the same results. Since we only do this for the reruns, we don't
need to care about persistence of the state, we only check for the
detection outcome.
  • Loading branch information
B4nan committed Jun 11, 2024
1 parent 9a80f98 commit 7e195c1
Showing 1 changed file with 17 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,12 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
crawlingContext.log.debug(`Running browser request handler for ${crawlingContext.request.url}`);
this.stats.trackBrowserRequestHandlerRun();

// Keep a copy of the `useState` value, we need to use the old state when trying the HTTP handler to have
// the same outcome. We don't need to care about its persistence, since we only run this for detection
// purposes. We read the value directly instead of using `useState` so there are no side effects.
const kvs = await crawlingContext.getKeyValueStore();
const oldState = await kvs.getValue(AdaptivePlaywrightCrawler.CRAWLEE_STATE_KEY);
const oldStateCopy = JSON.parse(JSON.stringify(oldState));
const browserRun = await this.runRequestHandlerInBrowser(crawlingContext);

if (!browserRun.ok) {
Expand All @@ -310,7 +316,7 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler {

if (shouldDetectRenderingType) {
crawlingContext.log.debug(`Detecting rendering type for ${crawlingContext.request.url}`);
const plainHTTPRun = await this.runRequestHandlerWithPlainHTTP(crawlingContext);
const plainHTTPRun = await this.runRequestHandlerWithPlainHTTP(crawlingContext, oldStateCopy);

const detectionResult: RenderingType = (() => {
if (!plainHTTPRun.ok) {
Expand Down Expand Up @@ -438,6 +444,7 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler {

protected async runRequestHandlerWithPlainHTTP(
crawlingContext: PlaywrightCrawlingContext,
oldStateCopy?: Dictionary,
): Promise<Result<RequestHandlerResult>> {
const result = new RequestHandlerResult(this.config, AdaptivePlaywrightCrawler.CRAWLEE_STATE_KEY);
const logs: LogProxyCall[] = [];
Expand Down Expand Up @@ -490,7 +497,15 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
},
addRequests: result.addRequests,
pushData: result.pushData,
useState: this.allowStorageAccess(result.useState),
useState: async (defaultValue) => {
// return the old state before the browser handler was executed
// when rerunning the handler via HTTP for detection
if (oldStateCopy !== undefined) {
return oldStateCopy ?? defaultValue; // fallback to the default for `null`
}

return this.allowStorageAccess(result.useState)(defaultValue);
},
getKeyValueStore: this.allowStorageAccess(result.getKeyValueStore),
}),
this.requestHandlerTimeoutInnerMillis,
Expand Down

0 comments on commit 7e195c1

Please sign in to comment.