Skip to content

Commit 4ed01c7

Browse files
Fix: jigsawstack.web.ai_scrape params:
1. Types are updated to match with js lib, and source truth @jigsawstack/scrape.ts zod schema. 2. Updated type casting to Dict[Any, Any]
1 parent 1e78667 commit 4ed01c7

File tree

1 file changed

+92
-22
lines changed

1 file changed

+92
-22
lines changed

jigsawstack/web.py

Lines changed: 92 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
from typing import Any, Dict, List, cast, Union
2-
from typing_extensions import NotRequired, TypedDict, Optional
1+
from typing import Any, Dict, List, Union, Optional, cast, Literal
2+
from typing_extensions import NotRequired, TypedDict
3+
34
from .request import Request, RequestConfig
45
from .async_request import AsyncRequest, AsyncRequestConfig
56
from ._config import ClientConfig
@@ -14,6 +15,9 @@
1415
from .helpers import build_path
1516

1617

18+
#
19+
# DNS
20+
#
1721
class DNSParams(TypedDict):
1822
domain: str
1923
type: NotRequired[str]
@@ -34,6 +38,9 @@ class DNSResponse(TypedDict):
3438
authority: List
3539

3640

41+
#
42+
# HTML to Any
43+
#
3744
class HTMLToAnyParams(TypedDict):
3845
html: str
3946
url: str
@@ -58,56 +65,117 @@ class HTMLToAnyResponse(TypedDict):
5865
html: str
5966

6067

68+
#
69+
# BYO Proxy
70+
#
71+
class CookieParameter(TypedDict):
72+
name: str
73+
value: str
74+
url: NotRequired[str]
75+
domain: NotRequired[str]
76+
path: NotRequired[str]
77+
secure: NotRequired[bool]
78+
httpOnly: NotRequired[bool]
79+
sameSite: NotRequired[Literal["Strict", "Lax", "None"]]
80+
expires: NotRequired[bool]
81+
priority: NotRequired[str]
82+
sameParty: NotRequired[bool]
83+
84+
85+
class GotoOptions(TypedDict):
86+
timeout: int
87+
wait_until: str
88+
89+
90+
class WaitFor(TypedDict):
91+
mode: str
92+
value: Union[str, int]
93+
94+
95+
class AdvanceConfig(TypedDict):
96+
console: bool
97+
network: bool
98+
cookies: bool
99+
100+
61101
class BYOProxyAuth(TypedDict):
62102
username: str
63103
password: str
64104

65105

66106
class BYOProxy(TypedDict):
67107
server: str
68-
auth: BYOProxyAuth
108+
auth: NotRequired[BYOProxyAuth]
109+
69110

70111
class BaseAIScrapeParams(TypedDict):
71112
url: str
72-
advance_config: NotRequired[object]
113+
root_element_selectors: NotRequired[str]
114+
page_position: NotRequired[int]
115+
http_headers: NotRequired[Dict[str, Any]]
116+
reject_request_pattern: NotRequired[List[str]]
117+
goto_options: NotRequired[GotoOptions]
118+
wait_for: NotRequired[WaitFor]
119+
advance_config: NotRequired[AdvanceConfig]
73120
size_preset: NotRequired[str]
74121
is_mobile: NotRequired[bool]
75122
scale: NotRequired[int]
76123
width: NotRequired[int]
77124
height: NotRequired[int]
125+
cookies: NotRequired[List[CookieParameter]]
78126
force_rotate_proxy: NotRequired[bool]
79-
reject_request_pattern: NotRequired[List[str]]
80-
http_headers: NotRequired[object]
81-
goto_options: NotRequired[object]
82-
wait_for: NotRequired[object]
83-
cookies: NotRequired[object]
127+
byo_proxy: NotRequired[BYOProxy]
128+
129+
130+
class AIScrapeParamsWithSelector(BaseAIScrapeParams):
131+
selectors: List[str]
132+
element_prompts: NotRequired[List[str]]
133+
84134

85135
class AIScrapeParamsWithPrompts(BaseAIScrapeParams):
86-
selector: Optional[List[str]]
136+
selectors: NotRequired[List[str]]
87137
element_prompts: List[str]
88138

89-
class AIScrapeParamsWithSelector(BaseAIScrapeParams):
90-
selector: List[str]
91-
element_prompts: Optional[List[str]]
92139

93140
AIScrapeParams = Union[AIScrapeParamsWithSelector, AIScrapeParamsWithPrompts]
94141

95-
class LinkData(TypedDict):
96-
type: str # "a" or "img"
97-
href: Optional[str]
142+
143+
class Attribute(TypedDict):
144+
name: str
145+
value: str
146+
147+
148+
class Result(TypedDict):
149+
html: str
150+
text: str
151+
attributes: List[Attribute]
152+
153+
154+
class DataItem(TypedDict):
155+
key: str
156+
selectors: str
157+
results: List[Result]
158+
159+
160+
class Link(TypedDict):
161+
href: str
98162
text: Optional[str]
163+
type: Literal["a", "img"]
99164

100165

101166
class AIScrapeResponse(TypedDict):
102167
success: bool
103-
data: List[Dict[str, Any]]
104-
selectors: List[str]
105-
context: Dict[str, List[str]]
106-
link: List[LinkData]
168+
data: List[DataItem]
107169
page_position: int
108170
page_position_length: int
171+
context: Dict[str, List[str]]
172+
selectors: Dict[str, List[str]]
173+
link: List[Link]
109174

110175

176+
#
177+
# Web Client
178+
#
111179
class Web(ClientConfig):
112180

113181
config: RequestConfig
@@ -130,7 +198,7 @@ def ai_scrape(self, params: AIScrapeParams) -> AIScrapeResponse:
130198
resp = Request(
131199
config=self.config,
132200
path=path,
133-
params=cast(AIScrapeParams, params),
201+
params=cast(Dict[Any, Any], params),
134202
verb="post",
135203
).perform_with_content()
136204
return resp
@@ -177,6 +245,9 @@ def search_suggestions(
177245
return s.suggestions(params)
178246

179247

248+
#
249+
# Async Web Client
250+
#
180251
class AsyncWeb(ClientConfig):
181252

182253
config: AsyncRequestConfig
@@ -204,7 +275,6 @@ async def ai_scrape(self, params: AIScrapeParams) -> AIScrapeResponse:
204275
).perform_with_content()
205276
return resp
206277

207-
208278
async def html_to_any(self, params: HTMLToAnyParams) -> Any:
209279
path = "/web/html_to_any"
210280
resp = await AsyncRequest(

0 commit comments

Comments
 (0)