11import json
22
3- from scrapling .core ._types import Callable , Dict , Optional , Union
3+ from scrapling .core ._types import (Callable , Dict , Optional ,
4+ SelectorWaitStates , Union )
45from scrapling .core .utils import log , lru_cache
56from scrapling .engines .constants import (DEFAULT_STEALTH_FLAGS ,
67 NSTBROWSER_DEFAULT_QUERY )
@@ -23,7 +24,7 @@ def __init__(
2324 page_action : Callable = None ,
2425 wait_selector : Optional [str ] = None ,
2526 locale : Optional [str ] = 'en-US' ,
26- wait_selector_state : Optional [ str ] = 'attached' ,
27+ wait_selector_state : SelectorWaitStates = 'attached' ,
2728 stealth : Optional [bool ] = False ,
2829 real_chrome : Optional [bool ] = False ,
2930 hide_canvas : Optional [bool ] = False ,
@@ -193,12 +194,21 @@ def fetch(self, url: str) -> Response:
193194 :param url: Target url.
194195 :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
195196 """
197+ from playwright .sync_api import Response as PlaywrightResponse
196198 if not self .stealth or self .real_chrome :
197199 # Because rebrowser_playwright doesn't play well with real browsers
198200 from playwright .sync_api import sync_playwright
199201 else :
200202 from rebrowser_playwright .sync_api import sync_playwright
201203
204+ # Store the final response
205+ final_response = None
206+
207+ def handle_response (finished_response : PlaywrightResponse ):
208+ nonlocal final_response
209+ if finished_response .request .resource_type == "document" :
210+ final_response = finished_response
211+
202212 with sync_playwright () as p :
203213 # Creating the browser
204214 if self .cdp_url :
@@ -212,6 +222,8 @@ def fetch(self, url: str) -> Response:
212222 page = context .new_page ()
213223 page .set_default_navigation_timeout (self .timeout )
214224 page .set_default_timeout (self .timeout )
225+ # Listen for all responses
226+ page .on ("response" , handle_response )
215227
216228 if self .extra_headers :
217229 page .set_extra_http_headers (self .extra_headers )
@@ -223,7 +235,7 @@ def fetch(self, url: str) -> Response:
223235 for script in self .__stealth_scripts ():
224236 page .add_init_script (path = script )
225237
226- res = page .goto (url , referer = generate_convincing_referer (url ) if self .google_search else None )
238+ first_response = page .goto (url , referer = generate_convincing_referer (url ) if self .google_search else None )
227239 page .wait_for_load_state (state = "domcontentloaded" )
228240 if self .network_idle :
229241 page .wait_for_load_state ('networkidle' )
@@ -240,21 +252,24 @@ def fetch(self, url: str) -> Response:
240252 if self .network_idle :
241253 page .wait_for_load_state ('networkidle' )
242254
255+ response_bytes = final_response .body () if final_response else page .content ().encode ('utf-8' )
256+ # In case we didn't catch a document type somehow
257+ final_response = final_response if final_response else first_response
243258 # This will be parsed inside `Response`
244- encoding = res .headers .get ('content-type' , '' ) or 'utf-8' # default encoding
259+ encoding = final_response .headers .get ('content-type' , '' ) or 'utf-8' # default encoding
245260 # PlayWright API sometimes give empty status text for some reason!
246- status_text = res .status_text or StatusText .get (res .status )
261+ status_text = final_response .status_text or StatusText .get (final_response .status )
247262
248263 response = Response (
249- url = res .url ,
264+ url = final_response .url ,
250265 text = page .content (),
251- body = page . content (). encode ( 'utf-8' ) ,
252- status = res .status ,
266+ body = response_bytes ,
267+ status = final_response .status ,
253268 reason = status_text ,
254269 encoding = encoding ,
255270 cookies = {cookie ['name' ]: cookie ['value' ] for cookie in page .context .cookies ()},
256- headers = res .all_headers (),
257- request_headers = res .request .all_headers (),
271+ headers = final_response .all_headers (),
272+ request_headers = final_response .request .all_headers (),
258273 ** self .adaptor_arguments
259274 )
260275 page .close ()
@@ -266,12 +281,21 @@ async def async_fetch(self, url: str) -> Response:
266281 :param url: Target url.
267282 :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
268283 """
284+ from playwright .async_api import Response as PlaywrightResponse
269285 if not self .stealth or self .real_chrome :
270286 # Because rebrowser_playwright doesn't play well with real browsers
271287 from playwright .async_api import async_playwright
272288 else :
273289 from rebrowser_playwright .async_api import async_playwright
274290
291+ # Store the final response
292+ final_response = None
293+
294+ async def handle_response (finished_response : PlaywrightResponse ):
295+ nonlocal final_response
296+ if finished_response .request .resource_type == "document" :
297+ final_response = finished_response
298+
275299 async with async_playwright () as p :
276300 # Creating the browser
277301 if self .cdp_url :
@@ -285,6 +309,8 @@ async def async_fetch(self, url: str) -> Response:
285309 page = await context .new_page ()
286310 page .set_default_navigation_timeout (self .timeout )
287311 page .set_default_timeout (self .timeout )
312+ # Listen for all responses
313+ page .on ("response" , handle_response )
288314
289315 if self .extra_headers :
290316 await page .set_extra_http_headers (self .extra_headers )
@@ -296,7 +322,7 @@ async def async_fetch(self, url: str) -> Response:
296322 for script in self .__stealth_scripts ():
297323 await page .add_init_script (path = script )
298324
299- res = await page .goto (url , referer = generate_convincing_referer (url ) if self .google_search else None )
325+ first_response = await page .goto (url , referer = generate_convincing_referer (url ) if self .google_search else None )
300326 await page .wait_for_load_state (state = "domcontentloaded" )
301327 if self .network_idle :
302328 await page .wait_for_load_state ('networkidle' )
@@ -313,21 +339,24 @@ async def async_fetch(self, url: str) -> Response:
313339 if self .network_idle :
314340 await page .wait_for_load_state ('networkidle' )
315341
342+ response_bytes = await final_response .body () if final_response else (await page .content ()).encode ('utf-8' )
343+ # In case we didn't catch a document type somehow
344+ final_response = final_response if final_response else first_response
316345 # This will be parsed inside `Response`
317- encoding = res .headers .get ('content-type' , '' ) or 'utf-8' # default encoding
346+ encoding = final_response .headers .get ('content-type' , '' ) or 'utf-8' # default encoding
318347 # PlayWright API sometimes give empty status text for some reason!
319- status_text = res .status_text or StatusText .get (res .status )
348+ status_text = final_response .status_text or StatusText .get (final_response .status )
320349
321350 response = Response (
322- url = res .url ,
351+ url = final_response .url ,
323352 text = await page .content (),
324- body = ( await page . content ()). encode ( 'utf-8' ) ,
325- status = res .status ,
353+ body = response_bytes ,
354+ status = final_response .status ,
326355 reason = status_text ,
327356 encoding = encoding ,
328357 cookies = {cookie ['name' ]: cookie ['value' ] for cookie in await page .context .cookies ()},
329- headers = await res .all_headers (),
330- request_headers = await res .request .all_headers (),
358+ headers = await final_response .all_headers (),
359+ request_headers = await final_response .request .all_headers (),
331360 ** self .adaptor_arguments
332361 )
333362 await page .close ()
0 commit comments