From c2973463106d48998ca6d6114a2640723615bdee Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Thu, 9 Feb 2023 16:54:36 -0300 Subject: [PATCH 01/24] scrapy_playwright._utils module --- scrapy_playwright/_utils.py | 40 ++++++++++++++++++++++++++++++++++++ scrapy_playwright/handler.py | 40 ++---------------------------------- 2 files changed, 42 insertions(+), 38 deletions(-) create mode 100644 scrapy_playwright/_utils.py diff --git a/scrapy_playwright/_utils.py b/scrapy_playwright/_utils.py new file mode 100644 index 00000000..ee4eb8a8 --- /dev/null +++ b/scrapy_playwright/_utils.py @@ -0,0 +1,40 @@ +from typing import Awaitable, Iterator, Tuple + +from scrapy.http.headers import Headers +from scrapy.utils.python import to_unicode +from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding + + +async def _maybe_await(obj): + if isinstance(obj, Awaitable): + return await obj + return obj + + +def _possible_encodings(headers: Headers, text: str) -> Iterator[str]: + if headers.get("content-type"): + content_type = to_unicode(headers["content-type"]) + yield http_content_type_encoding(content_type) + yield html_body_declared_encoding(text) + + +def _encode_body(headers: Headers, text: str) -> Tuple[bytes, str]: + for encoding in filter(None, _possible_encodings(headers, text)): + try: + body = text.encode(encoding) + except UnicodeEncodeError: + pass + else: + return body, encoding + return text.encode("utf-8"), "utf-8" # fallback + + +def _is_safe_close_error(error: Exception) -> bool: + """ + Taken verbatim from + https://github.com/microsoft/playwright-python/blob/v1.20.0/playwright/_impl/_helper.py#L234-L238 + """ + message = str(error) + return message.endswith("Browser has been closed") or message.endswith( + "Target page, context or browser has been closed" + ) diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 21de94aa..ec985e0a 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -4,7 +4,7 @@ from dataclasses import dataclass from ipaddress import ip_address from time import time -from typing import Awaitable, Callable, Dict, Generator, Optional, Tuple, Type, TypeVar, Union +from typing import Awaitable, Callable, Dict, Optional, Type, TypeVar, Union from playwright.async_api import ( Browser, @@ -24,13 +24,12 @@ from scrapy.responsetypes import responsetypes from scrapy.utils.defer import deferred_from_coro from scrapy.utils.misc import load_object -from scrapy.utils.python import to_unicode from scrapy.utils.reactor import verify_installed_reactor from twisted.internet.defer import Deferred, inlineCallbacks -from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding from scrapy_playwright.headers import use_scrapy_headers from scrapy_playwright.page import PageMethod +from scrapy_playwright._utils import _encode_body, _is_safe_close_error, _maybe_await __all__ = ["ScrapyPlaywrightDownloadHandler"] @@ -550,12 +549,6 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest) return _request_handler -async def _maybe_await(obj): - if isinstance(obj, Awaitable): - return await obj - return obj - - def _attach_page_event_handlers( page: Page, request: Request, spider: Spider, context_name: str ) -> None: @@ -645,32 +638,3 @@ async def _log_response(response: PlaywrightResponse) -> None: ) return _log_response - - -def _possible_encodings(headers: Headers, text: str) -> Generator[str, None, None]: - if headers.get("content-type"): - content_type = to_unicode(headers["content-type"]) - yield http_content_type_encoding(content_type) - yield html_body_declared_encoding(text) - - -def _encode_body(headers: Headers, text: str) -> Tuple[bytes, str]: - for encoding in filter(None, _possible_encodings(headers, text)): - try: - body = text.encode(encoding) - except UnicodeEncodeError: - pass - else: - return body, encoding - return text.encode("utf-8"), "utf-8" # fallback - - -def _is_safe_close_error(error: Exception) -> bool: - """ - Taken verbatim from - https://github.com/microsoft/playwright-python/blob/v1.20.0/playwright/_impl/_helper.py#L234-L238 - """ - message = str(error) - return message.endswith("Browser has been closed") or message.endswith( - "Target page, context or browser has been closed" - ) From 43710fb4fcf26b138902495648e905cc1b7a4faf Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Thu, 9 Feb 2023 17:01:09 -0300 Subject: [PATCH 02/24] PLAYWRIGHT_CLOSE_INACTIVE_CONTEXT_INTERVAL setting --- scrapy_playwright/_utils.py | 7 +++++ scrapy_playwright/handler.py | 51 +++++++++++++++++++++++++++++++++++- 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/scrapy_playwright/_utils.py b/scrapy_playwright/_utils.py index ee4eb8a8..8da26db6 100644 --- a/scrapy_playwright/_utils.py +++ b/scrapy_playwright/_utils.py @@ -1,3 +1,5 @@ +import asyncio + from typing import Awaitable, Iterator, Tuple from scrapy.http.headers import Headers @@ -5,6 +7,11 @@ from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding +async def _async_delay(coro: Awaitable, delay: int) -> None: + await asyncio.sleep(delay) + await coro + + async def _maybe_await(obj): if isinstance(obj, Awaitable): return await obj diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index ec985e0a..74b4c4ce 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -29,7 +29,7 @@ from scrapy_playwright.headers import use_scrapy_headers from scrapy_playwright.page import PageMethod -from scrapy_playwright._utils import _encode_body, _is_safe_close_error, _maybe_await +from scrapy_playwright._utils import _encode_body, _is_safe_close_error, _maybe_await, _async_delay __all__ = ["ScrapyPlaywrightDownloadHandler"] @@ -77,6 +77,14 @@ def __init__(self, crawler: Crawler) -> None: self.context_semaphore = asyncio.Semaphore( value=settings.getint("PLAYWRIGHT_MAX_CONTEXTS") ) + self.close_inactive_context_interval: Optional[int] = None + if "PLAYWRIGHT_CLOSE_INACTIVE_CONTEXT_INTERVAL" in settings: + with suppress(TypeError, ValueError): + self.close_inactive_context_interval = int( + settings.get("PLAYWRIGHT_CLOSE_INACTIVE_CONTEXT_INTERVAL") + ) + if self.close_inactive_context_interval < 0: + self.close_inactive_context_interval = None self.default_navigation_timeout: Optional[float] = None if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in settings: @@ -161,6 +169,13 @@ async def _create_browser_context( extra={"spider": spider, "context_name": name, "persistent": persistent}, ) self.stats.inc_value("playwright/context_count") + if self.close_inactive_context_interval is not None: + asyncio.create_task( + _async_delay( + self._maybe_close_inactive_context(name, spider), + self.close_inactive_context_interval, + ) + ) if self.default_navigation_timeout is not None: context.set_default_navigation_timeout(self.default_navigation_timeout) self.context_wrappers[name] = BrowserContextWrapper( @@ -171,6 +186,39 @@ async def _create_browser_context( self._set_max_concurrent_context_count() return self.context_wrappers[name] + async def _maybe_close_inactive_context( + self, name: str, spider: Optional[Spider] = None + ) -> None: + """Close a context if it has no associated pages, + otherwise schedule a task to check again in the future. + """ + page_count = 0 + async with self.context_launch_lock: + ctx_wrapper = self.context_wrappers.get(name) + if ctx_wrapper: + page_count = len(ctx_wrapper.context.pages) + if not page_count: + logger.info( + "[Context=%s] Closing inactive context", + name, + extra={"spider": spider, "context_name": name}, + ) + await ctx_wrapper.context.close() + if page_count and self.close_inactive_context_interval: + logger.debug( + "[Context=%s] page count is %i, checking again in %i seconds", + name, + page_count, + self.close_inactive_context_interval, + extra={"spider": spider, "context_name": name, "context_page_count": page_count}, + ) + asyncio.create_task( + _async_delay( + self._maybe_close_inactive_context(name, spider), + self.close_inactive_context_interval, + ) + ) + async def _create_page(self, request: Request, spider: Spider) -> Page: """Create a new page in a context, also creating a new context if necessary.""" context_name = request.meta.setdefault("playwright_context", DEFAULT_CONTEXT_NAME) @@ -257,6 +305,7 @@ def close(self) -> Deferred: yield deferred_from_coro(self._close()) async def _close(self) -> None: + logger.info("Closing %i contexts", len(self.context_wrappers)) await asyncio.gather(*[ctx.context.close() for ctx in self.context_wrappers.values()]) self.context_wrappers.clear() if hasattr(self, "browser"): From 92ad7f18d1b70bc07e9a1a8200948f3bef938b24 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Fri, 10 Feb 2023 00:17:42 -0300 Subject: [PATCH 03/24] Simplify logging --- scrapy_playwright/handler.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 74b4c4ce..80542e70 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -204,12 +204,11 @@ async def _maybe_close_inactive_context( extra={"spider": spider, "context_name": name}, ) await ctx_wrapper.context.close() - if page_count and self.close_inactive_context_interval: + if page_count and self.close_inactive_context_interval is not None: logger.debug( - "[Context=%s] page count is %i, checking again in %i seconds", + "[Context=%s] Page count is %i, not closing context", name, page_count, - self.close_inactive_context_interval, extra={"spider": spider, "context_name": name, "context_page_count": page_count}, ) asyncio.create_task( From 863d2b460bcaa42343f836f819595b2aa26e5000 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Fri, 10 Feb 2023 00:18:06 -0300 Subject: [PATCH 04/24] Simplify setting processing --- scrapy_playwright/handler.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 80542e70..1c0e1074 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -77,14 +77,11 @@ def __init__(self, crawler: Crawler) -> None: self.context_semaphore = asyncio.Semaphore( value=settings.getint("PLAYWRIGHT_MAX_CONTEXTS") ) - self.close_inactive_context_interval: Optional[int] = None - if "PLAYWRIGHT_CLOSE_INACTIVE_CONTEXT_INTERVAL" in settings: - with suppress(TypeError, ValueError): - self.close_inactive_context_interval = int( - settings.get("PLAYWRIGHT_CLOSE_INACTIVE_CONTEXT_INTERVAL") - ) - if self.close_inactive_context_interval < 0: - self.close_inactive_context_interval = None + self.close_inactive_context_interval: Optional[int] = settings.getint( + "PLAYWRIGHT_CLOSE_INACTIVE_CONTEXT_INTERVAL" + ) + if self.close_inactive_context_interval <= 0: + self.close_inactive_context_interval = None self.default_navigation_timeout: Optional[float] = None if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in settings: From bf4fbf2c3fa44b78a485b4286705dce866063f90 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Fri, 10 Feb 2023 00:36:04 -0300 Subject: [PATCH 05/24] Readme: add PLAYWRIGHT_CLOSE_INACTIVE_CONTEXT_INTERVAL --- README.md | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index bd3594e6..f8f26d01 100644 --- a/README.md +++ b/README.md @@ -240,6 +240,17 @@ See the [notes about leaving unclosed pages](#receiving-page-objects-in-callback PLAYWRIGHT_MAX_PAGES_PER_CONTEXT = 4 ``` +### `PLAYWRIGHT_CLOSE_INACTIVE_CONTEXT_INTERVAL` +Type `Optional[int]`, default `None` + +Enables a period task to automatically close browser contexts which have no +active pages. Set to the amount of seconds between checks. Set to `None` (the +default) to disable the check, i.e. no contexts are automaticallly closed. + +```python +PLAYWRIGHT_CLOSE_INACTIVE_CONTEXT_INTERVAL = 2 * 60 # 2 minutes +``` + ### `PLAYWRIGHT_ABORT_REQUEST` Type `Optional[Union[Callable, str]]`, default `None` @@ -563,6 +574,12 @@ yield scrapy.Request( Please note that if a context with the specified name already exists, that context is used and `playwright_context_kwargs` are ignored. +### Automatically closing inactive contexts + +Specifying a non-negative integer value for the +[`PLAYWRIGHT_CLOSE_INACTIVE_CONTEXT_INTERVAL`](#playwright_close_inactive_context_interval) +setting enables a periodic task which closes browser contexts which have no active pages. + ### Closing a context during a crawl After [receiving the Page object in your callback](#receiving-page-objects-in-callbacks), @@ -595,10 +612,11 @@ async def close_context_on_error(self, failure): Specify a value for the `PLAYWRIGHT_MAX_CONTEXTS` setting to limit the amount of concurent contexts. This setting should be used with caution: it's possible to block the whole crawl if contexts are not closed after they are no longer -used (refer to the above section to dinamically close contexts). Make sure to -define an errback to still be able to close the context even if there are -errors with a request. - +used (refer to the above section to dinamically close contexts, see also the +section about +[automatically closing inactive contexts](#automatically-closing-inactive-contexts)). +Make sure to define an errback to still be able to close the context even if +there are errors with a request. ## Proxy support From 95b1c5ed7308c4cd2d85ac1a83a3028b03ba0fa7 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Fri, 17 Feb 2023 16:49:16 -0300 Subject: [PATCH 06/24] Reading PLAYWRIGHT_CLOSE_INACTIVE_CONTEXT_INTERVAL setting --- scrapy_playwright/handler.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 1c0e1074..bbad0c4c 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -77,11 +77,10 @@ def __init__(self, crawler: Crawler) -> None: self.context_semaphore = asyncio.Semaphore( value=settings.getint("PLAYWRIGHT_MAX_CONTEXTS") ) - self.close_inactive_context_interval: Optional[int] = settings.getint( - "PLAYWRIGHT_CLOSE_INACTIVE_CONTEXT_INTERVAL" - ) - if self.close_inactive_context_interval <= 0: - self.close_inactive_context_interval = None + self.close_inactive_context_interval: Optional[int] = None + close_context_interval = settings.getint("PLAYWRIGHT_CLOSE_INACTIVE_CONTEXT_INTERVAL") + if close_context_interval > 0: + self.close_inactive_context_interval = close_context_interval self.default_navigation_timeout: Optional[float] = None if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in settings: From a4ff8603ca0a206bd6503f4395caaa0ee1b33f40 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Sun, 19 Feb 2023 15:08:28 -0300 Subject: [PATCH 07/24] Rename setting --- README.md | 8 ++++---- scrapy_playwright/_utils.py | 14 ++++++++++++-- scrapy_playwright/handler.py | 34 ++++++++++++++++++---------------- 3 files changed, 34 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index bbd60e2d..2250a9f4 100644 --- a/README.md +++ b/README.md @@ -240,15 +240,15 @@ See the [notes about leaving unclosed pages](#receiving-page-objects-in-callback PLAYWRIGHT_MAX_PAGES_PER_CONTEXT = 4 ``` -### `PLAYWRIGHT_CLOSE_INACTIVE_CONTEXT_INTERVAL` -Type `Optional[int]`, default `None` +### `PLAYWRIGHT_CLOSE_CONTEXT_INTERVAL` +Type `Optional[float]`, default `None` Enables a period task to automatically close browser contexts which have no active pages. Set to the amount of seconds between checks. Set to `None` (the default) to disable the check, i.e. no contexts are automaticallly closed. ```python -PLAYWRIGHT_CLOSE_INACTIVE_CONTEXT_INTERVAL = 2 * 60 # 2 minutes +PLAYWRIGHT_CLOSE_CONTEXT_INTERVAL = 2 * 60 # 2 minutes ``` ### `PLAYWRIGHT_ABORT_REQUEST` @@ -578,7 +578,7 @@ that context is used and `playwright_context_kwargs` are ignored. ### Automatically closing inactive contexts Specifying a non-negative integer value for the -[`PLAYWRIGHT_CLOSE_INACTIVE_CONTEXT_INTERVAL`](#playwright_close_inactive_context_interval) +[`PLAYWRIGHT_CLOSE_CONTEXT_INTERVAL`](#playwright_close_context_interval) setting enables a periodic task which closes browser contexts which have no active pages. ### Closing a context during a crawl diff --git a/scrapy_playwright/_utils.py b/scrapy_playwright/_utils.py index 8da26db6..7c4f2919 100644 --- a/scrapy_playwright/_utils.py +++ b/scrapy_playwright/_utils.py @@ -1,13 +1,15 @@ import asyncio +from contextlib import suppress -from typing import Awaitable, Iterator, Tuple +from typing import Awaitable, Iterator, Optional, Tuple from scrapy.http.headers import Headers +from scrapy.settings import Settings from scrapy.utils.python import to_unicode from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding -async def _async_delay(coro: Awaitable, delay: int) -> None: +async def _async_delay(coro: Awaitable, delay: float) -> None: await asyncio.sleep(delay) await coro @@ -45,3 +47,11 @@ def _is_safe_close_error(error: Exception) -> bool: return message.endswith("Browser has been closed") or message.endswith( "Target page, context or browser has been closed" ) + + +def _read_float_setting(settings: Settings, key: str) -> Optional[float]: + try: + return float(settings[key]) + except (KeyError, TypeError, ValueError): + pass + return None diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index bbad0c4c..73ed5154 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -29,7 +29,13 @@ from scrapy_playwright.headers import use_scrapy_headers from scrapy_playwright.page import PageMethod -from scrapy_playwright._utils import _encode_body, _is_safe_close_error, _maybe_await, _async_delay +from scrapy_playwright._utils import ( + _async_delay, + _encode_body, + _is_safe_close_error, + _maybe_await, + _read_float_setting, +) __all__ = ["ScrapyPlaywrightDownloadHandler"] @@ -77,17 +83,13 @@ def __init__(self, crawler: Crawler) -> None: self.context_semaphore = asyncio.Semaphore( value=settings.getint("PLAYWRIGHT_MAX_CONTEXTS") ) - self.close_inactive_context_interval: Optional[int] = None - close_context_interval = settings.getint("PLAYWRIGHT_CLOSE_INACTIVE_CONTEXT_INTERVAL") - if close_context_interval > 0: - self.close_inactive_context_interval = close_context_interval - - self.default_navigation_timeout: Optional[float] = None - if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in settings: - with suppress(TypeError, ValueError): - self.default_navigation_timeout = float( - settings.get("PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT") - ) + self.close_context_interval: Optional[float] = _read_float_setting( + settings, "PLAYWRIGHT_CLOSE_CONTEXT_INTERVAL" + ) + + self.default_navigation_timeout: Optional[float] = _read_float_setting( + settings, "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" + ) # headers if "PLAYWRIGHT_PROCESS_REQUEST_HEADERS" in settings: @@ -165,11 +167,11 @@ async def _create_browser_context( extra={"spider": spider, "context_name": name, "persistent": persistent}, ) self.stats.inc_value("playwright/context_count") - if self.close_inactive_context_interval is not None: + if self.close_context_interval is not None: asyncio.create_task( _async_delay( self._maybe_close_inactive_context(name, spider), - self.close_inactive_context_interval, + self.close_context_interval, ) ) if self.default_navigation_timeout is not None: @@ -200,7 +202,7 @@ async def _maybe_close_inactive_context( extra={"spider": spider, "context_name": name}, ) await ctx_wrapper.context.close() - if page_count and self.close_inactive_context_interval is not None: + if page_count and self.close_context_interval is not None: logger.debug( "[Context=%s] Page count is %i, not closing context", name, @@ -210,7 +212,7 @@ async def _maybe_close_inactive_context( asyncio.create_task( _async_delay( self._maybe_close_inactive_context(name, spider), - self.close_inactive_context_interval, + self.close_context_interval, ) ) From bd17f9ce820bfb6ebeb09a9c647e0c8f2613e221 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Sun, 19 Feb 2023 15:27:21 -0300 Subject: [PATCH 08/24] Test closing inactive contexts --- tests/test_browser_contexts.py | 39 ++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tests/test_browser_contexts.py b/tests/test_browser_contexts.py index 91848898..5d24af1e 100644 --- a/tests/test_browser_contexts.py +++ b/tests/test_browser_contexts.py @@ -1,4 +1,5 @@ import asyncio +import logging import platform import tempfile from pathlib import Path @@ -244,6 +245,44 @@ async def test_contexts_dynamic(self): assert cookie["value"] == "qwerty" assert cookie["domain"] == "example.org" + @pytest.mark.asyncio + async def test_close_inactive_context(self, caplog): + caplog.set_level(logging.DEBUG) + spider = Spider("foo") + async with make_handler( + { + "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, + "PLAYWRIGHT_CLOSE_CONTEXT_INTERVAL": 0.5, + "PLAYWRIGHT_MAX_PAGES_PER_CONTEXT": 1, + } + ) as handler: + assert len(handler.context_wrappers) == 0 + with StaticMockServer() as server: + await asyncio.gather( + *[ + handler._download_request( + Request( + server.urljoin(f"/index.html?a={i}"), meta={"playwright": True} + ), + spider, + ) + for i in range(5) + ] + ) + assert len(handler.context_wrappers) == 1 + assert ( + "scrapy-playwright", + logging.DEBUG, + "[Context=default] Page count is 1, not closing context", + ) in caplog.record_tuples + await asyncio.sleep(0.3) + assert len(handler.context_wrappers) == 0 + assert ( + "scrapy-playwright", + logging.INFO, + "[Context=default] Closing inactive context", + ) in caplog.record_tuples + class TestCaseMultipleContextsChromium(MixinTestCaseMultipleContexts): browser_type = "chromium" From ef0be70c0aece62464a6eef32d92c8e5766daafc Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Sun, 19 Feb 2023 15:35:40 -0300 Subject: [PATCH 09/24] Remove unused import --- scrapy_playwright/_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scrapy_playwright/_utils.py b/scrapy_playwright/_utils.py index 7c4f2919..64cb1780 100644 --- a/scrapy_playwright/_utils.py +++ b/scrapy_playwright/_utils.py @@ -1,5 +1,4 @@ import asyncio -from contextlib import suppress from typing import Awaitable, Iterator, Optional, Tuple From e18cd704b99d410d9e50405bd29d81c1e0a9b8f4 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Sun, 19 Feb 2023 16:25:51 -0300 Subject: [PATCH 10/24] Rename context lock --- scrapy_playwright/handler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 73ed5154..d3d13cf9 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -76,7 +76,7 @@ def __init__(self, crawler: Crawler) -> None: self.max_pages_per_context: int = settings.getint( "PLAYWRIGHT_MAX_PAGES_PER_CONTEXT" ) or settings.getint("CONCURRENT_REQUESTS") - self.context_launch_lock = asyncio.Lock() + self.context_lock = asyncio.Lock() self.context_wrappers: Dict[str, BrowserContextWrapper] = {} self.startup_context_kwargs: dict = settings.getdict("PLAYWRIGHT_CONTEXTS") if settings.getint("PLAYWRIGHT_MAX_CONTEXTS"): @@ -191,7 +191,7 @@ async def _maybe_close_inactive_context( otherwise schedule a task to check again in the future. """ page_count = 0 - async with self.context_launch_lock: + async with self.context_lock: ctx_wrapper = self.context_wrappers.get(name) if ctx_wrapper: page_count = len(ctx_wrapper.context.pages) @@ -221,7 +221,7 @@ async def _create_page(self, request: Request, spider: Spider) -> Page: context_name = request.meta.setdefault("playwright_context", DEFAULT_CONTEXT_NAME) # this block needs to be locked because several attempts to launch a context # with the same name could happen at the same time from different requests - async with self.context_launch_lock: + async with self.context_lock: ctx_wrapper = self.context_wrappers.get(context_name) if ctx_wrapper is None: ctx_wrapper = await self._create_browser_context( From f14e751ca1609498c23ab009261db4dd0d85b085 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Sun, 19 Feb 2023 16:54:53 -0300 Subject: [PATCH 11/24] Update close inactive context test --- tests/test_browser_contexts.py | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/tests/test_browser_contexts.py b/tests/test_browser_contexts.py index 5d24af1e..e2d12f87 100644 --- a/tests/test_browser_contexts.py +++ b/tests/test_browser_contexts.py @@ -11,7 +11,7 @@ from scrapy_playwright.page import PageMethod from tests import make_handler -from tests.mockserver import StaticMockServer +from tests.mockserver import MockServer, StaticMockServer class MixinTestCaseMultipleContexts: @@ -253,21 +253,16 @@ async def test_close_inactive_context(self, caplog): { "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_CLOSE_CONTEXT_INTERVAL": 0.5, - "PLAYWRIGHT_MAX_PAGES_PER_CONTEXT": 1, } ) as handler: assert len(handler.context_wrappers) == 0 - with StaticMockServer() as server: - await asyncio.gather( - *[ - handler._download_request( - Request( - server.urljoin(f"/index.html?a={i}"), meta={"playwright": True} - ), - spider, - ) - for i in range(5) - ] + with MockServer() as server: + await handler._download_request( + Request( + url=server.urljoin("/delay/1"), + meta={"playwright": True}, + ), + spider, ) assert len(handler.context_wrappers) == 1 assert ( @@ -275,7 +270,7 @@ async def test_close_inactive_context(self, caplog): logging.DEBUG, "[Context=default] Page count is 1, not closing context", ) in caplog.record_tuples - await asyncio.sleep(0.3) + await asyncio.sleep(0.5) assert len(handler.context_wrappers) == 0 assert ( "scrapy-playwright", From 690a207e93d26c3a12de6652637d1c4d1a4a90cc Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Sun, 19 Feb 2023 20:19:24 -0300 Subject: [PATCH 12/24] Waiting event --- README.md | 17 ++++++--- scrapy_playwright/handler.py | 69 ++++++++++++++++------------------ tests/test_browser_contexts.py | 22 +++++------ 3 files changed, 54 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index 2250a9f4..f244ed36 100644 --- a/README.md +++ b/README.md @@ -243,12 +243,14 @@ PLAYWRIGHT_MAX_PAGES_PER_CONTEXT = 4 ### `PLAYWRIGHT_CLOSE_CONTEXT_INTERVAL` Type `Optional[float]`, default `None` -Enables a period task to automatically close browser contexts which have no -active pages. Set to the amount of seconds between checks. Set to `None` (the -default) to disable the check, i.e. no contexts are automaticallly closed. +**_Experimental_** + +If set to a non-zero value, browser contexts will be automatically closed after +spending the specified amount of seconds without open pages. Set to `None` +(the default) to disable, i.e. contexts remain open until explicitly closed. ```python -PLAYWRIGHT_CLOSE_CONTEXT_INTERVAL = 2 * 60 # 2 minutes +PLAYWRIGHT_CLOSE_CONTEXT_INTERVAL = 5 * 60 # 5 minutes ``` ### `PLAYWRIGHT_ABORT_REQUEST` @@ -579,7 +581,7 @@ that context is used and `playwright_context_kwargs` are ignored. Specifying a non-negative integer value for the [`PLAYWRIGHT_CLOSE_CONTEXT_INTERVAL`](#playwright_close_context_interval) -setting enables a periodic task which closes browser contexts which have no active pages. +setting enables closing browser contexts which have no active pages. ### Closing a context during a crawl @@ -940,3 +942,8 @@ Deprecated features will be supported for at least six months following the release that deprecated them. After that, they may be removed at any time. See the [changelog](docs/changelog.md) for more information about deprecations and removals. + + +## Experimental features + +Features marked as "experimental" might be modified and/or removed at any time. diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index d3d13cf9..2239f14d 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -55,8 +55,10 @@ @dataclass class BrowserContextWrapper: context: BrowserContext - semaphore: asyncio.Semaphore persistent: bool + semaphore: asyncio.Semaphore # limit amount of pages + inactive: asyncio.Event + waiting_close: asyncio.Event class ScrapyPlaywrightDownloadHandler(HTTPDownloadHandler): @@ -167,54 +169,43 @@ async def _create_browser_context( extra={"spider": spider, "context_name": name, "persistent": persistent}, ) self.stats.inc_value("playwright/context_count") - if self.close_context_interval is not None: - asyncio.create_task( - _async_delay( - self._maybe_close_inactive_context(name, spider), - self.close_context_interval, - ) - ) if self.default_navigation_timeout is not None: context.set_default_navigation_timeout(self.default_navigation_timeout) self.context_wrappers[name] = BrowserContextWrapper( context=context, - semaphore=asyncio.Semaphore(value=self.max_pages_per_context), persistent=persistent, + semaphore=asyncio.Semaphore(value=self.max_pages_per_context), + inactive=asyncio.Event(), + waiting_close=asyncio.Event(), ) + if self.close_context_interval is not None: + asyncio.create_task( + self._maybe_close_inactive_context( + context_name=name, context_wrapper=self.context_wrappers[name], spider=spider + ) + ) self._set_max_concurrent_context_count() return self.context_wrappers[name] async def _maybe_close_inactive_context( - self, name: str, spider: Optional[Spider] = None + self, + context_name: str, + context_wrapper: BrowserContextWrapper, + spider: Optional[Spider] = None, ) -> None: - """Close a context if it has no associated pages, - otherwise schedule a task to check again in the future. - """ - page_count = 0 - async with self.context_lock: - ctx_wrapper = self.context_wrappers.get(name) - if ctx_wrapper: - page_count = len(ctx_wrapper.context.pages) - if not page_count: - logger.info( - "[Context=%s] Closing inactive context", - name, - extra={"spider": spider, "context_name": name}, - ) - await ctx_wrapper.context.close() - if page_count and self.close_context_interval is not None: - logger.debug( - "[Context=%s] Page count is %i, not closing context", - name, - page_count, - extra={"spider": spider, "context_name": name, "context_page_count": page_count}, - ) - asyncio.create_task( - _async_delay( - self._maybe_close_inactive_context(name, spider), - self.close_context_interval, + """Close a context if it has had no pages for a certain amount of time.""" + while True: + await context_wrapper.inactive.wait() + context_wrapper.waiting_close.set() + await asyncio.sleep(self.close_context_interval) # type: ignore [arg-type] + if context_wrapper.waiting_close.is_set() and not context_wrapper.context.pages: + logger.info( + "[Context=%s] Closing inactive browser context", + context_name, + extra={"spider": spider, "context_name": context_name}, ) - ) + await context_wrapper.context.close() + break async def _create_page(self, request: Request, spider: Spider) -> Page: """Create a new page in a context, also creating a new context if necessary.""" @@ -229,6 +220,8 @@ async def _create_page(self, request: Request, spider: Spider) -> Page: ) await ctx_wrapper.semaphore.acquire() + ctx_wrapper.inactive.clear() + ctx_wrapper.waiting_close.clear() page = await ctx_wrapper.context.new_page() self.stats.inc_value("playwright/page_count") total_page_count = self._get_total_page_count() @@ -476,6 +469,8 @@ def _make_close_page_callback(self, context_name: str) -> Callable: def close_page_callback() -> None: if context_name in self.context_wrappers: self.context_wrappers[context_name].semaphore.release() + if not self.context_wrappers[context_name].context.pages: + self.context_wrappers[context_name].inactive.set() return close_page_callback diff --git a/tests/test_browser_contexts.py b/tests/test_browser_contexts.py index e2d12f87..879a2d48 100644 --- a/tests/test_browser_contexts.py +++ b/tests/test_browser_contexts.py @@ -247,6 +247,9 @@ async def test_contexts_dynamic(self): @pytest.mark.asyncio async def test_close_inactive_context(self, caplog): + # if self.browser_type == "firefox": + # pytest.skip("Gets stuck in Firefox") + caplog.set_level(logging.DEBUG) spider = Spider("foo") async with make_handler( @@ -258,24 +261,19 @@ async def test_close_inactive_context(self, caplog): assert len(handler.context_wrappers) == 0 with MockServer() as server: await handler._download_request( - Request( - url=server.urljoin("/delay/1"), - meta={"playwright": True}, - ), - spider, + Request(server.urljoin("/headers"), meta={"playwright": True}), spider ) assert len(handler.context_wrappers) == 1 - assert ( - "scrapy-playwright", - logging.DEBUG, - "[Context=default] Page count is 1, not closing context", - ) in caplog.record_tuples - await asyncio.sleep(0.5) + await asyncio.sleep(0.3) + await handler._download_request( + Request(server.urljoin("/delay/1"), meta={"playwright": True}), spider + ) + await asyncio.sleep(0.7) assert len(handler.context_wrappers) == 0 assert ( "scrapy-playwright", logging.INFO, - "[Context=default] Closing inactive context", + "[Context=default] Closing inactive browser context", ) in caplog.record_tuples From 2f543af5fe034083881c30d55a1e197b1b0932a2 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Sun, 19 Feb 2023 20:37:35 -0300 Subject: [PATCH 13/24] Removed unused _async_delay helper --- scrapy_playwright/_utils.py | 7 ------- scrapy_playwright/handler.py | 1 - 2 files changed, 8 deletions(-) diff --git a/scrapy_playwright/_utils.py b/scrapy_playwright/_utils.py index 64cb1780..fc9a61e3 100644 --- a/scrapy_playwright/_utils.py +++ b/scrapy_playwright/_utils.py @@ -1,5 +1,3 @@ -import asyncio - from typing import Awaitable, Iterator, Optional, Tuple from scrapy.http.headers import Headers @@ -8,11 +6,6 @@ from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding -async def _async_delay(coro: Awaitable, delay: float) -> None: - await asyncio.sleep(delay) - await coro - - async def _maybe_await(obj): if isinstance(obj, Awaitable): return await obj diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 2239f14d..362cc8a1 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -30,7 +30,6 @@ from scrapy_playwright.headers import use_scrapy_headers from scrapy_playwright.page import PageMethod from scrapy_playwright._utils import ( - _async_delay, _encode_body, _is_safe_close_error, _maybe_await, From 6fdc22905d49eff068832515fb4c6e23440ab8a3 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Sun, 19 Feb 2023 20:41:57 -0300 Subject: [PATCH 14/24] Remove irrelevant comment in tests --- tests/test_browser_contexts.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/test_browser_contexts.py b/tests/test_browser_contexts.py index 879a2d48..ea4183eb 100644 --- a/tests/test_browser_contexts.py +++ b/tests/test_browser_contexts.py @@ -247,9 +247,6 @@ async def test_contexts_dynamic(self): @pytest.mark.asyncio async def test_close_inactive_context(self, caplog): - # if self.browser_type == "firefox": - # pytest.skip("Gets stuck in Firefox") - caplog.set_level(logging.DEBUG) spider = Spider("foo") async with make_handler( From 5acff74a45ea9016e5781a15a683aa3fa9df4f19 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Thu, 13 Jul 2023 17:43:50 -0300 Subject: [PATCH 15/24] Only attempt to close context if it's still working --- scrapy_playwright/handler.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 362cc8a1..e8d4b4c6 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -178,30 +178,24 @@ async def _create_browser_context( waiting_close=asyncio.Event(), ) if self.close_context_interval is not None: - asyncio.create_task( - self._maybe_close_inactive_context( - context_name=name, context_wrapper=self.context_wrappers[name], spider=spider - ) - ) + asyncio.create_task(self._maybe_close_inactive_context(name=name, spider=spider)) self._set_max_concurrent_context_count() return self.context_wrappers[name] async def _maybe_close_inactive_context( - self, - context_name: str, - context_wrapper: BrowserContextWrapper, - spider: Optional[Spider] = None, + self, name: str, spider: Optional[Spider] = None ) -> None: """Close a context if it has had no pages for a certain amount of time.""" - while True: + while name in self.context_wrappers: + context_wrapper = self.context_wrappers[name] await context_wrapper.inactive.wait() context_wrapper.waiting_close.set() await asyncio.sleep(self.close_context_interval) # type: ignore [arg-type] if context_wrapper.waiting_close.is_set() and not context_wrapper.context.pages: logger.info( "[Context=%s] Closing inactive browser context", - context_name, - extra={"spider": spider, "context_name": context_name}, + name, + extra={"spider": spider, "context_name": name}, ) await context_wrapper.context.close() break From 65104dbf9d03d4885c5ec6c450a5ba23e21ac216 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Tue, 15 Aug 2023 23:59:09 -0300 Subject: [PATCH 16/24] Adapt test for caplog fixture as instance attribute --- tests/tests_asyncio/test_browser_contexts.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/tests_asyncio/test_browser_contexts.py b/tests/tests_asyncio/test_browser_contexts.py index 80c39056..1d7462fe 100644 --- a/tests/tests_asyncio/test_browser_contexts.py +++ b/tests/tests_asyncio/test_browser_contexts.py @@ -16,6 +16,11 @@ class MixinTestCaseMultipleContexts: + @pytest.fixture(autouse=True) + def inject_fixtures(self, caplog): + caplog.set_level(logging.DEBUG) + self._caplog = caplog + @pytest.mark.asyncio async def test_contexts_max_settings(self): settings = { @@ -247,8 +252,7 @@ async def test_contexts_dynamic(self): assert cookie["domain"] == "example.org" @pytest.mark.asyncio - async def test_close_inactive_context(self, caplog): - caplog.set_level(logging.DEBUG) + async def test_close_inactive_context(self): spider = Spider("foo") async with make_handler( { @@ -272,7 +276,7 @@ async def test_close_inactive_context(self, caplog): "scrapy-playwright", logging.INFO, "[Context=default] Closing inactive browser context", - ) in caplog.record_tuples + ) in self._caplog.record_tuples class TestCaseMultipleContextsChromium(IsolatedAsyncioTestCase, MixinTestCaseMultipleContexts): From 01649ce9db229b5bf8106ae45af817400690491f Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Thu, 24 Aug 2023 15:36:42 -0300 Subject: [PATCH 17/24] Rename util method, add tests --- scrapy_playwright/_utils.py | 7 ++--- scrapy_playwright/handler.py | 6 ++-- tests/tests_asyncio/test_utils.py | 46 ++++++++++++++++++++++++------- 3 files changed, 42 insertions(+), 17 deletions(-) diff --git a/scrapy_playwright/_utils.py b/scrapy_playwright/_utils.py index 66a73963..e668c74f 100644 --- a/scrapy_playwright/_utils.py +++ b/scrapy_playwright/_utils.py @@ -82,12 +82,11 @@ async def _get_page_content( raise -def _read_float_setting(settings: Settings, key: str) -> Optional[float]: +def _get_float_setting(settings: Settings, key: str) -> Optional[float]: try: return float(settings[key]) - except (KeyError, TypeError, ValueError): - pass - return None + except: + return None async def _get_header_value( diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 9f991497..9561f965 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -36,7 +36,7 @@ _get_page_content, _is_safe_close_error, _maybe_await, - _read_float_setting, + _get_float_setting, ) @@ -87,11 +87,11 @@ def __init__(self, crawler: Crawler) -> None: self.context_semaphore = asyncio.Semaphore( value=settings.getint("PLAYWRIGHT_MAX_CONTEXTS") ) - self.close_context_interval: Optional[float] = _read_float_setting( + self.close_context_interval: Optional[float] = _get_float_setting( settings, "PLAYWRIGHT_CLOSE_CONTEXT_INTERVAL" ) - self.default_navigation_timeout: Optional[float] = _read_float_setting( + self.default_navigation_timeout: Optional[float] = _get_float_setting( settings, "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" ) diff --git a/tests/tests_asyncio/test_utils.py b/tests/tests_asyncio/test_utils.py index d3535172..3e11ab8a 100644 --- a/tests/tests_asyncio/test_utils.py +++ b/tests/tests_asyncio/test_utils.py @@ -1,4 +1,5 @@ import logging +from decimal import Decimal from unittest import IsolatedAsyncioTestCase from unittest.mock import AsyncMock @@ -6,9 +7,11 @@ from playwright.async_api import Error as PlaywrightError from scrapy import Spider from scrapy.http.headers import Headers +from scrapy.settings import Settings from scrapy_playwright._utils import ( _NAVIGATION_ERROR_MSG, _encode_body, + _get_float_setting, _get_header_value, _get_page_content, _maybe_await, @@ -129,20 +132,19 @@ async def test_encode_mismatch(self): class TestHeaderValue(IsolatedAsyncioTestCase): @pytest.mark.asyncio - async def test_get_header_ok(self): + async def test_get_header_value(self): async def _identity(x): return x - resource = AsyncMock() - resource.header_value = _identity - assert "asdf" == await _get_header_value(resource, "asdf") - assert "qwerty" == await _get_header_value(resource, "qwerty") + res1 = AsyncMock() + res1.header_value = _identity + assert "asdf" == await _get_header_value(res1, "asdf") + assert "qwerty" == await _get_header_value(res1, "qwerty") - async def test_get_header_exception(self): - resource = AsyncMock() - resource.header_value.side_effect = Exception("nope") - assert await _get_header_value(resource, "asdf") is None - assert await _get_header_value(resource, "qwerty") is None + res2 = AsyncMock() + res2.header_value.side_effect = Exception("nope") + assert await _get_header_value(res2, "asdf") is None + assert await _get_header_value(res2, "qwerty") is None class TestMaybeAwait(IsolatedAsyncioTestCase): @@ -157,3 +159,27 @@ async def _awaitable_identity(x): assert await _maybe_await("foo") == "foo" assert await _maybe_await("bar") == "bar" assert await _maybe_await(1234) == 1234 + + +class TestGetFloatSetting(IsolatedAsyncioTestCase): + @pytest.mark.asyncio + async def test_get_float_setting(self): + settings = Settings( + { + "FLOAT": 1.5, + "DECIMAL": Decimal("2.5"), + "INT": 3, + "NUMERIC_STRING": "123", + "NON_NUMERIC_STRING": "asdf", + "NONE": None, + "LIST": [1, 2, 3], + } + ) + assert _get_float_setting(settings, "FLOAT") == 1.5 + assert _get_float_setting(settings, "DECIMAL") == 2.5 + assert _get_float_setting(settings, "INT") == 3.0 + assert _get_float_setting(settings, "NUMERIC_STRING") == 123 + assert _get_float_setting(settings, "NON_NUMERIC_STRING") is None + assert _get_float_setting(settings, "NONE") is None + assert _get_float_setting(settings, "LIST") is None + assert _get_float_setting(settings, "MISSING_KEY") is None From a609ae50b04b1aa994031558195f6be4599fcf44 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Fri, 25 Aug 2023 20:04:44 -0300 Subject: [PATCH 18/24] except Exception --- scrapy_playwright/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy_playwright/_utils.py b/scrapy_playwright/_utils.py index e668c74f..6f5c544f 100644 --- a/scrapy_playwright/_utils.py +++ b/scrapy_playwright/_utils.py @@ -85,7 +85,7 @@ async def _get_page_content( def _get_float_setting(settings: Settings, key: str) -> Optional[float]: try: return float(settings[key]) - except: + except Exception: return None From a1a040d40210d2f9aabf1b2c6f0b8ab0ae4869d4 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Tue, 29 Aug 2023 16:47:41 -0300 Subject: [PATCH 19/24] Test zero value --- tests/tests_asyncio/test_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/tests_asyncio/test_utils.py b/tests/tests_asyncio/test_utils.py index 3e11ab8a..e51b61d6 100644 --- a/tests/tests_asyncio/test_utils.py +++ b/tests/tests_asyncio/test_utils.py @@ -166,6 +166,7 @@ class TestGetFloatSetting(IsolatedAsyncioTestCase): async def test_get_float_setting(self): settings = Settings( { + "ZERO": 0, "FLOAT": 1.5, "DECIMAL": Decimal("2.5"), "INT": 3, @@ -175,6 +176,7 @@ async def test_get_float_setting(self): "LIST": [1, 2, 3], } ) + assert _get_float_setting(settings, "ZERO") == 0.0 assert _get_float_setting(settings, "FLOAT") == 1.5 assert _get_float_setting(settings, "DECIMAL") == 2.5 assert _get_float_setting(settings, "INT") == 3.0 From 3dd74c9b2fc50f5a22d8c48982492c4dfcc702d8 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 29 Nov 2023 12:11:51 -0300 Subject: [PATCH 20/24] Make black & flake8 happy --- scrapy_playwright/handler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 344cde80..4b407cfb 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -74,7 +74,7 @@ class Config: max_contexts: Optional[int] startup_context_kwargs: dict navigation_timeout: Optional[float] - close_context_interval : Optional[float] + close_context_interval: Optional[float] @classmethod def from_settings(cls, settings: Settings) -> "Config": @@ -91,7 +91,7 @@ def from_settings(cls, settings: Settings) -> "Config": ), close_context_interval=_get_float_setting( settings, "PLAYWRIGHT_CLOSE_CONTEXT_INTERVAL" - ) + ), ) cfg.cdp_kwargs.pop("endpoint_url", None) if not cfg.max_pages_per_context: From 98ba7bfbdbb5bee8141f560cf727eee1a0e4aeb5 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Fri, 21 Jun 2024 00:59:21 -0300 Subject: [PATCH 21/24] Remove pytest.mark.asyncio --- tests/tests_asyncio/test_browser_contexts.py | 1 - tests/tests_asyncio/test_utils.py | 1 - 2 files changed, 2 deletions(-) diff --git a/tests/tests_asyncio/test_browser_contexts.py b/tests/tests_asyncio/test_browser_contexts.py index 50ceb46f..dd0add13 100644 --- a/tests/tests_asyncio/test_browser_contexts.py +++ b/tests/tests_asyncio/test_browser_contexts.py @@ -223,7 +223,6 @@ async def test_contexts_dynamic(self): assert cookie["value"] == "qwerty" assert cookie["domain"] == "example.org" - @pytest.mark.asyncio async def test_close_inactive_context(self): spider = Spider("foo") async with make_handler( diff --git a/tests/tests_asyncio/test_utils.py b/tests/tests_asyncio/test_utils.py index e3ba7cfc..09ce238f 100644 --- a/tests/tests_asyncio/test_utils.py +++ b/tests/tests_asyncio/test_utils.py @@ -153,7 +153,6 @@ async def _awaitable_identity(x): class TestGetFloatSetting(IsolatedAsyncioTestCase): - @pytest.mark.asyncio async def test_get_float_setting(self): settings = Settings( { From 996105c77a25db6399ed97b6fec06f34c5725bfd Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Fri, 28 Jun 2024 00:10:02 -0300 Subject: [PATCH 22/24] Remove experimental notes from the readme --- README.md | 7 ------- 1 file changed, 7 deletions(-) diff --git a/README.md b/README.md index 0105c353..80cde203 100644 --- a/README.md +++ b/README.md @@ -299,8 +299,6 @@ PLAYWRIGHT_MAX_PAGES_PER_CONTEXT = 4 ### `PLAYWRIGHT_CLOSE_CONTEXT_INTERVAL` Type `Optional[float]`, default `None` -**_Experimental_** - If set to a non-zero value, browser contexts will be automatically closed after spending the specified amount of seconds without open pages. Set to `None` (the default) to disable, i.e. contexts remain open until explicitly closed. @@ -1088,8 +1086,3 @@ Deprecated features will be supported for at least six months following the release that deprecated them. After that, they may be removed at any time. See the [changelog](docs/changelog.md) for more information about deprecations and removals. - - -## Experimental features - -Features marked as "experimental" might be modified and/or removed at any time. From 5ed076420e345a2be5d740a2ccd605ca4199a6a6 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Fri, 28 Jun 2024 00:22:38 -0300 Subject: [PATCH 23/24] Allow Windows test for closing inactive contexts --- tests/tests_asyncio/test_browser_contexts.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/tests_asyncio/test_browser_contexts.py b/tests/tests_asyncio/test_browser_contexts.py index fec88c08..c90648ca 100644 --- a/tests/tests_asyncio/test_browser_contexts.py +++ b/tests/tests_asyncio/test_browser_contexts.py @@ -230,6 +230,7 @@ async def test_contexts_dynamic(self): assert cookie["value"] == "qwerty" assert cookie["domain"] == "example.org" + @allow_windows async def test_close_inactive_context(self): spider = Spider("foo") async with make_handler( From 33b3f3c35fdc3b5f31a3b1fcbeebcd489135e2c1 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 13 Aug 2025 16:21:38 -0300 Subject: [PATCH 24/24] Add default config value --- scrapy_playwright/handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index fd0937ae..11f1f81b 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -100,7 +100,7 @@ class Config: restart_disconnected_browser: bool target_closed_max_retries: int = 3 use_threaded_loop: bool = False - close_context_interval: Optional[float] + close_context_interval: Optional[float] = None @classmethod def from_settings(cls, settings: Settings) -> "Config":