diff --git a/README.md b/README.md index a07a6a9..4294e10 100644 --- a/README.md +++ b/README.md @@ -379,6 +379,17 @@ See the [notes about leaving unclosed pages](#receiving-page-objects-in-callback PLAYWRIGHT_MAX_PAGES_PER_CONTEXT = 4 ``` +### `PLAYWRIGHT_CLOSE_CONTEXT_INTERVAL` +Type `Optional[float]`, default `None` + +If set to a non-zero value, browser contexts will be automatically closed after +spending the specified amount of seconds without open pages. Set to `None` +(the default) to disable, i.e. contexts remain open until explicitly closed. + +```python +PLAYWRIGHT_CLOSE_CONTEXT_INTERVAL = 5 * 60 # 5 minutes +``` + ### `PLAYWRIGHT_ABORT_REQUEST` Type `Optional[Union[Callable, str]]`, default `None` @@ -726,6 +737,12 @@ yield scrapy.Request( Please note that if a context with the specified name already exists, that context is used and `playwright_context_kwargs` are ignored. +### Automatically closing inactive contexts + +Specifying a non-negative integer value for the +[`PLAYWRIGHT_CLOSE_CONTEXT_INTERVAL`](#playwright_close_context_interval) +setting enables closing browser contexts which have no active pages. + ### Closing contexts while crawling After [receiving the Page object in your callback](#receiving-page-objects-in-callbacks), diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 3b88944..11f1f81 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -66,8 +66,10 @@ @dataclass class BrowserContextWrapper: context: BrowserContext - semaphore: asyncio.Semaphore persistent: bool + semaphore: asyncio.Semaphore # limit amount of pages + inactive: asyncio.Event + waiting_close: asyncio.Event @dataclass @@ -98,6 +100,7 @@ class Config: restart_disconnected_browser: bool target_closed_max_retries: int = 3 use_threaded_loop: bool = False + close_context_interval: Optional[float] = None @classmethod def from_settings(cls, settings: Settings) -> "Config": @@ -123,6 +126,9 @@ def from_settings(cls, settings: Settings) -> "Config": ), use_threaded_loop=platform.system() == "Windows" or settings.getbool("_PLAYWRIGHT_THREADED_LOOP", False), + close_context_interval=_get_float_setting( + settings, "PLAYWRIGHT_CLOSE_CONTEXT_INTERVAL" + ), ) cfg.cdp_kwargs.pop("endpoint_url", None) cfg.connect_kwargs.pop("ws_endpoint", None) @@ -280,12 +286,34 @@ async def _create_browser_context( context.set_default_navigation_timeout(self.config.navigation_timeout) self.context_wrappers[name] = BrowserContextWrapper( context=context, - semaphore=asyncio.Semaphore(value=self.config.max_pages_per_context), persistent=persistent, + semaphore=asyncio.Semaphore(value=self.config.max_pages_per_context), + inactive=asyncio.Event(), + waiting_close=asyncio.Event(), ) + if self.config.close_context_interval is not None: + asyncio.create_task(self._maybe_close_inactive_context(name=name, spider=spider)) self._set_max_concurrent_context_count() return self.context_wrappers[name] + async def _maybe_close_inactive_context( + self, name: str, spider: Optional[Spider] = None + ) -> None: + """Close a context if it has had no pages for a certain amount of time.""" + while name in self.context_wrappers: + context_wrapper = self.context_wrappers[name] + await context_wrapper.inactive.wait() + context_wrapper.waiting_close.set() + await asyncio.sleep(self.config.close_context_interval) # type: ignore [arg-type] + if context_wrapper.waiting_close.is_set() and not context_wrapper.context.pages: + logger.info( + "[Context=%s] Closing inactive browser context", + name, + extra={"spider": spider, "context_name": name}, + ) + await context_wrapper.context.close() + break + async def _create_page(self, request: Request, spider: Spider) -> Page: """Create a new page in a context, also creating a new context if necessary.""" context_name = request.meta.setdefault("playwright_context", DEFAULT_CONTEXT_NAME) @@ -301,6 +329,8 @@ async def _create_page(self, request: Request, spider: Spider) -> Page: ) await ctx_wrapper.semaphore.acquire() + ctx_wrapper.inactive.clear() + ctx_wrapper.waiting_close.clear() page = await ctx_wrapper.context.new_page() self.stats.inc_value("playwright/page_count") total_page_count = self._get_total_page_count() @@ -357,6 +387,7 @@ def close(self) -> Deferred: _ThreadedLoopAdapter.stop(id(self)) async def _close(self) -> None: + logger.info("Closing %i contexts", len(self.context_wrappers)) with suppress(TargetClosedError): await asyncio.gather(*[ctx.context.close() for ctx in self.context_wrappers.values()]) self.context_wrappers.clear() @@ -673,6 +704,8 @@ def _make_close_page_callback(self, context_name: str) -> Callable: def close_page_callback() -> None: if context_name in self.context_wrappers: self.context_wrappers[context_name].semaphore.release() + if not self.context_wrappers[context_name].context.pages: + self.context_wrappers[context_name].inactive.set() return close_page_callback diff --git a/tests/tests_asyncio/test_browser_contexts.py b/tests/tests_asyncio/test_browser_contexts.py index 97df599..9c0f42f 100644 --- a/tests/tests_asyncio/test_browser_contexts.py +++ b/tests/tests_asyncio/test_browser_contexts.py @@ -1,4 +1,5 @@ import asyncio +import logging import platform import tempfile from pathlib import Path @@ -11,10 +12,15 @@ from scrapy_playwright.page import PageMethod from tests import allow_windows, make_handler -from tests.mockserver import StaticMockServer +from tests.mockserver import MockServer, StaticMockServer class MixinTestCaseMultipleContexts: + @pytest.fixture(autouse=True) + def inject_fixtures(self, caplog): + caplog.set_level(logging.DEBUG) + self._caplog = caplog + @allow_windows async def test_context_kwargs(self): settings_dict = { @@ -224,6 +230,33 @@ async def test_contexts_dynamic(self): assert cookie["value"] == "qwerty" assert cookie["domain"] == "example.org" + @allow_windows + async def test_close_inactive_context(self): + spider = Spider("foo") + async with make_handler( + { + "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, + "PLAYWRIGHT_CLOSE_CONTEXT_INTERVAL": 0.5, + } + ) as handler: + assert len(handler.context_wrappers) == 0 + with MockServer() as server: + await handler._download_request( + Request(server.urljoin("/headers"), meta={"playwright": True}), spider + ) + assert len(handler.context_wrappers) == 1 + await asyncio.sleep(0.3) + await handler._download_request( + Request(server.urljoin("/delay/1"), meta={"playwright": True}), spider + ) + await asyncio.sleep(0.7) + assert len(handler.context_wrappers) == 0 + assert ( + "scrapy-playwright", + logging.INFO, + "[Context=default] Closing inactive browser context", + ) in self._caplog.record_tuples + class TestCaseMultipleContextsChromium(IsolatedAsyncioTestCase, MixinTestCaseMultipleContexts): browser_type = "chromium"