initial code

proxymesh · proxymesh · commit ba2d2e6d7c5b · 2025-01-31T10:26:09.000-08:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+build/
+scrapy_proxy_headers.egg-info/
+scrapy_proxy_headers/__pycache__/
diff --git a/README.md b/README.md
@@ -0,0 +1,24 @@
+The `scrapy-proxy-headers` package is designed for adding proxy headers to HTTPS requests.
+
+In normal usage, custom headers put in `request.headers` cannot be read by a proxy when you make a HTTPS request, because the headers are encrypted and passed through the proxy tunnel, along with the rest of the request body. You can read more about this at [Proxy Server Requests over HTTPS](https://docs.proxymesh.com/article/145-proxy-server-requests-over-https).
+
+Because Scrapy does not have a good way to pass custom headers to a proxy when you make HTTPS requests, we at ProxyMesh made this extension to support our customers that use Scrapy and want to use custom headers to control our proxy behavior. But this extension can work for any custom headers through a proxy.
+
+To use this extension, do the following:
+
+1. `pip install scrapy_proxy_headers`
+2. In your Scrapy `settings.py`, add the following code:
+
+```python
+DOWNLOAD_HANDLERS = {
+  "https": "scrapy_proxy_headers.HTTP11ProxyDownloadHandler"
+}
+```
+
+3. When you want make a request with a custom proxy header, instead of using `request.headers`, use `request.meta["proxy_headers"]` like this:
+
+```python
+request.meta["proxy_headers"] = {"X-ProxyMesh-Country": "US"}
+```
+
+Currently this package does not yet support reading custom response headers from the proxy, but that feature is coming soon.
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1 @@
+scrapy>=2.0
diff --git a/scrapy_proxy_headers/__init__.py b/scrapy_proxy_headers/__init__.py
@@ -0,0 +1,13 @@
+"""
+To use this, in your settings, do the following:
+
+DOWNLOAD_HANDLERS = {
+  "https": "scrapy_proxy_headers.HTTP11ProxyDownloadHandler"
+}
+
+Then when you make a request with a custom proxy header, instead of using request.headers, use request.meta["proxy_headers"] like this:
+
+request.meta["proxy_headers"] = {"X-ProxyMesh-Country": "US"}
+"""
+
+from .download_handler import HTTP11ProxyDownloadHandler
diff --git a/scrapy_proxy_headers/agent.py b/scrapy_proxy_headers/agent.py
@@ -0,0 +1,112 @@
+# TODO: handle response headers
+
+from scrapy.core.downloader.handlers.http11 import TunnelingAgent, TunnelingTCP4ClientEndpoint, ScrapyAgent, HTTP11DownloadHandler
+from scrapy.core.downloader.webclient import _parse
+from scrapy.utils.python import to_bytes
+
+def tunnel_request_data_with_headers(host: str, port: int, **proxy_headers) -> bytes:
+    r"""
+    Return binary content of a CONNECT request.
+
+    >>> from scrapy.utils.python import to_unicode as s
+    >>> s(tunnel_request_data_with_headers("example.com", 8080))
+    'CONNECT example.com:8080 HTTP/1.1\r\nHost: example.com:8080\r\n\r\n'
+    >>> s(tunnel_request_data_with_headers("example.com", 8080, **{"X-ProxyMesh-Country": "US"}))
+    'CONNECT example.com:8080 HTTP/1.1\r\nHost: example.com:8080\r\nX-ProxyMesh-Country: US\r\n\r\n'
+    >>> s(tunnel_request_data_with_headers(b"example.com", "8090"))
+    'CONNECT example.com:8090 HTTP/1.1\r\nHost: example.com:8090\r\n\r\n'
+    """
+    host_value = to_bytes(host, encoding="ascii") + b":" + to_bytes(str(port))
+    tunnel_req = b"CONNECT " + host_value + b" HTTP/1.1\r\n"
+    tunnel_req += b"Host: " + host_value + b"\r\n"
+    
+    for key, val in proxy_headers.items():
+        tunnel_req += to_bytes(key) + b": " + to_bytes(val) + b"\r\n"
+    
+    tunnel_req += b"\r\n"
+    return tunnel_req
+
+class TunnelingHeadersTCP4ClientEndpoint(TunnelingTCP4ClientEndpoint):
+    def __init__(
+        self,
+        reactor,
+        host: str,
+        port: int,
+        proxyConf,
+        contextFactory,
+        timeout: float = 30,
+        bindAddress = None,
+        **proxy_headers
+    ):
+        super().__init__(reactor, host, port, proxyConf, contextFactory, timeout, bindAddress)
+
+        self._proxy_headers = {}
+        if self._proxyAuthHeader:
+            self._proxy_headers['Proxy-Authorization'] = self._proxyAuthHeader
+        self._proxy_headers.update(proxy_headers)
+    
+    def requestTunnel(self, protocol):
+        """Asks the proxy to open a tunnel."""
+        assert protocol.transport
+        tunnelReq = tunnel_request_data_with_headers(
+            self._tunneledHost, self._tunneledPort, **self._proxy_headers
+        )
+        protocol.transport.write(tunnelReq)
+        self._protocolDataReceived = protocol.dataReceived
+        protocol.dataReceived = self.processProxyResponse  # type: ignore[method-assign]
+        self._protocol = protocol
+        return protocol
+
+class TunnelingHeadersAgent(TunnelingAgent):
+    """An agent that uses a L{TunnelingTCP4ClientEndpoint} to make HTTPS
+    downloads. It may look strange that we have chosen to subclass Agent and not
+    ProxyAgent but consider that after the tunnel is opened the proxy is
+    transparent to the client; thus the agent should behave like there is no
+    proxy involved.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._proxy_headers = {}
+    
+    def set_proxy_headers(self, proxy_headers):
+        self._proxy_headers = proxy_headers
+
+    def _getEndpoint(self, uri):
+        return TunnelingHeadersTCP4ClientEndpoint(
+            reactor=self._reactor,
+            host=uri.host,
+            port=uri.port,
+            proxyConf=self._proxyConf,
+            contextFactory=self._contextFactory,
+            timeout=self._endpointFactory._connectTimeout,
+            bindAddress=self._endpointFactory._bindAddress,
+            **self._proxy_headers
+        )
+
+class ScrapyProxyHeadersAgent(ScrapyAgent):
+    _TunnelingAgent = TunnelingHeadersAgent
+    
+    def _get_agent(self, request, timeout: float):
+        agent = super()._get_agent(request, timeout)
+
+        proxy = request.meta.get("proxy")
+        proxy_headers = request.meta.get('proxy_headers')
+        if proxy and proxy_headers:
+            scheme = _parse(request.url)[0]
+            if scheme == b"https":
+                agent.set_proxy_headers(proxy_headers)
+        
+        return agent
+
+class HTTP11ProxyDownloadHandler(HTTP11DownloadHandler):
+    def download_request(self, request, spider):
+        """Return a deferred for the HTTP download"""
+        agent = ScrapyProxyHeadersAgent(
+            contextFactory=self._contextFactory,
+            pool=self._pool,
+            maxsize=getattr(spider, "download_maxsize", self._default_maxsize),
+            warnsize=getattr(spider, "download_warnsize", self._default_warnsize),
+            fail_on_dataloss=self._fail_on_dataloss,
+            crawler=self._crawler,
+        )
+        return agent.download_request(request)
diff --git a/scrapy_proxy_headers/download_handler.py b/scrapy_proxy_headers/download_handler.py
@@ -0,0 +1,15 @@
+from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler
+from scrapy_proxy_headers.agent import ScrapyProxyHeadersAgent
+
+class HTTP11ProxyDownloadHandler(HTTP11DownloadHandler):
+    def download_request(self, request, spider):
+        """Return a deferred for the HTTP download"""
+        agent = ScrapyProxyHeadersAgent(
+            contextFactory=self._contextFactory,
+            pool=self._pool,
+            maxsize=getattr(spider, "download_maxsize", self._default_maxsize),
+            warnsize=getattr(spider, "download_warnsize", self._default_warnsize),
+            fail_on_dataloss=self._fail_on_dataloss,
+            crawler=self._crawler,
+        )
+        return agent.download_request(request)
diff --git a/setup.py b/setup.py
@@ -0,0 +1,20 @@
+from setuptools import setup, find_packages
+
+setup(
+    name="scrapy_proxy_headers",
+    version="0.1.0",
+    packages=find_packages(),
+    install_requires=[
+        "scrapy>=2.0",
+    ],
+    entry_points={
+        "scrapy.downloader_handlers": [
+            "https = scrapy_proxy_headers.HTTP11ProxyDownloadHandler"
+        ],
+    },
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        #"License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+build/`
	`2`	`+scrapy_proxy_headers.egg-info/`
	`3`	`+scrapy_proxy_headers/__pycache__/`