Skip to content

Commit ba2d2e6

Browse files
committed
initial code
1 parent f24add0 commit ba2d2e6

File tree

7 files changed

+188
-0
lines changed

7 files changed

+188
-0
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
build/
2+
scrapy_proxy_headers.egg-info/
3+
scrapy_proxy_headers/__pycache__/

README.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
The `scrapy-proxy-headers` package is designed for adding proxy headers to HTTPS requests.
2+
3+
In normal usage, custom headers put in `request.headers` cannot be read by a proxy when you make a HTTPS request, because the headers are encrypted and passed through the proxy tunnel, along with the rest of the request body. You can read more about this at [Proxy Server Requests over HTTPS](https://docs.proxymesh.com/article/145-proxy-server-requests-over-https).
4+
5+
Because Scrapy does not have a good way to pass custom headers to a proxy when you make HTTPS requests, we at ProxyMesh made this extension to support our customers that use Scrapy and want to use custom headers to control our proxy behavior. But this extension can work for any custom headers through a proxy.
6+
7+
To use this extension, do the following:
8+
9+
1. `pip install scrapy_proxy_headers`
10+
2. In your Scrapy `settings.py`, add the following code:
11+
12+
```python
13+
DOWNLOAD_HANDLERS = {
14+
"https": "scrapy_proxy_headers.HTTP11ProxyDownloadHandler"
15+
}
16+
```
17+
18+
3. When you want make a request with a custom proxy header, instead of using `request.headers`, use `request.meta["proxy_headers"]` like this:
19+
20+
```python
21+
request.meta["proxy_headers"] = {"X-ProxyMesh-Country": "US"}
22+
```
23+
24+
Currently this package does not yet support reading custom response headers from the proxy, but that feature is coming soon.

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
scrapy>=2.0

scrapy_proxy_headers/__init__.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
"""
2+
To use this, in your settings, do the following:
3+
4+
DOWNLOAD_HANDLERS = {
5+
"https": "scrapy_proxy_headers.HTTP11ProxyDownloadHandler"
6+
}
7+
8+
Then when you make a request with a custom proxy header, instead of using request.headers, use request.meta["proxy_headers"] like this:
9+
10+
request.meta["proxy_headers"] = {"X-ProxyMesh-Country": "US"}
11+
"""
12+
13+
from .download_handler import HTTP11ProxyDownloadHandler

scrapy_proxy_headers/agent.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
# TODO: handle response headers
2+
3+
from scrapy.core.downloader.handlers.http11 import TunnelingAgent, TunnelingTCP4ClientEndpoint, ScrapyAgent, HTTP11DownloadHandler
4+
from scrapy.core.downloader.webclient import _parse
5+
from scrapy.utils.python import to_bytes
6+
7+
def tunnel_request_data_with_headers(host: str, port: int, **proxy_headers) -> bytes:
8+
r"""
9+
Return binary content of a CONNECT request.
10+
11+
>>> from scrapy.utils.python import to_unicode as s
12+
>>> s(tunnel_request_data_with_headers("example.com", 8080))
13+
'CONNECT example.com:8080 HTTP/1.1\r\nHost: example.com:8080\r\n\r\n'
14+
>>> s(tunnel_request_data_with_headers("example.com", 8080, **{"X-ProxyMesh-Country": "US"}))
15+
'CONNECT example.com:8080 HTTP/1.1\r\nHost: example.com:8080\r\nX-ProxyMesh-Country: US\r\n\r\n'
16+
>>> s(tunnel_request_data_with_headers(b"example.com", "8090"))
17+
'CONNECT example.com:8090 HTTP/1.1\r\nHost: example.com:8090\r\n\r\n'
18+
"""
19+
host_value = to_bytes(host, encoding="ascii") + b":" + to_bytes(str(port))
20+
tunnel_req = b"CONNECT " + host_value + b" HTTP/1.1\r\n"
21+
tunnel_req += b"Host: " + host_value + b"\r\n"
22+
23+
for key, val in proxy_headers.items():
24+
tunnel_req += to_bytes(key) + b": " + to_bytes(val) + b"\r\n"
25+
26+
tunnel_req += b"\r\n"
27+
return tunnel_req
28+
29+
class TunnelingHeadersTCP4ClientEndpoint(TunnelingTCP4ClientEndpoint):
30+
def __init__(
31+
self,
32+
reactor,
33+
host: str,
34+
port: int,
35+
proxyConf,
36+
contextFactory,
37+
timeout: float = 30,
38+
bindAddress = None,
39+
**proxy_headers
40+
):
41+
super().__init__(reactor, host, port, proxyConf, contextFactory, timeout, bindAddress)
42+
43+
self._proxy_headers = {}
44+
if self._proxyAuthHeader:
45+
self._proxy_headers['Proxy-Authorization'] = self._proxyAuthHeader
46+
self._proxy_headers.update(proxy_headers)
47+
48+
def requestTunnel(self, protocol):
49+
"""Asks the proxy to open a tunnel."""
50+
assert protocol.transport
51+
tunnelReq = tunnel_request_data_with_headers(
52+
self._tunneledHost, self._tunneledPort, **self._proxy_headers
53+
)
54+
protocol.transport.write(tunnelReq)
55+
self._protocolDataReceived = protocol.dataReceived
56+
protocol.dataReceived = self.processProxyResponse # type: ignore[method-assign]
57+
self._protocol = protocol
58+
return protocol
59+
60+
class TunnelingHeadersAgent(TunnelingAgent):
61+
"""An agent that uses a L{TunnelingTCP4ClientEndpoint} to make HTTPS
62+
downloads. It may look strange that we have chosen to subclass Agent and not
63+
ProxyAgent but consider that after the tunnel is opened the proxy is
64+
transparent to the client; thus the agent should behave like there is no
65+
proxy involved.
66+
"""
67+
def __init__(self, *args, **kwargs):
68+
super().__init__(*args, **kwargs)
69+
self._proxy_headers = {}
70+
71+
def set_proxy_headers(self, proxy_headers):
72+
self._proxy_headers = proxy_headers
73+
74+
def _getEndpoint(self, uri):
75+
return TunnelingHeadersTCP4ClientEndpoint(
76+
reactor=self._reactor,
77+
host=uri.host,
78+
port=uri.port,
79+
proxyConf=self._proxyConf,
80+
contextFactory=self._contextFactory,
81+
timeout=self._endpointFactory._connectTimeout,
82+
bindAddress=self._endpointFactory._bindAddress,
83+
**self._proxy_headers
84+
)
85+
86+
class ScrapyProxyHeadersAgent(ScrapyAgent):
87+
_TunnelingAgent = TunnelingHeadersAgent
88+
89+
def _get_agent(self, request, timeout: float):
90+
agent = super()._get_agent(request, timeout)
91+
92+
proxy = request.meta.get("proxy")
93+
proxy_headers = request.meta.get('proxy_headers')
94+
if proxy and proxy_headers:
95+
scheme = _parse(request.url)[0]
96+
if scheme == b"https":
97+
agent.set_proxy_headers(proxy_headers)
98+
99+
return agent
100+
101+
class HTTP11ProxyDownloadHandler(HTTP11DownloadHandler):
102+
def download_request(self, request, spider):
103+
"""Return a deferred for the HTTP download"""
104+
agent = ScrapyProxyHeadersAgent(
105+
contextFactory=self._contextFactory,
106+
pool=self._pool,
107+
maxsize=getattr(spider, "download_maxsize", self._default_maxsize),
108+
warnsize=getattr(spider, "download_warnsize", self._default_warnsize),
109+
fail_on_dataloss=self._fail_on_dataloss,
110+
crawler=self._crawler,
111+
)
112+
return agent.download_request(request)
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler
2+
from scrapy_proxy_headers.agent import ScrapyProxyHeadersAgent
3+
4+
class HTTP11ProxyDownloadHandler(HTTP11DownloadHandler):
5+
def download_request(self, request, spider):
6+
"""Return a deferred for the HTTP download"""
7+
agent = ScrapyProxyHeadersAgent(
8+
contextFactory=self._contextFactory,
9+
pool=self._pool,
10+
maxsize=getattr(spider, "download_maxsize", self._default_maxsize),
11+
warnsize=getattr(spider, "download_warnsize", self._default_warnsize),
12+
fail_on_dataloss=self._fail_on_dataloss,
13+
crawler=self._crawler,
14+
)
15+
return agent.download_request(request)

setup.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from setuptools import setup, find_packages
2+
3+
setup(
4+
name="scrapy_proxy_headers",
5+
version="0.1.0",
6+
packages=find_packages(),
7+
install_requires=[
8+
"scrapy>=2.0",
9+
],
10+
entry_points={
11+
"scrapy.downloader_handlers": [
12+
"https = scrapy_proxy_headers.HTTP11ProxyDownloadHandler"
13+
],
14+
},
15+
classifiers=[
16+
"Programming Language :: Python :: 3",
17+
#"License :: OSI Approved :: MIT License",
18+
"Operating System :: OS Independent",
19+
],
20+
)

0 commit comments

Comments
 (0)