scrapy middlewares:
from txsocksx.http import SOCKS5Agent
from twisted.internet import reactor
from twisted.internet.endpoints import TCP4ClientEndpoint
from scrapy.core.downloader.webclient import _parse
from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler, ScrapyAgent
proxyHost = "socks-cla.abuyun.com"
proxyPort = xxxx
proxyUser = "S92IXXXXX2731G8C"
proxyPass = "D7650BABD8BXXXXX"
class Socks5DownloadHandler(HTTP11DownloadHandler):
def download_request(self, request, spider):
"""Return a deferred for the HTTP download"""
agent = ScrapySocks5Agent(contextFactory=self._contextFactory, pool=self._pool)
return agent.download_request(request)
class ScrapySocks5Agent(ScrapyAgent):
def _get_agent(self, request, timeout):
# bindAddress = request.meta.get('bindaddress') or self._bindAddress
#proxy = request.meta.get('proxy')
#if proxy:
#_, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
#_, _, host, port, proxyParams = _parse(request.url)
torServerEndpoint = TCP4ClientEndpoint(reactor, proxyHost, proxyPort)
#proxyEndpoint = SOCKS5ClientEndpoint(proxyHost, proxyPort,proxyEndpoint=torServerEndpoint, methods={'login': (proxyUser, proxyPass)})
agent = SOCKS5Agent(reactor, proxyEndpoint=torServerEndpoint,endpointArgs=dict(methods={'login': (proxyUser, proxyPass)}))
return agent
# return self._Agent(reactor, contextFactory=self._contextFactory,
# connectTimeout=timeout, bindAddress=bindAddress, pool=self._pool)
scrapy settings:
DOWNLOAD_HANDLERS = {
"http": "spider.middlewares.Socks5DownloadHandler",
"https": "spider.middlewares.Socks5DownloadHandler",
}