抓取代理信息
0x00 创建Spider
class YoudailiSpider(CrawlSpider):
name = 'youdaili'
allowed_domains = ['youdaili.net']
start_urls = ['http://www.youdaili.net/Daili/http/']
rules = (
Rule(LinkExtractor(allow=r'/\d+(_)*\d*\.html'), callback='parse_item', follow=True),
)
def parse_item(self, response):
print "-"*100
items = []
for data in response.xpath("//div[@class='cont_font']/p/text()"):
item = ProxyInfoItem()
data = data.extract().replace('\r\n','')
item['location'] = data.split('#')[1]
data = data.split('#')[0]
item['proxy_type'] = data.split('@')[1]
data = data.split('@')[0]
item['port'] = data.split(':')[1]
item['ip'] = data.split(':')[0]
items.append(item)
return items
pass
0x01 验证proxy是否可用的类
用别人的代码稍微改了一下
class TestTime(threading.Thread):
"""test a proxy's speed in new thread by recording its connect time"""
def __init__(self):
threading.Thread.__init__(self)
self.proxy = None
self.time = None
self.test_url = None
self.stat = " time out!"
self.test_pattern = re.compile(r"""table""")
pass
def setproxy(self,proxy) :
self.proxy = proxy
self.stat = proxy + " time out!"
pass
def settesturl(self,url):
self.test_url = url
pass
def run(self):
start = time.time()
try:
f = urllib.urlopen(self.test_url, proxies = {"http":"http://"+self.proxy})
except:
self.stat = self.proxy+" fails!"
else:
data = f.read()
f.close()
end = time.time()
if self.test_pattern.search(data): #if data is matched
self.time = end-start
self.stat = self.proxy+" time: "+str(self.time)
else:
self.stat = self.proxy+" not matched!"
pass
class ProxyTest(object):
def __init__(self):
self.url = 'http://www.ip138.com/'
self.time_out = 3.0
pass
def starttest(self,proxys):
for proxy in proxys:
self.test = TestTime()
self.test.settesturl(self.url)
self.test.setproxy(proxy)
#self.test.setDaemon(True)
print "testing "+proxy
self.test.start()
self.test.join(self.time_out)
print self.test.stat
if not self.test.time:
self.test.time = 99999
yield (self.test.time,proxy)
pass
pass
0x02 处理每一个Item的信息,并且存入数据库
class ProxyprojectPipeline(object):
def __init__(self):
self.linecount = 0
self.items = []
self.db = ProxySql()
self.test =ProxyTest()
def process_item(self, item, spider):
if item not in self.items:
testproxy = []
testproxy.append(item['ip'] +':'+ item['port'] )
for time,proxy in self.test.starttest(testproxy):
if time < 5:
item['proxy_speed'] =str(time)
self.db.insert('tbProxyInfoItem',item)
self.items.append(item)
return item
else:
raise DropItem("too long time in %s" % item)
else:
raise DropItem("repeate in %s" % item)
0x03 使用中间件,测试代理
class CustomHttpProxyMiddleware(object):
def __init__(self):
self.db = ProxySql()
proxylist = self.db.selectproxys()
p = random.choice(proxylist)
print "http://%s:%s" % (p[0],p[1])
print '-'*150
pass
def process_request(self, request, spider):
# TODO implement complex proxy providing algorithm
if self.use_proxy(request):
p = random.choice(proxylist)
try:
request.meta['proxy'] = "http://%s:%s" % (p[0],p[1])
except Exception, e:
print("Exception %s" % e)
def use_proxy(self, request):
"""
using direct download for depth <= 2
using proxy with probability 0.3
"""
if "depth" in request.meta and int(request.meta['depth']) <= 2:
return False
i = random.randint(1, 10)
return i <= 2
class CustomUserAgentMiddleware(object):
def process_request(self, request, spider):
print '*'* 100
agent = random.choice(AGENTS)
遇到的问题
按照我的理解,在配置中起用中间件后,每一次请求CustomHttpProxyMiddleware的process_request都应该被调用,但是打log,发现没有被调到。
可能的原因:
版本问题,如果想为每个request设置代理,需要重写的函数不是process_request,而是其他。参照Scrapy0.2.4.6的帮助文档,没有找到process_request这个函数。
但是由于貌似还没有被ban掉。所以暂时不考虑proxy。这个proxy只是当备用。