拿绿色下载站的最近更新为例,spider核心代码
# -*- coding: utf-8 -*-
from scrapy.spider import Spider
from scrapy.http import Request
import re
class MySpider(Spider):
name = "downg"
allowed_domains = ["downg.com"]
start_urls = [
'http://www.downg.com/new/0_%s.html' %x for x in xrange(1,7)
]
def parse(self, response):
urls_list=re.findall(r'class=app-name><A href="(.*?)"',response.body)
urlReqs=[]
for url in urls_list:
req=Request(url,self.getDetail)
urlReqs.append(req)
return urlReqs
def getDetail(self,response):
print response.url
转载于:https://blog.51cto.com/pcliuyang/1534120