代码如下
from scrapy.utils.response import get_base_url
from scrapy.utils.url import urljoin_rfc
class TestSpider(BaseSpider):
name = 'test'
def __init__(self):
# articleUrl xpath
self.articleUrlXpath = '//a[contains(@href, "content*")]/@href'
def parse(self, response):
sel = Selector(response)
base_url = get_base_url(response)
for url in sel.xpath(self.articleUrlXpath).extract():
yield Request(urljoin_rfc(base_url, url), callback=self.parse)