爬取过程中 遇见 百度蜘蛛反爬 robot.txt,我们可以在scrapy 的setting.py 配置文件下配置
ROBOTSTXT_OBEY = False
最终代码
# -*- coding: utf-8 -*-
from scrapy.spider import Spider
from scrapy.contrib.spiders import CrawlSpider, Rule
#from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request, HtmlResponse
from scrapy import log
from items import BDzdItem
class BDzdSpider(CrawlSpider):
global qa_number;
qa_number=0;
"""爬取百度知道 银行"""
log.msg("log&