新建scrapy工程
scrapy startproject project_name
进入工程目录,新建spider
scrapy genspider --template=crawl spider_file_name www.baidu.com
修改settings.py文件
#关掉机器人协议
ROBOTSTXT_OBEY = False
#设置下载延迟
DOWNLOAD_DELAY = 2
在工程目录下添加run.py文件
from scrapy import cmdline
# window和ubuntu下都可以这么用
cmdline.execute('scrapy crawl spider_name'.split())
进入spider目录,修改spider_file_name.py文件
import re
from scrapy.spider import CrawlSpider
from scrapy.http import Request
from scrapy.selector import Selector
class tspider(CrawlSpider):
name = 'spider_name'
def start_request(self):
url = 'https://www.baidu.com'
yield Request(url=url,callback=self.parse)
def parse(self,response):
selector = Selector(response)
baidulist = selector.xpath('body/div[@id="wrapper" and @style]/ \
div[@id="head" and @class=""]/div[@class="head_wrapper"]/ \
div[@id="u1"]')
for a in baidulist:
print a.xpath('a/text()')
pass
然后运行run.py即可