新建项目
[root@localhost pytest]# scrapy startproject iteye886
New Scrapy project 'iteye886', using template directory '/usr/lib64/python2.7/site-packages/scrapy/templates/project', created in:
/root/pytest/iteye886
You can start your first spider with:
cd iteye886
scrapy genspider example example.com
[root@localhost pytest]# cd iteye886/
[root@localhost iteye886]# scrapy genspider myblog 886.iteye.com
Created spider 'myblog' using template 'basic' in module:
iteye886.spiders.myblog
[root@localhost iteye886]# tree
.
├── iteye886
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── items.py
│ ├── pipelines.py
│ ├── settings.py
│ ├── settings.pyc
│ └── spiders
│ ├── __init__.py
│ ├── __init__.pyc
│ └── myblog.py
└── scrapy.cfg
2 directories, 10 files
编写需要采集的字段
[root@localhost iteye886]# vim iteye886/items.py
1 # -*- coding: utf-8 -*-
2
3 # Define here the models for your scraped items
4 #
5 # See documentation in:
6 # http://doc.scrapy.org/en/latest/topics/items.html
7
8 import scrapy
9
10
11 class Iteye886Item(scrapy.Item):
12 # define the fields for your item here like:
13 # name = scrapy.Field()
14 title = scrapy.Field()#设置要获取的item
15 link = scrapy.Field()
16
编辑代码
[root@localhost iteye886]# vim iteye886/spiders/myblog.py
1 # -*- coding: utf-8 -*-
2 import scrapy
3 from iteye886.items import Iteye886Item #导入item
4 class MyblogSpider(scrapy.Spider):
5 name = "myblog"
6 allowed_domains = ["886.iteye.com"]
7 start_urls = (
8 'http://886.iteye.com/', #删除www
9 )
10
11 def parse(self, response):
12 lis = response.xpath('//*[@id="main"]/div/div[1]/h3/a') #增加xpath
13 item = Iteye886Item()
14 for li in lis:
15 item['title']=response.xpath('a/text()').extract()
16 item['link']=response.xpath('a/@href').extract()
17 yield item
[root@localhost iteye886]# scrapy list
myblog
[root@localhost iteye886]# scrapy crawl myblog -o abc.csv
[root@localhost iteye886]# cat abc.csv
link,title
/blog/2324590,centos7-python:交互界面tab补齐
/blog/2324577,scrapy-0:centos7安装scrapy
....