使用scrapy抓取一个页面
1、将抓取的数据存放在数据库中
sudo service mysql start 启动数据库
mysql -u root 连接数据库
create database test1 创建数据库
2、创建models.py
安装mysqlclient 连接数据库
from sqlalchemy import cerate_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column,Integer,String
engine = create_engine('mysql+mysqldb://root@localhost:3306/test1')
Base = declarative_base()
class Test1(Base):
__tablename__ ='test1table' #创建的表名
id = Column(Integer,primary_key=True)
name =Column(String(64))
if __name__ == '__main__':
Base.metadata.create_all(engine)
3、运行models.py 创建test1table表
安装scrapy
scrapy startproject test #创建test爬虫
cd test
cd test
items.py 创建item对象,存放爬取到的数据
import scrapy
class TestItem(scrapy.Item):
id = scrapy.Field()
name = scrapy.Field()
scrapy genspider test test.com #创建test.py爬虫文件
cd spiders
from test.items import TestItem
import scrapy
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['http://www.test.com']
@property
def start_urls(self):
return url
def parse(self,response):
yeild TestItem({
'name':response.css().extract_first()
})
cd …
pipelines.py
处理item的数据
检查数据是否重复
存储到数据库
from sqlalchemy.orm import sessionmaker
from test.items import TestItem
class TestPipelines(object):
def process_item(self,item,spider):
item['name'] = string(item['name'])
self.session.add(Test(**item))
return item
def open_spider(self,spider):
Session = sessionmaker(bind=engine)
self.session = Session()
def close_spider(self,spider):
self.session.commit()
self.session.close()
*pipelines.py 是默认被注释的,在setting.py中取消ITEM_PIPELINES注释
运行爬虫
scrapy crawl test
页面跟随
爬取的页面无法直接获取,根据当前页面进行获取
response.follow
test.py
from test.items import TestItem
import scrapy
class TestSpider(scrapy.Spider):
name='test'
start_urls=['']
@property
def start_url(self):
return start_url
def parse(self,response):
yield TestItem({
'name':response.css().extract_first()
})
for url in response.xpath().extract():
yield scrapy.Request(url=response.urljoin(url),callback=self.parse)
for url in response.xpath():
yield response.follow(url,callback=self.parse)
#callback 代表着在生成的url页面中继续调用 parse函数
图片下载
定义item.py,添加TestImageItem
class TestImageItem(scrapy.Item):
image_urls=scrapy.Field()
images=scrapy.Field()
生成爬虫
scrapy genspider testimage test.com
vi testimage.py
from test.items import TestImageItem
import scrapy
class TestImage(scrapy.Spider):
name=‘testimage’
start_urls=[’**’]
def parse(self,response):
item = TestIamgeItem()
item[‘image_urls’]=response.xpath()
yeild item
修改setting.py设置,注释掉ITEM_PIPELINES中的pipelines,
添加‘scrapy.pipelines.images.ImagesPipeline’:100,
配置图片存储目录
IMAGE_STORE=‘images’
pip install pillow
组成item的数据在多个页面
多级request和parse
vi test.py
from test.items import TestItem
import scrapy
class Test(scrapy.Spider):
name=‘test’
start_urls=[’’]
def parse(self,response):
item=TestItem()
item[‘name’]=response.xpath().extract()
test1_url=response.urljoin(response.xpath())
request = scrapy.Request(test1_url,callback=self.parse_repo)
request.meta[‘item’]=item#将未完成的item通过meta传递给parse_repo
yeild request
def parse_repo(self,response):
item=request.meta[‘item’]
item[‘age’]=response.xpath()
yield item