一、基本功能
安装环境:window10,python 3.7
1、安装包
pip install scrapy
scrapy #查看安装包信息
scrapy bench #性能测试
2、创建新的项目
scrapy startproject scrapytestone
tree /F #查看项目目录
3、生成项目文件
cd scrapytestone/spiders #进入到文件夹
scrapy genspider scrapyname "www.xxx.com" #生成爬虫文件,名为scrapyname,首个地址为xxx.com,可以同时构建多个爬虫以名称区分
4、配置文件
items.py
class ItcomItem(scrapy.Item):
name=scrapy.Field()
title=scrapy.Field()
info=scrapy.Field()
teachinfo.py
import scrapy
from itcom.items import ItcomItem
class TeachinfoSpider(scrapy.Spider):
name = 'teachinfo'
allowed_domains = ['itcast.cn']
start_urls = ["http://www.itcast.cn/channel/teacher.shtml"]
def parse(self, response):
filename = "teacher.html"
#获取网站标题
context = response.xpath('/html/head/title/text()')
# 提取网站标题
title=context.extract_first()
print(title)
for each in response.xpath("//div[@class='li_txt']"):
# 将得到的数据封装到一个ItcastItem对象
item=ItcomItem()
# extract()方法返回的都是unicode字符串
name=each.xpath("h3/text()").extract()
title=each.xpath("h4/text()").extract()
info=each.xpath("p/text()").extract()
# xpath返回的是包含一个元素的列表
item['name']=name[0]
item['title']=title[0]
item['info']=info[0]
# items.append(item)
yield item
# return items
5、运行爬虫
scrapy crawl itcast
文件保存,支持json,csv,xml,
scrapy crawl teachinfo -o teachers.json
其他配置
默认为Unicode编码格式,修改为如下格式
1、临时修改:
scrapy crawl teachinfo -o teachers.json -s FEED_EXPORT_ENCODING=UTF-8
2、修改设置:settings.py 尾部增加
FEED_EXPORT_ENCODING = ‘utf-8’
二、项目实例
1、爬取douyu
scrapy startproject Douyu #项目构建
cd Douyu/Douyu
scrapy genspider douyu #创建爬虫
#配置文件,编写爬虫文件,配置管道,获取字段,按第二步操作后
scrapy crawl douyu #运行爬虫
2、爬虫编写
items文件,定义取值字段名
import scrapy
class DouyuItem(scrapy.Item):
nickname=scrapy.Field() #昵称
title=scrapy.Field() # 标题
url=scrapy.Field() # url
imgurl=scrapy.Field() #img
spiders/douyu.py编写爬虫文件
import scrapy
import json
from DouYu.items import DouyuItem
#使用接口获取数据,并提取图片信息
class DouyuSpider(scrapy.Spider):
name = 'douyu'
allowed_domains = ['www.douyu.com'] #默认域
baseurl = 'https://www.douyu.com/gapi/rknc/directory/yzRec/'
offset=1
start_urls = [baseurl+str(offset)] #初始地址
def parse(self, response):
data_list=json.loads(response.body)['data']['rl']
for data in data_list:
if self.offset>20: #抓取次数
return
item=DouyuItem()
# rn自定义标题, nn 名称, url, rs16图片
item['nickname']=data['nn']
item['title']=data['rn']
item['url']=data['url']
item['imgurl']=data['rs16']
yield item #返回给管道
self.offset+=1
yield scrapy.Request(self.baseurl + str(self.offset), callback=self.parse) #url返回管道,重新获取url
pipelines.py 管道文件编写
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from .settings import IMAGES_STORE as images_store
import os
class DouyuPipeline(ImagesPipeline): #获取图片处理
def get_media_requests(self,item,info): #使用连接获取图片
image_link=item['imgurl']
yield scrapy.Request(image_link)
def item_completed(self, results, item, info): #图片重命名
image_path = [x['path'] for ok, x in results if ok]
old_name=images_store + image_path[0]
new_name=images_store+item['nickname']+'.jpg'
os.rename(old_name,new_name)
settings.py 配置文件处理
BOT_NAME = 'DouYu'
SPIDER_MODULES = ['DouYu.spiders']
NEWSPIDER_MODULE = 'DouYu.spiders'
#图片存储地址
IMAGES_STORE = './images/'
#robots协议
ROBOTSTXT_OBEY = False
#管道处理优先级
ITEM_PIPELINES = {
'DouYu.pipelines.DouyuPipeline': 300,
}