一. 编写items.py文件
import scrapy
from scrapy import Field,Item
#Item 是用来保存爬取到的数据的容器
class BaiduItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# image_type = Field() #图片
image_url = Field()
pass
二. 在spiders文件夹里创建baiduPicture.py文件并编写该文件
# -*- coding:utf-8 -*-
from scrapy.spiders import CrawlSpider
from scrapy.http import Request
from scrapy.selector import Selector
from baidu.items import BaiduItem
import json
class GetBaiDuPic(CrawlSpider):
name = 'baiduPicture'
# 设置headers伪装成浏览器
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5'
}
# 指定url的情况下我们重写start_requests方法
def start_requests(self):
start_urls = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord+=陆毅&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&word=陆毅%E9%99%86%E6%AF%85&z=&ic=0&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&step_word=%E9%99%86%E6%AF%85&pn=0&rn=30&gsm=1e&1528441524820='
yield Request(start_urls,headers=self.headers)
def parse(self, response):
#从Json文件内容中提取所有img的内容
imgs = json.loads(response.body)['data']
for eachImage in imgs:
item = BaiduItem() #items中的类
try:
item['image_url'] = [eachImage['middleURL']]
yield item
except Exception as e:
print(e)
三. 编写settings.py文件
#把ROBOTSTXT_OBEY置为false
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
四. 在baidu文件夹下创建main.py文件
from scrapy import cmdline
cmdline.execute("scrapy crawl baiduPicture -o result.csv".split()) #用命令行启动