1.下载scrapy框架
pip install scrapy
2.在E盘下创建一个文件夹scrapy01,在命令行窗体中进入该文件夹
3.创建项目:scrapy startproject 项目名
scrapy startproject first_scrapy
4.使用pycharm打开scrapy01文件夹
5.在items.py文件中创建所需的字段,用于保存数据
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class FirstScrapyItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field() # 书名
price = scrapy.Field() # 价格
author = scrapy.Field() # 作者
date = scrapy.Field() # 出版日期
publisher = scrapy.Field() # 出版社
6.在spiders文件夹中创建爬虫程序test.py,代码如下:
# author:WN
# datetime:2019/11/3 15:29
from abc import ABC
import scrapy
from .. import items
class MySpider(scrapy.Spider, ABC):
# 名字
name = "mySpider"
def start_requests(self):
for num in range(1, 101):
url = "http://search.dangdang.com/?key=Python&act=input&page_index=%d" % num
# 使用yield:请求过后返回的数据等待被取走
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
try:
data = response.text
# scrapy是使用Xpath进行查找数据的
# 创建选择查找类Selector()对象
select = scrapy.Selector(text=data)
book_data = select.xpath("//ul[@class='bigimg']/li")
item = items.FirstScrapyItem()
# 查找具体数据
for book in book_data:
title = book.xpath("./a/img/@alt").extract_first().strip()
price = book.xpath("./p[@class='price']/span[@class='search_now_price']/text()").extract_first().lstrip('¥')
author = book.xpath("./p[@class='search_book_author']/span/a/@title").extract_first()
date = book.xpath("./p[@class='search_book_author']/span[2]/text()").extract_first().strip()
publisher = book.xpath("./p[@class='search_book_author']/span/a[@name='P_cbs']/text()").extract_first()
item['title'] = title if title else ''
item['price'] = price if price else ''
item['author'] = author if author else ''
item['date'] = date if date else ''
item['publisher'] = publisher if publisher else ''
yield item
except Exception as e:
print(e)
7.在setings.py中添加配置,以便将test.py中的item推送到piplines.py的类中
# 设置将item配置到pipelines中的类中
# 项目名.pipelines.类名
# 300是一个默认整数,它可以是任意整数
ITEM_PIPELINES = {
'first_scrapy.pipelines.FirstScrapyPipeline': 300,
}
8.编写pipelines.py的代码,前提先创建mysql数据库book和表books:
create database book;
use book;
set character_set_results=gbk;
create table books(
bTitle varchar(256) primary key,
bPrice varchar(50),
bAuthoe varchar(50),
bDate varchar(32),
bPublisher varchar(256)
);
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
class FirstScrapyPipeline(object):
# spider爬虫一开始就会执行下面的函数
def open_spider(self, spider):
print('opened')
try:
# 连接数据库
self.con = pymysql.connect(host='localhost', port=3306, user='root', password='root', db='book', charset='utf8')
# 创建游标
self.cursor = self.con.cursor()
self.opened = True
self.count = 0
except Exception as e:
print(e)
self.opened = False
# spider爬虫关闭执行函数
def close_spider(self, spider):
if self.opened:
self.con.commit()
self.con.close()
self.opened = False
print("close")
print("总共爬取:", self.count, "本书籍")
def process_item(self, item, spider):
try:
print(item['title'])
print(item['price'])
print(item['author'])
print(item['date'])
print(item['publisher'])
if self.opened:
self.cursor.execute(
'insert into books(bTitle,bPrice,bAuthor,bDate,bPublisher) values (%s,%s,%s,%s,%s)', (
item['title'], item['price'], item['author'], item['date'], item['publisher'])
)
self.count += 1
except Exception as err:
print(err)
return item
9.运行此项目
(1)在命令行窗体中运行:scrapy crawl 爬虫程序名 -s LOG_ENABLED=False,后边的参数是不显示调试信息
scrapy crawl mySpider -s LOG_ENABLED=False
(2)在spiders文件夹的上一级文件夹下创建run.py,运行此文件就可以运行该项目(不在dos窗口中运行项目)代码如下:
# author:WN
# datetime:2019/11/3 15:36
from scrapy import cmdline
# 运行语句,不需要再打开dos窗口
# scrapy crawl 爬虫名 不显示调试信息的参数
cmdline.execute("scrapy crawl mySpider -s LOG_ENABLED=False".split())