Scrapy 是一个基于 Twisted 实现的异步处理爬虫框架,该框架使用纯 Python 语言编写。Scrapy 框架应用广泛,常用于数据采集、网络监测,以及自动化测试等
本次内容将以爬取当当网相关数据进行框架测试
scrapy startproject dangdang
cd .\dangdang
scrapy genspider -t crawl dd_books www. dangdang.com
1、使用CrawSpider模板自动化爬取当当网数据2、存储mysql数据库
import scrapy
class DangdangItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
comment = scrapy.Field()
import scrapy
from ..items import DangdangItem
from scrapy.http import Request # 实现翻页
class DdBooksSpider(scrapy.Spider):
name = 'dd_books'
allowed_domains = ['dangdang.com']
start_urls = ['http://category.dangdang.com/pg1-cp01.54.26.00.00.00.html']
def parse(self, response):
item = DangdangItem()
item["title"] = response.xpath("//a[@name='itemlist-title']/@title").extract()
item["link"] = response.xpath("//a[@name='itemlist-title']/@href").extract()
item["comment"] = response.xpath("//a[@name='itemlist-review']/text()").extract()
yield item
for i in range(2, 3):
url = "http://category.dangdang.com/pg"+str(i)+"-cp01.54.26.00.00.00.html"
yield Request(url, callback=self.parse)
import pymysql
class DangdangPipeline(object):
def process_item(self, item, spider):
conn = pymysql.connect(host="localhost", user="root", password="13459562539zxj", database="data_dangdang",charset="utf8")
for i in range(0, len(item["comment"])):
title = item["title"][i]
link = item["link"][i]
comment = item["comment"][i]
sql = "insert into dangdang(title, link, comment) values('" + title + "', '" + link + "', '" + comment + "')"
conn.query(sql)
conn.commit()
conn.close()
return item
结果截图: