首先在settings修改一些设置
ROBOTSTXT_OBEY = True
# 改为
ROBOTSTXT_OBEY = False
# 把这三行前面的#去掉
ITEM_PIPELINES = {
'dangdang.pipelines.DangdangPipeline': 300,
}
在items.py定义一些要爬取的信息
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class DangdangItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 标题
title = scrapy.Field()
# 链接
link = scrapy.Field()
# 评论数
comment = scrapy.Field()
然后在spiders创建一个dd.py的文件
内容如下
# -*- coding: utf-8 -*-
import scrapy
from dangdang.items import DangdangItem
from scrapy.http import Request
class DdSpider(scrapy.Spider):
name = 'dd'
allowed_domains = ['dangdang.com']
start_urls = ['http://category.dangdang.com/pg1-cid4008154.html']
def parse(self, response):
item = DangdangItem()
item["title"] = response.xpath("//a[@name='itemlist-title']/@title").extract()
item["link"] = response.xpath("//a[@name='itemlist-title']/@href").extract()
item["comment"] = response.xpath("//a[@dd_name='单品评论']/text()").extract()
yield item
for i in range(2,81):
url = 'http://category.dangdang.com/pg'+str(i)+'-cid4008154.html'
yield Request(url,callback=self.parse)
然后通过pipelines.py写入数据库,首先创建一个名为dangdang的数据库
写入表goods里面,goods里面创建id,title,link,comment
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
class DangdangPipeline(object):
def process_item(self, item, spider):
conn=pymysql.connect(host="127.0.0.1",user="root",passwd="",db="dangdang")
for i in range(0, len(item["title"])):
title = item["title"][i]
link = item["link"][i]
comment = item["comment"][i]
#print(title+":"+link+":"+comment)
sql="insert into goods(title,link, comment) values('"+title+"','"+link+"','"+comment+"')"
#print(sql)
try:
conn.query(sql)
except Exception as err:
print(err)
conn.close()
return item
然后通过cmd进入这个文件这个文件夹下
再使用命令开启爬虫
scrapy crawl dd