这些数据并不需要模拟登陆,模拟登陆会在下一偏博客写。
主要步骤:
创建项目scrapy startproject dangdang
使用默认模版创建爬虫scrapy genspider -t basic dd dangdang.com
1。编写items,即需要爬的分类
import scrapy
class DangdangItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
2。开启seting中的pipelines
将注释去除即可
3。编写爬虫文件
# -*- coding: utf-8 -*-
import scrapy
from dangdang.items import DangdangItem
from scrapy.http import Request
class DdSpider(scrapy.Spider):
name = 'dd'
allowed_domains = ['dangdang.com']
start_urls = ['http://dangdang.com/']
header = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36'}#模拟浏览器
def parse(self, response):
item = DangdangItem()
item['title']=response.xpath('//p[@name="title"]/a/text()').extract()#xpath表达式获取内容
item['link']=response.xpath('//p[@name="title"]/a/@href').extract()
yield item
for i in range(2,10):#获取十页数据
url = 'http://category.dangdang.com/pg'+str(i)+'-cp01.25.17.00.00.00.html'
yield Request(url,callback=self.parse,headers=self.header)
4。编写pipelines后续处理
其中需要注意数据的表创建的格式及编码问题
# -*- coding: utf-8 -*-
import mysql.connector
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
class DangdangPipeline(object):
def process_item(self, item, spider):
db = mysql.connector.connect(host='localhost',
user='root',
passwd='123456',
db='python') # 链接数据库
cur = db.cursor() # 获取数据库游标
for i in range(0,len(item['title'])):#遍历列表写入数据库
title = item['title'][i]
link = item['link'][i]
cur.execute("insert into dangdang(book_title,book_link) VALUES ('"+title+"','"+link+"')") # 执行语句
db.commit() # 提交事务,没有此句在数据库中不能查询到数据
cur.close() # 关闭游标
db.close() # 关闭数据库
return item
查看数据库