本文使用工具:
cmd pycharm phpstudy/mysql命令行
Xpath表达式
Xpath与正则简单对比
1.Xpath表达式效率更高
2.正则表达式功能更强大
3.一般优先选择Xpath,解决不了再用正则
Xpath提取规则
- / 逐层提取
- text()提取标签下的文本
/html/head/title/text()
3.//标签名** :提取所有名为的标签
4.//标签名[@属性=‘属性值’] :提取属性为**的标签
@属性 代表取某个属性
#提取div中<div class="tools">标签的内容
//div[@class='tools']
实例:用scrapy爬取当当网商品信息(名称,链接,评论),并将其存入mysql数据库中
数据库:使用mysql命令行实现
用cmd创建文件 略
2.MySQL命令行
Enter password: //登录密码,初始root
create database dangdang;//创建数据库文件
use dangdang;//使用该数据库文件
create table goods(id int(32) auto_increment primary key ,title varchar(100),link varchar(100) unique,comment varchar(100));//创建goods容器存储信息
scrapy项目中改动部分
1.item.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class DangdangItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
#创建三个存储容器
title=scrapy.Field()
link=scrapy.Field()
comment=scrapy.Field()
2.dd.py
# -*- coding: utf-8 -*-
import scrapy
from dangdang.items import DangdangItem
from scrapy.http import Request
class DdSpider(scrapy.Spider):
name = 'dd'
allowed_domains = ['dangdang.com']
start_urls = ['http://category.dangdang.com/pg1-cid4008149.html']#起始地址
def parse(self, response):
item=DangdangItem()
item["title"]=response.xpath("//a[@dd_name='单品标题']/@title").extract()
item["link"] = response.xpath("//a[@dd_name='单品标题']/@href").extract()
item["comment"]=response.xpath("//a[@name='itemlist-review']/text()").extract()
yield item
for i in range(2,10):#爬取前十页
url='http://category.dangdang.com/pg'+str(i)+'-cid4008149.html'
yield Request(url,callback=self.parse)
3.pipeline
# -*- coding: utf-8 -*-
import pymysql
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
class DangdangPipeline(object):
def process_item(self, item, spider):
conn=pymysql.connect(host="127.0.0.1",user="root",passwd="root",db="dangdang")#连接数据库
for i in range(0,len(item["title"])):
title=item["title"][i]
link = item["link"][i]
comment = item["comment"][i]
sql="insert into goods(title,link,comment) values('"+title+"','"+link+"','"+comment+"')"#sql语句
#print(sql)
try:
conn.query(sql)
except Exception as err:
print(err)
conn.close()
return item