前言:通过实例学习了解Scrapy爬虫框架的使用,并把爬取到的数据保存到数据库中和保存成一个Json格式的文件。
项目分析:
项目名:phone 爬虫名:getphone 爬取的网址:http://www.jihaoba.com/escrow/ 集号吧
分析爬取的字段:
//div[@class="numbershow"]/ul
手机号码:
./li[@class="number hmzt"]/a/@href
需要使用正则匹配获取电话号码
re("\\d{11}") # 11位电话号码
价格:
./li[@class="price"]/span/text()
存入数据库需要对人民币符号进行格式处理
replace("¥", "")
归属地:
./li[@class="brand"]/brand/text()
项目流程:
-
创建项目
scrapy startproject phone
-
创建爬虫器
cd phone scrapy genspider getphone jihaoba.com
-
设计爬虫器
from ..items import PhoneItem class GetphoneSpider(scrapy.Spider): name = 'getphone' allowed_domains = ['jihaoba.com'] start_urls = ['http://www.jihaoba.com/escrow/'] def parse(self, response): # print(response) lists = response.xpath('//div[@class="numbershow"]/ul') for list in lists: phoneitem = PhoneItem() # 电话号码 phoneitem['number'] = list.xpath('li[@class="number hmzt"]/a/@href').re("\\d{11}")[0] # 价格 price = list.xpath('li[@class="price"]/span/text()').extract()[0] phoneitem['price'] = price.replace("¥", "") # 归属地 phoneitem['brand'] = list.xpath('li[@class="brand"]/text()').extract()[0] # print(number, price, brand) yield phoneitem # 继续下一页 # next = "http://www.jihaoba.com" + response.xpath('//a[@class="m-pages-next"]/@href').extract_first() # yield scrapy.Request(next)
-
设置项目 items
class PhoneItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() number = scrapy.Field() price = scrapy.Field() brand = scrapy.Field()
-
设置管道 pipelines
import MySQLdb from .settings import mysql_host, mysql_user, mysql_passwd, mysql_db, mysql_port # 保存到数据库 class putMySQLPipeline: def __init__(self): host = mysql_host user = mysql_user passwd = mysql_passwd db = mysql_db port = mysql_port # 连接数据库 self.connection = MySQLdb.connect(host, user, passwd, db, port, charset='utf8') # 获取游标 self.cursor = self.connection.cursor() def process_item(self, item, spider): sql = "insert into phonetable(number,price,brand) values(%s,%s,%s)" param = list() # 创建列表存放数据 param.append(item['number']) param.append(item['price']) param.append(item['brand']) self.cursor.execute(sql, tuple(param)) # 执行操作,tuple()表示将列表转换为元组 self.connection.commit() # 提交事务 return item def __del__(self): self.cursor.close() self.connection.close() class JsonWithEncodingPipeline(object): def __init__(self): self.file = codecs.open("phone.json", "a", encoding="utf-8") def process_item(self, item, spider): lines = json.dumps(dict(item), ensure_ascii=False) + "\n" # dict() 创建一个字典 self.file.write(lines) return item def __del__(self): self.file.close()
-
设置 settings
# Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html # 管道优先级(数字越小优先级越高) ITEM_PIPELINES = { # 'phone.pipelines.PhonePipeline': 300, 'phone.pipelines.putMySQLPipeline': 300, 'phone.pipelines.JsonWithEncodingPipeline': 400 } mysql_host = '192.168.142.200' # 数据库IP地址 mysql_user = 'root' # 数据库用户名 mysql_passwd = '123456' # 数据库密码 mysql_db = 'db01' # 数据库名 mysql_tb = 'phonetable' # 数据表名 mysql_port = 3306 # 数据库端口号
-
创建 start.py
from scrapy import cmdline cmdline.execute("scrapy crawl getphone".split())
-
数据库
-
建立数据库
create database db01;
-
数据库授权
grant all privileges on db01.* 'root'@'%' identified by '123456';
-
建数据表
create table phone ( number bigint, price int, brand varchar(50) );
-
设置字符集编码
-
查看数据库字符集
show create database db01;
-
修改数据库字符编码
alter database db01 character set utf8;
-
查看数据表字符集
show create table phone;
-
修改数据表字符编码
alter table phone character set utf8;
-
-
-
运行项目
直接运行 start.py