前言:通过实例学习了解Scrapy爬虫框架的使用,并把爬取到的数据保存到数据库中和保存成一个Json格式的文件。
项目分析:
项目名:douban 爬虫器:getmovie 爬取的网址:https://movie.douban.com/chart 豆瓣电影排行榜
分析爬取的字段:
//div[@class="pl2"]
电影名:
./a/span/text()
电影详细介绍链接:
./a/@href
电影评分:
./div/span[@class="rating_nums"]/text()
电影评价人数:
./div/span[@class="p1"]/text()
存入数据库需要对数据进行处理,只保留人数,使用正则匹配
re("\\d+")
项目流程:
-
创建项目
scrapy startproject douban
-
创建爬虫器
cd douban scrapy genspider getmovie douban.com
-
设计爬虫器
from ..items import DoubanItem class GetmovieSpider(scrapy.Spider): name = 'getmovie' allowed_domains = ['douban.com'] start_urls = ['https://movie.douban.com/chart'] def parse(self, response): # print(response.text) lists = response.xpath('//div[@class="pl2"]') for list in lists: doubanitem = DoubanItem() # 电影名 doubanitem["title"] = list.xpath('./a/span/text()').extract_first() # 电影详细介绍网址 doubanitem["url"] = list.xpath('./a/@href').extract_first() # 电影评分 doubanitem["rating"] = list.xpath('./div/span[@class="rating_nums"]/text()').extract_first() # 电影总评论数 doubanitem["commentcount"] = list.xpath('./div/span[@class="pl"]/text()').re("\\d+")[0] # print(doubanitem) yield doubanitem # 提交数据 pass
-
设置项目 items
class DoubanItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() url = scrapy.Field() rating = scrapy.Field() commentcount = scrapy.Field() pass
-
设置管道 pipelines
import codecs import json import MySQLdb from .settings import mysql_host, mysql_db, mysql_port, mysql_user, mysql_passwd # 保存到数据库 class MySQLPipeline: def __init__(self): host = mysql_host user = mysql_user passwd = mysql_passwd db = mysql_db port = mysql_port self.connection = MySQLdb.connect(host, user, passwd, db, port, charset="utf8") self.cursor = self.connection.cursor() def process_item(self, item, spider): sql = "insert into movie (title,url,rating,commentcount) values(%s,%s,%s,%s)" params = list() params.append(item["title"]) params.append(item["url"]) params.append(item["rating"]) params.append(item["commentcount"]) self.cursor.execute(sql, tuple(params)) self.connection.commit() return item def __del__(self): self.cursor.close() self.connection.close() # 保存为JSON文件 class JsonWithEncodingPipeline(object): def __init__(self): self.file = codecs.open("doubanmovie.json", "a", encoding="utf-8") # 模式'a' 为 追加 def process_item(self, item, spider): lines = json.dumps(dict(item), ensure_ascii=False) + "\n" # dict() 创建一个字典 self.file.write(lines) return item def __del__(self): self.file.close()
-
设置 settings
查看User-agent:User-agent查看方式# Crawl responsibly by identifying yourself (and your website) on the user-agent # 设置用户代理(模拟浏览器访问) USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36' # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html # 管道优先级(数字越小优先级越高) ITEM_PIPELINES = { # 'douban.pipelines.DoubanPipeline': 300, 'douban.pipelines.JsonWithEncodingPipeline': 200, 'douban.pipelines.MySQLPipeline': 100, } mysql_host = "192.168.142.200" mysql_user = "root" mysql_passwd = "123456" mysql_db = "db01" mysql_tb = "movie" mysql_port = 3306
-
创建 start.py
from scrapy import cmdline cmdline.execute('scrapy crawl getmovie'.split())
-
数据库
-
建立数据库
create database db01;
-
数据库授权
grant all privileges on db01.* to 'root'@'%' identified by '123456';
-
建数据表
create table movie ( title varchar(50), url varchar(50), rating float, commentcount int );
-
设置字符集编码
-
查看数据库的字符集
show create database db01;
-
修改数据库的字符编码
alter database db01 character set utf8;
-
查看数据表的字符集
show create table movie;
-
修改数据表的字符编码
alter table movie character set utf8;
-
-
-
运行项目
直接运行 start.py