完整爬虫代码
# -*- coding: utf-8 -*-
import scrapy
from spider_douban.items import SpiderDoubanItem
class Douban250Spider(scrapy.Spider):
# 继承scrapy.Spider,这里是爬虫名字,不能和项目名字重复
name = 'douban250'
# 允许的域名
allowed_domains = ['movie.douban.com']
# 入口 url,扔到调度器里面
start_urls = ['https://movie.douban.com/top250']
# 默认解析方法
def parse(self, response):
movie_list = response.xpath('//div[@id="content"]//ol/li')
print(len(movie_list))
for i_item in movie_list:
douban_item = SpiderDoubanItem()
douban_item['movie_name'] = i_item.xpath('div[@class="item"]/div[@class="info"]/div[@class="hd"]/a/span[1]/text()').extract_first()
print(douban_item)
# 序号
# 爬取序号
douban_item['serial_number'] = i_item.xpath('div[@class="item"]/div[@class="pic"]/em/text()').extract_first()
# 电影介绍
content = i_item.xpath('.//div[@class="bd"]/p[1]/text()').extract()
for i_content in content:
content_s = "".join(i_content.split())
douban_item["introduce"] = content_s
# 电影 星级
douban_item['star'] = i_item.xpath('./div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[2]/text()').extract_first()
# 电影评价人数
evalute = i_item.xpath('./div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[4]/text()').extract_first()
douban_item['evalute'] = evalute.replace('人评价', '')
# 电影描述
douban_item['describe'] = i_item.xpath('./div[@class="item"]/div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span[@class="inq"]/text()').extract_first()
yield douban_item #
# 不进行 yield 无法进入 pipelines 里面,将获取的数据交给pipelines
# 解析下一页
next_link = response.xpath('//span[@class="next"]/link/@href').extract()
# 如果有就一直取下一页
if next_link:
next_link = next_link[0]
# 将获取的数据交给pipelines
# 第一个参数是下一页的链接, 第二个参数是回调函数 的名字
yield scrapy.Request('https://movie.douban.com/top250'+next_link, callback=self.parse)
保存数据
scrapy保存信息的最简单的方法主要有四种,-o 输出指定格式的文件,,命令如下:
json格式,默认为Unicode编码
scrapy crawl douban250 -o douban.json
json lines格式,默认为Unicode编码
scrapy crawl douban250 -o douban.jsonl
csv 逗号表达式,可用Excel打开,注意编码方式
scrapy crawl douban250 -o douban.csv
xml格式
scrapy crawl douban250 -o douban.xml
将爬取的数据存入mysql数据库
新建MySQLPipeline.py
import pymysql.cursors
class MySQLPipeline(object):
def __init__(self):
# 连接数据库
self.connect = pymysql.connect(
host='127.0.0.1', # 数据库地址
port=3306, # 数据库端口
db='studydatabase1', # 数据库名
user='root', # 数据库用户名
passwd='root', # 数据库密码
charset='utf8', # 编码方式
use_unicode=True)
# 通过cursor执行增删查改
self.cursor = self.connect.cursor()
def process_item(self, item, spider):
self.cursor.execute(
"insert into movie value (null , %s, %s, %s, %s, %s, %s)",(item['serial_number'],item['movie_name'],item['introduce'],item['star'],item['evalute'],item['describe'])) #, # item里面定义的字段和表字段对应
# 提交sql语句
self.connect.commit()
return item # 必须实现返回
打开settings.py 将我们自己编写的pipline进行注册
ITEM_PIPELINES = {
'spider_douban.MySQLPipeline.MySQLPipeline': 300,
}
继续执行main.py
from scrapy import cmdline
cmdline.execute('scrapy crawl douban250 -o douban.csv'.split())
表结构
SET FOREIGN_KEY_CHECKS=0;
-- ----------------------------
-- Table structure for movie
-- ----------------------------
DROP TABLE IF EXISTS `movie`;
CREATE TABLE `movie` (
`id` int(20) NOT NULL AUTO_INCREMENT,
`serial_number` int(10) DEFAULT NULL,
`movie_name` varchar(100) DEFAULT NULL,
`introduce` varchar(100) DEFAULT NULL,
`star` double(10,0) DEFAULT NULL,
`evalute` int(20) DEFAULT NULL,
`describe` varchar(100) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=501 DEFAULT CHARSET=utf8;
SQLAlchemy
是Python编程语言下的一款开源软件。提供了SQL工具包及对象关系映(ORM)
class User(object):
username = "真的帅"
real_name = "李易峰"
sex = "男"
def address_list(self):
return [
{'id': 1, 'area': '详细地址1'},
{'id': 2, 'area': '详细地址2'},
]
print(__name__)
if __name__ == '__main__':
user = User()
print(user.real_name)
print(user.username)
print(user.address_list())
安装
pip install sqlalchemy
使用sqlalchemy建表
from sqlalchemy import create_engine
from sqlalchemy.orm import declarative_base
from sqlalchemy import Column, Integer, String, DateTime
# 第一步,准备连接
# mysql://用户名:密码@地址:端口号/数据库名字?charset=utf8
engine = create_engine('mysql://root:root@127.0.0.1:3306/studydatabase1?charset=utf8',
echo=True)
print(engine)
# 第二步,声明ORM模型的基类
Base = declarative_base()
class Student(Base):
""" 学生信息表 """
__tablename__ = 'student'
id = Column(type_=Integer, name='stu_id', primary_key=True)
stu_no = Column(type_=Integer, nullable=False, comment='学号')
stu_name = Column(String(16), nullable=False, comment='姓名')
created_at = Column(DateTime, doc='注册的时间')
def create_table():
"""同步数据库表"""
Base.metadata.create_all(bind=engine)
if __name__ == '__main__':
create_table()
sqlalchemy表的关联创建
db_engine.py
from sqlalchemy import create_engine
engine = create_engine('mysql://root:root@127.0.0.1:3306/studydatabase1?charset=utf8', echo=True)
user_model.py
"""
用户地址信息的ORM模型
"""
from datetime import datetime
from enum import IntEnum
from sqlalchemy.types import CHAR
from sqlalchemy.dialects.mysql import TINYINT
from sqlalchemy import Column, Integer, String, Enum, SmallInteger, DateTime, Boolean, ForeignKey
from sqlalchemy.orm import declarative_base, relationship, backref
from db_engine import engine
Base = declarative_base()
class SexEnum(IntEnum):
MAN = 1 # 男
WOMEN = 2 # 女
class User(Base):
""" 用户信息表 """
__tablename__ = 'account_user'
id = Column('id', type_=Integer(), primary_key=True)
username = Column(String(32), nullable=False, unique=True, comment='用户名')
password = Column(String(512), nullable=False, comment='密码')
real_name = Column(String(16), comment='真实姓名')
sex = Column(Enum(SexEnum), default=None, comment='性别')
age = Column(TINYINT(unsigned=True), default=0, comment='年龄')
created_at = Column(DateTime, default=datetime.now(), comment='创建的时间')
is_valid = Column(Boolean, default=True, comment='是否有效')
# 用户详情:一对一的关联关系
profile = relationship('UserProfile', backref='user', uselist=False)
# user_obj = User()
# user_obj.addresses
# user_obj.profile
class UserAddress(Base):
""" 地址信息表 """
__tablename__ = 'account_user_address'
id = Column(Integer, primary_key=True)
area = Column(String(256), nullable=False, comment='地址信息')
# phone_no = Column(String(11), comment='电话号码')
phone_no = Column(CHAR(11), comment='电话号码')
remark = Column(String(512), comment='备注信息')
is_valid = Column(Boolean, default=True)
created_at = Column(DateTime, default=datetime.now())
user_id = Column(Integer, ForeignKey(User.id), nullable=False, comment='关联的用户ID')
# user_id = Column(Integer, ForeignKey('account_user.id'))
user = relationship('User', backref='addresses')
class UserProfile(Base):
""" 用户详细信息表 """
__tablename__ = 'account_user_profile'
id = Column(Integer, primary_key=True)
hobby = Column(String(255), comment='用户的爱好')
user_id = Column(Integer, ForeignKey(User.id), nullable=False, comment='关联的用户ID')
# user = relationship('User', backref=backref('profile', uselist=False))
def create_table():
""" 同步数据库表 """
Base.metadata.create_all(bind=engine)
if __name__ == '__main__':
create_table()
SELECT * from account_user au,account_user_address aua WHERE
au.id = aua.user_id ;