Python爬虫实战 | (10) 爬取猫眼电影《海王》影评并存入MySql数据库

之前在Python爬虫实战(6)中我们曾爬取过《海王》影评,本篇博客的爬取解析过程和之前是完全一样的,唯一不同的是数据存储方式,之前是存储到文件中(csv,txt,json,excel等),这次我们将提取的影评存储到MySql数据库中。下面是海王影评接口:

http://m.maoyan.com/mmdb/comments/movie/249342.json_v=yes&offset=0&startTime=2019-01-01%2000:00:00

其中http://m.maoyan.com是猫眼电影的网址,后面是一个路径,249342是电影《海王》的id,startTime是评论的起始时间,年月日时分秒,%20代表空格。

接口的数据是json格式,一系列的键值对,上图阴影部分是一条影评 。我们关心的字段有cityName,content,id,score,startTime。

首先搭建爬虫主体框架:

import requests
import json
import time
from datetime import datetime
from datetime import timedelta
import pymysql

def get_page(url):
    pass

def parse_page(html):
    pass


def write_tofile(comments):
    pass


def create_db():
    pass

if __name__ == '__main__':
    create_db() #创建mysql数据库
    '''
    海王影评接口
    url = 'http://m.maoyan.com/mmdb/comments/movie/249342.json?_v_=yes&offset=0&startTime=2019-01-01%2000:00:00'
    '''

    id = '249342' #海王电影id
    start_time = '2019-01-01 00:00:00' #开始时间
    end_time = '2018-01-01 00:00:00' #结束时间
    while start_time > end_time:
        url = 'http://m.maoyan.com/mmdb/comments/movie/'+id+'.json?_v_=yes&offset=0&startTime='+start_time.replace(' ','%20') #将空格替换为%20

        #发送请求,获取响应
        try:
            html = get_page(url)
        except Exception:
            time.sleep(0.5) #发生异常时 延时0.5s 避免访问过于频繁
            html = get_page(url)
        else:
            time.sleep(0.1) #没有发生异常时 延时0.1s
        #解析响应内容
        comments = parse_page(html)
        #获取末尾评论的时间
        start_time = comments[14]['startTime']
        #把时间从str转换为datetime类型 减去1s 避免爬取重复数据
        start_time = datetime.strptime(start_time,'%Y-%m-%d %H:%M:%S') + timedelta(seconds=-1)
        #再把时间转换为字符串形式
        start_time = datetime.strftime(start_time,'%Y-%m-%d %H:%M:%S')
        #保存数据
        write_tofile(comments)

我们主要爬取18年到19年这一年的影评,所以我们从起始时间开始,每隔1s刷新一下页面,进行爬取。

连接mysql,创建数据库和表:

def create_db():
    # 创建maoyan数据库
    db = pymysql.connect(host='localhost', user='root', password='', port=3306)
    cursor = db.cursor()
    cursor.execute('CREATE DATABASE IF NOT EXISTS maoyan DEFAULT CHARACTER SET utf8')
    # 连接maoyan数据库
    db = pymysql.connect(host='localhost', user='root', password='', port=3306, db='maoyan')
    cursor = db.cursor()
    cursor.execute("DROP TABLE IF EXISTS comments") #删除所有的表
    #创建comments表以及各个字段
    cursor.execute(
        'CREATE TABLE IF NOT EXISTS comments (id int NOT NULL auto_increment, uid VARCHAR(255), cityName VARCHAR(255) , content  VARCHAR(255), score  float ,startTime  datetime, PRIMARY KEY (id))')
    db.close()

发送请求,获取响应,编写get_page(url)函数:

def get_page(url):

    #添加User-Agent,放在headers中,伪装成浏览器
    headers = {
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
    }
    response = requests.get(url,headers=headers)
    if response.status_code == 200:
        response.encoding = response.apparent_encoding
        return response.text
    return None

解析响应,之前都是用正则表达式来解析HTML代码,现在是json格式,很容易解析,因为其内部都是一些键值对,可以像字典一样访问:

def parse_page(html):
    data = json.loads(html)['cmts'] #将str转换为json cmts是最外层的键名
    comments = []
    for item in data:
        comment = {
            'id':item['id'],
            'cityName':item['cityName'] if 'cityName' in item else '',
            'content':item['content'].replace('\n',' ').replace('\r',' ').replace('\t',' ') #处理评论内容换行的情况
            'score':item['score'],
            'startTime':item['startTime']
        }
        comments.append(comment)
    return comments

将数据存储到Mysql数据库:

def write_tofile(comments):
    db = pymysql.connect(host='localhost', user='root', password='', port=3306, db='maoyan') #连接Mysql maoyan数据库
    cursor = db.cursor()
    #插入数据 存储
    sql = 'INSERT INTO comments(uid, cityName, content, score, startTime) values (%s, %s, %s, %s, %s)'
    for item in comments:
        try:
            cursor.execute(sql,list(item.values()))
            db.commit()
        except:
            db.rollback()
    db.close()

爬取效果,首先确保已经安装好MySql数据库和可视化管理工具Navicat:

完整代码:

import requests
import json
import time
from datetime import datetime
from datetime import timedelta
import pymysql

def get_page(url):

    #添加User-Agent,放在headers中,伪装成浏览器
    headers = {
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
    }
    response = requests.get(url,headers=headers)
    if response.status_code == 200:
        response.encoding = 'utf-8'
        return response.text
    return None

def parse_page(html):
    data = json.loads(html)['cmts'] #将str转换为json cmts是最外层的键名
    comments = []
    for item in data:
        comment = {
            'id':item['id'],
            'cityName':item['cityName'] if 'cityName' in item else '',
            'content':item['content'].replace('\n',' ').replace('\r',' ').replace('\t',' '), #处理评论内容换行的情况
            'score':item['score'],
            'startTime':item['startTime']
        }
        comments.append(comment)
    return comments


def write_tofile(comments):
    db = pymysql.connect(host='localhost', user='root', password='', port=3306, db='maoyan') #连接Mysql maoyan数据库
    cursor = db.cursor()
    #插入数据 存储
    sql = 'INSERT INTO comments(uid, cityName, content, score, startTime) values (%s, %s, %s, %s, %s)'
    for item in comments:
        try:
            cursor.execute(sql,list(item.values()))
            db.commit()
        except:
            db.rollback()
    db.close()


def create_db():
    # 创建maoyan数据库
    db = pymysql.connect(host='localhost', user='root', password='', port=3306)
    cursor = db.cursor()
    cursor.execute('CREATE DATABASE IF NOT EXISTS maoyan DEFAULT CHARACTER SET utf8')
    # 连接maoyan数据库
    db = pymysql.connect(host='localhost', user='root', password='', port=3306, db='maoyan')
    cursor = db.cursor()
    cursor.execute("DROP TABLE IF EXISTS comments") #删除所有的表
    #创建comments表以及各个字段
    cursor.execute(
        'CREATE TABLE IF NOT EXISTS comments (id int NOT NULL auto_increment, uid VARCHAR(255), cityName VARCHAR(255) , content  VARCHAR(255), score  float ,startTime  datetime, PRIMARY KEY (id))')
    db.close()


if __name__ == '__main__':
    create_db() #创建mysql数据库
    '''
    海王影评接口
    url = 'http://m.maoyan.com/mmdb/comments/movie/249342.json?_v_=yes&offset=0&startTime=2019-01-01%2000:00:00'
    '''

    id = '249342' #海王电影id
    start_time = '2019-01-01 00:00:00' #开始时间
    end_time = '2018-01-01 00:00:00' #结束时间
    while start_time > end_time:
        url = 'http://m.maoyan.com/mmdb/comments/movie/'+id+'.json?_v_=yes&offset=0&startTime='+start_time.replace(' ','%20') #将空格替换为%20

        #发送请求,获取响应
        try:
            html = get_page(url)
        except Exception:
            time.sleep(0.5) #发生异常时 延时0.5s 避免访问过于频繁
            html = get_page(url)
        else:
            time.sleep(0.1) #没有发生异常时 延时0.1s
        #解析响应内容
        comments = parse_page(html)
        #获取末尾评论的时间
        start_time = comments[14]['startTime']
        #把时间从str转换为datetime类型 减去1s 避免爬取重复数据
        start_time = datetime.strptime(start_time,'%Y-%m-%d %H:%M:%S') + timedelta(seconds=-1)
        #再把时间转换为字符串形式
        start_time = datetime.strftime(start_time,'%Y-%m-%d %H:%M:%S')
        #保存数据
        write_tofile(comments)

 

 

 

 

 

 

  • 2
    点赞
  • 14
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值