之前在Python爬虫实战(6)中我们曾爬取过《海王》影评,本篇博客的爬取解析过程和之前是完全一样的,唯一不同的是数据存储方式,之前是存储到文件中(csv,txt,json,excel等),这次我们将提取的影评存储到MySql数据库中。下面是海王影评接口:
http://m.maoyan.com/mmdb/comments/movie/249342.json_v=yes&offset=0&startTime=2019-01-01%2000:00:00
其中http://m.maoyan.com是猫眼电影的网址,后面是一个路径,249342是电影《海王》的id,startTime是评论的起始时间,年月日时分秒,%20代表空格。
接口的数据是json格式,一系列的键值对,上图阴影部分是一条影评 。我们关心的字段有cityName,content,id,score,startTime。
首先搭建爬虫主体框架:
import requests
import json
import time
from datetime import datetime
from datetime import timedelta
import pymysql
def get_page(url):
pass
def parse_page(html):
pass
def write_tofile(comments):
pass
def create_db():
pass
if __name__ == '__main__':
create_db() #创建mysql数据库
'''
海王影评接口
url = 'http://m.maoyan.com/mmdb/comments/movie/249342.json?_v_=yes&offset=0&startTime=2019-01-01%2000:00:00'
'''
id = '249342' #海王电影id
start_time = '2019-01-01 00:00:00' #开始时间
end_time = '2018-01-01 00:00:00' #结束时间
while start_time > end_time:
url = 'http://m.maoyan.com/mmdb/comments/movie/'+id+'.json?_v_=yes&offset=0&startTime='+start_time.replace(' ','%20') #将空格替换为%20
#发送请求,获取响应
try:
html = get_page(url)
except Exception:
time.sleep(0.5) #发生异常时 延时0.5s 避免访问过于频繁
html = get_page(url)
else:
time.sleep(0.1) #没有发生异常时 延时0.1s
#解析响应内容
comments = parse_page(html)
#获取末尾评论的时间
start_time = comments[14]['startTime']
#把时间从str转换为datetime类型 减去1s 避免爬取重复数据
start_time = datetime.strptime(start_time,'%Y-%m-%d %H:%M:%S') + timedelta(seconds=-1)
#再把时间转换为字符串形式
start_time = datetime.strftime(start_time,'%Y-%m-%d %H:%M:%S')
#保存数据
write_tofile(comments)
我们主要爬取18年到19年这一年的影评,所以我们从起始时间开始,每隔1s刷新一下页面,进行爬取。
连接mysql,创建数据库和表:
def create_db():
# 创建maoyan数据库
db = pymysql.connect(host='localhost', user='root', password='', port=3306)
cursor = db.cursor()
cursor.execute('CREATE DATABASE IF NOT EXISTS maoyan DEFAULT CHARACTER SET utf8')
# 连接maoyan数据库
db = pymysql.connect(host='localhost', user='root', password='', port=3306, db='maoyan')
cursor = db.cursor()
cursor.execute("DROP TABLE IF EXISTS comments") #删除所有的表
#创建comments表以及各个字段
cursor.execute(
'CREATE TABLE IF NOT EXISTS comments (id int NOT NULL auto_increment, uid VARCHAR(255), cityName VARCHAR(255) , content VARCHAR(255), score float ,startTime datetime, PRIMARY KEY (id))')
db.close()
发送请求,获取响应,编写get_page(url)函数:
def get_page(url):
#添加User-Agent,放在headers中,伪装成浏览器
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
response = requests.get(url,headers=headers)
if response.status_code == 200:
response.encoding = response.apparent_encoding
return response.text
return None
解析响应,之前都是用正则表达式来解析HTML代码,现在是json格式,很容易解析,因为其内部都是一些键值对,可以像字典一样访问:
def parse_page(html):
data = json.loads(html)['cmts'] #将str转换为json cmts是最外层的键名
comments = []
for item in data:
comment = {
'id':item['id'],
'cityName':item['cityName'] if 'cityName' in item else '',
'content':item['content'].replace('\n',' ').replace('\r',' ').replace('\t',' ') #处理评论内容换行的情况
'score':item['score'],
'startTime':item['startTime']
}
comments.append(comment)
return comments
将数据存储到Mysql数据库:
def write_tofile(comments):
db = pymysql.connect(host='localhost', user='root', password='', port=3306, db='maoyan') #连接Mysql maoyan数据库
cursor = db.cursor()
#插入数据 存储
sql = 'INSERT INTO comments(uid, cityName, content, score, startTime) values (%s, %s, %s, %s, %s)'
for item in comments:
try:
cursor.execute(sql,list(item.values()))
db.commit()
except:
db.rollback()
db.close()
爬取效果,首先确保已经安装好MySql数据库和可视化管理工具Navicat:
完整代码:
import requests
import json
import time
from datetime import datetime
from datetime import timedelta
import pymysql
def get_page(url):
#添加User-Agent,放在headers中,伪装成浏览器
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
response = requests.get(url,headers=headers)
if response.status_code == 200:
response.encoding = 'utf-8'
return response.text
return None
def parse_page(html):
data = json.loads(html)['cmts'] #将str转换为json cmts是最外层的键名
comments = []
for item in data:
comment = {
'id':item['id'],
'cityName':item['cityName'] if 'cityName' in item else '',
'content':item['content'].replace('\n',' ').replace('\r',' ').replace('\t',' '), #处理评论内容换行的情况
'score':item['score'],
'startTime':item['startTime']
}
comments.append(comment)
return comments
def write_tofile(comments):
db = pymysql.connect(host='localhost', user='root', password='', port=3306, db='maoyan') #连接Mysql maoyan数据库
cursor = db.cursor()
#插入数据 存储
sql = 'INSERT INTO comments(uid, cityName, content, score, startTime) values (%s, %s, %s, %s, %s)'
for item in comments:
try:
cursor.execute(sql,list(item.values()))
db.commit()
except:
db.rollback()
db.close()
def create_db():
# 创建maoyan数据库
db = pymysql.connect(host='localhost', user='root', password='', port=3306)
cursor = db.cursor()
cursor.execute('CREATE DATABASE IF NOT EXISTS maoyan DEFAULT CHARACTER SET utf8')
# 连接maoyan数据库
db = pymysql.connect(host='localhost', user='root', password='', port=3306, db='maoyan')
cursor = db.cursor()
cursor.execute("DROP TABLE IF EXISTS comments") #删除所有的表
#创建comments表以及各个字段
cursor.execute(
'CREATE TABLE IF NOT EXISTS comments (id int NOT NULL auto_increment, uid VARCHAR(255), cityName VARCHAR(255) , content VARCHAR(255), score float ,startTime datetime, PRIMARY KEY (id))')
db.close()
if __name__ == '__main__':
create_db() #创建mysql数据库
'''
海王影评接口
url = 'http://m.maoyan.com/mmdb/comments/movie/249342.json?_v_=yes&offset=0&startTime=2019-01-01%2000:00:00'
'''
id = '249342' #海王电影id
start_time = '2019-01-01 00:00:00' #开始时间
end_time = '2018-01-01 00:00:00' #结束时间
while start_time > end_time:
url = 'http://m.maoyan.com/mmdb/comments/movie/'+id+'.json?_v_=yes&offset=0&startTime='+start_time.replace(' ','%20') #将空格替换为%20
#发送请求,获取响应
try:
html = get_page(url)
except Exception:
time.sleep(0.5) #发生异常时 延时0.5s 避免访问过于频繁
html = get_page(url)
else:
time.sleep(0.1) #没有发生异常时 延时0.1s
#解析响应内容
comments = parse_page(html)
#获取末尾评论的时间
start_time = comments[14]['startTime']
#把时间从str转换为datetime类型 减去1s 避免爬取重复数据
start_time = datetime.strptime(start_time,'%Y-%m-%d %H:%M:%S') + timedelta(seconds=-1)
#再把时间转换为字符串形式
start_time = datetime.strftime(start_time,'%Y-%m-%d %H:%M:%S')
#保存数据
write_tofile(comments)