猫眼电影 - 榜单 - top100榜
电影名称、主演、上映时间
数据抓取实现
- 1、确定响应内容中是否存在所需数据
右键 - 查看网页源代码 - 搜索关键字 - 存在!!
- 2、找URL规律
第1页:https://maoyan.com/board/4?offset=0
第2页:https://maoyan.com/board/4?offset=10
第n页:offset=(n-1)*10
- 3、正则表达式
<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>
- 4、编写程序框架,完善程序
from urllib import request
import re
import time
import random
from useragents import ua_list
class MaoyanSpider(object):
def __init__(self):
self.url = 'https://maoyan.com/board/4?offset={}'
# 计数
self.num = 0
# 获取
def get_html(self,url):
headers = {
'User-Agent' : random.choice(ua_list)
}
req = request.Request(url=url,headers=headers)
res = request.urlopen(req)
html = res.read().decode('utf-8')
# 直接调用解析函数
self.parse_html(html)
# 解析
def parse_html(self,html):
re_bds = r'<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>'
pattern = re.compile(re_bds,re.S)
# film_list: [('霸王别姬','张国荣','1993'),()]
film_list = pattern.findall(html)
# 直接调用写入函数
self.write_html(film_list)
def write_html(self,film_list):
item = {}
for film in film_list:
item['name'] = film[0].strip()
item['star'] = film[1].strip()
item['time'] = film[2].strip()[5:15]
print(item)
self.num += 1
def main(self):
for offset in range(0,31,10):
url = self.url.format(offset)
self.get_html(url)
time.sleep(random.randint(1,2))
print('共抓取数据:',self.num)
if __name__ == '__main__':
start = time.time()
spider = MaoyanSpider()
spider.main()
end = time.time()
print('执行时间:%.2f' % (end-start))
数据持久化存储
数据持久化存储 - csv文件
将爬取的数据存放到本地的csv文件中
1、导入模块
2、打开csv文件
3、初始化写入对象
4、写入数据(参数为列表)
import csv
with open('film.csv','w') as f:
writer = csv.writer(f)
writer.writerow([])
创建 test.csv 文件,在文件中写入数据
# 单行写入(writerow([]))
import csv
with open('test.csv','w',newline='') as f:
writer = csv.writer(f)
writer.writerow(['步惊云','36'])
writer.writerow(['超哥哥','25'])
# 多行写入(writerows([(),(),()]
import csv
with open('test.csv','w',newline='') as f:
writer = csv.writer(f)
writer.writerows([('聂风','36'),('秦霜','25'),('孔慈','30')])
猫眼电影数据存入本地 maoyanfilm.csv 文件 - 使用writerow()方法实现
# 存入csv文件 - writerow()
def write_html(self,film_list):
with open('film.csv','a') as f:
# 初始化写入对象,注意参数f别忘了
writer = csv.writer(f)
for film in film_list:
L = [
film[0].strip(),
film[1].strip(),
film[2].strip()[5:15]
]
# writerow()参数为列表
writer.writerow(L)
思考:使用 writerows()方法实现?
# 存入csv文件 - writerows()
def write_html(self,film_list):
L = []
with open('film.csv','a') as f:
# 初始化写入对象,注意参数f别忘了
writer = csv.writer(f)
for film in film_list:
t = (
film[0].strip(),
film[1].strip(),
film[2].strip()[5:15]
)
L.append(t)
# writerows()参数为列表
writer.writerows(L)
数据持久化存储 - MySQL数据库
1、在数据库中建库建表
# 连接到mysql数据库
mysql -h127.0.0.1 -uroot -p123456
# 建库建表
create database maoyandb charset utf8;
use maoyandb;
create table filmtab(
name varchar(100),
star varchar(300),
time varchar(50)
)charset=utf8;
- 2、回顾pymysql基本使用
import pymysql
# 创建2个对象
db = pymysql.connect('localhost','root','123456','maoyandb',charset='utf8')
cursor = db.cursor()
# 执行SQL命令并提交到数据库执行
# execute()方法第二个参数为列表传参补位
ins = 'insert into filmtab values(%s,%s,%s)'
cursor.execute(ins,['霸王别姬','张国荣','1993'])
db.commit()
# 关闭
cursor.close()
db.close()
- 来试试高效的executemany()方法?
import pymysql
# 创建2个对象
db = pymysql.connect('192.168.153.137','tiger','123456','maoyandb',charset='utf8')
cursor = db.cursor()
# 抓取的数据
film_list = [('月光宝盒','周星驰','1994'),('大圣娶亲','周星驰','1994')]
# 执行SQL命令并提交到数据库执行
# execute()方法第二个参数为列表传参补位
cursor.executemany('insert into filmtab values(%s,%s,%s)',film_list)
db.commit()
# 关闭
cursor.close()
db.close()
- 3、将电影信息存入MySQL数据库(尽量使用executemany方法)
# mysql - executemany([(),(),()])
def write_html(self, film_list):
L = []
ins = 'insert into filmtab values(%s,%s,%s)'
for film in film_list:
t = (
film[0].strip(),
film[1].strip(),
film[2].strip()[5:15]
)
L.append(t)
self.cursor.executemany(ins, L)
# 千万别忘了提交到数据库执行
self.db.commit()
- 4、做个SQL查询
1、查询20年以前的电影的名字和上映时间
select name,time from filmtab where time<(now()-interval 20 year);
2、查询1990-2000年的电影名字和上映时间
select name,time from filmtab where time>='1990-01-01' and time<='2000-12-31';
数据持久化存储 - MongoDB数据库
pymongo操作mongodb数据库
import pymongo
# 1.数据库连接对象
conn=pymongo.MongoClient('localhost',27017)
# 2.库对象
db = conn['库名']
# 3.集合对象
myset = db['集合名']
# 4.插入数据
myset.insert_one({字典})
思考
1、能否到电影详情页把评论抓取下来?
2、能否到电影详情页把电影图片抓取下来? - 并按照电影名称分别创建文件夹
代码实现
from urllib import request
import re
import time
import random
from useragents import ua_list
import os
class MaoyanSpider(object):
def __init__(self):
self.url = 'https://maoyan.com/board/4?offset={}'
# 计数
self.num = 0
# 获取
def get_html(self,url):
headers = {
'User-Agent' : random.choice(ua_list)
}
req = request.Request(url=url,headers=headers)
res = request.urlopen(req)
html = res.read().decode('utf-8','ignore')
return html
def re_func(self,re_bds,html):
pattern = re.compile(re_bds,re.S)
r_list = pattern.findall(html)
return r_list
# 解析
def parse_html(self,url):
re_bds = r'<div class="movie-item-info">.*?href="(.*?)".*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>'
# html获取 + re解析
html = self.get_html(url)
film_list = self.re_func(re_bds,html)
# 直接调用写入函数
self.write_html(film_list)
def write_html(self,film_list):
film_dict = {}
for film in film_list:
film_dict['name'] = film[1].strip()
film_dict['star'] = film[2].strip()
film_dict['time'] = film[3].strip()[5:15]
two_url = 'https://maoyan.com{}'.format(film[0].strip())
film_dict['comment'] = self.get_comment(two_url)
self.save_image(two_url,film)
print(film_dict)
self.num += 1
def get_comment(self,two_url):
# 获取 + 解析
html = self.get_html(two_url)
re_bds = r'<div class="comment-content">(.*?)</div>'
comment_list = self.re_func(re_bds,html)
return comment_list
def save_image(self,two_url,film):
re_bds = r'<div class="img.*?"><img class="default-img" data-src="(.*?)" alt=""></div>'
html = self.get_html(two_url)
img_link_list = self.re_func(re_bds,html)
for img_link in img_link_list:
req = request.Request(img_link)
res = request.urlopen(req)
html = res.read()
# 处理文件名
directory = 'D:\\猫眼\\{}\\'.format(film[1].strip())
if not os.path.exists(directory):
os.makedirs(directory)
filename=directory + img_link.split('/')[-1].split('@')[0]
with open(filename,'wb') as f:
f.write(html)
# 入口函数
def run(self):
for offset in range(0,31,10):
url = self.url.format(offset)
self.parse_html(url)
time.sleep(random.randint(1,2))
print('共抓取数据:',self.num)
if __name__ == '__main__':
start = time.time()
spider = MaoyanSpider()
spider.run()
end = time.time()
print('执行时间:%.2f' % (end-start))