import re
import os
import time
import random
import requests
import pandas as pd
# 方法一,用于网页翻页有规律的情况,如0,10,20,30,40
# 存储数据
datalist = []
for i in range(11): #爬取10页
print('正在爬取第%s页'%i)
url = 'https://maoyan.com/board/4?offset='+str(i*10)
proxies = [{'http': 'http://58.212.42.116:36708'}, {'http':'http://117.57.91.53:9999'}, {'http':'123.169.35.184:9999'}]
header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.42 Safari/537.36'
,'Connection': 'close'}
response = requests.get(url, headers = header, proxies = random.choice(proxies))#verify是否验证服务器的SSL证书
html = response.text
# 正则表达式
indregex = re.compile('<i class="board-index board-index-.*?">(.*?)</i>')
ind = re.findall(indregex,html)
titleregex = re.compile('<p class="name"><a href=.*?title=.*?data-act=.*?data-val=.*?>(.*?)</a></p>')
title = re.findall(titleregex,html)
nameregex = re.compile('<p class="star">\n\s+(.*?)\n\s+</p>')
name = re.findall(nameregex,html)
dateregex = re.compile('<p class="releasetime">(.*?)</p>')
date = re.findall(dateregex,html)
scoreregex = re.compile('<p class="score"><i class="integer">(.*?)</i><i class="fraction">(.*?)</i></p>')
score = re.findall(scoreregex,html)
# 循环保存数据
for i in range(len(ind)):
datalist.append([ind[i], title[i], name[i], date[i], float(''.join(score[i]))])
time.sleep(random.randint(6,8)) # 设置间隔时间
df = pd.DataFrame(datalist,columns=['排名','电影名称','演员','上映日期','评分'])
df.to_csv('D:\\Desktop\\爬虫_anaconda\\猫眼TOP100.csv',index=False)
print('爬取完成,文件:猫眼TOP100.csv')
# 方法二,用于网页翻页没有规律的情况,直接从下一页按钮中获取第二页的信息
# 或者只有当前页和下一页的选项,没有显示一共有多少页
datalist = []
start = '0'
while True:
try: #异常处理,使代码更加健壮
print('正在爬取第%s页'%int(start))
url = 'https://maoyan.com/board/4?offset='+start
header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.42 Safari/537.36'}
response = requests.get(url, headers = header)
if response.status_code == 200:
html = response.text
#正则表达式
indregex = re.compile('<i class="board-index board-index-.*?">(.*?)</i>')
ind = re.findall(indregex,html)
titleregex = re.compile('<p class="name"><a href=.*?title=.*?data-act=.*?data-val=.*?>(.*?)</a></p>')
title = re.findall(titleregex,html)
nameregex = re.compile('<p class="star">\n\s+(.*?)\n\s+</p>')
name = re.findall(nameregex,html)
dateregex = re.compile('<p class="releasetime">(.*?)</p>')
date = re.findall(dateregex,html)
scoreregex = re.compile('<p class="score"><i class="integer">(.*?)</i><i class="fraction">(.*?)</i></p>')
score = re.findall(scoreregex,html)
# 存储数据
for i in range(len(ind)):
datalist.append([ind[i], title[i], name[i], date[i], float(''.join(score[i]))])
time.sleep(random.randint(6,8))
startpattern = re.compile('"?offset=(.*?)">下一页</a>')
start = re.findall(startpattern,html)[0]
# print(start)
except:
break
print('爬虫报错')
df = pd.DataFrame(datalist,columns=['排名','电影名称','演员','上映日期','评分'])
df.to_csv('D:\\Desktop\\爬虫_anaconda\\猫眼TOP100.csv',index=False)
print('爬取完成')