目标:爬取猫眼top100电影的名称,主演,上映时间
1.观察所要爬取的内容的页面[地址(https://maoyan.com/board/4?offset=0),并找到其规律,从中观察到其URL中只有offset=?在变动
2.按F12观察页面中的名称,主演,上映时间的URL地址用正则进行匹配
3.创建主函数来定义爬取的页面,创建函数来获取页面,创建函数来分析页面,创建函数来保存解析的数据,并将这些函数封装到一个类中。(由于此次数据量较少,我将数据存放在创建的csv文件中)
代码如下:
"""猫眼电影top100数据爬取"""
import re
import csv
from urllib import request
class MaoyanSpider:
def __init__(self):
self.headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome"
"/14.0.835.163 Safari/535.1"}
self.page = 1
#获取页面
def get_page(self,url):
req = request.Request(url,headers=self.headers)
res = request.urlopen(req)
html = res.read().decode('utf-8')
self.parse_page(html)
#解析页面
def parse_page(self,html):
p = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?star">(.*?)</p>.*?time">(.*?)</p>',re.S)
r_list = p.findall(html)
self.write_csv(r_list)
# print(r_list)
#存储信息
def write_csv(self,r_list):
with open('maoyan.csv','a',newline="",encoding='utf-8') as f:
writer = csv.writer(f)
for r_t in r_list:
film = [
r_t[0].strip(),
r_t[1].strip(),
r_t[2].strip()
]
writer.writerow(film)
# 主函数
def work_on(self):
for pn in range(0,101,10):
url = 'https://maoyan.com/board/4?offset=%s' % str(pn)
self.get_page(url)
self.page += 1
if __name__ == "__main__":
spider = MaoyanSpider()
spider.work_on()