猫眼电影TOP100榜单爬取
用到的模块有csv、re、urllib.request或者requests
import csv
import re
from urllib import request
class Maoyan:
def __init__(self):
self.baseurl = "https://maoyan.com/board/4?offset="
self.headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0"}
self.page = 1
self.offst = 0
#下载页面
def loadPage(self,url):
req = request.Request(url,headers=self.headers)
res = request.urlopen(req)
html = res.read().decode('utf8')
self.parsePage(html)
#解析页面
def parsePage(self,html):
p = re.compile("<div class="movie-item-info">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?releasetime">(.*?)</p>",re.S)
r_list = p.findall(html)
self.wirtePage(r_list)
def wirtePage(self,r_list):
if self.page == 1:
with open('猫眼.csv','a', newline='') as f:
writer =csv.writer(f)
writer.writerow(['电影名称','主演','上映时间'])
for r_tuple in r_list:
with open('猫眼.csv','a',newline='') as f:
#创建写入对象
writer = csv.writer(f)
L = [r_tuple[0].strip(),r_tuple[1].strip(),r_tuple[2].strip()]
#L = list(r_tuple)
writer.writerow(L)
def workOn(self):
while True:
c = input('爬取(y/n):')
if c.strip().lower() == 'y':
self.offst =(self.page-1)*10
url = self.baseurl + str(self.offst)
self.loadPage(url)
self.page += 1
else:
print('爬取结束')
break
if __name__=="__main__":
sp = Maoyan()
sp.workOn()