爬取猫眼电影网站电影排行榜TOP100数据 --存入csv
流程
- 确认页面是动态还是静态:在源码搜索关键字看是否能找到
- 查看排行榜页面查询字符串中翻页的规律:规律为:offset=(page-1)*10
- 进入页面源码:右键点击查看元素,复制下需求数据的标签块,如下
<p class="name"><a href="/films/1228" title="天空之城" data-act="boarditem-click" data-val="{movieId:1228}">天空之城</a></p>
<p class="star">
主演:寺田农,鹫尾真知子,龟山助清
</p>
<p class="releasetime">上映时间:1992-05-01</p>
- 写出re:想要的数据为:电影名&朱颜&上映时间
<p class="name">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p>
- 爬虫代码
from urllib import request
from fake_useragent import UserAgent
from time import sleep
import re
import csv
class MaoyanSpider(object):
def __init__(self):
self.url = 'https://maoyan.com/board/4?offset={}'
self.i = 0
def get_html(self,url):
headers = {'User-Agent':UserAgent().random}
req = request.Request(url=url,headers=headers)
resp = request.urlopen(req)
html = resp.read().decode(utf-8)
return html
def parse_html(self,html):
re_bds = '<p class="name">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p>'
pattern = re.compile(re_bds,re.S)
r_list = pattern.findall(html)
print(r_list)
def save_html(self,r_list):
L = []
with open ('film.csv','a') as f:
writer = csv.writer(f)
for r in r_list:
row = (r[0],
r[1].strip(),
r[2].strip()[5:15]
)
L.append(row)
self.i += 1
writer.writerows(L)
def run(self):
begin = int(input('请输入起始页:'))
end = int(input('请输入终止页:'))
for page in range(begin,end+1):
url = self.url.format((page-1)*10)
html = self.get_html(url)
r_list = self.parse_html(html)
self.save_html(r_list)
time.sleep(random.uniform(0,1))
print('数据数量:',self.i)
if __name__ == '__main__':
start =time.time()
spider = MaoyanSpider()
spider.run()
end = time.time()
print('执行时间:%.2f' % (end-start))