(.* ?)
.?class="releasetime">(.?)',re.S
练习:爬取猫王top10信息
from urllib import request
import re
import time
import csv
class MaoyanSpider(object):
def __init__(self):
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"}
self.page = 1
# 用来计数
def get_page(self, url):
req = request.Request(url, headers=self.headers)
res = request.urlopen(req)
html = res.read().decode('utf-8')
# 直接调用解析函数
self.parse_page(html)
def parse_page(self,html):
p=re.compile('
#p=re.compile('
r_list = p.findall(html)
# 直接调用保存函数
# r_list:[('霸王别姬','张国荣','1993'),(),()]
self.write_csv(r_list)
# 保存数据函数
def write_csv(self,r_list):
with open('猫眼电影top10.csv','a') as f:
writer = csv.writer(f)
# 依次写入每个电影信息
for r_t in r_list:
film = [
r_t[0].strip(),
r_t[1].strip(),
r_t[2].strip()
]
writer.writerow(film)
#主函数
def work_om(self):
for pn in range(0,41,10):
url = 'https://maoyan.com/board/4?offset=%s'%str(pn)
self.get_page(url)
print('第%d页爬取成功'%self.page)
self.page += 1
time.sleep(4)
if __name__ =='__main__':
begin = time.time()
spider = MaoyanSpider()
spider.work_om()
end = time.time()
print("执行时间%.2f"%(end - begin))
运行截图:
希望本文所述对大家Python程序设计有所帮助。