功能
爬取猫眼电影TOP100的排名、图片URL、名称、主演、时间以及评分信息,并存入excel中。
示例代码
import requests
from requests.exceptions import RequestException
import re
import xlwt
all_info_list=[]
def create_ecxel(sheet_header):
header = sheet_header
global book
book = xlwt.Workbook(encoding='utf-8')
global sheet
sheet = book.add_sheet('Sheet1')
for h in range(len(header)):
sheet.write(0, h, header[h])
def get_one_page(url):
try:
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
+ '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
+ '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
items = re.findall(pattern, html)
for item in items:
index = item[0]
image = item[1]
title = item[2]
actor = item[3].strip()[3:]
time = item[4].strip()[5:]
score = item[5] + item[6]
info_list = [index,image,title,actor,time,score]
all_info_list.append(info_list)
def write_to_file(all_info_list):
row = 1
for list in all_info_list:
column = 0
for data in list:
sheet.write(row,column,data)
column += 1
row += 1
if __name__ == '__main__':
# 创建excel工作簿、写入工作表表头
sheet_header = ['index', 'image', 'title', 'actor', 'time', 'score']
create_ecxel(sheet_header)
#获取urls列表
urls = ['http://maoyan.com/board/4?offset={}'.format(str(i))
for i in range(0,100,10)]
#获取html并解析、保存内容
for url in urls:
html = get_one_page(url)
parse_one_page(html)
#写入excel工作簿
write_to_file(all_info_list)
#保存文件
book.save('猫眼电影TOP100.xls')
注意事项