Python爬虫
步骤
1、基本的爬取思路是先爬取排行榜中每个电影的详情页URL
2、通过拼接URL获取详情页的具体地址
3、通过分析电影详情页获取电影数据
4、将爬取到的数据保存到Excel文件
代码
# -*- coding: utf-8 -*-
# @Time : 2021/4/26 22:29
# @Author : hqf
# @File : 韩国电影售票评分网站.py
import requests
from bs4 import BeautifulSoup
import xlwt
'''
https://movie.naver.com
韩国电影售票评分网站
'''
def get_html(url,encoding):
response = requests.get(url)
if response.status_code == 200:
# 判断请求是否成功
response.encoding = encoding
return response.text
else:
return None
def page_url_list(baseUrl):
return [baseUrl + str(i + 1) for i in range(0,5)]
def naver_spider_ranking_list():
'''
爬取网站 https://movie.naver.com
:return:
'''
base_url = "https://movie.naver.com"
# 排行榜
url = base_url + "/movie/sdb/rank/rmovie.nhn?sel=pnt&date=20210426&page="
datalist = []
for page_url in page_url_list(url):
print(page_url)
html = get_html(page_url,"euc-kr")
# 数据处理
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all('table', class_="list_ranking"):
for tr in item.find_all('tr'):
score = 0
if tr.find("td", class_="point") is not None:
score = tr.find("td", class_="point").get_text()
page_url = tr.find('a')
if page_url is not None:
href = page_url['href']
title = page_url['title']
detail_info = naver_spider_detail(base_url + href, score)
print(title)
datalist.append(detail_info)
save_to_excel("./韩国电影排行榜.xls",datalist)
def naver_spider_detail(url,score):
'''
爬取详情页数据
:return:
'''
html = get_html(url,"utf-8")
# 数据处理
soup = BeautifulSoup(html, "html.parser")
all_info = []
for item in soup.find_all('div', class_="mv_info_area"):
poster = item.find('div', class_="poster")# 海报地址
img_src = poster.find('img')['src']
mv_info = item.find('div', class_="mv_info")# 电影信息
mv_title = mv_info.find('h3', class_="h_movie").a.string # 电影名称
mv_des = mv_info.find('dl', class_="info_spec").find_all("dd")# 具体信息
all_info.append(img_src)
all_info.append(mv_title)
all_info.append(score)
for index,dd in enumerate(mv_des):
if index == 0: # 包含了 类型、制片国家、时长、片长、发行时间
mv_area = dd.p.find_all("a") # 地区 发行时间
type_list = []
length = len(mv_area)
for index,i in enumerate(mv_area):
if (index == length - 1) or (index == length - 2) or (index == length - 3):#日期
all_info.append(i.get_text())
else:
type_list.append(i.get_text())
mv_time = dd.p.find_all("span") # 时长
all_info.append(mv_time[2].get_text())
all_info.append(",".join(type_list))
if (index == 1) or (index == 2):
all_info.append(dd.p.get_text()) # 导演 演员
return all_info
def save_to_excel(savepath,datalist):
book = xlwt.Workbook(encoding="utf-8", style_compression=0) # 创建workbook对象
sheet = book.add_sheet('韩国热门电影', cell_overwrite_ok=True) # 创建工作表
col = ("海报链接", "电影名", "评分", "制片国家", "年份", "月份", "时长", "类型", "导演", "演员")
for i in range(0, 10):
sheet.write(0, i, col[i]) # 列名
for i in range(0, len(datalist)):
print("第{}条".format(i + 1))
data = datalist[i]
if len(data) >= 10:# 数据完整才保存
for j in range(0, 10):
sheet.write(i + 1, j, data[j])
book.save(savepath) # 保存
def main():
naver_spider_ranking_list()
if __name__ == '__main__':
main()