python爬取豆瓣电影top250_Python 爬取豆瓣电影Top250排行榜,爬虫初试

from bs4 import BeautifulSoup

import openpyxl

import re

import urllib.request

import urllib.error

# 访问url

def ask_url(url):

# 伪装浏览器

head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \

AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'}

req = urllib.request.Request(url, headers=head) # 包装

try:

response = urllib.request.urlopen(req, timeout=3) # 访问 超时3s结束

html = response.read().decode('utf-8') # 解码

return html # 返回url网页html源码

except urllib.request.HTTPError as e:

if hasattr(e, 'code'):

print(e.code)

except urllib.error.URLError as e:

if hasattr(e, 'reason'):

print(e.reason)

# 爬取网页

def crawl_web(base_url):

data_list = []

# re电影名

re_movie_name = re.compile(r'(.*?)')

# re影片详情连接

re_movie_link = re.compile(r'')

# re影片海报图片

re_movie_img = re.compile(r'', re.S)

# re电影简介

re_movie_introduction = re.compile(r'

(.*?)

', re.S)

# re评分

re_movie_score = re.compile(

r'(.*?)')

# re评分人数

re_movie_judge = re.compile(r'(.*?)人评价')

# re一句话评价

re_moive_inq = re.compile(r'(.*?)。*')

for i in range(10):

url = base_url + str(i * 25)

html = ask_url(url) # 获取网页源码

soup = BeautifulSoup(html, 'html.parser') # 解析源码

for item in soup.find_all(class_='item'):

item = str(item).replace(u'\xa0', ' ') # 获取的页面中有奇妙代码(●'◡'●),所以要去掉

data = []

# 获取需要的信息

movie_name = re.findall(re_movie_name, item)

if len(movie_name) > 1:

data.append(movie_name[0])

data.append(movie_name[1].replace('/', ''))

else: # 没有外语名也要空出来,方便后续储存

data.append(movie_name[0])

data.append('暂无')

movie_link = re.findall(re_movie_link, item)[0]

data.append(movie_link)

movie_img = re.findall(re_movie_img, item)[0]

data.append(movie_img)

movie_introduction = re.findall(re_movie_introduction, item)[0]

movie_introduction = re.sub(

r'
', ' ', movie_introduction).strip() # 存入简介时要去掉含有的html标签

data.append(movie_introduction)

movie_score = re.findall(re_movie_score, item)[0]

data.append(movie_score)

movie_judge = re.findall(re_movie_judge, item)[0]

data.append(movie_judge)

moive_inq = re.findall(re_moive_inq, item)

if len(moive_inq) == 0: # 有时候没有一句话短评,同上要空出

data.append('暂无')

else:

data.append(moive_inq[0])

# 添加到data_list

data_list.append(data)

return data_list

def save_data(save_path, data_list):

wb = openpyxl.Workbook()

ws = wb.active

ws.title = '豆瓣TOP250'

first_row = ("电影名", "其他名", "影片详情连接", "影片海报图片",

"电影简介", "评分", "评分人数", "一句话评价")

temp = 1

for i in first_row: # 生成表头

ws.cell(1, temp, i)

temp += 1

row = 1

for i in data_list: # 存入数据

row += 1

column = 1

for j in i:

ws.cell(row, column, j)

column += 1

wb.save(save_path+'豆瓣TOP250.xlsx')

return None

if __name__ == "__main__":

base_url = 'https://movie.douban.com/top250?start='

save_path = 'E:\\School\\Study\\Python\\爬虫\\'

data_list = crawl_web(base_url)

save_data(save_path, data_list)

print('Crawl over')

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值