爬取豆瓣网中电影信息并保存到本地目录当中
读者可以根据源代码来设计自己的爬虫,url链接不能通用,由于源代码中后续查找筛选中有不同类或者标签名,仅供参考,另外推荐b站上一个老师,叫路飞学城IT的,讲的很详细,我就是根据他的讲解来设计自己的爬虫,本来想用GUI界面展示的,中间遇到一点困难,下次再来发啦。
话不多说先上码`
from bs4 import BeautifulSoup
import requests
import os
import re
def restore_poster(src_url): #下载海报,存储到指定位置
if not os.path.exists('./douban_poster'):
os.mkdir('./douban_poster')
poster_data=requests.get(url=src_url,headers=headers).content
poster_name=src_url.split('/')[-1]
poster_path='./douban_poster/'+poster_name
with open(poster_path,'wb') as fp:
try:
fp.write(poster_data)
print('海报下载成功')
except :
print('海报下载失败')
def find_movie(movie_id): #通过预告片链接获取MP4格式预告片并下载
movie_text=requests.get(url=movie_id,headers=headers).text
# soup=BeautifulSoup(movie_text,'lxml')
#解析预告片
# movie_list=soup.select('#movie_player video')
# movie_source=movie_list.source['src'] #预告片资源
movie_source=(re.findall(r'(http://.*?\.mp4)', movie_text,re.S))#S为单行匹配用re正则库是因为用BeatifulSoup不好找,视频链接类型会变,无法通用
movie_source=movie_source[0].split('\"')[-1]
if not os.path.exists('./douban_movie'):
os.mkdir('./douban_movie')
movie_data=requests.get(url=movie_source,headers=headers).content
movie_name=movie_source.split('/')[-1]
movie_path='./douban_movie/'+movie_name
with open(movie_path,'wb')as fp:
fp.write(movie_data)
print('预告片下载成功')
return movie_source
def operate(a1_list): #通过遍历bs4.element.resultset类型找到所需要的视频相关信息
fp=open('./douban.txt','w',encoding='utf_8')
for a1 in a1_list:
title=a1.div.h3.a.string #影片名
src_url=a1.a.img['src'] #海报链接
print(title,'爬取成功')
restore_poster(src_url)
detail_url=a1.a['href'] #详情链接
try:
movie_id=a1.div.ul.a['href'] #获取预告片网页
movie_source=find_movie(movie_id) #获取电影预告片链接
except TypeError:
movie_source='暂无预告片链接'
print(movie_source)
#发起请求
detail_text=requests.get(url=detail_url,headers=headers).text #解析导演等等信息
detail_soup=BeautifulSoup(detail_text,'lxml')
div_tag=detail_soup.find('div',id='info')
#解析链接内容
content=div_tag.text
fp.write('影片名:'+title+' '+content+'海报链接:'+src_url+'视频链接:'+movie_source+'\n')
fp.close()
def main():
#对首页数据获取
global headers,title2
url='https://movie.douban.com/cinema/later/nanchang/'
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36 Edg/87.0.664.41'
}
try:
page_text=requests.get(url=url,headers=headers).text
print("成功进入豆瓣网")
except :
print('访问失败')
#在首页解析电影标题和详情链接url
#1.实例化beautifulsoup对象
soup=BeautifulSoup(page_text,'lxml')
#解析电影标题和详情页url
a1_list=soup.select('#showing-soon.tab-bd > div')
operate(a1_list)
# fp=open('./douban.txt','w',encoding='utf_8')
# for a1 in a1_list:
# title=a1.div.h3.a.string #影片名
# src_url=a1.a.img['src'] #海报链接
# print(title,'爬取成功')
# restore_poster(src_url)
# detail_url=a1.a['href'] #详情链接
# try:
# movie_id=a1.div.ul.a['href'] #获取预告片网页
# movie_source=find_movie(movie_id) #获取电影预告片
# except TypeError:
# movie_source='暂无预告片链接'
# print(movie_source)
# #发起请求
# detail_text=requests.get(url=detail_url,headers=headers).text #解析导演等等信息
# detail_soup=BeautifulSoup(detail_text,'lxml')
# div_tag=detail_soup.find('div',id='info')
# #解析链接内容
# content=div_tag.text
# fp.write('影片名:'+title+' '+content+'海报链接:'+src_url+'视频链接:'+movie_source+'\n')
# fp.close()
#GUI
if __name__=='__main__':
main()
运行之后可以看到如下图片