可以爬取豆瓣指定的电影全部信息哦,能生成相对应的10个文件夹存储爬取到的十部电影十部电视剧信息,还可以爬取封面哦!!
导包
pip install requests下载requests包 调用它获取html的功能
pip install bs4 下载bs4包 调用它的解析、遍历功能
os:操作系统包 用来新建文件夹和文件
re正则表达式库
Os.path.join:构建文件夹路径os.makedirs:创建文件夹名字
Requests.get:获取爬到的网站信息
用bs4解析遍历爬取的信息,生成一个text文件,再通过re正则表达式匹配到我要寻找的动态网址链接,然后再根据这个链接,再次爬取链接网址的网页信息,通过os操作系统库,生成text文件。
再打开电影详细信息页面,根据源码中的类似div 的id值或者class值,一层套一层的调用find方法寻找信息,
将爬取的信息通过os模块的open打开文件夹和write写入信息功能,将信息写入进去,
再生成txt文件,通过path.join的路径,将文件放在文件夹中。
代码如下:
import requests,bs4,re,os
###在douban文件夹中创建movie文件夹####
Douban_movie=os.path.join(".","douban","movie")
os.makedirs(Douban_movie,exist_ok=True)
###在douban文件夹中创建series文件夹####
Douban_series=os.path.join(".","douban","series")
os.makedirs(Douban_series,exist_ok=True)
###存电影####
douban_url = "https://movie.douban.com/j/new_search_subjects?"
douabnheader = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3870.400 QQBrowser/10.8.4405.400'}
douban_dy_params = {'sort': 'U',
'range': '7,10', #评分范围
'tags': '电影', #影视形式 例如:电影,电视剧,综艺等等
'start': '0', #加载范围
'year_range': '2018,2021', #年份范围
'genres':'', #影视类型 例如:剧情,喜剧,动作等等
'countries':'', #地区 例如:中国大陆,欧美,美国等等
}
doubanpage=requests.get(douban_url,headers=douabnheader,params=douban_dy_params) #获取网页内容
doubanpage.raise_for_status() #处理异常
doubanpage.encoding="utf-8"
doubanbs4=bs4.BeautifulSoup(doubanpage.text,"lxml") #解析网页
# searchdb=doubanbs4.find_all()
pat = r'/([0-9]+)\\'
regex = re.compile(pat)
res=regex.findall(str(doubanbs4))
print(res)
for a in res[:10]:
dy_url = "https://movie.douban.com/subject/"+str(a)
dyheader = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3870.400 QQBrowser/10.8.4405.400'}
dypage = requests.get(dy_url,headers=dyheader)
dypage.raise_for_status()
dypage.encoding="utf-8"
dybs4=bs4.BeautifulSoup(dypage.text,"lxml")
find_div=dybs4.find("div",id="content")# 找名字
######电影名字#####
find_h1=find_div.find("h1")
find_span=find_h1.find("span",property="v:itemreviewed")
dy_name = find_span.text
###上映年份####
find_ydiv=find_div.find("div",id="info")
find_yspan=find_ydiv.find("span",property="v:initialReleaseDate")
dy_year=find_yspan.text
####主演#####
find_zspan=find_ydiv.find("span",class_="actor")
find_1zspan=find_zspan.find("span",class_="attrs")
dy_zhuyan=find_1zspan.text
###作品简介####
find_jjdiv=find_div.find("div",class_="related-info")
find_jjspan=find_jjdiv.find("span",property="v:summary")
dy_jianjie=find_jjspan.text
###导演###+
find_dspan=find_ydiv.find("a",rel="v:directedBy")
if find_dspan==None: #有的影视会出现没有导演的情况
dy_daoyan="无"
else:
dy_daoyan=find_dspan.text
##在movie文件夹中创建名字带有电影名字和年份的文件夹
Douban_movie_file=os.path.join(Douban_movie,dy_name+dy_year)
os.makedirs(Douban_movie_file,exist_ok=True)
###获取图片并下载####
find_tdiv=find_div.find("div",id="mainpic")
find_dimg=find_tdiv.find("img") #标签
dy_img=find_dimg["src"] #获取图片链接
dy_img_get=requests.get(dy_img,headers=dyheader).content
tupianfile=os.path.join(Douban_movie_file,"图片.jpg")
with open(tupianfile,"wb+") as p: #wb+写入字节
p.write(dy_img_get)
##创建info.txt写入信息
info_file=os.path.join(Douban_movie_file,"info.text")
with open(info_file,"w",encoding="utf-8") as i:
info_text="电影名称:"+str(dy_name)+"\n"+"电影年份:"+str(dy_year)+"\n"+"导演:"+str(dy_daoyan)+"\n"+"主演:"+str(dy_zhuyan)+"\n"+"电影简介:"+str(dy_jianjie).strip()+"\n"+"电影链接:"+str(dy_url)
i.write(str(info_text))
print("十部电影已存入")
####存电视剧####
douban_dsj_params = {
'sort':' U',
'range': '7,10', #评分范围
'tags': '电视剧', #影视形式 例如:电影,电视剧,综艺等等
'start': '0', #加载范围
'year_range': '2018,2021', #年份范围
'genres':'', #影视类型 例如:剧情,喜剧,动作等等
'countries':'', #地区 例如:中国大陆,欧美,美国等等
}
douban_dsj_page=requests.get(douban_url,headers=douabnheader,params=douban_dsj_params)
douban_dsj_page.raise_for_status()
douban_dsj_page.encoding="utf-8"
douban_dsj_bs4=bs4.BeautifulSoup(douban_dsj_page.text,"lxml")
searchdb_dsj=douban_dsj_bs4.find_all()
pat1 = r'/([0-9]+)\\' #匹配每部电视剧超链接后面的编号
regex1 = re.compile(pat)
res_dsj=regex1.findall(str(searchdb_dsj))
for x in res_dsj[:10]: #取10个
dsj_url = "https://movie.douban.com/subject/"+str(x)
dsjheader = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3870.400 QQBrowser/10.8.4405.400'}
dsjpage = requests.get(dsj_url,headers=dsjheader)
dsjpage.raise_for_status()
dsjpage.encoding="utf-8"
dsjbs4=bs4.BeautifulSoup(dsjpage.text,"lxml")
find_div_dsj=dsjbs4.find("div",id="content")
######电视剧名字#####
find_h1_dsj=find_div_dsj.find("h1")
find_span_dsj=find_h1_dsj.find("span",property="v:itemreviewed")
dsj_name = find_span_dsj.text
###上映年份####
find_ydiv_dsj=find_div_dsj.find("div",id="info")
find_yspan_dsj=find_ydiv_dsj.find("span",property="v:initialReleaseDate")
dsj_year=find_yspan_dsj.text
####主演#####
find_zspan_dsj=find_ydiv_dsj.find("span",class_="actor")
find_1zspan_dsj=find_zspan_dsj.find("span",class_="attrs")
dsj_zhuyan=find_1zspan_dsj.text
###作品简介####
find_jjdiv_dsj=find_div_dsj.find("div",class_="related-info")
find_jjspan_dsj=find_jjdiv_dsj.find("span",property="v:summary")
dsj_jianjie=find_jjspan_dsj.text
###导演###
find_dspan_dsj=find_ydiv_dsj.find("a",rel="v:directedBy") #有的影视会出现没有导演的情况
if find_dspan_dsj==None:
dsj_daoyan="无"
else:
dsj_daoyan=find_dspan_dsj.text
##在movie文件夹中创建名字带有电影名字和年份的文件夹
Douban_series_file=os.path.join(Douban_series,dsj_name+dsj_year)
os.makedirs(Douban_series_file,exist_ok=True)
###获取图片并下载####
find_tdiv_dsj=find_div_dsj.find("div",id="mainpic")
find_dimg_dsj=find_tdiv_dsj.find("img")
dsj_img=find_dimg_dsj["src"] #获取图片链接
dsj_img_get=requests.get(dsj_img,headers=dsjheader).content
tupianfile_dsj=os.path.join(Douban_series_file,"图片.jpg")
with open(tupianfile_dsj,"wb+") as d:
d.write(dsj_img_get)
##创建info.txt写入信息
info_file=os.path.join(Douban_series_file,"info.text")
with open(info_file,"w",encoding="utf-8") as i:
info_text="电视剧名称:"+str(dsj_name)+"\n"+"电视剧年份:"+str(dsj_year)+"\n"+"导演:"+str(dsj_daoyan)+"\n"+"主演:"+str(dsj_zhuyan)+"\n"+"电视剧简介:"+str(dsj_jianjie).strip()+"\n"+"电影链接:"+str(dsj_url)
i.write(str(info_text))
print("十部电视剧已存入")