import requests
import json
import os
header = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Mobile Safari/537.36"}
class douban:
def __init__(self, url, dir):
self.url = url
self.dir = dir
self.TV_name = []
self.TV_picture = []
def get_sourcename(self):
print("开始获取剧名和剧照................")
response = requests.get(self.url,headers=header)
rejson = json.loads(response.content.decode())
x = 0
count=len(rejson["subjects"])
print(count)
if count==0:
return x
for x in range(count):
self.TV_name.append(rejson["subjects"][x]["title"])
self.TV_picture.append(rejson["subjects"][x]["cover"])
print("获取了",x+1,"张剧照!")
return x+1
def get_source(self):
print("开始保存到本地................")
for y in range(len(self.TV_name)):
directory = "{}\{}.jpg"
name=self.TV_name[y].replace("<","").replace(">","")
name1=name.replace("/","").replace("\\","")
name2=name1.replace(" ","").replace(":","")
name3=name2.replace("*","").replace("?","")
response2 = requests.get(self.TV_picture[y])
with open(directory.format(dir, name3), "wb") as file:
file.write(response2.content)
print("保存成功!")
if __name__ == "__main__":
url ="https://movie.douban.com/j/search_subjects?type=tv&tag={}&page_limit=20&page_start={}"#input("输入豆瓣网址:")
dir ="douban_{}_dramas"#input("输入存储文件夹:")
Type_of_opera=input("请输入电视的种类(热门 美剧 英剧 韩剧 日剧 国产剧 港剧 日本动画 综艺 纪录片):")
director=input("请输入图片存储的目录:")
dir=dir.format(director)
if os.path.exists(dir) == 0:
os.mkdir(dir)
page=1
print("开始获取:")
while page:
print("第", page, "次获取:")
urllist=url.format(Type_of_opera,(page-1)*20)
print(urllist)
db = douban(urllist,dir)
stop = db.get_sourcename()
db.get_source()
page+=1
if stop <20:
print("获取结束!")
print("总共获取了",(page-2)*20+stop,"张剧照")
break
一个贼简单的python爬虫实例
最新推荐文章于 2024-07-27 12:20:46 发布