豆瓣实战1

最新推荐文章于 2024-10-08 14:17:48 发布

xuexilangren1

最新推荐文章于 2024-10-08 14:17:48 发布

阅读量126

点赞数

分类专栏： python 文章标签：爬虫 python

本文链接：https://blog.csdn.net/xuexilangren1/article/details/89600787

版权

python 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

豆瓣实战（1）

import requests
import bs4
import json
import datetime
import os
import sys
import urllib

#建立文件夹函数
def mkdir(path):
    path=path.strip()					#去掉文件夹名两边的空格	
    isExists=os.path.exists(path)		#判断文件夹是否存在
    if not isExists:
        os.makedirs(path)				#不存在则创建
        print("successful")
    else:
        print("content is exits，needn't download!")
        sys.exit(0)
        
#保存图片
def saveImage(url,name):				
    img=requests.get(url)				#请求图片链接
    file_name=name+".jpg"				#文件名
    f=open(file_name,"ab")				#打开文件
    f.write(img.content)				#写入文件
    f.close()
    
def set(**d):		·					#判断图片是否加载完毕
    for i in d:							#分析最后一次json请求返回的headers，找到差异作为判断条件
        if i=="Content-Length":
            return True
    
    return False    

#下载
def download():
	with open("d:\\python_study\\BeautifulPicture\\movie.txt","a",encoding="utf8") as f:#打开文件，以追加的方式
        for i in json:
            #saveImage(i['cover_url'],i['title'])					#保存图片
            for j in i.values():									#遍历字典的values
                if isinstance(j,list):								#判断values的类型
                    f.write(("|".join(j))+",")
                    
                elif isinstance(j,int):
                    f.write(str(j)+",")
                    
                else :
                    f.write(str(j)+",")
                    
            f.write("\n")

os.chdir("d:\python_study\BeautifulPicture")						#切换到父目录
path=datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")			#生成文件夹名，以系统时间为文件夹名
mkdir(path)															#生成文件夹
os.chdir("d:\\python_study\\BeautifulPicture\\"+path)				#切换到目录

headers={
    'Host': 'movie.douban.com',
   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
    'Referer': 'https://movie.douban.com/typerank?type_name=%E5%89%A7%E6%83%85%E7%89%87&type=11&interval_id=100:90&action='

}

referer="https://movie.douban.com"
url="https://movie.douban.com/j/chart/top_list?type=11&interval_id=100:90&action=&start=600&limit=20"


#print(type(json))


i=0

while True:
    url="https://movie.douban.com/j/chart/top_list?type=11&interval_id=100:90&action=&start="+str(i)+"&limit=20"
    response=requests.get(url,headers=headers)
    json=response.json()
    d=response.headers
    if set(**d):
        download()
        break
    else:
        download()
        
    i=i+20