豆瓣相册批量下载

最新推荐文章于 2021-08-02 12:30:49 发布

squall1988

最新推荐文章于 2021-08-02 12:30:49 发布

阅读量2.3k

点赞数

文章标签：相册 url list html download input

本文链接：https://blog.csdn.net/squall1988/article/details/5748721

版权

#!/usr/bin/env python # -*- coding:utf-8 -*- import urllib import os import re def get_html_data(url): x=urllib.urlopen(url) data=x.read() return data def get_jpg_source(url,file_name): m=re.compile(r"(?<=<img src="/" mce_src="/""http://img)[/w/W]*(?=jpg/" />)",re.IGNORECASE|re.S) data=get_html_data(url) x=m.findall(data) if len(x)==0: m=re.compile(r"(?<=<img src="/" mce_src="/""http://t.douban)[/w/W]*(?=jpg/" />)",re.I|re.S) x=m.findall(data) try: xx="http://t.douban"+x[0]+"jpg" except: print u"这个真的下载不了!!" return m=re.compile(r"(?<=<img src="/" mce_src="/""http://t.douban)[/w/W]*(?=jpg)",re.I|re.S) x=m.findall(xx) if len(x[0])>20: xx="http://t.douban"+x[0]+"jpg" x=m.findall(xx) xx="http://t.douban"+x[0]+"jpg" print xx else: xx="http://img"+x[0]+"jpg" urllib.urlretrieve(xx,file_name) def first_get_source(url): m=re.compile(r"(?<=com/photos/photo/)[0-9]*(?=//")",re.I|re.S) data=get_html_data(url) x=m.findall(data) print data photo_list=[] for xx in x: photo_list.append("http://www.douban.com/photos/photo/"+xx) return photo_list def get_photo(url,dir): i=1 photo_list=[] data=get_html_data(url) lists=first_get_source(url) for photo in lists: photo_list.append(photo) print photo_list while check_next_page(data)==True: m=re.compile(r"(?<=rel=/"next/" href=/")[/w/W]*(?=start=[0-9]*/"/)",re.S|re.I) x=m.findall(data) url=x[0]+"start="+str(18*i) i=i+1 data=get_html_data(url) lists=first_get_source(url) for photo in lists: photo_list.append(photo) download_all(photo_list,dir) def check_next_page(data): m=re.compile(r"(?<=rel=/"next)(?=/")",re.S|re.I) x=m.findall(data) if len(x)==0: return False else: return True def download_all(photo_list,dir): if not os.path.exists(dir): os.mkdir(dir) for i in range(len(photo_list)): #print photo_list[i] get_jpg_source(photo_list[i],dir+str(i)+".jpg") if __name__=="__main__": #哈哈估计没有人不过还是贴出来吧 print "input the url:" url=raw_input() print "input the location:" location=raw_input() get_photo(url,location)

不会用html的paser所以只能这么丑陋的写了

以后要学学如何parserhtml文档了。