import requests
import re
import os
import random
# 获取大分类的url
# 获取每个分类下的子url
#获取图片地址
def get_html(url):# 返回文本
return requests.get(url).text
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
'Cookie':'Hm_lvt_c605a31292b623d214d012ec2a737685=1525873877,1525915564,1525966732,1526005433; Hm_lpvt_c605a31292b623d214d012ec2a737685=1526006213',
'Referer':'http://www.umei.cc/bizhitupian/diannaobizhi/132330.htm'
}
if __name__ == '__main__':
print('开始')
url = 'http://www.umei.cc'
# index = proxy(url)
index = requests.get(url, timeout=1).text
# print("代理成功")
# print("index为",index)
# index = get_html(url,proxies=pro)
girl_list= re.findall(r'<h2><a href="http://www.umei.cc/(.*?)/" title=".*?" class="MainNav">(\w+)</a></h2>', index)#有图片分类
print(girl_list)
for i in girl_list: # 获取大地址
try:
title= str(i[1])
print('当前为大分类:',title)
# path = 'e:\spyder'+'\\'+title
# print('path:',path)
# if not os.path.exists(path):
# os.makedirs(path)
# print('创建文件:',path)
next_url_param = i[0]
child_url = 'http://www.umei.cc/'+next_url_param+'/'
child_page= get_html(child_url)
# <a href="http://www.umei.cc/bizhitupian/diannaobizhi/" title="电脑壁纸">
pic_list = re.findall(r'<a href="http://www.umei.cc/.*?/(.*?)/" title="(.*?)">',child_page)
print(pic_list)
for j in pic_list: # 获取分类地址
try:
last_url_param = j[0]
last_title = j[1]
print('当前分类为:',last_title)
# sepath = path+'\\'+last_title
# print('sepath',sepath)
# print(sepath)
# if not os.path.exists(sepath):
# os.mkdir(sepath)
# print('创建文件:', sepath)
print(last_title, last_url_param)
# http: // www.umei.cc / bizhitupian / diannaobizhi /
last_url ='http://www.umei.cc/'+next_url_param+'/'+last_url_param+'/'
pic_page = get_html(last_url)
# http://www.umei.cc/bizhitupian/diannaobizhi/132330.htm
pic_ids = set(re.findall(r'http://www.umei.cc/'+next_url_param+'/'+last_url_param+'/(\d+).htm',pic_page))
# pic_url = re.findall(r'',pic_page)
# print(pic_ids)
for k in pic_ids:
try:
pic_url ='http://www.umei.cc/'+next_url_param+'/'+last_url_param+'/'+k+'.htm'
# print(pic_url)
pic_detail=get_html(pic_url)
# print(pic_detail)
pic_detail_ids = set(re.findall(r''+k+'_\d+',pic_detail))
if pic_detail_ids:
print('下为多张的照片')
print(pic_detail_ids)
for w in pic_detail_ids:
print(w)
# <a href='/bizhitupian/diannaobizhi/132329_2.htm'>
pic_url = 'http://www.umei.cc/' + next_url_param + '/' + last_url_param + '/' + w + '.htm'
print(pic_url)
page = get_html(pic_url)
# src='http://i1.umei.cc/uploads/tu/201805/9999/afbcfd5c2b.jpg'
pic_name = re.findall(r'<strong>(.*?)</strong>', page)
# print(page)
pic_adress = str(set(re.findall(r'center'+'.*?'+'(http:.*?.jpg)', page,re.S)))
print('图片名字为:', pic_name[0])
print('图片地址为:', pic_adress[0])
# print(page)
else:
print(pic_detail_ids)
print('下为单独的照片')
pic_name = re.findall(r'<strong>(.*?)</strong>',pic_detail)
print('图片名字为:',pic_name)
# <img alt="漂亮的星云壁纸桌面" src="http://i1.umei.cc/uploads/tu/201805/9999/7b806700bf.jpg" style="cursor: pointer;">
pic_adress = str(set(re.findall(r''+'alt.*?'+pic_name[0]+'.*?'+'(http.*?''.jpg)', pic_detail)))
print('***********************************')
if str(pic_adress):
print(pic_adress)
adress = str(pic_adress).split('\'')[1]
print('图片地址为:', adress)
content = requests.get(adress,headers=headers).content
print('content:',content)
if content:
print(title,last_title)
fname = 'E:\pycode'+'\\'+str(title)+'\\'+str(last_title)
pname = 'E:\pycode'+'\\'+str(title)+'\\'+str(last_title)+'\\'+pic_name[0] + '.jpg'
print(fname)
if not os.path.exists(fname):
os.makedirs(fname)
print('创建文件夹成功')
# sepath = path+'/'+last_title
# print('sepath',sepath)
# print(sepath)
print('开始写入图片')
with open(pname,'wb') as f:
f.write(content)
print('***********又一张图片下载完成*******************')
else:
continue
except:
continue
except:
continue
except Exception as e:
print(e)
continue
# print(pic_page)
# child_list = re.findall(r'a href="http://www.umei.cc/weimeitupian/(.*?)/" title=".*?">(.*?)</a>')
# print(child_list)