都知道下载网页,图片名称会发生变化。要保留原来的名称需要进行一些处理。可以用正则表达式
抓取你想要的图片。
'(photos.sdgcbbs.com.*?png)|(photos.sdgcbbs.com.*?jpg)|(photos.sdgcbbs.com.*?jpeg)'
表示要上面的图片,然后进行下一步处理。
def get_imgurl(r):
rep=r'(photos.sdgcbbs.com.*?png)|(photos.sdgcbbs.com.*?jpg)|(photos.sdgcbbs.com.*?jpeg)'
repg=re.compile(rep)
url_list=repg.findall(r)
list=[]
for tuples in url_list:
for xx in tuples:
if xx !='':
xx=xx.replace('photos.sdgcbbs.com/img/','')
if '/' not in xx and len(xx)>20:
list.append(xx)
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import re
import requests
import os
a='''/anquan
/erjian
/huanping
/jianli
/jingji
/xiaofang
/zaojia
/zckjs
/erjian
/zjkjs
/cjkjs
/yaoshi
/yijian'''
b=a.split('\n')
def openurl(url):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36"}
req = requests.get(url,headers=headers)
r=req.content.decode('utf-8','ignore')
return r
def get_imgurl(r):
rep=r'(photos.sdgcbbs.com.*?png)|(photos.sdgcbbs.com.*?jpg)|(photos.sdgcbbs.com.*?jpeg)'
repg=re.compile(rep)
url_list=repg.findall(r)
return url_list
def downimg(path,url):
img_name=re.sub('http://photos.sdgcbbs.com/img/','',url)
img_name=re.sub(r'.*?/','',url)
img_name=img_name.strip()
print(path+'/'+img_name)
response = requests.get(url)
img = response.content
with open(path+'/'+img_name,'wb' ) as f:
f.write(img)
path='C:\\Users\\Administrator\\Desktop\\work2'
wangzhi='http://bd2.sdjiantu.com'
try:
for xx in b:
wangzhiwz=wangzhi+xx
os.makedirs(path+xx,exist_ok=True)
r=openurl(wangzhiwz)
for url in get_imgurl(r):
for urlr in url:
if urlr!='':
urlr='http://'+urlr
downimg(path+xx,urlr)
except Exception as e:
print(e)