import re,os,sys,bs4,requests,time
def check_url(url,domain_url):
httpreg = re.compile(r'(https?)://')
if httpreg.match(url):
return url
slashreg = re.compile(r'//')
if slashreg.match(url):
return 'http://'+ url[2:]
return domain_url + url
def replace_filename(filename):
reg_filename = re.compile(r'(.*)?[\/:*?"<>|].*')
mo = reg_filename.search(filename)
if mo:
return mo.group(1)
else:
return filename
def now():
return int(round(time.time()))
def get_path():
return os.path.join('c','img')
def download_img(url):
os.makedirs(get_path(),exist_ok=True)
if url[-1] == '/':
url = url[:-1]
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text)
tags = soup.select('img')
print('开始下载图片...')
for tag in tags:
if hasattr(tag,'src'):
img_url = tag.get('src')
img_url = check_url(img_url,url)
img_name = os.path.basename(img_url)
img_name = replace_filename(img_name)
img_path = os.path.join(get_path(),img_name)
if os.path.exists(img_path):
img_name = str(now()) + img_name
img_path = os.path.join(get_path(),img_name)
print('开始解析图片地址:',img_url)
with requests.get(img_url,stream = True) as img_res:
with open(img_path, 'wb') as fd:
for block in img_res.iter_content(8096):
fd.write(block)
print('下载图片完成:',img_name)
print('Done')
download_img('https://book.douban.com/subject/10779534/')
简单的抓图片爬虫
最新推荐文章于 2023-10-09 13:12:32 发布