坑了这么久,现在填上。
环境win10,python
之前爬过一些图片的网站:https://images.pexels.com和https://unsplash.com,根据网上找的一些资料和自己看的,现在贴出代码。
import requests
import re
import os
import time
def get_url(url):
kw = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64)'}
try:
r = requests.get(url,headers = kw)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r
except:
print('wrong!!!!!!!!!!!')
def get_photourl(photo_url):
kw = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64)'}
try:
r = requests.get(photo_url,headers = kw)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r
except:
return 'wrong'
def get_photos(url,new_fpath):
result = get_url(url)
pattern = re.compile(r'src="https://images.pexels.com/photos/(\d+)/(.*?)\?h=350&auto=compress&cs=tinysrgb"', re.S)
#真正的下载链接是static,不是images开头
#print(result.text)
items = re.findall(pattern, result.text)
#print(items)
for item in items:
try:
photo_url = 'https://static.pexels.com/photos/' + str(item[0]) + '/' + str(item[1])
#把图片链接中的images,改成了static
print(photo_url)
save(photo_url,item,new_fpath)
time.sleep(1)
except:
continue
def makedir(new_fpath,i,key):
E = os.path.exists(new_fpath)
if not E:
os.makedirs(new_fpath)
os.chdir(new_fpath)
print('文件夹'+ key + '_page' + str(i + 1) + '创建成功!')
else:
print('文件夹已存在!')
def save(photo_url,item,new_fpath):
Final_fpath = new_fpath + '/' +str(item[0])+str(item[1])
print('正在下载图片......')
result = get_photourl(photo_url)
if result != 'wrong':
print('下载成功!')
else:
print('失败')
E = os.path.exists(Final_fpath)
if not E:
try:
with open(Final_fpath,'wb') as f:
f.write(result.content)
except:
print('下载失败!')
else:
print('图片已存在')
def main():
key = input('请输入搜索关键词(英文):')
url = 'https://www.pexels.com/search/' + key + '/'
num = int(input('请输入一共要下载的页数:'))#默认从第1页开始下载
fpath = '*****'
for i in range(2,num):
new_fpath = fpath + '/Photo2.0/' + key + '_page' + str(i + 1)
makedir(new_fpath,i,key)
if i >= 1:
new_url = url + '?page=' + str(i + 1)
print(new_url)
get_photos(new_url,new_fpath)
else:
get_photos(url,new_fpath)
time.sleep(3)
main()
爬取https://unsplash.com的时候需要利用selenium模拟下拉操作。需要pip安装一下,并下载chromedriver放在谷歌的安装目录(Chrome\Application)。
from selenium import webdriver#实现自动下拉
from lxml import etree#定位元素(更加高效)
from urllib.parse import urlparse#解析图片的名称
import urllib.request#urlretrieve()下载保存图片
import re
import time
class Unsplash:
#初始化构造函数
def __init__(self):
self.url='https://unsplash.com/search/photos/label'#请求地址
self.save_path="****"#图片保存路径
self.driver=webdriver.Chrome()
#self.driver = webdriver.PhantomJS()
#实现下拉动作,并返回网页源代码,times:下拉次数
def do_scroll(self,times):
#打开目标网址
driver=self.driver
driver.get(self.url)
#模拟下拉操作
for i in range(times):
print('正在下拉'+str(i+1)+'次:')
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
print('等待'+str(i+1)+'次网页加载')
time.sleep(40)
#解析网页源码
#html=etree.HTML(driver.page_source)
html = driver.page_source
return html
#下载图片保存到本地
def save_img(self,src,img_name):
urllib.request.urlretrieve(src, filename=self.save_path + img_name)
def get_pic(self, html):
#获取a标签的style内容
#all_uls = html.xpath('//a[@class="cV68d"]/@style')
pattern = re.compile(r'img src="https://images.unsplash.com/photo(.*?)"',re.S)
items = re.findall(pattern, html)
# 获取图片下载地址,
count = 1
for url in items:
#使用正则表达式获取想要的src地址
#src = re.findall(r'url\(\"(.*?)\"\)',url,re.S)[0]
url = 'https://images.unsplash.com/photo'+url
#print(url)
#使用urlparse解析地址,使用path的内容,去除不需要的参数
#fina_src=urlparse(' ' + src).path.strip()
# 保存图片的名字
#img_name = fina_src.split('/')[-1]+'.jpg'
#print(fina_src,img_name)
count += 1
img_name = str(count)+'.jpg'
self.save_img(url,img_name)
def main(self):
#获取源码
html=self.do_scroll(20)
print("开始下载图片")
self.get_pic(html)
img=Unsplash()
img.main()