Python爬虫--Python爬虫爬取Demo图片数据
GitHub:https://github.com/kevinten10/Lifecat-Python
pixabay是优秀的图片搜索网站:https://pixabay.com/
下载情况:
version1.1快速实现(单线程操作):
# coding = utf-8
from urllib.request import urlretrieve
import requests
import re
# 请求头--伪装浏览器
headers = {
'Cookie': '',
'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}
def getHtml(url):
"""
获取response
:param url: url
:return: html
"""
html = requests.get(url, headers=headers)
return html
def getImg(html):
"""
正则表达式方法:
获取image.jpg-url
:param html: html
:return: [urls]
"""
reg = '(https.+?\.jpg){1}'
imgre = re.compile(reg)
imglist = re.findall(imgre, html.text)
for imgurl in imglist:
if len(imgurl) > 100:
imgurl = None
else:
print(imgurl)
print('过滤前共有total:', len(imglist), 'urls')
return imglist
def img_filter(imgs):
"""
过滤重复图片
:param imgs: 本例网站中统一图片url连续出现三次,故逢三取一
:return: 过滤后的Urls
"""
x = 0
fimgs = []
for img in imgs:
x += 1
if x % 3 == 0:
fimgs.append(img)
print('过滤后共有total:', len(fimgs), 'urls')
return fimgs
def download(imgurls, count):
"""
根据url下载图片到本地
:param imgurls: Url
:param count: 当前下载的页面-->防止后一页面覆盖前一页面
:return:
"""
# 下载到本地的目录,需存在
path = r"D://Python/download/"
name = 'page' + str(count) + '_image'
x = 0
for url in imgurls:
try:
urlretrieve(url, path + name + str(x) + '.jpg')
x += 1
except:
print('下载失败')
if __name__ == '__main__':
"""
从pixabay网站下载图片
?q='search框中搜索的关键词'
"""
print('不要忘记填写headers中的cookie...')
word = input("请输入搜索关键词:")
pages = input("请输入要爬取多少页(1页约100张):")
# 构建页面循环源
urls = ["https://pixabay.com/zh/photos/?q=" + word + "&pagi={}".format(i) for i in range(1, int(pages) + 1)]
# 页面html数据集合
htmls = []
# 已获取的页面数
x = 0
# 获取页面url的request-html
for url in urls:
x += 1
print('已获取', x, '页/', int(pages))
htmls.append(getHtml(url))
# 对页面html进行图片爬取
# 已下载的页面数
count = 0
for html in htmls:
imgs = getImg(html)
# 过滤重复图片-->同一图片的Url会出现三次-->过滤掉两次
fimgs = img_filter(imgs)
# 下载
count += 1
download(fimgs, count)