最近要进行一类图片的识别,因此需要大量图片,所以我用了python爬虫实现
最新版最牛逼的代码,质量好速度快
import re
import os
import requests
# 获取网站源代码
def get_html(url, headers, params):
response = requests.get(url, headers=headers, params=params)
response.encoding = "utf-8"
if response.status_code == 200:
return response.text
else:
print("网站源码获取错误")
def parse_pic_url(html):
result = re.findall('thumbURL":"(.*?)"', html, re.S)
return result
def get_pic_content(url):
response = requests.get(url)
return response.content
def save_pic(fold_name,content, pic_name):
with open(fold_name+"/" + str(pic_name) + ".jpg", "wb") as f:
f.write(content)
f.close()
# 定义一个文件夹
def create_fold(fold_name):
try:
os.mkdir(fold_name)
except:
print("文件已存在")
def main():
# 输入文件夹名字
fold_name = input("请输入您要抓取图片的名字:")
# 调用函数,创建文件夹
create_fold(fold_name)
# 输入要抓取的图片页数
page_name = input("请输入要抓取的页数 (0,1,2,3,4......):")
pic_name = 0
for i in range(10):
url = "https://image.baidu.com/search/acjson?tn=resultjson_com&logid=11836243366050550448&ipn=rj&ct=201326592&is=&fp=result&fr=ala&word=%E5%A4%A7%E7%86%8A%E7%8C%AB&queryWord=%E5%A4%A7%E7%86%8A%E7%8C%AB&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=©right=&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&expermode=&nojc=&isAsync=&pn=60&rn=30&gsm=3c&1695869692997="
headers = \
{"Accept": "text/plain, */*; q=0.01",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Connection": "keep-alive",
"Cookie": "winWH=%5E6_1659x838; BDIMGISLOGIN=0; BDqhfp=%E5%A4%A7%E7%86%8A%E7%8C%AB%26%26-10-1undefined%26%268568%26%267; BIDUPSID=84AA588D485BC5D9748C16152F786E4A; PSTM=1664863489; BDUSS=9UelhFRmVxQ2FYRURpM2hnanRSb09DcE5BcDFIYmdhM25DSXd3bWFMLX5mbWhqRVFBQUFBJCQAAAAAAAAAAAEAAABc%7EUGiAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAL%7ExQGO%7E8UBjc2; BDUSS_BFESS=9UelhFRmVxQ2FYRURpM2hnanRSb09DcE5BcDFIYmdhM25DSXd3bWFMLX5mbWhqRVFBQUFBJCQAAAAAAAAAAAEAAABc%7EUGiAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAL%7ExQGO%7E8UBjc2; BAIDUID=AA120298DBC668808E941F202EDAFE7D:FG=1; BAIDUID_BFESS=AA120298DBC668808E941F202EDAFE7D:FG=1; ZFY=ZkM1wYgsnkzHUCE:B8RSn0l9c2wZElo2ztkkXles7ZEQ:C; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; cleanHistoryStatus=0; BDRCVFR[Tp5-T0kH1pb]=mk3SLVN4HKm; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; indexPageSugList=%5B%22%E5%A4%A7%E7%86%8A%E7%8C%AB%22%2C%22%E7%9C%BC%E9%95%9C%E6%A1%86%E7%A2%8E%22%5D; userFrom=null; ab_sr=1.0.1_ZjU4YWMxNDUwYzdmOTA5MzNlOTcwMzU1Y2Q2Yzg5N2EyNDAxYTJmY2E1NGU4MTFjZDYzMDllMmQ1ZTcyYzE2NmJhNTNmY2I3YzAyOWNkZDEzYzhiMmRlMWUxMWEzMTdiNGNkZTEzNTk3N2JiOGY2NjUxZTYyZGYwMTYwNTkzZWI3YWU1MmVmMThhNWU5ZWMwYThkYmIyY2UxNWFhM2RiZg==",
"Host": "image.baidu.com",
"Referer": "https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gb18030&word=%B4%F3%D0%DC%C3%A8&fr=ala&ala=1&alatpl=normal&pos=0&dyTabStr=MTEsMCwxLDYsMyw1LDQsMiw4LDcsOQ%3D%3D",
"Sec-Ch-Ua": '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": '"Windows"',
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.43",
"X-Requested-With": "XMLHttpRequest", }
params = {"tn": "resultjson_com",
"logid": "11836243366050550448",
"ipn": "rj",
"ct": "201326592",
"fp": "result",
"fr": "ala",
"word": fold_name,
"queryWord": fold_name,
"cl": "2",
"lm": "-1",
"ie": "utf-8",
"oe": "utf-8",
"pn": str(int(i + 1) * 30),
"rn": "30",
"gsm": "3c"
}
html = get_html(url, headers, params)
result = parse_pic_url(html)
for item in result:
pic_content = get_pic_content(item)
save_pic(fold_name,pic_content, pic_name)
pic_name += 1
print("正在保存第" + str(pic_name) + "张图片")
if __name__ == "__main__":
main()
一、爬取某一图片网站
主要参考:https://www.cnblogs.com/franklv/p/6829387.html
爬取网页:https://www.ivsky.com/search.php?q=%E7%BD%82%E7%B2%9F%E8%8A%B1 (不过这个网页图片太少了)
1.1 代码
在安装相关库后及要爬取的网址后,可直接运行
在程序中相应地方可以更改爬取图片的网址和保存图片的路径
'''
程序功能:爬取罂粟花图片
作者:哥
日期:2019.5.15
版本更改说明:
'''
import requests
from bs4 import BeautifulSoup
import os
def getHtmlurl(url): #获取网址
try:
r=requests.get(url)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return ""
def getpic(html): #获取图片地址并下载
soup =BeautifulSoup(html,'html.parser')
all_img=soup.find('ul',class_='pli').find_all('img')
for img in all_img:
src=img['src']
img_url=src
print (img_url)
root='F:/poppy_pic/'
path = root + img_url.split('/')[-1]
try: #创建或判断路径图片是否存在并下载
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r = requests.get(img_url)
with open(path, 'wb') as f:
f.write(r.content)
f.close()
print("文件保存成功")
else:
print("文件已存在")
except:
print("爬取失败")
def main():
url='https://www.ivsky.com/search.php?q=%E7%BD%82%E7%B2%9F%E8%8A%B1'# 这里我是搜索的罂粟花的图片,可以改成你搜索图片的网址
html=(getHtmlurl(url))
print(getpic(html))
#暂时不支持翻页,想要下载多页不,可以用下面的方式,就是把第二页、第三页的网址复制过来
# url='https://www.ivsky.com/search.php?q=%E7%BD%82%E7%B2%9F%E8%8A%B1&PageNo=2' # 这里我是搜索的罂粟花的图片,可以改成你搜索图片的网址
# html=(getHtmlurl(url))
# print(getpic(html))
main()
相关库介绍:https://www.cnblogs.com/mzc1997/p/7813801.html
https://blog.csdn.net/qq_42156420/article/details/80784673
二、爬取百度图片
主要参考:https://blog.csdn.net/xiligey1/article/details/73321152
为了能够爬取更多的图片,我用了百度图片,下面是代码:
2.1 代码
'''
程序功能:在百度图片里爬取罂粟花图片
作者:哥
日期:2019.5.15
版本更改说明:
'''
# -*- coding: utf-8 -*-
"""根据搜索词下载百度图片"""
import re
import sys
import urllib
import requests
def get_onepage_urls(onepageurl):
"""获取单个翻页的所有图片的urls+当前翻页的下一翻页的url"""
if not onepageurl:
print('已到最后一页, 结束')
return [], ''
try:
html = requests.get(onepageurl)
html.encoding = 'utf-8'
html = html.text
except Exception as e:
print(e)
pic_urls = []
fanye_url = ''
return pic_urls, fanye_url
pic_urls = re.findall('"objURL":"(.*?)",', html, re.S)
fanye_urls = re.findall(re.compile(r'<a href="(.*)" class="n">下一页</a>'), html, flags=0)
fanye_url = 'http://image.baidu.com' + fanye_urls[0] if fanye_urls else ''
return pic_urls, fanye_url
def down_pic(pic_urls):
"""给出图片链接列表, 下载所有图片"""
for i, pic_url in enumerate(pic_urls):
try:
pic = requests.get(pic_url, timeout=15)
string = str(i + 1) + '.jpg'
with open(string, 'wb') as f:
f.write(pic.content)
print('成功下载第%s张图片: %s' % (str(i + 1), str(pic_url)))
except Exception as e:
print('下载第%s张图片时失败: %s' % (str(i + 1), str(pic_url)))
print(e)
continue
if __name__ == '__main__':
keyword = '罂粟花' # 关键词, 改为你想输入的词即可, 相当于在百度图片里搜索一样
url_init_first = r'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1497491098685_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1497491098685%5E00_1519X735&word='
url_init = url_init_first + urllib.parse.quote(keyword, safe='/')
all_pic_urls = []
onepage_urls, fanye_url = get_onepage_urls(url_init)
all_pic_urls.extend(onepage_urls)
fanye_count = 0 # 累计翻页数
while 1:
onepage_urls, fanye_url = get_onepage_urls(fanye_url)
fanye_count += 1
# print('第页' % str(fanye_count))
if fanye_url == '' and onepage_urls == []:
break
all_pic_urls.extend(onepage_urls)
down_pic(list(set(all_pic_urls)))
运行过程
图片是保存在py文件同文件夹下