首先感谢刘鸿亮的文章-python爬虫之图片爬虫,学习过程中采用了其中的思路和代码,特此声明!
文章链接:python爬虫之图片爬虫
下面为我的代码,初次学习,望原谅。
爬取图片的目标网站:天堂图片网
import os
import re
import requests
from bs4 import BeautifulSoup
#获取页面数据
def get_data(url):
response = requests.get(url)
if response.status_code == 200:
data = response
return data
return None
#获取不同类别图片的链接
def get_category(url):
category_it = []
data = get_data(url)
soup = BeautifulSoup(data.text,'lxml')
all = soup.find_all('div',class_ = 'kw')
for div in all:
href = url + div.a.get('href')
category_it.append(href)
return category_it
#获取不同组图片链接
def get_group(group):
tu_it = []
data = get_data(group)
reg = r'/tupian/\w*?/'
pattern = re.compile(reg)
a = re.findall(pattern, data.text)[:18]
for tu in a:
tu_web = url + tu
tu_it.append(tu_web)
return tu_it
#获取图片链接
def get_web(web):
web_it = []
for w in range(2,6):
tu_web = web + "index_" + str(w) + ".html"
data = get_data(tu_web)
soup = BeautifulSoup(data.text,'lxml')
all = soup.find_all('div',class_ = 'il_img')
for div in all:
href = url + div.a.get('href')
web_it.append(href)
return web_it
#下载图片
def download(down_url,dir):
# group_web = get_group(url)
# for g in group_web:
data = get_data(down_url)
soup = BeautifulSoup(data.text,'lxml')
all = soup.find_all('div',class_ = 'il_img')
get_dir("img_folder_" + dir)
num = 1
for div in all:
href = div.img.get('src')
try:
img = get_data(href)
with open("pic_%s.jpg" % num, 'wb') as p:
p.write(img.content)
num += 1
except Exception:
print("Picture not exist!")
# 创建并进入文件夹
def get_dir(dir):
a = os.path.abspath('.')
b = os.path.join(a, dir)
if not os.path.exists(b):
os.mkdir(dir)
os.chdir(b)
else:
os.chdir(b)
# 退出当前文件夹
def return_dir():
d = os.path.split(os.path.abspath("."))[0]
os.chdir(d)
def go(url):
category_lt = get_category(url)
a = 1
for i in category_lt:
get_dir("category_" + str(a))
group_lt = get_group(i)
n = 1
for web in group_lt:
num = 1
get_dir("group_" + str(n))
print("The group is: " + str(n))
for ii in get_web(web):
download(ii, str(num))
num += 1
n += 1
return_dir()
a += 1
return_dir()
if __name__ == '__main__':
url = "http://www.ivsky.com/"
get_dir("test1")
go(url)