代码运行时如图
运行后打开目标文件夹
然后点开
下面来看看怎么实现图片爬取的
工具准备
pycharm+BeautifulSoup+requests+threading
具体代码
import requests
import random
import os
from bs4 import BeautifulSoup
import threading
class crawler_pic(threading.Thread):
begin_index = 0
end_index = 0
grads = 20
base_url = "http://www.win4000.com/wallpaper_big_154{}.html"
file_root = "D://pics_multi//"
UA = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
]
headers = {
"User-Agent": random.choice(UA)
}
def __init__(self, name, begin):
threading.Thread.__init__(self)
self.name = name
self.begin_index = begin
self.end_index = begin + self.grads
def get_html(self, url):
try:
HTML = requests.get(url,headers=self.headers)
HTML.raise_for_status()
HTML.encoding = HTML.apparent_encoding
return HTML.text
except:
print("In "+self.name+":ERROR Load "+url)
return "NULL"
def store_pics(self,pic_urls):
fileName = pic_urls[0]+"//"
for picurl in pic_urls[1:]:
path = self.file_root + fileName + picurl.split('/')[-1]
print(path)
try:
if not os.path.exists(self.file_root):
os.mkdir(self.file_root)
if not os.path.exists(self.file_root+fileName):
os.mkdir(self.file_root+fileName)
if not os.path.exists(path):
pic = requests.get(picurl)
with open(path, 'wb') as f:
f.write(pic.content)
f.close()
print("图片:" + picurl + " 成功下载")
else:
print("图片已存在")
except:
print("爬取失败")
return 1
def get_pic_urls(self, HTML):
pic_urls = ["filename"]
soup = BeautifulSoup(HTML, "html.parser")
for tag in soup.find("div", attrs={"id": "picBox", "class": "picBox"}).descendants:
if tag.name == 'img':
pic_urls.append(tag.attrs['src'])
pic_urls[0] = tag.attrs['title']
global pic_num
pic_num += len(pic_urls) - 1
return pic_urls
def run(self):
for i in range(self.begin_index,self.end_index):
html = self.get_html(self.base_url.format(i))
if html != "NULL":
pic_urls = self.get_pic_urls(html)
self.store_pics(pic_urls)
if __name__ == '__main__':
threads = []
count = 0
pic_num = 0
for begin in range(700,900,20):
threads.append(crawler_pic("Thread-begin:"+str(begin),begin))
for thread in threads:
thread.start()
for thread in threads:
thread.join()
print(pic_num)
代码分析
构造多线程类
class crawler_pic(threading.Thread):
begin_index = 0 # 起始页面
end_index = 0 # 终止页
grads = 20 # 爬取梯度:每个线程爬虫需要执行的爬取页数
# 链接
base_url = "http://www.win4000.com/wallpaper_big_154{}.html"
# 图片保存根目录
file_root = "D://pics_multi//"
# 伪装浏览器
UA = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
]
# 随机构造头部信息
headers = {
"User-Agent": random.choice(UA)
}
初始化多线程
def __init__(self, name, begin):
threading.Thread.__init__(self)
self.name = name
self.begin_index = begin
self.end_index = begin + self.grads
获取网页信息
# 获取
def get_html(self, url):
try:
HTML = requests.get(url,headers=self.headers)
HTML.raise_for_status()
HTML.encoding = HTML.apparent_encoding
return HTML.text
except:
print("In "+self.name+":ERROR Load "+url)
return "NULL"
将获取的图片存储至根目录下
def store_pics(self,pic_urls):
fileName = pic_urls[0]+"//"
for picurl in pic_urls[1:]:
# 构造图片存储地址
path = self.file_root + fileName + picurl.split('/')[-1]
print(path)
try:
# 需要逐层创建目录
if not os.path.exists(self.file_root):
os.mkdir(self.file_root)
# 如无该目录,先行构建
if not os.path.exists(self.file_root+fileName):
os.mkdir(self.file_root+fileName)
# 图片存在,不重复保存
# 不存在,创建
if not os.path.exists(path):
# request获取图片内容
pic = requests.get(picurl)
with open(path, 'wb') as f:
f.write(pic.content)
f.close()
print("图片:" + picurl + " 成功下载")
else:
print("图片已存在")
except:
print("爬取失败")
return 1
在html页面中获取图片链接,返回链接列表
def get_pic_urls(self, HTML):
pic_urls = ["filename"]
soup = BeautifulSoup(HTML, "html.parser")
"""
页面分析:
图片链接位于标签<div "id": "picBox", "class": "picBox"> -- <li> -- <img> [href:pic_url]
获取最上层:div 全部子孙标签 选取a 获取a的属性信息
"""
for tag in soup.find("div", attrs={"id": "picBox", "class": "picBox"}).descendants:
if tag.name == 'img':
pic_urls.append(tag.attrs['src'])
pic_urls[0] = tag.attrs['title']
"""
for a_tag in soup.find("div", attrs={"id": "picBox", "class": "picBox"}).findAll("a"):
pic_urls.append(a_tag.attrs['href'])
"""
# 全局,记录图片数量
global pic_num
pic_num += len(pic_urls) - 1
return pic_urls
线程方法
def run(self):
# 爬取一遍分配的页面
for i in range(self.begin_index,self.end_index):
html = self.get_html(self.base_url.format(i))
# 页面爬取成功的情况下获取图片链接
if html != "NULL":
pic_urls = self.get_pic_urls(html)
self.store_pics(pic_urls)
"""
for pic in pic_urls:
print("in "+self.name+":"+pic)
"""
if __name__ == '__main__':
threads = []
count = 0
pic_num = 0
# 构造爬虫
for begin in range(700,900,20):
threads.append(crawler_pic("Thread-begin:"+str(begin),begin))
# 开始爬取
for thread in threads:
thread.start()
for thread in threads:
thread.join()
print(pic_num)
因为开启了20个线程,所以很快就爬完了
欣赏一波~害羞