import requests
import urllib.request
from bs4 import BeautifulSoup
import os, re
import datetime
from multiprocessing import Pool
total_page = 0
class Spider:
# 初始化网址
def __init__(self):
self.url = "http://www.zbjuran.com/mei"
# 获取该网页的所有模特链接
def get_image_urls(self):
msgList = []
for i in range(1, 30):
url = self.url + "/qingchun/list_14_" + str(i) + ".html"
html = requests.get(url).text
soup = BeautifulSoup(html, 'lxml')
all_urls = soup.find_all(class_="picbox")
# print(all_urls)
for url in all_urls:
img_name = url.find('img')['alt']
# print(img_name)
img_url = 'http://www.zbjuran.com' + url.find('a')['href']
# print(img_url)
msgList.append([img_name, img_url])
print(len(msgList))
return msgList
# 创建存储文件夹
def createDirectory(self, filename):
path = "E:/爬虫/图片/" + filename
if not os.path.exists(path):
os.makedirs(path)
return path
# 下载的图片
def down_image(self, url, path):
# for url in urlList:
# 网页读取
myurl = url[1]
html = requests.get(myurl)
# 编码网页
html.encoding = 'gb2312'
html = html.text
soup = BeautifulSoup(html, 'lxml')
# 读取图片数目
page_num = soup.find(class_='page').li.a.text
page_num = re.sub('\D', '', page_num)
# 计算总图片数目
global total_page
total_page += int(page_num)
print("本页面共有%s张照片" % page_num)
# 下载图片
for i in range(1, int(page_num) + 1):
if i == 1:
rp = '.html'
else:
rp = '_%s.html' % i
urlSite = myurl.replace('.html', rp)
html = requests.get(urlSite)
html.encoding = 'gb2312'
if html.status_code == 200:
soup = BeautifulSoup(html.text, 'lxml')
src = soup.find(class_='picbox').img
if src == None:
continue
else:
src = src['src']
if not "http://www.zbjuran.com" in src and 'uploads' in src:
desrc = 'http://www.zbjuran.com' + src
print("正在下载%s的第%s张照片" % (url[0], i))
urllib.request.urlretrieve(desrc, path + '/' + url[0] + '_%s.jpg' % i)
else:
desrc = src
print("正在下载%s的第%s张照片" % (url[0], i))
urllib.request.urlretrieve(desrc, path + '/' + url[0] + '_%s.jpg' % i)
else:
continue
if __name__ == "__main__":
spider = Spider()
urls = spider.get_image_urls()
print(urls)
d1 = datetime.datetime.now()
p = Pool(20)
j = 0
for i in urls:
path = spider.createDirectory(i[0] + str(j))
p.apply_async(spider.down_image, args=(i, path))
j += 1
p.close()
p.join()
d2 = datetime.datetime.now()
print(d2 - d1)
python爬虫学习多进程下载图片
最新推荐文章于 2024-08-14 18:20:48 发布