Python实现的一个小爬虫
记录一个Python实现的Spider,下面是demo示例
import time
import requests
from bs4 import BeautifulSoup
import os
class Mztu(object):
def __init__(self, start_url, headers, save_path, start_no=1, end_no=1):
self.save_path = save_path
self.url = start_url
self.headers = headers
self.start_no = start_no
self.end_no = end_no
def get_all_pic_a(self):
start_html = requests.get(self.url, headers=self.headers)
Soup = BeautifulSoup(start_html.content, 'lxml')
all_a = Soup.find('div', class_='all').findAll('a')
all_a = all_a[self.start_no: self.end_no]
return all_a
def generate_folder(self, folder_name):
os.makedirs(os.path.join(self.save_path, folder_name))
os.chdir(self.save_path + folder_name)
def update_hreader(self, referer):
headers['Referer'] = '{}'.format(referer)
def get_pic(self):
img_group_no = 0
for a in self.get_all_pic_a():
title = a.get_text()
title = title.replace('?', ' ').replace('/', ' ').replace(':', ' ').replace('?', '')
folder_name = title.strip()
img_group_no += 1
print('正在下载第 %s 组,:%s,保存路径:%s' % (img_group_no, folder_name, self.save_path))
print('共要下载 %s 组,剩余 %s 组' % (self.end_no - self.start_no, self.end_no - self.start_no - img_group_no))
try:
self.generate_folder(folder_name)
except FileExistsError:
continue
except NotADirectoryError and OSError as e:
print(e)
break
img_group_url = a['href']
response_img_group = requests.get(img_group_url, headers=self.headers)
response_img_group_Soup = BeautifulSoup(response_img_group.text, 'lxml')
img_group_pages = response_img_group_Soup.find('div', class_='pagenavi').find_all('span')[-2].get_text()
img_no = 0
for page in range(1, int(img_group_pages)+1):
page_url = img_group_url + '/' + str(page)
response_img = requests.get(page_url, headers=self.headers)
img_Soup = BeautifulSoup(response_img.content, 'lxml')
img_src = img_Soup.find('div', class_='main-image').find('img')['src']
img_name = img_src[-9:]
img_name = img_name.replace('/', ' ')
self.update_hreader(img_src)
rs_img = requests.get(img_src, headers=self.headers)
img = rs_img.content
with open(img_name, 'ab') as f:
f.write(img)
img_no += 1
print('%s -- 第 %02d 张已下载完毕' % (folder_name, img_no))
print("恭喜你!!! 第 %s 组: %s 已下载完毕" % (img_group_no, folder_name))
print('-------------------华丽的分割线-------------------')
time.sleep(2)
if img_group_no >= self.end_no:
print("已完成所有下载")
break
if img_group_no >= len(self.get_all_pic_a()):
print("%s 下载完毕" % img_group_no)
break
if __name__ == '__main__':
url_all = 'https://www.mzitu.com/all/'
headers = {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1',
'Referer': 'http://www.mzitu.com/'
}
path = 'D:/mztu/'
pic = Mztu(url_all, headers, path, 1, 666)
pic.get_pic()