表示放假看了看python ,一直想写个爬虫玩一下,但是好多网站都有反爬虫的机制orz..,导致爬下来的图片根本没法看(哭 。最近发现一个很良心的站,于是写了个小爬虫嘿嘿嘿。主要用到了BeautifulSoup库和requests库,比较简单,效率比较差,有待改进就当自娱自乐了hhh
import os
import requests
from bs4 import BeautifulSoup
import re
class Mmonly(object):
def all_page(self,url):
for page in range(1,388):
print('当前第',page,'页')
page_html = url[:-6] + str(page) + '.html'
self.all_url(page_html)
def all_url(self,url):
html = self.request(url)
all_a = BeautifulSoup(html.text,'html5lib').find_all('div',class_ = 'ABox')
for a in all_a:
href = a.contents[0]
name = href.contents[0]
url = href['href']
title = name['alt']
path = str(title)
if self.mkdir(path):
self.html(url)
else:
pass
def mkdir(slef,path):
path = path.strip()
isExists = os.path.exists(os.path.join('G:\python\mmonly.cc',path))
if not isExists:
print('创建一个叫',path,'的文件夹')
os.mkdir(os.path.join('G:\python\mmonly.cc',path))
os.chdir(os.path.join('G:\python\mmonly.cc',path))
return True
else:
print(path,'文件夹已经存在')
return False
def html(self,url):
pic_html = self.request(url)
pic_num = BeautifulSoup(pic_html.text,'html5lib').find('span',class_ = 'totalpage').get_text()
cnt = 0
for page in range(1,int(pic_num)+1):
page_url = url[:-5] + '_' + str(page) + '.html'
#print(page_url)
cnt += 1
self.img(page_url,cnt)
#print(pic_num)
def img(self,url,cnt):
img_html = self.request(url)
img_url = BeautifulSoup(img_html.text,'html5lib').find('div', class_ ='big-pic').find('img')['src']
#print(img_url)
self.save(img_url,cnt)
def save(self,url,cnt):
name = str(cnt)
#print(name)
img = self.request(url)
f = open(name+'.jpg','ab')
f.write(img.content)
f.close();
def request(self,url):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'}
content = requests.get(url,headers = headers)
return content
mmonly = Mmonly()
url = 'http://www.mmonly.cc/ktmh/list_28_1.html'
mmonly.all_page(url)