整体思路分三步:
模拟浏览器–>截取浏览器当前屏幕–>保存漫画截图
目标网址
http://www.1kkk.com/ch1000-514226/
(1)获取浏览器(模拟浏览器)
def getBrowser(self):
broswer = webdriver.PhantomJS()
try:
broswer.get(self.startUrl)
except:
print("error url")
return broswer
(2)打开开发者工具,分析需要爬取的页码数,然后找到下一页
代码如下
def saveCartoon(self,broswer):
#broswer.title.split('_')[0]
cartoonTitle = '1漫画'
self.createDir(cartoonTitle)
os.chdir(cartoonTitle)
#/html/body/div[2]/h1/font/span[2],获取漫画页数
sumPage = int(self.broswer.find_element_by_xpath('//font[@class="zf40"]/span[2]').text)
i = 1
while i<=sumPage:
imgName = str(i) + '.png'
broswer.get_screenshot_as_file(imgName)
i = i+1
#自动翻页
NextTag = broswer.find_element_by_id('next')
NextTag.click()
time.sleep(5)
(3)创建目录函数
def createDir(self,dirName):
if os.path.exists(dirName):
print("create directory failed")
else:
try:
os.makedirs(dirName)
except:
print("create directory failed")
else:
print("create directory failed")
(4)爬取的内容如下
完整代码
from selenium import webdriver
import os
import time
class GetCartoon(object):
def __init__(self):
self.startUrl = 'http://www.1kkk.com/ch1-406302'
self.broswer = self.getBrowser()
self.saveCartoon(self.broswer)
self.broswer.quit()
def getBrowser(self):
broswer = webdriver.PhantomJS()
try:
broswer.get(self.startUrl)
except:
print("error url")
return broswer
def saveCartoon(self,broswer):
#broswer.title.split('_')[0]
cartoonTitle = '1漫画'
self.createDir(cartoonTitle)
os.chdir(cartoonTitle)
#/html/body/div[2]/h1/font/span[2],获取漫画页数
sumPage = int(self.broswer.find_element_by_xpath('//font[@class="zf40"]/span[2]').text)
i = 1
while i<=sumPage:
imgName = str(i) + '.png'
broswer.get_screenshot_as_file(imgName)
i = i+1
#自动翻页
NextTag = broswer.find_element_by_id('next')
NextTag.click()
time.sleep(5)
def createDir(self,dirName):
if os.path.exists(dirName):
print("create directory failed")
else:
try:
os.makedirs(dirName)
except:
print("create directory failed")
else:
print("create directory failed")
if __name__ == '__main__':
GC = GetCartoon()