Java爬虫进阶-phantomJS+selenium2抓取网站图片和小说

闲来无事,应小伙伴要求,最近写了一个专门爬取小说和美女图片的爬虫工具类,有不足之处欢迎小伙伴们指出。

准备工作:

            新建maven工程,导入pom依赖如下:

           

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>zhy.crawler</groupId>
  <artifactId>zhy_crawler</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  
	<properties>
		<jonguoLib.version>0.0.1</jonguoLib.version>
		<HighEncoder.version>0.0.1</HighEncoder.version>
		<RedisTool.version>0.0.2</RedisTool.version>
		<comLog.version>1.1.1</comLog.version>
		<slf4j.version>1.6.1</slf4j.version>
		<jackson.version>1.9.13</jackson.version>
		<spring.version>4.0.2.RELEASE</spring.version>
	</properties>
	
	<dependencies>
	    <dependency>
			<groupId>net.sf.json-lib</groupId>
			<artifactId>json-lib</artifactId>
			<version>2.4</version>
			<classifier>jdk15</classifier><!--鎸囧畾jdk鐗堟湰 -->
		</dependency>
		<dependency>
			<groupId>com.jonguo</groupId>
			<artifactId>JonguoLib</artifactId>
			<version>${jonguoLib.version}</version>
		</dependency>
		<dependency>
			<groupId>com.jonguo</groupId>
			<artifactId>HighEncoder</artifactId>
			<version>${HighEncoder.version}</version>
		</dependency>
		<dependency>
			<groupId>com.jonguo</groupId>
			<artifactId>RedisTool</artifactId>
			<version>${RedisTool.version}</version>
		</dependency>
		<dependency>
			<groupId>redis.clients</groupId>
			<artifactId>jedis</artifactId>
			<version>2.9.0</version>
		</dependency>
	 	<dependency>
	 		<groupId>commons-logging</groupId>
	 		<artifactId>commons-logging</artifactId>
	 		<version>${comLog.version}</version>
	 	</dependency>
		<dependency>
			<groupId>org.slf4j</groupId>
			<artifactId>slf4j-api</artifactId>
			<version>${slf4j.version}</version>
		</dependency>
		<dependency>
			<groupId>org.slf4j</groupId>
			<artifactId>slf4j-log4j12</artifactId>
			<version>${slf4j.version}</version>
		</dependency>
		<dependency>
			<groupId>org.codehaus.jackson</groupId>
			<artifactId>jackson-core-asl</artifactId>
			<version>${jackson.version}</version>
		</dependency>
		<dependency>
			<groupId>org.codehaus.jackson</groupId>
			<artifactId>jackson-core-lgpl</artifactId>
			<version>${jackson.version}</version>
		</dependency>
		<dependency>
			<groupId>org.codehaus.jackson</groupId>
			<artifactId>jackson-jaxrs</artifactId>
			<version>${jackson.version}</version>
		</dependency>
		<dependency>
			<groupId>org.codehaus.jackson</groupId>
			<artifactId>jackson-mapper-asl</artifactId>
			<version>${jackson.version}</version>
		</dependency>
		<dependency>
			<groupId>org.codehaus.jackson</groupId>
			<artifactId>jackson-mapper-lgpl</artifactId>
			<version>${jackson.version}</version>
		</dependency>
		<dependency>
			<groupId>org.codehaus.jackson</groupId>
			<artifactId>jackson-smile</artifactId>
			<version>${jackson.version}</version>
		</dependency>
		<dependency>
			<groupId>org.codehaus.jackson</groupId>
			<artifactId>jackson-xc</artifactId>
			<version>${jackson.version}</version>
		</dependency>		
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-aop</artifactId>
			<version>${spring.version}</version>
		</dependency>
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-beans</artifactId>
			<version>${spring.version}</version>
		</dependency>
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-context-support</artifactId>
			<version>${spring.version}</version>
		</dependency>
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-context</artifactId>
			<version>${spring.version}</version>
		</dependency>
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-core</artifactId>
			<version>${spring.version}</version>
		</dependency>
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-expression</artifactId>
			<version>${spring.version}</version>
		</dependency>
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-web</artifactId>
			<version>${spring.version}</version>
		</dependency>
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-webmvc</artifactId>
			<version>${spring.version}</version>
		</dependency>
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-webmvc-portlet</artifactId>
			<version>${spring.version}</version>
		</dependency>
		<dependency>
			<groupId>org.mortbay.jetty</groupId>
			<artifactId>servlet-api-2.5</artifactId>
			<version>6.1.14</version>
			<scope>provided</scope>
		</dependency>
		<dependency>
			<groupId>org.aspectj</groupId>
			<artifactId>aspectjweaver</artifactId>
			<version>1.6.8</version>
		</dependency>
		<dependency>
			<groupId>org.freemarker</groupId>
			<artifactId>freemarker</artifactId>
			<version>2.3.16</version>
		</dependency>
		<dependency>
			<groupId>joda-time</groupId>
			<artifactId>joda-time</artifactId>
			<version>2.7</version>
		</dependency>
		<dependency>
			<groupId>commons-lang</groupId>
			<artifactId>commons-lang</artifactId>
			<version>2.6</version>
		</dependency>
		<dependency>
			<groupId>org.apache.activemq</groupId>
			<artifactId>activemq-all</artifactId>
			<version>5.14.1</version>
		</dependency>
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-jms</artifactId>
			<version>${spring.version}</version>
		</dependency>
		
		<!-- https://mvnrepository.com/artifact/com.codeborne/phantomjsdriver -->
		<dependency>
		    <groupId>com.codeborne</groupId>
		    <artifactId>phantomjsdriver</artifactId>
		    <version>1.2.1</version>
		</dependency>
		
	</dependencies>
	

	<build>
		<finalName>${project.artifactId}-${project.version}</finalName>
		<sourceDirectory>${basedir}/src/main/java</sourceDirectory>
		<outputDirectory>${basedir}/src/main/webapp/WEB-INF/classes</outputDirectory>
		<plugins>
			<plugin>
				<groupId>org.apache.maven.plugins</groupId>
				<artifactId>maven-compiler-plugin</artifactId>
				<configuration>
					<source>1.8</source>
					<target>1.8</target>
					<encoding>UTF-8</encoding>
				</configuration>
			</plugin>
			<plugin>
				<artifactId>maven-resources-plugin</artifactId>
				<executions>
					<execution>
						<id>copy-resources</id>
						<phase>process-resources</phase>
						<goals>
							<goal>copy-resources</goal>
						</goals>
						<configuration>
							<overwrite>true</overwrite>
							<sourceDirectory>${basedir}/src/main/java</sourceDirectory>
							<outputDirectory>${basedir}/src/main/webapp/WEB-INF/classes</outputDirectory>
							<resources>
								<resource>
									<directory>src/main/resources/env/${env}</directory>
									<targetPath>${basedir}/src/main/webapp/WEB-INF/classes/conf</targetPath>
								</resource>
							</resources>
						</configuration>
					</execution>
				</executions>
			</plugin>
		</plugins>
	</build>
</project>

工具类源码如下(小说):

package com.zhy.crawler.base;

import java.io.File;
import java.io.FileOutputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;

import org.openqa.selenium.By;
import org.openqa.selenium.NoSuchElementException;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.phantomjs.PhantomJSDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriverService;
import org.openqa.selenium.remote.DesiredCapabilities;

public class NovelCrawler {
	
	/**  
	* @Title: NovelCrawler.java  
	* @Package com.zhy.crawler.base  
	* @Description: 小说爬取工具 
	* @author John_Hawkings
	* @date 2018年6月1日  
	* @version V1.0  
	*/  
	public static void main(String[] args) {
		//设置必要参数
        DesiredCapabilities dcaps = new DesiredCapabilities();
        //ssl证书支持
        dcaps.setCapability("acceptSslCerts", true);
        //截屏支持
        dcaps.setCapability("takesScreenshot", true);
        //css搜索支持
        dcaps.setCapability("cssSelectorsEnabled", true);
        //js支持
        dcaps.setJavascriptEnabled(true);
        //驱动支持
        dcaps.setCapability(PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY,"D:\\Devlop\\PhantomJS\\phantomjs-2.1.1-windows\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe");
        //创建无界面浏览器对象
        PhantomJSDriver driver = new PhantomJSDriver(dcaps);
        try {
            // 让浏览器访问空间主页
            // driver.manage().timeouts().implicitlyWait(5, TimeUnit.SECONDS);
             driver.get("http://www.biqule.com/book_57885/");
            // driver.manage().timeouts().implicitlyWait(5, TimeUnit.SECONDS);
             Thread.sleep(1000L);
             WebElement  webElement = driver.findElementByClassName("article-list");
             List<WebElement> elements = webElement.findElements(By.tagName("dd"));
             List<String> linkLst = new ArrayList<>();
             for (int i = 0;i< elements.size();i++) {
            	 if(doesWebElementExist(elements.get(i),By.tagName("a"))) {
            		 linkLst.add(elements.get(i).findElement(By.tagName("a")).getAttribute("href"));
 				}
             }
             for (int i = 0;i< linkLst.size();i++) {
	            driver.get(linkLst.get(i));
	            //获取新页面窗口句柄并跳转
	            String windowHandle = driver.getWindowHandle();
	            driver.switchTo().window(windowHandle);
	            driver.manage().timeouts().implicitlyWait(3, TimeUnit.SECONDS);
	            String text = driver.findElementById("content").getText();
	            File file  = new File("D:\\Novel\\极道天魔.txt");
	            if(!file.exists()) {
	            	file.createNewFile();
	            }
	            //2: 实例化OutputString 对象
	            FileOutputStream output = new FileOutputStream(file,true);
	            //3: 准备好实现内容的输出
	            //将字符串变为字节数组
	            byte data[] = text.getBytes();
	            output.write(data);
	            //4: 资源操作的最后必须关闭
	            output.close();
			}
            } catch (Exception e) {
                 e.printStackTrace();
             }finally{
                 //关闭并退出浏览器
                 driver.close();
                 driver.quit();
             }
        
	}
	
	public static boolean doesWebElementExist(WebElement element, By selector)
	  { 
	  
	         try 
	          { 
	        	 element.findElement(selector); 
	                 return true; 
	          } 
	          catch (NoSuchElementException e) 
	         { 
	                 return false; 
	         } 
	 }    

}

工具类源码如下(图片):

package com.zhy.crawler.base;

import java.awt.image.BufferedImage;
import java.io.File;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;

import javax.imageio.ImageIO;

import org.openqa.selenium.By;
import org.openqa.selenium.NoSuchElementException;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.phantomjs.PhantomJSDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriverService;
import org.openqa.selenium.remote.DesiredCapabilities;

public class GirlsCrawler {
	
	/**  
	* @Title: NovelCrawler.java  
	* @Package com.zhy.crawler.base  
	* @Description: 小说爬取工具 
	* @author John_Hawkings
	* @date 2018年6月1日  
	* @version V1.0  
	*/  
	public static void main(String[] args) {
		//设置必要参数
        DesiredCapabilities dcaps = new DesiredCapabilities();
        //ssl证书支持
        dcaps.setCapability("acceptSslCerts", true);
        //截屏支持
        dcaps.setCapability("takesScreenshot", true);
        //css搜索支持
        dcaps.setCapability("cssSelectorsEnabled", true);
        //js支持
        dcaps.setJavascriptEnabled(true);
        //驱动支持
        dcaps.setCapability(PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY,"D:\\Devlop\\PhantomJS\\phantomjs-2.1.1-windows\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe");
        //创建无界面浏览器对象
        PhantomJSDriver driver = new PhantomJSDriver(dcaps);
        try {
            // 让浏览器访问图片主页
             driver.get("http://www.94img.com/photos/QingDouKe-16286.html");
             Thread.sleep(1000L);
             List<WebElement> girlElements = driver.findElementsByClassName("gallary_item");
             List<String> linkLst = new ArrayList<String>();
             //获取第一页的类容放入集合中
             for (WebElement webElement : girlElements) {
            	 linkLst.add(webElement.findElement(By.tagName("img")).getAttribute("src"));
			 }
             //获取后几页内容放入集合中
             WebElement xPath = driver.findElementByXPath("//*[@id=\"bodywrap\"]/table/tbody/tr/td/div/div[1]/div/div[10]");
             List<WebElement> pageElements = xPath.findElements(By.tagName("a"));
             String pageSize = pageElements.get(pageElements.size()-2).getText();
             for(int i = 2;i<Integer.valueOf(pageSize)+1;i++) {
            	 driver.get("http://www.94img.com/photos/QingDouKe-16286-"+i+".html");
            	 String windowHandle = driver.getWindowHandle();
 	            driver.switchTo().window(windowHandle);
 	            driver.manage().timeouts().implicitlyWait(3, TimeUnit.SECONDS);
 	           girlElements = driver.findElementsByClassName("gallary_item");
 	             //获取后几页的类容放入集合中
 	             for (WebElement webElement : girlElements) {
 	            	 linkLst.add(webElement.findElement(By.tagName("img")).getAttribute("src"));
 				 }
             }
             //遍历图片链接集合下载图片到本地
             HttpURLConnection connection=null;  
             URL url=null;  
            	 for (int k =0;k<linkLst.size();k++) {
            		 System.out.println("第"+(k+1)+"张图片下载成功");
            		  url = new URL(linkLst.get(k));  
                      connection=(HttpURLConnection) url.openConnection();  
                      int code=connection.getResponseCode();  
	                  if(code == 200){  //响应成功  
	                        BufferedImage image=ImageIO.read(connection.getInputStream()); //读取图片文件流  
	                        String path="D:\\Novel\\Girls\\2\\"+(k+1)+".jpeg";  //创建存储图片文件的路径  
	                        File file=new File(path);  
	                        ImageIO.write(image,"jpeg",  file);  //将图片写进创建的路径  
	                    }  
				}
            } catch (Exception e) {
                 e.printStackTrace();
             }finally{
                 //关闭并退出浏览器
                 driver.close();
                 driver.quit();
             }
        
	}
	
	public static boolean doesWebElementExist(WebElement element, By selector)
	  { 
	  
	         try 
	          { 
	        	 element.findElement(selector); 
	                 return true; 
	          } 
	          catch (NoSuchElementException e) 
	         { 
	                 return false; 
	         } 
	 }    

}
思路:

      获取小说或者图片的所有连接加入到一个集合里面然后遍历集合依次访问页面获取数据,有人会问为什么不在获取主页面数据时一次性写完爬虫,因为这里面会存在一个小问题,就是在多个页面跳转的时候连接会失效,这个问题可以解决但是有点繁琐这里不多说,我们以最通俗易懂的方式来写方便记忆与学习


总结:

       phantomJS+selenium真的很好用,特别是它支持的截图功能在特定的需求下完全就是个神器,这里不多说,后面有更多的爬虫实例演示。欢迎小伙伴留言互相交流

  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值