jsoup爬虫

最新推荐文章于 2022-03-04 20:14:10 发布

我对你乱码了

最新推荐文章于 2022-03-04 20:14:10 发布

阅读量193

点赞数

本文链接：https://blog.csdn.net/weixin_43977846/article/details/102726719

版权

文章目录

pom依赖：
工具类：
- util:
文件
- crawler.properties
- log4j.properties
实例：

pom依赖：

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">

  <modelVersion>4.0.0</modelVersion>
  <packaging>war</packaging>

  <name>jsoup</name>
  <groupId>com.yj</groupId>
  <artifactId>jsoup</artifactId>
  <version>1.0-SNAPSHOT</version>

  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
  </properties>

  <dependencies>
    <!-- jdbc驱动包 -->
    <dependency>
      <groupId>mysql</groupId>
      <artifactId>mysql-connector-java</artifactId>
      <version>5.1.44</version>
    </dependency>

    <!-- 添加Httpclient支持 -->
    <dependency>
      <groupId>org.apache.httpcomponents</groupId>
      <artifactId>httpclient</artifactId>
      <version>4.5.2</version>
    </dependency>

    <!-- 添加jsoup支持 -->
    <dependency>
      <groupId>org.jsoup</groupId>
      <artifactId>jsoup</artifactId>
      <version>1.10.1</version>
    </dependency>


    <!-- 添加日志支持 -->
    <dependency>
      <groupId>log4j</groupId>
      <artifactId>log4j</artifactId>
      <version>1.2.16</version>
    </dependency>

    <!-- 添加ehcache支持 -->
    <dependency>
      <groupId>net.sf.ehcache</groupId>
      <artifactId>ehcache</artifactId>
      <version>2.10.3</version>
    </dependency>

    <!-- 添加commons io支持 -->
    <dependency>
      <groupId>commons-io</groupId>
      <artifactId>commons-io</artifactId>
      <version>2.5</version>
    </dependency>


    <dependency>
      <groupId>com.alibaba</groupId>
      <artifactId>fastjson</artifactId>
      <version>1.2.47</version>
    </dependency>
  </dependencies>

</project>

工具类：

util:

DateUtil

package yj.util;

import java.text.SimpleDateFormat;
import java.util.Date;

/**
 * 日期工具类
 * @author user
 *
 */
public class DateUtil {

	/**
	 * 获取当前年月日路径
	 * @return
	 * @throws Exception
	 */
	public static String getCurrentDatePath()throws Exception{
		Date date=new Date();
//		排取的图片按时间分类，方便查找
		SimpleDateFormat sdf=new SimpleDateFormat("yyyy/MM/dd");
		return sdf.format(date);
	}
	
	public static void main(String[] args) {
		try {
			System.out.println(getCurrentDatePath());
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}

DbUtil

package yj.util;

import java.sql.Connection;
import java.sql.DriverManager;

/**
 * 数据库工具类
 * @author user
 *
 */
public class DbUtil {

	/**
	 * 获取连接
	 * @return
	 * @throws Exception
	 */
	public Connection getCon()throws Exception{
		Class.forName(PropertiesUtil.getValue("jdbcName"));
		Connection con=DriverManager.getConnection(PropertiesUtil.getValue("dbUrl"), PropertiesUtil.getValue("dbUserName"), PropertiesUtil.getValue("dbPassword"));
		return con;
	}
	
	/**
	 * 关闭连接
	 * @param con
	 * @throws Exception
	 */
	public void closeCon(Connection con)throws Exception{
		if(con!=null){
			con.close();
		}
	}
	
	public static void main(String[] args) {
		DbUtil dbUtil=new DbUtil();
		try {
			dbUtil.getCon();
			System.out.println("数据库连接成功");
		} catch (Exception e) {
			e.printStackTrace();
			System.out.println("数据库连接失败");
		}
	}
}

PropertiesUtil

package yj.util;

import java.io.IOException;
import java.io.InputStream;
import java.util.Properties;

/**
 * properties工具类
 * @author user
 *
 */
public class PropertiesUtil {

	/**
	 * 根据key获取value值
	 * @param key
	 * @return
	 */
	public static String getValue(String key){
		Properties prop=new Properties();
		InputStream in=new PropertiesUtil().getClass().getResourceAsStream("/crawler.properties");
		try {
			prop.load(in);
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return prop.getProperty(key);
	}
}

文件

crawler.properties

dbUrl=jdbc:mysql://localhost:3306/yj?autoReconnect=true
dbUserName=root
dbPassword=123
jdbcName=com.mysql.jdbc.Driver
ehcacheXmlPath=F://blogCrawler/ehcache.xml
blogImages=F://blogCrawler/blogImages/

log4j.properties

log4j.rootLogger=INFO, stdout,D  
  
#Console  
log4j.appender.stdout=org.apache.log4j.ConsoleAppender  
log4j.appender.stdout.Target = System.out
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout  
log4j.appender.stdout.layout.ConversionPattern=[%-5p] %d{yyyy-MM-dd HH:mm:ss,SSS} method:%l%n%m%n

#D
log4j.appender.D = org.apache.log4j.RollingFileAppender
log4j.appender.D.File = C://blogCrawler/bloglogs/log.log
log4j.appender.D.MaxFileSize=100KB
log4j.appender.D.MaxBackupIndex=100  
log4j.appender.D.Append = true
log4j.appender.D.layout = org.apache.log4j.PatternLayout
log4j.appender.D.layout.ConversionPattern = %-d{yyyy-MM-dd HH:mm:ss}  [ %t:%r ] - [ %p ]  %m%n

在这里插入图片描述
必须需要这个文件：
ehcache.xml:

<?xml version="1.0" encoding="UTF-8"?>
<ehcache xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:noNamespaceSchemaLocation="http://ehcache.org/ehcache.xsd"
         updateCheck="false">
    <!--磁盘存储:将缓存中暂时不使用的对象,转移到硬盘,类似于Windows系统的虚拟内存-->
    <!--path:指定在硬盘上存储对象的路径-->
    <!--java.io.tmpdir 是默认的临时文件路径。 可以通过如下方式打印出具体的文件路径 System.out.println(System.getProperty("java.io.tmpdir"));-->
    <diskStore path="F:\blogCrawler\blogehcache"/>


    <!--defaultCache：默认的管理策略-->
    <!--eternal：设定缓存的elements是否永远不过期。如果为true，则缓存的数据始终有效，如果为false那么还要根据timeToIdleSeconds，timeToLiveSeconds判断-->
    <!--maxElementsInMemory：在内存中缓存的element的最大数目-->
    <!--overflowToDisk：如果内存中数据超过内存限制，是否要缓存到磁盘上-->
    <!--diskPersistent：是否在磁盘上持久化。指重启jvm后，数据是否有效。默认为false-->
    <!--timeToIdleSeconds：对象空闲时间(单位：秒)，指对象在多长时间没有被访问就会失效。只对eternal为false的有效。默认值0，表示一直可以访问-->
    <!--timeToLiveSeconds：对象存活时间(单位：秒)，指对象从创建到失效所需要的时间。只对eternal为false的有效。默认值0，表示一直可以访问-->
    <!--memoryStoreEvictionPolicy：缓存的3 种清空策略-->
    <!--FIFO：first in first out (先进先出)-->
    <!--LFU：Less Frequently Used (最少使用).意思是一直以来最少被使用的。缓存的元素有一个hit 属性，hit 值最小的将会被清出缓存-->
    <!--LRU：Least Recently Used(最近最少使用). (ehcache 默认值).缓存的元素有一个时间戳，当缓存容量满了，而又需要腾出地方来缓存新的元素的时候，那么现有缓存元素中时间戳离当前时间最远的元素将被清出缓存-->
    <defaultCache maxElementsInMemory="1" eternal="true" overflowToDisk="true" />



    <!--name： Cache的名称，必须是唯一的(ehcache会把这个cache放到HashMap里)-->
  <cache name="cnblog"  maxElementsInMemory="1" eternal="true"
           overflowToDisk="true" diskPersistent="true" 
         />
 <cache name="8gli_movies"  maxElementsInMemory="1" eternal="true"
           overflowToDisk="true" diskPersistent="true" 
         />
</ehcache>

实例：

爬取图片：

DownloadImg：

package yj.crawler;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.log4j.Logger;
import yj.util.DateUtil;
import yj.util.PropertiesUtil;

import java.io.File;
import java.io.IOException;
import java.util.UUID;


public class DownloadImg {
	private static Logger logger = Logger.getLogger(DownloadImg.class);
	private static String URL = "http://www.yidianzhidao.com/UploadFiles/img_1_446119934_1806045383_26.jpg";

	public static void main(String[] args) {

		logger.info("开始爬取首页：" + URL);
		

		CloseableHttpClient httpClient = HttpClients.createDefault();
		HttpGet httpGet = new HttpGet(URL);
//		设置连接时间setConnectTimeout和响应时间setSocketTimeout
		RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build();
		httpGet.setConfig(config);
		CloseableHttpResponse response = null;
		try {
			response = httpClient.execute(httpGet);
			if (response == null) {
				logger.info("连接超时！！！");
			} else {
				HttpEntity entity = response.getEntity();
				String imgPath = PropertiesUtil.getValue("blogImages");
				String dateDir = DateUtil.getCurrentDatePath();
				String uuid = UUID.randomUUID().toString();
				String subfix = entity.getContentType().getValue().split("/")[1];
				String localFile = imgPath+dateDir+"/"+uuid+"."+subfix;
//				System.out.println(localFile);
				FileUtils.copyInputStreamToFile(entity.getContent(), new File(localFile));
			}
		} catch (ClientProtocolException e) {
			logger.error(URL+"-ClientProtocolException", e);
		} catch (IOException e) {
			logger.error(URL+"-IOException", e);
		} catch (Exception e) {
			logger.error(URL+"-Exception", e);
		} finally {
			try {
				if (response != null) {
					response.close();
				}
				if(httpClient != null) {
					httpClient.close();
				}
			} catch (IOException e) {
				logger.error(URL+"-IOException", e);
			}
		}
		

		logger.info("结束首页爬取：" + URL);
	
	}
}

在这里插入图片描述

爬取博客网和其他网址：

BlogCrawlerStarter

package yj.crawler;

import net.sf.ehcache.Cache;
import net.sf.ehcache.CacheManager;
import net.sf.ehcache.Status;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import yj.util.DateUtil;
import yj.util.DbUtil;
import yj.util.PropertiesUtil;

import java.io.File;
import java.io.IOException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID;

/**
 * @author Administrator
 *
 */
public class BlogCrawlerStarter {

	private static Logger logger = Logger.getLogger(BlogCrawlerStarter.class);
//	https://www.csdn.net/nav/newarticles
	private static String HOMEURL = "https://www.cnblogs.com/";
	private static CloseableHttpClient httpClient;
	private static Connection con;
	private static CacheManager cacheManager;
	private static Cache cache;

	/**
	 * httpclient解析首页，获取首页内容
	 */
	public static void parseHomePage() {
		logger.info("开始爬取首页：" + HOMEURL);
		
		cacheManager = CacheManager.create(PropertiesUtil.getValue("ehcacheXmlPath"));
		cache = cacheManager.getCache("cnblog");
		
		httpClient = HttpClients.createDefault();
		HttpGet httpGet = new HttpGet(HOMEURL);
		RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build();
		httpGet.setConfig(config);
		CloseableHttpResponse response = null;
		try {
			response = httpClient.execute(httpGet);
			if (response == null) {
				logger.info(HOMEURL + ":爬取无响应");
				return;
			}

			if (response.getStatusLine().getStatusCode() == 200) {
				HttpEntity entity = response.getEntity();
				String homePageContent = EntityUtils.toString(entity, "utf-8");
				// System.out.println(homePageContent);
				parseHomePageContent(homePageContent);
			}

		} catch (ClientProtocolException e) {
			logger.error(HOMEURL + "-ClientProtocolException", e);
		} catch (IOException e) {
			logger.error(HOMEURL + "-IOException", e);
		} finally {
			try {
				if (response != null) {
					response.close();
				}

				if (httpClient != null) {
					httpClient.close();
				}
			} catch (IOException e) {
				logger.error(HOMEURL + "-IOException", e);
			}
		}

		if(cache.getStatus() ==  Status.STATUS_ALIVE) {
			cache.flush();
		}
		cacheManager.shutdown();
		logger.info("结束爬取首页：" + HOMEURL);

	}

	/**
	 * 通过网络爬虫框架jsoup，解析网页类容，获取想要数据（博客的连接）
	 * 
	 * @param homePageContent
	 */
	private static void parseHomePageContent(String homePageContent) {
		Document doc = Jsoup.parse(homePageContent);
		//#feedlist_id .list_con .title h2 a
		Elements aEles = doc.select("#post_list .post_item .post_item_body h3 a");
		for (Element aEle : aEles) {
//			这个是首页中的博客列表中的单个链接URL
			String blogUrl = aEle.attr("href");
			if (null == blogUrl || "".equals(blogUrl)) {
				logger.info("该博客未内容，不再爬取插入数据库！");
				continue;
			}
			if(cache.get(blogUrl) != null) {
				logger.info("该数据已经被爬取到数据库中，数据库不再收录！");
				continue;
			}
//			System.out.println("************************"+blogUrl+"****************************");
			
			parseBlogUrl(blogUrl);
		}
	}

	/**
	 * 通过博客地址获取博客的标题，以及博客的类容
	 * 
	 * @param blogUrl
	 */
	private static void parseBlogUrl(String blogUrl) {

		logger.info("开始爬取博客网页：" + blogUrl);
		httpClient = HttpClients.createDefault();
		HttpGet httpGet = new HttpGet(blogUrl);
		RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build();
		httpGet.setConfig(config);
		CloseableHttpResponse response = null;
		try {
			response = httpClient.execute(httpGet);
			if (response == null) {
				logger.info(blogUrl + ":爬取无响应");
				return;
			}

			if (response.getStatusLine().getStatusCode() == 200) {
				HttpEntity entity = response.getEntity();
				String blogContent = EntityUtils.toString(entity, "utf-8");
				parseBlogContent(blogContent, blogUrl);
			}

		} catch (ClientProtocolException e) {
			logger.error(blogUrl + "-ClientProtocolException", e);
		} catch (IOException e) {
			logger.error(blogUrl + "-IOException", e);
		} finally {
			try {
				if (response != null) {
					response.close();
				}
			} catch (IOException e) {
				logger.error(blogUrl + "-IOException", e);
			}
		}

		logger.info("结束爬取博客网页：" + HOMEURL);

	}

	/**
	 * 解析博客类容，获取博客中标题以及所有内容
	 * 
	 * @param blogContent
	 */
	private static void parseBlogContent(String blogContent, String link) {
		Document doc = Jsoup.parse(blogContent);
		if(!link.contains("ansion2014")) {
			System.out.println(blogContent);
		}
		Elements titleEles = doc
				//#mainBox main .blog-content-box .article-header-box .article-header .article-title-box h1
				.select("#topics .post h1 a");
		
		System.out.println(titleEles.toString());
	
		if (titleEles.size() == 0) {
			logger.info("博客标题为空，不插入数据库！");
			return;
		}
		String title = titleEles.get(0).html();

		Elements blogContentEles = doc.select("#cnblogs_post_body ");
		if (blogContentEles.size() == 0) {
			logger.info("博客内容为空，不插入数据库！");
			return;
		}
		String blogContentBody = blogContentEles.get(0).html();

//		Elements imgEles = doc.select("img");
//		List<String> imgUrlList = new LinkedList<String>();
//		if(imgEles.size() > 0) {
//			for (Element imgEle : imgEles) {
//				imgUrlList.add(imgEle.attr("src"));
//			}
//		}
//		
//		if(imgUrlList.size() > 0) {
//			Map<String, String> replaceUrlMap = downloadImgList(imgUrlList);
//			blogContent = replaceContent(blogContent,replaceUrlMap);
//		}

		String sql = "insert into `t_jsoup_article` values(null,?,?,null,now(),0,0,null,?,0,null)";
		try {
			PreparedStatement pst = con.prepareStatement(sql);
			pst.setObject(1, title);
			pst.setObject(2, blogContentBody);
			pst.setObject(3, link);
			if(pst.executeUpdate() == 0) {
				logger.info("爬取博客信息插入数据库失败");
			}else {
				cache.put(new net.sf.ehcache.Element(link, link));
				logger.info("爬取博客信息插入数据库成功");
			}
		} catch (SQLException e) {
			logger.error("数据异常-SQLException：",e);
		}
	}

	/**
	 *
	 * 1、blogContent是爬取到的博客的原生博客内容，里面的图片都在csdn上（假设做了防盗链）
	 * 2、针对于原生博客内容进行图片引用地址替换
	 * 3、但是当前本地是没有那张图片的，需要通过原生博客类容图片地址先下载到本地
	 *
	 *
	 * 将别人博客内容进行加工，将原有图片地址换成本地的图片地址
	 * @param blogContent
	 * @param replaceUrlMap
	 * @return
	 */
	private static String replaceContent(String blogContent, Map<String, String> replaceUrlMap) {
		for(Map.Entry<String, String> entry: replaceUrlMap.entrySet()) {
			blogContent = blogContent.replace(entry.getKey(), entry.getValue());
		}
		return blogContent;
	}

	/**
	 * 别人服务器图片本地化
	 * @param imgUrlList
	 * @return
	 */
	private static Map<String, String> downloadImgList(List<String> imgUrlList) {
		Map<String, String> replaceMap = new HashMap<String, String>();
		for (String imgUrl : imgUrlList) {
			CloseableHttpClient httpClient = HttpClients.createDefault();
			HttpGet httpGet = new HttpGet(imgUrl);
			RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build();
			httpGet.setConfig(config);
			CloseableHttpResponse response = null;
			try {
				response = httpClient.execute(httpGet);
				if (response == null) {
					logger.info(HOMEURL + ":爬取无响应");
				}else {
					if (response.getStatusLine().getStatusCode() == 200) {
						HttpEntity entity = response.getEntity();
						String blogImagesPath = PropertiesUtil.getValue("blogImages");
						String dateDir = DateUtil.getCurrentDatePath();
						String uuid = UUID.randomUUID().toString();
						String subfix = entity.getContentType().getValue().split("/")[1];
						String fileName = blogImagesPath + dateDir + "/" + uuid + "." + subfix;
						
						FileUtils.copyInputStreamToFile(entity.getContent(), new File(fileName));
						replaceMap.put(imgUrl, fileName);
					}
				}
			} catch (ClientProtocolException e) {
				logger.error(imgUrl + "-ClientProtocolException", e);
			} catch (IOException e) {
				logger.error(imgUrl + "-IOException", e);
			} catch (Exception e) {
				logger.error(imgUrl + "-Exception", e);
			} finally {
				try {
					if (response != null) {
						response.close();
					}
				} catch (IOException e) {
					logger.error(imgUrl + "-IOException", e);
				}
			}
		
		}
		return replaceMap;
	}

	public static void start() {
		while(true) {
			DbUtil dbUtil = new DbUtil();
			try {
				con = dbUtil.getCon();
				parseHomePage();
			} catch (Exception e) {
				logger.error("数据库连接势失败！");
			} finally {
				try {
					if (con != null) {
						con.close();
					}
				} catch (SQLException e) {
					logger.error("数据关闭异常-SQLException：",e);
				}
			}
			try {
				Thread.sleep(1000*60);
			} catch (InterruptedException e) {
				logger.error("主线程休眠异常-InterruptedException：",e);
			}
		}
	}

	public static void main(String[] args) {
		start();
	}
}

PanZhaoZhaoCrawler3：

package yj.crawler;

import net.sf.ehcache.Cache;
import net.sf.ehcache.CacheManager;
import net.sf.ehcache.Status;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import yj.util.DbUtil;
import yj.util.PropertiesUtil;

import java.io.IOException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.LinkedList;
import java.util.List;

public class PanZhaoZhaoCrawler3 {
	private static Logger logger = Logger.getLogger(PanZhaoZhaoCrawler3.class);
	private static String URL = "http://www.13910.com/daren/";
	private static String PROJECT_URL = "http://www.13910.com";

	private static Connection con;
	private static CacheManager manager;
	private static Cache cache;
	private static CloseableHttpClient httpClient;
	private static long total = 0;

	/**
	 * httpclient获取首页内容
	 */
	public static void parseHomePage() {
		logger.info("开始爬取：" + URL);

		manager = CacheManager.create(PropertiesUtil.getValue("ehcacheXmlPath"));
		cache = manager.getCache("cnblog");

		httpClient = HttpClients.createDefault();
		HttpGet httpGet = new HttpGet(URL);
		RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build();
		httpGet.setConfig(config);
		CloseableHttpResponse response = null;
		try {
			response = httpClient.execute(httpGet);
			if (response == null) {
				logger.info("链接超时！");
			} else {
				if (response.getStatusLine().getStatusCode() == 200) {
					HttpEntity entity = response.getEntity();
					String pageContent = EntityUtils.toString(entity, "utf-8");
					parsePageContent(pageContent);
				}
			}
		} catch (ClientProtocolException e) {
			logger.error(URL + "-解析异常-ClientProtocolException", e);
		} catch (IOException e) {
			logger.error(URL + "-解析异常-IOException", e);
		} finally {
			try {
				if (response != null) {
					response.close();
				}

				if (httpClient != null) {
					httpClient.close();
				}
			} catch (IOException e) {
				logger.error(URL + "-解析异常-IOException", e);
			}
		}

		// 最终将数据缓存到硬盘中
		if (cache.getStatus() == Status.STATUS_ALIVE) {
			cache.flush();
		}
		manager.shutdown();
		logger.info("结束爬取：" + URL);

	}
	
	/**
	 * Jsoup解析首页内容
	 * @param pageContent
	 */
	private static void parsePageContent(String pageContent) {
		Document doc = Jsoup.parse(pageContent);
		Elements aEles = doc.select(".showtop .key-right .darenlist .list-info .darentitle a");
		for (Element aEle : aEles) {
			String aHref = aEle.attr("href");
			logger.info("提取个人代理分享主页："+aHref);
			
			String panZhaoZhaoUserShareUrl = PROJECT_URL + aHref;
			
			List<String> panZhaoZhaoUserShareUrls = getPanZhaoZhaoUserShareUrls(panZhaoZhaoUserShareUrl);
			for (String singlePanZhaoZhaoUserShareUrl : panZhaoZhaoUserShareUrls) {
//				System.out.println("**********************************************************"+singlePanZhaoZhaoUserShareUrl+"**********************************************************");
//				continue;
				parsePanZhaoZhaoUserShareUrl(singlePanZhaoZhaoUserShareUrl);
			}
		}
	}
	
	/**
	 * 收集个人主页的前15条记录
	 * @param panZhaoZhaoUserShareUrl
	 * @return
	 */
	private static List<String> getPanZhaoZhaoUserShareUrls(String panZhaoZhaoUserShareUrl){
		List<String> list = new LinkedList<String>();
		list.add(panZhaoZhaoUserShareUrl);
		for (int i = 2; i < 16; i++) {
			list.add(panZhaoZhaoUserShareUrl+"page-"+i+".html");
		}
		return list;
	}
	
	/**
	 * 解析盘找找加工后的用户URL
	 * 原：http://yun.baidu.com/share/home?uk=1949795117
	 * 现在：http://www.13910.com/u/1949795117/share/
	 * @param panZhaoZhaoUserShareUrl	现在的url
	 */
	private static void parsePanZhaoZhaoUserShareUrl(String panZhaoZhaoUserShareUrl) {

		logger.info("开始爬取个人代理分享主页：："+panZhaoZhaoUserShareUrl);
		
		HttpGet httpGet = new HttpGet(panZhaoZhaoUserShareUrl);
		RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build();
		httpGet.setConfig(config );
		CloseableHttpResponse response = null;
		try {
			response = httpClient.execute(httpGet);
			if(response == null) {
				logger.info("链接超时！");
			}else {
				if(response.getStatusLine().getStatusCode() == 200) {
					HttpEntity entity = response.getEntity();
					String pageContent = EntityUtils.toString(entity, "utf-8");
					parsePanZhaoZhaoUserSharePageContent(pageContent,panZhaoZhaoUserShareUrl);
				}
			}
		} catch (ClientProtocolException e) {
			logger.error(panZhaoZhaoUserShareUrl+"-解析异常-ClientProtocolException",e);
		} catch (IOException e) {
			logger.error(panZhaoZhaoUserShareUrl+"-解析异常-IOException",e);
		}finally {
			try {
				if(response != null) {
					response.close();
				}
			} catch (IOException e) {
				logger.error(panZhaoZhaoUserShareUrl+"-解析异常-IOException",e);
			}
		}
		
		logger.info("结束爬取个人代理分享主页：："+URL);
	
	
	}

	/**
	 * 通过用户分享的百度云主页URL获取的内容，得到所有加工后的链接
	 * @param pageContent
	 * @param panZhaoZhaoUserShareUrl	加工后的用户分享主页链接
	 */
	private static void parsePanZhaoZhaoUserSharePageContent(String pageContent, String panZhaoZhaoUserShareUrl) {
		Document doc = Jsoup.parse(pageContent);
		Elements aEles = doc.select("#flist li a");
		if(aEles.size() == 0) {
			logger.info("没有爬取到百度云地址");
			return;
		}
		
		for (Element aEle : aEles) {
			String ahref = aEle.attr("href");
			parseUserHandledTargetUrl(PROJECT_URL + ahref);
		}
//		System.out.println("***********************************"+aEles.size()+"***********************"+ahref+"**********************************************************");
	}

	/**
	 * 解析地址
	 * @param handledTargetUrl	这个地址中包含了加工后的百度云地址
	 */
	private static void parseUserHandledTargetUrl(String handledTargetUrl) {
		logger.info("开始爬取blog：："+handledTargetUrl);
		if(cache.get(handledTargetUrl) != null) {
			logger.info("数据库已存在该记录");
			return;
		}
		
		HttpGet httpGet = new HttpGet(handledTargetUrl);
		RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build();
		httpGet.setConfig(config );
		CloseableHttpResponse response = null;
		try {
			response = httpClient.execute(httpGet);
			if(response == null) {
				logger.info("链接超时！");
			}else {
				if(response.getStatusLine().getStatusCode() == 200) {
					HttpEntity entity = response.getEntity();
					String pageContent = EntityUtils.toString(entity, "utf-8");
//					System.out.println("**********************************************************"+pageContent+"**********************************************************");
					parseHandledTargetUrlPageContent(pageContent,handledTargetUrl);
				}
			}
		} catch (ClientProtocolException e) {
			logger.error(handledTargetUrl+"-解析异常-ClientProtocolException",e);
		} catch (IOException e) {
			logger.error(handledTargetUrl+"-解析异常-IOException",e);
		}finally {
			try {
				if(response != null) {
					response.close();
				}
			} catch (IOException e) {
				logger.error(handledTargetUrl+"-解析异常-IOException",e);
			}
		}
		
		logger.info("结束爬取blog：："+URL);
	
	
	
	}

	/**
	 * 解析加工后的百度云地址内容
	 * @param pageContent
	 * @param handledTargetUrl	加工后的百度云地址
	 */
	private static void parseHandledTargetUrlPageContent(String pageContent, String handledTargetUrl) {
		Document doc = Jsoup.parse(pageContent);
		Elements aEles = doc.select(".fileinfo .panurl a");
		if(aEles.size() == 0) {
			logger.info("没有爬取到百度云地址");
			return;
		}
		String ahref = aEles.get(0).attr("href");
//		System.out.println("**********************************************************"+ahref+"**********************************************************");
		getUserBaiduYunUrl(PROJECT_URL+ahref);
	}

	/**
	 * 获取被处理过的百度云链接内容
	 * @param handledBaiduYunUrl	被处理过的百度云链接
	 */
	private static void getUserBaiduYunUrl(String handledBaiduYunUrl) {

		logger.info("开始爬取blog：："+handledBaiduYunUrl);
		
		HttpGet httpGet = new HttpGet(handledBaiduYunUrl);
		RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build();
		httpGet.setConfig(config );
		CloseableHttpResponse response = null;
		try {
			response = httpClient.execute(httpGet);
			if(response == null) {
				logger.info("链接超时！");
			}else {
				if(response.getStatusLine().getStatusCode() == 200) {
					HttpEntity entity = response.getEntity();
					String pageContent = EntityUtils.toString(entity, "utf-8");
//					System.out.println("**********************************************************"+pageContent+"**********************************************************");
					parseHandledBaiduYunUrlPageContent(pageContent,handledBaiduYunUrl);
				}
			}
		} catch (ClientProtocolException e) {
			logger.error(handledBaiduYunUrl+"-解析异常-ClientProtocolException",e);
		} catch (IOException e) {
			logger.error(handledBaiduYunUrl+"-解析异常-IOException",e);
		}finally {
			try {
				if(response != null) {
					response.close();
				}
			} catch (IOException e) {
				logger.error(handledBaiduYunUrl+"-解析异常-IOException",e);
			}
		}
		
		logger.info("结束爬取blog：："+URL);
	
	
	
	
	}

	/**
	 * 获取百度云链接
	 * @param pageContent
	 * @param handledBaiduYunUrl
	 */
	private static void parseHandledBaiduYunUrlPageContent(String pageContent, String handledBaiduYunUrl) {
		Document doc = Jsoup.parse(pageContent);
		Elements aEles = doc.select("#check-result-no a");
		if(aEles.size() == 0) {
			logger.info("没有爬取到百度云地址");
			return;
		}
		String ahref = aEles.get(0).attr("href");
		if((!ahref.contains("yun.baidu.com")) && (!ahref.contains("pan.baidu.com"))) return;
		logger.info("**********************************************************"+"爬取到第"+(++total)+"个目标对象："+ahref+"**********************************************************");
//		System.out.println("爬取到第"+(++total)+"个目标对象："+ahref);
		String sql = "insert into `t_jsoup_article` values(null,?,?,null,now(),0,0,null,?,0,null)";
		try {
			PreparedStatement pst = con.prepareStatement(sql);
//			pst.setObject(1, UUID.randomUUID().toString());
			pst.setObject(1, "测试类容");
			pst.setObject(2, ahref);
			pst.setObject(3, ahref);
			if(pst.executeUpdate() == 0) {
				logger.info("爬取链接插入数据库失败！！！");
			}else {
				cache.put(new net.sf.ehcache.Element(handledBaiduYunUrl, handledBaiduYunUrl));
				logger.info("爬取链接插入数据库成功！！！");
			}
		} catch (SQLException e) {
			logger.error(ahref+"-解析异常-SQLException",e);
		}
	}
	
	public static void start() {
		DbUtil dbUtil = new DbUtil();
		try {
			con = dbUtil.getCon();
			parseHomePage();
		} catch (Exception e) {
			logger.error("数据库创建失败",e);
		}
		
	
	}

	public static void main(String[] args) {
		start();
	}
}

在这里插入图片描述

爬取电影：

MovieCrawlerStarter：

package yj.crawler;


import net.sf.ehcache.Cache;
import net.sf.ehcache.CacheManager;
import net.sf.ehcache.Status;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import yj.util.DbUtil;
import yj.util.PropertiesUtil;

import java.io.IOException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.LinkedList;
import java.util.List;


public class MovieCrawlerStarter {
	private static Logger logger = Logger.getLogger(MovieCrawlerStarter.class);
	private static String URL = "http://www.8gw.com/";
	private static String PROJECT_URL = "http://www.8gw.com";

	private static Connection con;
	private static CacheManager manager;
	private static Cache cache;
	private static CloseableHttpClient httpClient;
	private static long total = 0;

	/**
	 * 等待爬取的52个链接的数据
	 * 
	 * @return
	 */
	private static List<String> getUrls() {
		List<String> list = new LinkedList<String>();
		list.add("http://www.8gw.com/8gli/index8.html");
		for (int i = 2; i < 53; i++) {
			list.add("http://www.8gw.com/8gli/index8_" + i + ".html");
		}
		return list;
	}

	/**
	 * 获取URL主体类容
	 * 
	 * @param url
	 */
	private static void parseUrl(String url) {

		logger.info("开始爬取系列列表：：" + url);

		HttpGet httpGet = new HttpGet(url);
		RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build();
		httpGet.setConfig(config);
		CloseableHttpResponse response = null;
		try {
			response = httpClient.execute(httpGet);
			if (response == null) {
				logger.info("链接超时！");
			} else {
				if (response.getStatusLine().getStatusCode() == 200) {
					HttpEntity entity = response.getEntity();
					String pageContent = EntityUtils.toString(entity, "GBK");
					parsePageContent(pageContent, url);
				}
			}
		} catch (ClientProtocolException e) {
			logger.error(url + "-解析异常-ClientProtocolException", e);
		} catch (IOException e) {
			logger.error(url + "-解析异常-IOException", e);
		} finally {
			try {
				if (response != null) {
					response.close();
				}
			} catch (IOException e) {
				logger.error(url + "-解析异常-IOException", e);
			}
		}
		logger.info("结束爬取系列列表：：" + url);

	}

	/**
	 * 获取当前页中的具体影片的链接
	 * @param pageContent
	 * @param url
	 */
	private static void parsePageContent(String pageContent, String url) {
//		System.out.println("****************" + url + "***********************");
		Document doc = Jsoup.parse(pageContent);
		Elements liEles = doc.select(".span_2_800 #list_con li");
		for (Element liEle : liEles) {
			
			String movieUrl = liEle.select(".info a").attr("href");
			if (null == movieUrl || "".equals(movieUrl)) {
				logger.info("该影片未内容，不再爬取插入数据库！");
				continue;
			}
			if(cache.get(movieUrl) != null) {
				logger.info("该数据已经被爬取到数据库中，数据库不再收录！");
				continue;
			}
			parseSingleMovieUrl(PROJECT_URL+movieUrl);
		}
		
	}

	/**
	 * 解析单个影片链接
	 * @param movieUrl
	 */
	private static void parseSingleMovieUrl(String movieUrl) {
		logger.info("开始爬取影片网页：" + movieUrl);
		httpClient = HttpClients.createDefault();
		HttpGet httpGet = new HttpGet(movieUrl);
		RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build();
		httpGet.setConfig(config);
		CloseableHttpResponse response = null;
		try {
			response = httpClient.execute(httpGet);
			if (response == null) {
				logger.info(movieUrl + ":爬取无响应");
				return;
			}

			if (response.getStatusLine().getStatusCode() == 200) {
				HttpEntity entity = response.getEntity();
				String blogContent = EntityUtils.toString(entity, "GBK");
				parseSingleMovieContent(blogContent, movieUrl);
			}

		} catch (ClientProtocolException e) {
			logger.error(movieUrl + "-ClientProtocolException", e);
		} catch (IOException e) {
			logger.error(movieUrl + "-IOException", e);
		} finally {
			try {
				if (response != null) {
					response.close();
				}
			} catch (IOException e) {
				logger.error(movieUrl + "-IOException", e);
			}
		}

		logger.info("结束爬取影片网页：" + movieUrl);

			
	}

	/**
	 * 解析页面主体类容（影片名字、影片描述、影片地址）
	 * @param pageContent
	 * @param movieUrl
	 */
	private static void parseSingleMovieContent(String pageContent, String movieUrl) {
//		System.out.println("****************" + movieUrl + "***********************");
		Document doc = Jsoup.parse(pageContent);
		Elements divEles = doc.select(".wrapper .main .moviedteail");
//		.wrapper .main .moviedteail .moviedteail_tt h1
//		.wrapper .main .moviedteail .moviedteail_list .moviedteail_list_short a
//		.wrapper .main .moviedteail .moviedteail_img img	
		
		Elements h1Eles = divEles.select(".moviedteail_tt h1");
		if (h1Eles.size() == 0) {
			logger.info("影片名字为空，不插入数据库！");
			return;
		}
		String mname = h1Eles.get(0).html();
		
		Elements aEles = divEles.select(".moviedteail_list .moviedteail_list_short a");
		if (aEles.size() == 0) {
			logger.info("影片描述为空，不插入数据库！");
			return;
		}
		String mdesc = aEles.get(0).html();
		
		Elements imgEles = divEles.select(".moviedteail_img img");
		if (null == imgEles || "".equals(imgEles)) {
			logger.info("影片描述为空，不插入数据库！");
			return;
		}
		String mimg = imgEles.attr("src");
		String sql = "insert into movie(mname,mdesc,mimg,mlink) values(?,?,?,99)";
		
		try {
			System.out.println("****************" + mname + "***********************");
			System.out.println("****************" + mdesc + "***********************");
			System.out.println("****************" + mimg + "***********************");
			
			PreparedStatement pst = con.prepareStatement(sql);
			pst.setObject(1, mname);
			pst.setObject(2, mdesc);
			pst.setObject(3, mimg);
			if(pst.executeUpdate() == 0) {
				logger.info("爬取影片信息插入数据库失败");
			}else {
				cache.put(new net.sf.ehcache.Element(movieUrl, movieUrl));
				logger.info("爬取影片信息插入数据库成功");
			}
		} catch (SQLException e) {
			logger.error("数据异常-SQLException：",e);
		}
	}

	public static void main(String[] args) {
		manager = CacheManager.create(PropertiesUtil.getValue("ehcacheXmlPath"));
		cache = manager.getCache("8gli_movies");
		httpClient = HttpClients.createDefault();
		DbUtil dbUtil = new DbUtil();
		
		try {
			con = dbUtil.getCon();
			List<String> urls = getUrls();
			for (String url : urls) {
				try {
					parseUrl(url);
				} catch (Exception e) {
//					urls.add(url);
				}
			} 
		} catch (Exception e1) {
			logger.error("数据库连接势失败！");
		} finally {
			try {
				if (httpClient != null) {
					httpClient.close();
				}
				if (con != null) {
					con.close();
				}
			} catch (IOException e) {
				logger.error("网络连接关闭异常-IOException：",e);
			} catch (SQLException e) {
				logger.error("数据关闭异常-SQLException：",e);
			}
		}
		// 最终将数据缓存到硬盘中
		if (cache.getStatus() == Status.STATUS_ALIVE) {
			cache.flush();
		}
		manager.shutdown();
	}

}

在这里插入图片描述

我对你乱码了

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
jsoup爬虫

文章目录pom依赖：工具类：util:DateUtilDbUtilPropertiesUtil文件crawler.propertieslog4j.properties实例：爬取图片：DownloadImg：爬取博客网和其他网址：BlogCrawlerStarterPanZhaoZhaoCrawler3：爬取电影：MovieCrawlerStarter：pom依赖：<?xml versio...
复制链接

扫一扫