【java 我的爬虫日志(一)】jsoup+mysql+jenkins+maven将目标网站文章信息，爬虫下来，并整理存入数据库

最新推荐文章于 2021-02-25 03:48:09 发布

深圳-雄少

最新推荐文章于 2021-02-25 03:48:09 发布

阅读量633

点赞数 1

分类专栏：爬虫方面自动化测试方面

本文链接：https://blog.csdn.net/zouxiongqqq/article/details/79625608

版权

自动化测试方面同时被 2 个专栏收录

33 篇文章 1 订阅

订阅专栏

爬虫方面

1 篇文章 0 订阅

订阅专栏

思路：（1）采用soup请求和分析html，并爬虫文章关键内容，存入mysql中；（2）采用maven将项目打成jar包；（3）采用jenkins进行定时构建和执行jar，并最终在数据库中获得所有爬虫下来的文章内容。

第一步：部署和配置Jenkins：

第二步：添加数据库结构：

CREATE TABLE `plant_protection_dynamics` (
`id` int(11) unsigned NOT NULL AUTO_INCREMENT COMMENT '编号唯一标识',
`Plant_protection_net_name` varchar(100) DEFAULT NULL,
`Plant_protection_net_url` varchar(200) DEFAULT NULL,
`Get_Plant_protection_content_url` varchar(500) DEFAULT NULL COMMENT '获得地址',
`Plant_protection_Title` varchar(200) DEFAULT NULL COMMENT '文章标题',
`Plant_protection_Author` varchar(200) DEFAULT NULL COMMENT '文章作者',
`Plant_protection_html_Content` longtext,
`Plant_protection_Content` longtext COMMENT '文章内容',
`Plant_protection_Writing_time` varchar(50) DEFAULT NULL COMMENT '文章编写时间',
`Create_time` datetime DEFAULT NULL COMMENT '创建时间',
`update_time` datetime DEFAULT NULL COMMENT '最后修改时间',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=901 DEFAULT CHARSET=utf8 COMMENT='文章信息表';

第三步：在eclipse中建立maven项目，并编写爬虫代码，并存入mysql表中

项目结构图：

main包:

package main;

import java.sql.Connection;
import java.sql.ResultSet;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
import java.util.Random;
import java.io.IOException;

import org.apache.commons.lang.StringEscapeUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.jsoup.nodes.Element;

import Common.MysqlConn;

public class crawlers {
	@SuppressWarnings("unused")
	private static Connection conn;

	@SuppressWarnings("rawtypes")
	private static void fighting(int num) throws Exception {
		String mysql_driver = "com.mysql.jdbc.Driver";
		String mysql_url = "jdbc:mysql://localhost:3306/auto_testcase?useUnicode=true&characterEncoding=UTF-8&connectTimeout=5000";
		String mysql_DB_user = "root";
		String mysql_DB_password = "123456";
		ResultSet rs = null;
		conn = MysqlConn.getConnection(mysql_driver, mysql_url, mysql_DB_user, mysql_DB_password);
		Random random = new Random();
		Date d = new Date();
		SimpleDateFormat datetime = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
		try {
			System.out.println("*****************The crawler begins**************************");
			for (int i = 1; i <= num; i++) {
				String Url = "http://www.jszhibao.com/?action-category-catid-36-page-";
				Url = Url + i;
				System.out.println("页请求地址￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥:" + Url);
				Document documents = Jsoup.connect(Url).get();
				// 查看class元素中a标签
				Elements get_herf = documents.getElementsByClass("newli_l").select("a");
				if (!get_herf.equals("") && get_herf != null) {	
					for (Iterator it = get_herf.iterator(); it.hasNext();) {
						Element el = (Element) it.next();
						String sqls = "SELECT Get_Plant_protection_content_url,Plant_protection_Content from plant_protection_dynamics where Get_Plant_protection_content_url = '" + el.attr("href") + "';";
						rs = MysqlConn.queryx(sqls);
						if (rs != null) {
							if (rs.next()) {
								System.out.println("数据库表中已存在！");
								int nums=random.nextInt(3000);
								Thread.sleep(nums); 
							}else{
								Document documentss = Jsoup.connect(el.attr("href")).get();
								System.out.println("当前获取数据请求地址$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$:"+ el.attr("href"));
								Elements classElement = documentss.getElementsByClass("con6");
								classElement.select("#article_pn").remove();
								classElement.select("#article_tag").remove();
	/*							// 取出所有的元素
								System.out.println("获取文章标题>>>>>>>>>>>>>>>>>:"
										+ classElement.select("h1").get(0).text());
								System.out.println("获取文章作者>>>>>>>>>>>>>>>>>:"
										+ classElement.select("label").select("a").get(0).text());
								System.out.println("获取文章时间>>>>>>>>>>>>>>>>>:"
										+ classElement.select("label").get(1).text());
								System.out.println("获取文章内容>>>>>>>>>>>>>>>>>:"
										+ classElement.html());							
								// 获取元素的文本
	*/							
								String Plant_protection_net_name="'江苏植保网'";
								System.out.println("test>>>>>>>>>>>>>>>>>:"+classElement.text());
								String Get_Plant_protection_content_url="'" + el.attr("href") + "'";
								String Plant_protection_Title = "'" + classElement.select("h1").get(0).text() + "'";
								String Plant_protection_Author = "'" + classElement.select("label").get(0).text() + "'";
								String Plant_protection_Content = "'"+classElement.text()+"'";
								String Plant_protection_html_Content = "\""+StringEscapeUtils.escapeHtml(classElement.html())+"\"";
								SimpleDateFormat sdf = new SimpleDateFormat("yyyy年M月d日 HH:mm");
								Date date = sdf.parse(classElement.select("label").get(1).text());
								sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm");
								String Plant_protection_Writing_time ="'" + sdf.format(date) + "'";
								String Create_time = "'" + datetime.format(d) + "'";
								String update_time = "'" + datetime.format(d) + "'";
								String sql = "insert plant_protection_dynamics(Plant_protection_net_name,Plant_protection_net_url,Get_Plant_protection_content_url,Plant_protection_Title,Plant_protection_Author,Plant_protection_html_Content,Plant_protection_Content,Plant_protection_Writing_time,Create_time,update_time) "
										+ "values("+Plant_protection_net_name+",'"+Url+"',"+ Get_Plant_protection_content_url +","+ Plant_protection_Title +","+Plant_protection_Author +","+Plant_protection_html_Content+","+ Plant_protection_Content + ","+ Plant_protection_Writing_time +","+ Create_time +","+ update_time +")";						
								MysqlConn.execute_Update(sql);

								int nums=random.nextInt(3000);
								Thread.sleep(nums);
							}
						}else{
							System.out.println("连接数据库异常，请检查！");
						}
					}
				} else {
					System.out.println("请求页面无数据！"+Url);
				}
			}
		System.out.println("*****************The end of the reptile**************************");
		MysqlConn.closeDB();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

			
	/*	private static void test() throws Exception {
			String mysql_driver = "com.mysql.jdbc.Driver";
			String mysql_url = "jdbc:mysql://localhost:3306/auto_testcase?useUnicode=true&characterEncoding=UTF-8&connectTimeout=3000";
			String mysql_DB_user = "root";
			String mysql_DB_password = "123456";
			conn = MysqlConn.getConnection(mysql_driver, mysql_url, mysql_DB_user, mysql_DB_password);
			String sqls = "SELECT Plant_Content from plant_protection_dynamics where Get_url = 'http://www.jszhibao.com/?action-viewnews-itemid-2642';";
			ResultSet rss = MysqlConn.queryx(sqls);
			if(rss.next()){
			//System.out.println("11111111111111>>>>>>>>>>>>>>>>>:\n"+rss.getString(1));
			}
			System.out.println("反转义HTML:"+StringEscapeUtils.unescapeHtml(rss.getString(1))); 
			 SimpleDateFormat sdf = new SimpleDateFormat("yyyy年M月d日 HH:mm");
			 Date d = sdf.parse("2018年3月16日 14:49");
			 sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm");
			 System.out.println(sdf.format(d));
			}*/
		
		
		public static void main(String[] args) throws Exception {
			fighting(75);
			//test();
		}

	}

数据库操作：

package Common;

import java.sql.*;

public class MysqlConn {
	private static Connection conn;

	// 获取数据库连接
	public static Connection getConnection(String driver, String url, String userName, String userPassword) throws Exception {
		try {
			// 加载mysql驱动器
			Class.forName(driver);
			// 建立数据库连接
			conn = DriverManager.getConnection(url, userName, userPassword);
			if(!conn.isClosed()){
				System.out.println("Successful connection testcase DB!");
			}else{
				System.out.println("Failure connection testcase DB!");
				return null;
			}
		} catch (ClassNotFoundException e) {
			System.out.println("加载驱动器失败:" + e.getMessage());
			e.printStackTrace();
			return null;
		} catch (SQLException e) {
			System.out.println("注册驱动器失败:" + e.getMessage());
			e.printStackTrace();
			return null;
		}
		return conn;
	}

	// 访问数据库，执行插入、删除、更新操作
	public static String execute_Update(String sql) throws Exception {
		//getConnection(driver, url, userName, userPassword);// 初始化数据库连接
		Statement stmt = conn.createStatement();// 创建Statement对象
		if (stmt.executeUpdate(sql) != 1) {
			return "failure";
		}
		return "success";
	}

	// 访问数据库，执行查询操作
	public static ResultSet queryx(String sql) throws Exception {
		
		//getConnection(driver, url, userName, userPassword);
		Statement stmt = conn.createStatement();
		return stmt.executeQuery(sql);
	}

	// 关闭数据库链接
	public static void closeDB() {
		try {
			conn.close();
		} catch (SQLException ex) {
			System.out.println(ex.getMessage());
		}
	}
}

pom文件：

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>
	<groupId>Crawlers</groupId>
	<artifactId>Crawlers</artifactId>
	<packaging>jar</packaging>
	<version>1.0</version>
	<url>http://maven.apache.org</url>

	<dependencies>
		<dependency>
			<groupId>org.jsoup</groupId>
			<artifactId>jsoup</artifactId>
			<version>1.11.2</version>
		</dependency>
		<dependency>
			<groupId>org.quartz-scheduler</groupId>
			<artifactId>quartz</artifactId>
			<version>2.3.0</version>
		</dependency>
		<dependency>
			<groupId>mysql</groupId>
			<artifactId>mysql-connector-java</artifactId>
			<version>5.1.18</version>
		</dependency>
		<dependency>
			<groupId>commons-lang</groupId>
			<artifactId>commons-lang</artifactId>
			<version>2.3</version>
		</dependency>
	</dependencies>

	<build>
		<defaultGoal>compile</defaultGoal>
		<sourceDirectory>src</sourceDirectory>
		<resources>
			<resource>
				<filtering>false</filtering>
				<directory>${basedir}/src/main</directory>
			</resource>
		</resources>
		<plugins>
			<plugin>
				<artifactId>maven-compiler-plugin</artifactId>
				<version>3.1</version>
				<configuration>
					<source>1.7</source>
					<target>1.7</target>
					<encoding>GBK</encoding>
				</configuration>
			</plugin>
			<plugin>
				<groupId>org.apache.maven.plugins</groupId>
				<artifactId>maven-shade-plugin</artifactId>
				<version>1.2.1</version>
				<executions>
					<execution>
						<phase>package</phase>
						<goals>
							<goal>shade</goal>
						</goals>
						<configuration>
							<transformers>
								<transformer
									implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
									<mainClass>main.crawlers</mainClass>
								</transformer>
							</transformers>
						</configuration>
					</execution>
				</executions>
			</plugin>
		</plugins>
	</build>
	<properties>
		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
	</properties>
</project>

参考地址：

https://www.cnblogs.com/horizonli/p/5332645.html ----- Jenkins配置

https://my.oschina.net/axes/blog/119063 ----- pom详解

http://blog.csdn.net/chenleixing/article/details/43456987 ---- html转义和反转义

http://blog.csdn.net/u012983749/article/details/52179795 ---- jsoup 解析网页第三方库

注：本文章只供学习所用，不准恶心攻击别人网站，否则一切后果自负，谢谢！