思路:(1)采用soup请求和分析html,并爬虫文章关键内容,存入mysql中; (2)采用maven将项目打成jar包;(3)采用jenkins进行定时构建和执行jar,并最终在数据库中获得所有爬虫下来的文章内容。
第一步:部署和配置Jenkins:
第二步:添加数据库结构:
CREATE TABLE `plant_protection_dynamics` (
`id` int(11) unsigned NOT NULL AUTO_INCREMENT COMMENT '编号唯一标识',
`Plant_protection_net_name` varchar(100) DEFAULT NULL,
`Plant_protection_net_url` varchar(200) DEFAULT NULL,
`Get_Plant_protection_content_url` varchar(500) DEFAULT NULL COMMENT '获得地址',
`Plant_protection_Title` varchar(200) DEFAULT NULL COMMENT '文章标题',
`Plant_protection_Author` varchar(200) DEFAULT NULL COMMENT '文章作者',
`Plant_protection_html_Content` longtext,
`Plant_protection_Content` longtext COMMENT '文章内容',
`Plant_protection_Writing_time` varchar(50) DEFAULT NULL COMMENT '文章编写时间',
`Create_time` datetime DEFAULT NULL COMMENT '创建时间',
`update_time` datetime DEFAULT NULL COMMENT '最后修改时间',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=901 DEFAULT CHARSET=utf8 COMMENT='文章信息表';
第三步:在eclipse中建立maven项目,并编写爬虫代码,并存入mysql表中
项目结构图:
main包:
package main;
import java.sql.Connection;
import java.sql.ResultSet;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
import java.util.Random;
import java.io.IOException;
import org.apache.commons.lang.StringEscapeUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.jsoup.nodes.Element;
import Common.MysqlConn;
public class crawlers {
@SuppressWarnings("unused")
private static Connection conn;
@SuppressWarnings("rawtypes")
private static void fighting(int num) throws Exception {
String mysql_driver = "com.mysql.jdbc.Driver";
String mysql_url = "jdbc:mysql://localhost:3306/auto_testcase?useUnicode=true&characterEncoding=UTF-8&connectTimeout=5000";
String mysql_DB_user = "root";
String mysql_DB_password = "123456";
ResultSet rs = null;
conn = MysqlConn.getConnection(mysql_driver, mysql_url, mysql_DB_user, mysql_DB_password);
Random random = new Random();
Date d = new Date();
SimpleDateFormat datetime = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
try {
System.out.println("*****************The crawler begins**************************");
for (int i = 1; i <= num; i++) {
String Url = "http://www.jszhibao.com/?action-category-catid-36-page-";
Url = Url + i;
System.out.println("页请求地址¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥:" + Url);
Document documents = Jsoup.connect(Url).get();
// 查看class元素中a标签
Elements get_herf = documents.getElementsByClass("newli_l").select("a");
if (!get_herf.equals("") && get_herf != null) {
for (Iterator it = get_herf.iterator(); it.hasNext();) {
Element el = (Element) it.next();
String sqls = "SELECT Get_Plant_protection_content_url,Plant_protection_Content from plant_protection_dynamics where Get_Plant_protection_content_url = '" + el.attr("href") + "';";
rs = MysqlConn.queryx(sqls);
if (rs != null) {
if (rs.next()) {
System.out.println("数据库表中已存在!");
int nums=random.nextInt(3000);
Thread.sleep(nums);
}else{
Document documentss = Jsoup.connect(el.attr("href")).get();
System.out.println("当前获取数据请求地址$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$:"+ el.attr("href"));
Elements classElement = documentss.getElementsByClass("con6");
classElement.select("#article_pn").remove();
classElement.select("#article_tag").remove();
/* // 取出所有的元素
System.out.println("获取文章标题>>>>>>>>>>>>>>>>>:"
+ classElement.select("h1").get(0).text());
System.out.println("获取文章作者>>>>>>>>>>>>>>>>>:"
+ classElement.select("label").select("a").get(0).text());
System.out.println("获取文章时间>>>>>>>>>>>>>>>>>:"
+ classElement.select("label").get(1).text());
System.out.println("获取文章内容>>>>>>>>>>>>>>>>>:"
+ classElement.html());
// 获取元素的文本
*/
String Plant_protection_net_name="'江苏植保网'";
System.out.println("test>>>>>>>>>>>>>>>>>:"+classElement.text());
String Get_Plant_protection_content_url="'" + el.attr("href") + "'";
String Plant_protection_Title = "'" + classElement.select("h1").get(0).text() + "'";
String Plant_protection_Author = "'" + classElement.select("label").get(0).text() + "'";
String Plant_protection_Content = "'"+classElement.text()+"'";
String Plant_protection_html_Content = "\""+StringEscapeUtils.escapeHtml(classElement.html())+"\"";
SimpleDateFormat sdf = new SimpleDateFormat("yyyy年M月d日 HH:mm");
Date date = sdf.parse(classElement.select("label").get(1).text());
sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm");
String Plant_protection_Writing_time ="'" + sdf.format(date) + "'";
String Create_time = "'" + datetime.format(d) + "'";
String update_time = "'" + datetime.format(d) + "'";
String sql = "insert plant_protection_dynamics(Plant_protection_net_name,Plant_protection_net_url,Get_Plant_protection_content_url,Plant_protection_Title,Plant_protection_Author,Plant_protection_html_Content,Plant_protection_Content,Plant_protection_Writing_time,Create_time,update_time) "
+ "values("+Plant_protection_net_name+",'"+Url+"',"+ Get_Plant_protection_content_url +","+ Plant_protection_Title +","+Plant_protection_Author +","+Plant_protection_html_Content+","+ Plant_protection_Content + ","+ Plant_protection_Writing_time +","+ Create_time +","+ update_time +")";
MysqlConn.execute_Update(sql);
int nums=random.nextInt(3000);
Thread.sleep(nums);
}
}else{
System.out.println("连接数据库异常,请检查!");
}
}
} else {
System.out.println("请求页面无数据!"+Url);
}
}
System.out.println("*****************The end of the reptile**************************");
MysqlConn.closeDB();
} catch (IOException e) {
e.printStackTrace();
}
}
/* private static void test() throws Exception {
String mysql_driver = "com.mysql.jdbc.Driver";
String mysql_url = "jdbc:mysql://localhost:3306/auto_testcase?useUnicode=true&characterEncoding=UTF-8&connectTimeout=3000";
String mysql_DB_user = "root";
String mysql_DB_password = "123456";
conn = MysqlConn.getConnection(mysql_driver, mysql_url, mysql_DB_user, mysql_DB_password);
String sqls = "SELECT Plant_Content from plant_protection_dynamics where Get_url = 'http://www.jszhibao.com/?action-viewnews-itemid-2642';";
ResultSet rss = MysqlConn.queryx(sqls);
if(rss.next()){
//System.out.println("11111111111111>>>>>>>>>>>>>>>>>:\n"+rss.getString(1));
}
System.out.println("反转义HTML:"+StringEscapeUtils.unescapeHtml(rss.getString(1)));
SimpleDateFormat sdf = new SimpleDateFormat("yyyy年M月d日 HH:mm");
Date d = sdf.parse("2018年3月16日 14:49");
sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm");
System.out.println(sdf.format(d));
}*/
public static void main(String[] args) throws Exception {
fighting(75);
//test();
}
}
数据库操作:
package Common;
import java.sql.*;
public class MysqlConn {
private static Connection conn;
// 获取数据库连接
public static Connection getConnection(String driver, String url, String userName, String userPassword) throws Exception {
try {
// 加载mysql驱动器
Class.forName(driver);
// 建立数据库连接
conn = DriverManager.getConnection(url, userName, userPassword);
if(!conn.isClosed()){
System.out.println("Successful connection testcase DB!");
}else{
System.out.println("Failure connection testcase DB!");
return null;
}
} catch (ClassNotFoundException e) {
System.out.println("加载驱动器失败:" + e.getMessage());
e.printStackTrace();
return null;
} catch (SQLException e) {
System.out.println("注册驱动器失败:" + e.getMessage());
e.printStackTrace();
return null;
}
return conn;
}
// 访问数据库,执行插入、删除、更新操作
public static String execute_Update(String sql) throws Exception {
//getConnection(driver, url, userName, userPassword);// 初始化数据库连接
Statement stmt = conn.createStatement();// 创建Statement对象
if (stmt.executeUpdate(sql) != 1) {
return "failure";
}
return "success";
}
// 访问数据库,执行查询操作
public static ResultSet queryx(String sql) throws Exception {
//getConnection(driver, url, userName, userPassword);
Statement stmt = conn.createStatement();
return stmt.executeQuery(sql);
}
// 关闭数据库链接
public static void closeDB() {
try {
conn.close();
} catch (SQLException ex) {
System.out.println(ex.getMessage());
}
}
}
pom文件:
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>Crawlers</groupId>
<artifactId>Crawlers</artifactId>
<packaging>jar</packaging>
<version>1.0</version>
<url>http://maven.apache.org</url>
<dependencies>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.2</version>
</dependency>
<dependency>
<groupId>org.quartz-scheduler</groupId>
<artifactId>quartz</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.18</version>
</dependency>
<dependency>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
<version>2.3</version>
</dependency>
</dependencies>
<build>
<defaultGoal>compile</defaultGoal>
<sourceDirectory>src</sourceDirectory>
<resources>
<resource>
<filtering>false</filtering>
<directory>${basedir}/src/main</directory>
</resource>
</resources>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.7</source>
<target>1.7</target>
<encoding>GBK</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>1.2.1</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>main.crawlers</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
</project>
参考地址:
https://www.cnblogs.com/horizonli/p/5332645.html ----- Jenkins配置
https://my.oschina.net/axes/blog/119063 ----- pom详解
http://blog.csdn.net/chenleixing/article/details/43456987 ---- html转义和反转义
http://blog.csdn.net/u012983749/article/details/52179795 ---- jsoup 解析网页第三方库
注:本文章只供学习所用,不准恶心攻击别人网站,否则一切后果自负,谢谢!