1、maven工程配置jar依赖:
<dependency>
<groupId>org.xerial</groupId>
<artifactId>sqlite-jdbc</artifactId>
<version>3.21.0.1</version>
<scope>test</scope>
</dependency>
<dependency>
<!-- jsoup HTML parser library @ https://jsoup.org/ -->
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.2</version>
</dependency>
</dependencies>
package com.xuqun.archive;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class SQLiteJDBC {
private static final String Class_Name = "org.sqlite.JDBC";
private static final String DB_URL = "jdbc:sqlite:D:\\DEV_SPACE\\archive_data\\SQLiteDataBase.db";
public static void main(String[] args) {
// TODO Auto-generated method stub
Connection connection = null;
try {
connection = createConnection();
func1(connection);
}catch(Exception e){
e.printStackTrace();
}finally{
try {
if (connection != null)
connection.close();
} catch (SQLException e) {
// connection close failed.
System.err.println(e);
}
}
}
private static void func1(Connection connection) {
try {
Statement statement = connection.createStatement();
//判断是否有表tables的存在。有则删除
//statement.executeUpdate("drop table if exists nodeInfo");
//String sql="create table nodeInfo(node_id varchar(255),node_name varchar(255),link_url varchar(255)) "; //基础表
//String sql="create table nodeRelationship(ancestor varchar(255),descendant varchar(255),distance varchar(255)) "; //关系表
//statement.executeUpdate(sql); //创建数据库
//statement.executeUpdate("insert into nodeInfo values('1','jsoup','https://jsoup.org/')");//向数据库中插入数据
//statement.executeUpdate("insert into nodeInfo values('2','Cookbook','https://jsoup.org/cookbook/')");
//statement.executeUpdate("insert into nodeInfo values('3','Extracting data','https://jsoup.org/cookbook/extracting-data/')");
//statement.executeUpdate("insert into nodeRelationship values('1','1','0')");
//statement.executeUpdate("insert into nodeRelationship values('1','2','1')");
//statement.executeUpdate("insert into nodeRelationship values('1','3','2')");
String headUrl="https://jsoup.org";
String selectSQL="select * from mainData";
ResultSet RES=statement.executeQuery(selectSQL);
int i=1;//父节点
int j=1;//子节点
int count=0;//节点之间的距离
while(RES.next()){
String dataInfo=RES.getString("data");
System.out.println("dataInfo: "+dataInfo);
Document doc = Jsoup.parse(dataInfo);
Element masthead = doc.select("div.breadcrumb").first();
Elements links=masthead.select("a");
for (Element link : links) {
String linkHref = link.attr("href");
String linkText = link.text();
ResultSet resu=statement.executeQuery("select * from adressInfo where adressName='"+linkText+"' and adressUrl='"+headUrl+""+linkHref+"'");
boolean isre=resu.next();
if(!resu.next()){
String sqladressInfo="insert into adressInfo values('"+i+"','"+linkText+"','"+headUrl+""+linkHref+"')";
statement.executeUpdate(sqladressInfo);
String sqladressRelattionship="insert into adressRelationship values('"+i+"','"+i+"','0')";
//
}
}
String sqladressRelattionship="insert into adressRelationship values('"+i+"','"+i+"','0')";
}
/*String url="https://jsoup.org/cookbook/input/parse-body-fragment";
String headUrl="https://jsoup.org";
Document doc = Jsoup.connect(url).get();
//Element html=doc.html(url);
System.out.println(doc);
String sqlInsert="insert into mainData values('3333','22222')";
statement.executeUpdate(sqlInsert);
Element masthead = doc.select("div.breadcrumb").first();
Elements links=masthead.select("a");
for (Element link : links) {
String linkHref = link.attr("href");
String linkText = link.text();
System.out.println("------------------------------------------------------");
System.out.println("linkHref :"+headUrl+linkHref);
System.out.println("linkText : "+linkText);
String sql="insert into userInfo values('"+linkText+"','"+headUrl+""+linkHref+"')";
//statement.executeUpdate(sql);
}
ResultSet rSet=statement.executeQuery("select * from userInfo");//搜索数据库,将搜索的放入数据集ResultSet中
while (rSet.next()) { //遍历这个数据集
//System.out.println("节点:"+rSet.getString("node_id"));//依次输出 也可以这样写 rSet.getString(“name”)
System.out.println("名称:"+rSet.getString("name"));
System.out.println("url:"+rSet.getString("pwd"));
}*/
RES.close();//关闭数据集
connection.close();//关闭数据库连接
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
// 创建Sqlite数据库连接
private static Connection createConnection() {
try {
Class.forName(Class_Name);
return DriverManager.getConnection(DB_URL);
} catch (ClassNotFoundException | SQLException e) {
e.printStackTrace();
return null;
}
}
}
package com.xuqun.archive;
import java.io.File;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* @author:xuqun
* @time:2018年3月27日上午9:09:42
*/
public class TraverseHTML {
public static void main(String[] args) {
// TODO Auto-generated method stub
String url="https://jsoup.org/cookbook/input/parse-body-fragment";
//File input = new File("D:\\data\\parser01.html");//获取文件
String headUrl="https://jsoup.org";
try {
Document doc = Jsoup.connect(url).get();
//Document doc = Jsoup.parse(input, "UTF-8", "http://example.com/");
//Element content = doc.getElementById("content");
//Elements links = content.getElementsByTag("a");
//Elements links = doc.select("a"); //带有href属性的a元素
Element linka = doc.select("span").first();
System.out.println(doc);
Element masthead = doc.select("div.breadcrumb").first();
System.out.println(masthead);
Elements links=masthead.select("a");
for (Element link : links) {
String linkHref = link.attr("href");
String linkText = link.text();
System.out.println("------------------------------------------------------");
System.out.println("linkHref :"+headUrl+linkHref);
System.out.println("linkText : "+linkText);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}