java jsoup 网络爬虫 学习例子(四) 抓取网页连接插入mysql数据库
package com.iteye.injavawetrust.jsoup;
import java.io.IOException;
import java.util.Iterator;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
*
* @author InJavaWeTrust
*
*/
public class GetLink {
private JsoupUtil ju = JsoupUtil.getInstance();
private DBUtil du = DBUtil.getInstance();
private Link link = new Link();
private String insertSql = "";
public void getLink(String url) {
try {
Document document = Jsoup.connect(url).timeout(5000).get();
Elements hrefs = document.select("a[href]");
Iterator<Element> hrefIter = hrefs.iterator();
while (hrefIter.hasNext()) {
Element href = hrefIter.next();
link.setId(ju.getUUID());
link.setUrlName(href.text());
link.setUrl(href.attr("href"));
insertSql = ju.getInsertSql(link);
du.insert(insertSql);
}
Elements srcs = document.select("img[src]");
Iterator<Element> srcIter = srcs.iterator();
while(srcIter.hasNext()){
Element src = srcIter.next();
link.setId(ju.getUUID());
link.setUrlName(src.attr("alt"));
link.setUrl(src.attr("src"));
insertSql = ju.getInsertSql(link);
du.insert(insertSql);
}
Elements opts = document.select("option[value]");
Iterator<Element> optIter = opts.iterator();
while(optIter.hasNext()){
Element opt = optIter.next();
link.setId(ju.getUUID());
link.setUrlName(opt.text());
link.setUrl(opt.attr("value"));
insertSql = ju.getInsertSql(link);
du.insert(insertSql);
}
Elements links = document.select("link[href]");
Iterator<Element> linkIter = links.iterator();
while(linkIter.hasNext()){
Element li = linkIter.next();
link.setId(ju.getUUID());
link.setUrlName(li.text());
link.setUrl(li.attr("href"));
insertSql = ju.getInsertSql(link);
du.insert(insertSql);
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
new GetLink().getLink(Constants.URL);
}
}
package com.iteye.injavawetrust.jsoup;
import java.io.Serializable;
import java.util.Date;
/**
*
* @author InJavaWeTrust
*
*/
public class Link implements Serializable{
private static final long serialVersionUID = 1165098694307553167L;
/**
* ID
*/
private String id;
/**
* link name
*/
private String urlName;
/**
* link url
*/
private String url;
/**
* insert db date
*/
private Date date;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getUrlName() {
return urlName;
}
public void setUrlName(String urlName) {
this.urlName = urlName;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public Date getDate() {
return date;
}
public void setDate(Date date) {
this.date = date;
}
}
package com.iteye.injavawetrust.jsoup;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
/**
*
* @author InJavaWeTrust
*
*/
public class DBUtil {
private static Connection conn = null;
private static Statement st = null;
private static ResultSet rs = null;
private DBUtil() {
}
private static final DBUtil instance = new DBUtil();
public static DBUtil getInstance() {
return instance;
}
/**
* 连接数据库
*
* @return
*/
public Connection connection() {
try {
Class.forName(Constants.DRIVER);
} catch (ClassNotFoundException e1) {
e1.printStackTrace();
}
try {
conn = DriverManager.getConnection(Constants.DBURL, Constants.USER,
Constants.PASSWORD);
} catch (SQLException e) {
e.printStackTrace();
}
return conn;
}
/**
* 关闭连接
*
* @param rs
* @param st
* @param conn
*/
public void release(ResultSet rs, Statement st, Connection conn) {
try {
try {
if (null != rs) {
rs.close();
}
} catch (Exception e) {
rs = null;
}
try {
if (null != st) {
st.close();
}
} catch (Exception e) {
st = null;
}
try {
if (null != conn) {
conn.close();
}
} catch (Exception e) {
conn = null;
}
} finally {
rs = null;
st = null;
conn = null;
}
}
/**
* 插入
* @param sql
*/
public void insert(String sql){
try{
DBUtil.getInstance().connection();
st = conn.createStatement();
st.execute(sql);
DBUtil.getInstance().release(rs, st, conn);
}catch(Exception e){
e.printStackTrace();
}
}
}
package com.iteye.injavawetrust.jsoup;
/**
*
* @author InJavaWeTrust
*
*/
public class Constants {
/**
* mysql 驱动
*/
public static final String DRIVER = "com.mysql.jdbc.Driver";
/**
* 链接
*/
public static final String DBURL = "jdbc:mysql://localhost:3306/jsoupdb?useUnicode=true&characterEncoding=utf-8";
/**
* username
*/
public static final String USER = "root";
/**
* password
*/
public static final String PASSWORD = "root";
/**
* 随便找的一个URL
*/
public static final String URL = "http://www.hrbhuade.net/html/main/index.htm";
}
package com.iteye.injavawetrust.jsoup;
import java.util.UUID;
/**
*
* @author InJavaWeTrust
*
*/
public class JsoupUtil {
private JsoupUtil() {
}
private static final JsoupUtil instance = new JsoupUtil();
public static JsoupUtil getInstance() {
return instance;
}
/**
* 得到UUID
* @return 32位UUID
*/
public String getUUID() {
String s = UUID.randomUUID().toString();
return s.substring(0, 8) + s.substring(9, 13) + s.substring(14, 18)
+ s.substring(19, 23) + s.substring(24);
}
/**
* insert sql
* @param link Link obj
* @return sql
*/
public String getInsertSql(Link link) {
return "insert into link (id, urlname, url, date) values ('"
+ link.getId() + "','" + link.getUrlName() + "','"
+ link.getUrl() + "',NOW())";
}
}
link 表
DROP TABLE IF EXISTS `link`;
CREATE TABLE `link` (
`id` varchar(32) NOT NULL,
`urlname` varchar(200) DEFAULT NULL,
`url` varchar(200) DEFAULT NULL,
`date` datetime DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;