Main
package com.crawl;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class Main {
public static void main(String[] args) {
try {
List<String> searchs = new ArrayList<String>();
searchs.add("元尊,天蚕土豆");
searchs.add("大主宰,天蚕土豆");
searchs.add("圣墟,辰东");
searchs.add("我是至尊,风凌天下");
for (String search: searchs) {
String bookname=search.split(",")[0];
// String auther=search.split(",")[1];
StoreMysql.operateMysql("https://www.ddbiquge.com/s.php?q="+bookname,bookname);
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
StoreMysql
package com.crawl;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.crawl.dao.HttpClientUtil;
import static com.crawl.dao.dao.*;
public class StoreMysql {
public static void operateMysql(String url,String bookname) throws IOException {
String content = HttpClientUtil.getContent(url);
// try
// {
// Thread.currentThread().sleep(3000);//毫秒
// }
// catch(Exception ex){}
Document doc = Jsoup.parse(content); // 解析网页 得到文档对象
dbTablesInit();
// Elements hrefElements = doc.getElementsByClass("result-item-title result-game-item-title");
Elements hrefElements = doc.getElementsByClass("bookname");
for (Element e : hrefElements) {
String urlIndex = "https://www.ddbiquge.com"+e.getElementsByTag("a").attr("href");
System.out.println(urlIndex);
String contentIndex = HttpClientUtil.getContent(urlIndex);
Document docIndex = Jsoup.parse(contentIndex); // 解析网页 得到文档对象
Elements h2Elements = docIndex.getElementsByTag("h2"); // 根据tag名称来查询DOM
Element h2Element = h2Elements.get(0);
String h2 = h2Element.text();
System.out.println("题目:" + h2);
if(!h2.equals(bookname))continue;
Elements summarizeElements = docIndex.select(".small span");//作者
Element authorElement = summarizeElements.get(0);
String author = authorElement.text().substring(3);
System.out.println(author);
Element typeElement = summarizeElements.get(1);
String type = typeElement.text().substring(3);
System.out.println(type);
Element serialstatusElement = summarizeElements.get(2);
String serialstatus = serialstatusElement.text().substring(3);
System.out.println(serialstatus);
Element wordNumberElement = summarizeElements.get(3);
String wordNumber = wordNumberElement.text().substring(3);
System.out.println(wordNumber);
Elements introElement = docIndex.getElementsByClass("intro");// 简介
String intro = introElement.text().substring(3);
System.out.println(intro);
Elements hrefIndexElements = docIndex.select(".listmain dl dd a");
// String bookId = urlIndex.substring(30, urlIndex.length() - 5);
String bookId =null;
String pattern = "/book/([\\d]+).html";
Pattern r = Pattern.compile(pattern);
Matcher m = r.matcher(urlIndex);
if(m.find())
bookId = m.group(1);
else
System.out.println("nobookid");
Element updateTimeElement = docIndex.getElementsByClass("last").get(0);
String updateTime = updateTimeElement.text().substring(5);
System.out.println(updateTime);
Date date = new Date();
DateFormat df = new SimpleDateFormat("yyyy-MM-dd");
String book_createdate=df.format(date);
insertBook(bookId, h2, type, serialstatus, author, wordNumber, intro, urlIndex, updateTime, book_createdate);
// INSERT INTO `book` (`book_id`,`book_name`, `type`,`serialstatus`,`author`,`word_number`,`desc`,`book_url`,`book_createdate`)
Set<String> chapter_urls = new TreeSet<>();
for (Element el : hrefIndexElements) {// 获取章节名
String urlSub = el.attr("href");
String chapter_url = "https://www.ddbiquge.com" + urlSub;
chapter_urls.add(chapter_url);
}
for (String chapter_url:chapter_urls) {
String contentIndex1 = HttpClientUtil.getContent(chapter_url);
Document docIndex1 = Jsoup.parse(contentIndex1); // 解析网页 得到文档对象
Elements chapterElements = docIndex1.getElementsByTag("h1"); // 根据tag名称来查询DOM
Element chapterElement = chapterElements.get(0);
String chapter = chapterElement.text();
System.out.println(chapter);
String ct = docIndex1.select("#content").text();
ct=ct.replace(chapter_url,"");
ct=ct.replace("天才一秒记住本站地址:www.ddbiquge.com。顶点笔趣阁手机版阅读网址:m.ddbiquge.com","");
ct=ct.replaceAll("\\s+", "\r\n");
// int index = textsIndex.indexOf(" ");
// while(index>-1){
// String line = textsIndex.substring(0,index);
// textsIndex = textsIndex.substring(index+1);
// System.out.println(line);
// }
String chapter_id =null;
String patternc = "/chapter/[\\d]+_([\\d]+).html";
Pattern pa = Pattern.compile(patternc);
Matcher ma = pa.matcher(chapter_url);
if(ma.find())
chapter_id = ma.group(1);
else
System.out.println("nochapterid");
Date datec = new Date();
DateFormat dfc = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss");
String chapter_createdate=dfc.format(datec);
insertChapter( chapter_id, bookId, chapter, ct, chapter_url, chapter_createdate);
}
}
}
}
dao
package com.crawl.dao;
import java.sql.*;
import static com.crawl.dao.DruidJdbc.getConnection;
//import static com.crawl.dbcp.DBCPTest.getConnection;
public class dao {
public static void dbTablesInit() {
ResultSet rs = null;
// Properties p = new Properties();
Connection cn = getConnection();
try {
rs = cn.getMetaData().getTables(null, null, "book", null);
// p.load(dao.class.getResourceAsStream("/dbconfig.properties"));
Statement st = cn.createStatement();
//不存在url表
if (!rs.next()) {
//创建book表
// st.execute(p.getProperty("createZhouLinTable"));
// book bookid bookname type serialstatus author desc url createdate(yyyy-MM-dd)
String sql = "CREATE TABLE `book` (" +
"`book_id` int NOT NULL ," +
"`book_name` varchar(255) NOT NULL ," +
"`type` varchar(255) NOT NULL ," +
"`serialstatus` varchar(255) NOT NULL ," +
"`author` varchar(255) NOT NULL," +
"`word_number` int NOT NULL," +
"`desc` text," +
"`book_url`varchar(255) NOT NULL," +
"`updateTime` varchar(255) NOT NULL," +
"`book_createdate` varchar(255) NOT NULL," +
"KEY `index1_book_id` (`book_id`)," +
"PRIMARY KEY (book_id,book_name,author))ENGINE=InnoDB DEFAULT CHARSET=utf8";
st.executeUpdate(sql);
System.out.println("book表创建成功");
} else {
System.out.println("book表已存在");
}
rs = cn.getMetaData().getTables(null, null, "chapter", null);
if (!rs.next()) {
//创建chapter表
// chapter id chapterid name content url createdate(yyyy-mm-dd hh:mm:ss )
String sql = "CREATE TABLE `chapter` (" +
"`chapter_id` int NOT NULL," +
"`book_id` int NOT NULL," +
"`name` varchar(255) NOT NULL," +
"`content` text," +
"`chapter_url`varchar(255) NOT NULL," +
"`chapter_createdate` varchar(255) NOT NULL," +
"KEY `index1_chapter_id` (`chapter_id`)," +
"PRIMARY KEY (chapter_id))ENGINE=MyISAM AUTO_INCREMENT=1 DEFAULT CHARSET=utf8";
st.executeUpdate(sql);
System.out.println("chapter表创建成功");
// st.executeUpdate("CREATE INDEX index_book_id ON chapter (book_id)");
// System.out.println("chapter表索引创建成功");
} else {
System.out.println("chapter表已存在");
}
rs.close();
st.close();
cn.close();
} catch (SQLException e) {
e.printStackTrace();
}
// catch (IOException e) {
// e.printStackTrace();
// }
}
// String sql = "CREATE TABLE `book` (" +
// "`book_id` int NOT NULL ," +
// "`book_name` varchar(255) NOT NULL ," +
// "`type` varchar(255) NOT NULL ," +
// "`serialstatus` varchar(255) NOT NULL ," +
// "`author` varchar(255) NOT NULL," +
// "`word_number` int NOT NULL," +
// "`desc` text," +
// "`book_url`varchar(255) NOT NULL," +
// "`updateTime` varchar(255) NOT NULL," +
// "`book_createdate` varchar(255) NOT NULL," +
"`INDEX`[indexName] (book_id(length)) " +
// "PRIMARY KEY (book_id,book_name,author))ENGINE=InnoDB DEFAULT CHARSET=utf8";
public static void insertBook(String id, String book_name, String type, String serialstatus, String author,String wordnumber, String desc, String book_url, String updateTime,String book_createdate) {
Connection cn = getConnection();
String sql = " INSERT INTO `book` (`book_id`,`book_name`, `type`,`serialstatus`,`author`,`word_number`,`desc`,`book_url`,`updateTime`,`book_createdate`) VALUES (?,?,?,?,?,?,?,?,?,?)";
try {
//表示预编译的sql对象
PreparedStatement preparedStatement = cn.prepareStatement(sql);
preparedStatement.setString(1, id);
preparedStatement.setString(2, book_name);
preparedStatement.setString(3, type);
preparedStatement.setString(4, serialstatus);
preparedStatement.setString(5, author);
preparedStatement.setString(6, wordnumber);
preparedStatement.setString(7, desc);
preparedStatement.setString(8, book_url);
preparedStatement.setString(9, updateTime);
preparedStatement.setString(10, book_createdate);
preparedStatement.executeUpdate();
preparedStatement.close();
cn.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
// String sql = "CREATE TABLE `chapter` (" +
// "`chapter_id` int NOT NULL," +
// "`book_id` int NOT NULL," +
// "`name` varchar(255) NOT NULL," +
// "`content` text," +
// "`chapter_url`varchar(255) NOT NULL," +
// "`chapter_createdate` varchar(255) NOT NULL," +
public static void insertChapter(String chapter_id,String bookId, String name, String content,String chapter_url,String chapter_createdate) {
Connection cn = getConnection();
String sql = " INSERT INTO `chapter` (`chapter_id`,`book_id`,`name`,`content`,`chapter_url`,`chapter_createdate`) VALUES (?,?,?,?,?,?)";
try {
//表示预编译的sql对象
PreparedStatement preparedStatement = cn.prepareStatement(sql);
preparedStatement.setString(1, chapter_id);
preparedStatement.setString(2, bookId);
preparedStatement.setString(3, name);
preparedStatement.setString(4, content);
preparedStatement.setString(5, chapter_url);
preparedStatement.setString(6, chapter_createdate);
preparedStatement.executeUpdate();
preparedStatement.close();
cn.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
问题:不能找到一个合适的网站爬取
解决:多试,总有
最大问题还是代码层次结构