初学者,还望见谅
package com.peter.demon_02;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.FutureTask;
public class CrawlerlUtil {
private static String agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+ " (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36";
public static void main(String[] args) {
List<Poem> list = search("title", "出师表");
for (int i = 0; i < list.size(); i++) {
System.out.println(list.get(i));
poems(list.get(i));
System.out.println(list.get(i));
}
// JsonObject jsobj =
// poems("https://so.gushiwen.org/shiwenv_8bc0871fe00b.aspx");
// for (var a : jsobj.entrySet()) {
// System.out.println(a.getKey() + " " + a.getValue());
// }
// List<Poem> list = search("author", "李白");
// for (Poem poem : list) {
// String strs = "https://so.gushiwen.org";
// poems(strs + poem.url);
// }
//
// for (var a : header.entrySet()) {
// System.out.println(a.getKey() + " " + a.getValue());
// }
//
// Map<String, String> map = getType("朝代");
// List<Poem> lists = searchByType(map.get("两汉"));
// for (Poem poem : lists) {
// System.out.println(poem.url);
// poems(poem);
// System.out.println(poem.contentSound);
// }
///
// for (Map.Entry<String, String> entry : map.entrySet()) {
//
// new Thread(new Runnable() {
//
// @Override
// public void run() {
// List<Poem> lists = searchByType(entry.getValue());
//
// for (Poem poem : lists) {
// System.out.println(poem.url);
// poems(poem);
// System.out.println(poem);
// }
// }
// }).start();
//
// }
}
// 搜索功能
public static List<Poem> search(String type, String... keywords) {
List<Poem> lists = new ArrayList<Poem>();
int count = 1;
int sumcount = 99;
String temp = "";
for (int i = 0; i < keywords.length; i++) {
if (i < keywords.length - 1) {
temp = temp + keywords[i] + "+";
} else {
temp = temp + keywords[i];
}
}
String str = "";
try {
for (; count <= 1; count++) {
str = "https://so.gushiwen.org/search.aspx?page=" + count + "&type=" + type + "&value=" + temp;
Document doucment = Jsoup.connect(str).userAgent(agent).timeout(10000).get();
sumcount = Integer.valueOf(doucment.select(".pagesright #sumPage").text());
if (sumcount >= 10) {
sumcount = 10;
}
Elements eles = doucment.select(".cont");
for (Element ele : eles) {
for (Element sss : ele.select(".source").prev()) {
Poem poem = new Poem(sss.text(), "https://so.gushiwen.org" + sss.select("a").attr("href"));
lists.add(poem);
}
}
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return lists;
}
public static void poems(Poem poem) {
try {
Document document = Jsoup.connect(poem.url).userAgent(agent).timeout(10000).get();
// 搜索诗名
Element title = document.selectFirst(".cont h1");
poem.title = title.text();
// 搜索作者
Element author = document.selectFirst(".cont .source");
poem.author = author.text();
// 搜索内容
if (document.selectFirst(".contson:has(p)") != null
&& !document.selectFirst(".contson").select("p").text().equals("")) {
Elements main = document.selectFirst(".contson").select("p");
String strs = "";
for (Element el : main) {
strs = strs + el.text();
}
poem.content = strs;
} else {
Element main = document.selectFirst(".contson");
poem.content = main.text();
}
// 搜索注解
if (document.selectFirst(".contyishang") != null) {
if (document.selectFirst(".contyishang").select("div").text().contains("展开阅读全文") == false) {
poem.explanation = document.selectFirst(".contyishang").text();
} else {
String id = document.selectFirst(".contyishang").select("div:eq(0)").select("div:eq(1)")
.select("span").attr("id").split("Play")[1];
String url01 = "https://so.gushiwen.org/shiwen2017/ajaxfanyi.aspx?id=";
url01 += id;
Document document01 = Jsoup.connect(url01).userAgent(agent).timeout(10000).get();
Elements explanation = document01.select(".contyishang p");
String strs = "";
for (Element el : explanation) {
strs = el.text() + "\n";
}
poem.explanation = strs;
}
}
// attrExplannationSound(poem, document);
// attrContentSound(poem, document);
} catch (IOException e) {
e.printStackTrace();
}
}
// 各个朝代的搜索
public static List<Poem> searchByType(String url) {
List<Poem> lists = new ArrayList<Poem>();
int count = 1;
Document doucment = null;
try {
doucment = Jsoup.connect(url).userAgent(agent).timeout(10000).get();
} catch (IOException e) {
e.printStackTrace();
}
try {
for (; count <= 1; count++) {
Elements eles = doucment.select(".cont");
for (Element ele : eles) {
for (final Element txt : ele.select(".source").prev()) {
Callable<Poem> callable = new Callable<Poem>() {
@Override
public Poem call() throws Exception {
Poem poem = new Poem(txt.text(), txt.select("a").attr("href"));
return poem;
}
};
FutureTask<Poem> task = new FutureTask<>(callable);
new Thread(task).start();
lists.add(task.get());
}
}
}
} catch (InterruptedException e) {
e.printStackTrace();
} catch (ExecutionException e) {
e.printStackTrace();
}
return lists;
}
// 获取需要的类型
public static Map<String, String> getType(String type) {
String select = null;
switch (type) {
case "类型":
select = ".titletype #type1 a";
break;
case "作者":
select = ".titletype #type2 a";
break;
case "朝代":
select = ".titletype #type3 a";
break;
case "形式":
select = ".titletype > div:eq(4) a";
break;
default:
select = ".titletype #type1 a";
break;
}
Map<String, String> map = new HashMap<String, String>();
String url = "https://www.gushiwen.org";
Document doc = null;
try {
doc = Jsoup.connect("https://www.gushiwen.org/shiwen/").userAgent(agent).timeout(30000).get();
Elements eles = doc.select(select);
for (Element ele : eles) {
map.put(ele.text(), url + ele.attr("href"));
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return map;
}
public static void attrExplannationSound(Poem poem, Document document) {
try {
if (document.selectFirst(".contyishang h2").text().equals("译文及注释")
&& document.select(".contyishang h2") != null) {
String id = document.select(".contyishang").select("img").attr("id").split("Fanyi")[1];
Document docs = Jsoup.connect("https://so.gushiwen.org/fanyiplay.aspx?id=" + id).userAgent(agent)
.timeout(1000).get();
String urlsrc = docs.select("audio").attr("src");
poem.explannationSound = urlsrc;
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static void attrContentSound(Poem poem, Document document) {
try {
Elements ele = document.selectFirst(".tool").select(".toolpinglun:eq(2)");
String id = ele.select("img").attr("id").split("img")[1];
Document docs = Jsoup.connect("https://so.gushiwen.org/viewplay.aspx?id=" + id).userAgent(agent)
.timeout(1000).get();
String urlsrc = docs.select("audio").attr("src");
poem.contentSound = urlsrc;
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
package com.peter.demon_02;
public class Poem {
public String author;
public String url;
public String title;
public String content;
public String explanation;
public String explannationSound;
public String contentSound;
public Poem(String author, String url) {
super();
this.author = author;
this.url = url;
}
@Override
public String toString() {
return "Poem [author=" + author + ", url=" + url + ", title=" + title + ", content=" + content
+ ", explanation=" + explanation + ", explannationSound=" + explannationSound + ", contentSound="
+ contentSound + "]";
}
}