jsoup爬虫

最新推荐文章于 2023-07-17 16:58:52 发布

Peter Chan

最新推荐文章于 2023-07-17 16:58:52 发布

阅读量143

点赞数

分类专栏： Java

本文链接：https://blog.csdn.net/qq_40663787/article/details/88999028

版权

Java 专栏收录该内容

42 篇文章 0 订阅

订阅专栏

爬取 https://www.gushiwen.org/

初学者，还望见谅

package com.peter.demon_02;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.FutureTask;

public class CrawlerlUtil {

	private static String agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
			+ " (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36";

	public static void main(String[] args) {

		List<Poem> list = search("title", "出师表");
		for (int i = 0; i < list.size(); i++) {
			System.out.println(list.get(i));
			poems(list.get(i));
			System.out.println(list.get(i));
		}

		// JsonObject jsobj =
		// poems("https://so.gushiwen.org/shiwenv_8bc0871fe00b.aspx");

//		for (var a : jsobj.entrySet()) {
//			System.out.println(a.getKey() + "          " + a.getValue());
//		}

//		List<Poem> list = search("author", "李白");
//		for (Poem poem : list) {
//			String strs = "https://so.gushiwen.org";
//			poems(strs + poem.url);
//		}
//
//		for (var a : header.entrySet()) {
//			System.out.println(a.getKey() + "          " + a.getValue());
//		}

		//

//		Map<String, String> map = getType("朝代");

		// List<Poem> lists = searchByType(map.get("两汉"));

//		for (Poem poem : lists) {
//			System.out.println(poem.url);
//			poems(poem);
//			System.out.println(poem.contentSound);
//		}

		///
//        for (Map.Entry<String, String> entry : map.entrySet()) {
//
//            new Thread(new Runnable() {
//
//                @Override
//                public void run() {
//                    List<Poem> lists = searchByType(entry.getValue());
//
//                    for (Poem poem : lists) {
//                        System.out.println(poem.url);
//                        poems(poem);
//                        System.out.println(poem);
//                    }
//                }
//            }).start();
//
//        }

	}

	// 搜索功能
	public static List<Poem> search(String type, String... keywords) {
		List<Poem> lists = new ArrayList<Poem>();
		int count = 1;
		int sumcount = 99;
		String temp = "";
		for (int i = 0; i < keywords.length; i++) {
			if (i < keywords.length - 1) {
				temp = temp + keywords[i] + "+";
			} else {
				temp = temp + keywords[i];
			}
		}
		String str = "";
		try {
			for (; count <= 1; count++) {
				str = "https://so.gushiwen.org/search.aspx?page=" + count + "&type=" + type + "&value=" + temp;
				Document doucment = Jsoup.connect(str).userAgent(agent).timeout(10000).get();
				sumcount = Integer.valueOf(doucment.select(".pagesright #sumPage").text());
				if (sumcount >= 10) {
					sumcount = 10;
				}
				Elements eles = doucment.select(".cont");
				for (Element ele : eles) {
					for (Element sss : ele.select(".source").prev()) {
						Poem poem = new Poem(sss.text(), "https://so.gushiwen.org" + sss.select("a").attr("href"));
						lists.add(poem);
					}
				}
			}
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return lists;

	}

	public static void poems(Poem poem) {

		try {
			Document document = Jsoup.connect(poem.url).userAgent(agent).timeout(10000).get();

			// 搜索诗名
			Element title = document.selectFirst(".cont h1");
			poem.title = title.text();

			// 搜索作者
			Element author = document.selectFirst(".cont .source");
			poem.author = author.text();

			// 搜索内容
			if (document.selectFirst(".contson:has(p)") != null
					&& !document.selectFirst(".contson").select("p").text().equals("")) {
				Elements main = document.selectFirst(".contson").select("p");
				String strs = "";

				for (Element el : main) {
					strs = strs + el.text();
				}

				poem.content = strs;

			} else {
				Element main = document.selectFirst(".contson");
				poem.content = main.text();
			}

			// 搜索注解
			if (document.selectFirst(".contyishang") != null) {

				if (document.selectFirst(".contyishang").select("div").text().contains("展开阅读全文") == false) {
					poem.explanation = document.selectFirst(".contyishang").text();

				} else {
					String id = document.selectFirst(".contyishang").select("div:eq(0)").select("div:eq(1)")
							.select("span").attr("id").split("Play")[1];
					String url01 = "https://so.gushiwen.org/shiwen2017/ajaxfanyi.aspx?id=";
					url01 += id;
					Document document01 = Jsoup.connect(url01).userAgent(agent).timeout(10000).get();
					Elements explanation = document01.select(".contyishang p");
					String strs = "";
					for (Element el : explanation) {
						strs = el.text() + "\n";
					}

					poem.explanation = strs;
				}
			}

//            attrExplannationSound(poem, document);
//            attrContentSound(poem, document);

		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	// 各个朝代的搜索
	public static List<Poem> searchByType(String url) {
		List<Poem> lists = new ArrayList<Poem>();
		int count = 1;
		Document doucment = null;
		try {
			doucment = Jsoup.connect(url).userAgent(agent).timeout(10000).get();
		} catch (IOException e) {
			e.printStackTrace();
		}
		try {
			for (; count <= 1; count++) {
				Elements eles = doucment.select(".cont");
				for (Element ele : eles) {
					for (final Element txt : ele.select(".source").prev()) {
						Callable<Poem> callable = new Callable<Poem>() {
							@Override
							public Poem call() throws Exception {
								Poem poem = new Poem(txt.text(), txt.select("a").attr("href"));
								return poem;
							}
						};
						FutureTask<Poem> task = new FutureTask<>(callable);
						new Thread(task).start();

						lists.add(task.get());

					}
				}
			}
		} catch (InterruptedException e) {
			e.printStackTrace();
		} catch (ExecutionException e) {
			e.printStackTrace();
		}
		return lists;

	}

	// 获取需要的类型
	public static Map<String, String> getType(String type) {
		String select = null;
		switch (type) {
		case "类型":
			select = ".titletype #type1 a";
			break;
		case "作者":
			select = ".titletype #type2 a";
			break;
		case "朝代":
			select = ".titletype #type3 a";
			break;
		case "形式":
			select = ".titletype > div:eq(4) a";
			break;
		default:
			select = ".titletype #type1 a";
			break;
		}

		Map<String, String> map = new HashMap<String, String>();
		String url = "https://www.gushiwen.org";
		Document doc = null;

		try {
			doc = Jsoup.connect("https://www.gushiwen.org/shiwen/").userAgent(agent).timeout(30000).get();
			Elements eles = doc.select(select);
			for (Element ele : eles) {
				map.put(ele.text(), url + ele.attr("href"));
			}
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

		return map;
	}

	public static void attrExplannationSound(Poem poem, Document document) {
		try {
			if (document.selectFirst(".contyishang h2").text().equals("译文及注释")
					&& document.select(".contyishang h2") != null) {
				String id = document.select(".contyishang").select("img").attr("id").split("Fanyi")[1];

				Document docs = Jsoup.connect("https://so.gushiwen.org/fanyiplay.aspx?id=" + id).userAgent(agent)
						.timeout(1000).get();
				String urlsrc = docs.select("audio").attr("src");
				poem.explannationSound = urlsrc;
			}

		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

	}

	public static void attrContentSound(Poem poem, Document document) {
		try {
			Elements ele = document.selectFirst(".tool").select(".toolpinglun:eq(2)");
			String id = ele.select("img").attr("id").split("img")[1];
			Document docs = Jsoup.connect("https://so.gushiwen.org/viewplay.aspx?id=" + id).userAgent(agent)
					.timeout(1000).get();
			String urlsrc = docs.select("audio").attr("src");
			poem.contentSound = urlsrc;
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}

}

package com.peter.demon_02;

public class Poem {
	public String author;
	public String url;
	public String title;
	public String content;
	public String explanation;
	public String explannationSound;
	public String contentSound;

	public Poem(String author, String url) {
		super();
		this.author = author;
		this.url = url;
	}

	@Override
	public String toString() {
		return "Poem [author=" + author + ", url=" + url + ", title=" + title + ", content=" + content
				+ ", explanation=" + explanation + ", explannationSound=" + explannationSound + ", contentSound="
				+ contentSound + "]";
	}

}

Peter Chan

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
jsoup爬虫

package com.peter.demon_02;import java.io.IOException;import java.util.ArrayList;import java.util.List;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;im...
复制链接

扫一扫

专栏目录