jsoup爬虫

爬取 https://www.gushiwen.org/

初学者,还望见谅

package com.peter.demon_02;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.FutureTask;

public class CrawlerlUtil {

	private static String agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
			+ " (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36";

	public static void main(String[] args) {

		List<Poem> list = search("title", "出师表");
		for (int i = 0; i < list.size(); i++) {
			System.out.println(list.get(i));
			poems(list.get(i));
			System.out.println(list.get(i));
		}

		// JsonObject jsobj =
		// poems("https://so.gushiwen.org/shiwenv_8bc0871fe00b.aspx");

//		for (var a : jsobj.entrySet()) {
//			System.out.println(a.getKey() + "          " + a.getValue());
//		}

//		List<Poem> list = search("author", "李白");
//		for (Poem poem : list) {
//			String strs = "https://so.gushiwen.org";
//			poems(strs + poem.url);
//		}
//
//		for (var a : header.entrySet()) {
//			System.out.println(a.getKey() + "          " + a.getValue());
//		}

		//

//		Map<String, String> map = getType("朝代");

		// List<Poem> lists = searchByType(map.get("两汉"));

//		for (Poem poem : lists) {
//			System.out.println(poem.url);
//			poems(poem);
//			System.out.println(poem.contentSound);
//		}

		///
//        for (Map.Entry<String, String> entry : map.entrySet()) {
//
//            new Thread(new Runnable() {
//
//                @Override
//                public void run() {
//                    List<Poem> lists = searchByType(entry.getValue());
//
//                    for (Poem poem : lists) {
//                        System.out.println(poem.url);
//                        poems(poem);
//                        System.out.println(poem);
//                    }
//                }
//            }).start();
//
//        }

	}

	// 搜索功能
	public static List<Poem> search(String type, String... keywords) {
		List<Poem> lists = new ArrayList<Poem>();
		int count = 1;
		int sumcount = 99;
		String temp = "";
		for (int i = 0; i < keywords.length; i++) {
			if (i < keywords.length - 1) {
				temp = temp + keywords[i] + "+";
			} else {
				temp = temp + keywords[i];
			}
		}
		String str = "";
		try {
			for (; count <= 1; count++) {
				str = "https://so.gushiwen.org/search.aspx?page=" + count + "&type=" + type + "&value=" + temp;
				Document doucment = Jsoup.connect(str).userAgent(agent).timeout(10000).get();
				sumcount = Integer.valueOf(doucment.select(".pagesright #sumPage").text());
				if (sumcount >= 10) {
					sumcount = 10;
				}
				Elements eles = doucment.select(".cont");
				for (Element ele : eles) {
					for (Element sss : ele.select(".source").prev()) {
						Poem poem = new Poem(sss.text(), "https://so.gushiwen.org" + sss.select("a").attr("href"));
						lists.add(poem);
					}
				}
			}
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return lists;

	}

	public static void poems(Poem poem) {

		try {
			Document document = Jsoup.connect(poem.url).userAgent(agent).timeout(10000).get();

			// 搜索诗名
			Element title = document.selectFirst(".cont h1");
			poem.title = title.text();

			// 搜索作者
			Element author = document.selectFirst(".cont .source");
			poem.author = author.text();

			// 搜索内容
			if (document.selectFirst(".contson:has(p)") != null
					&& !document.selectFirst(".contson").select("p").text().equals("")) {
				Elements main = document.selectFirst(".contson").select("p");
				String strs = "";

				for (Element el : main) {
					strs = strs + el.text();
				}

				poem.content = strs;

			} else {
				Element main = document.selectFirst(".contson");
				poem.content = main.text();
			}

			// 搜索注解
			if (document.selectFirst(".contyishang") != null) {

				if (document.selectFirst(".contyishang").select("div").text().contains("展开阅读全文") == false) {
					poem.explanation = document.selectFirst(".contyishang").text();

				} else {
					String id = document.selectFirst(".contyishang").select("div:eq(0)").select("div:eq(1)")
							.select("span").attr("id").split("Play")[1];
					String url01 = "https://so.gushiwen.org/shiwen2017/ajaxfanyi.aspx?id=";
					url01 += id;
					Document document01 = Jsoup.connect(url01).userAgent(agent).timeout(10000).get();
					Elements explanation = document01.select(".contyishang p");
					String strs = "";
					for (Element el : explanation) {
						strs = el.text() + "\n";
					}

					poem.explanation = strs;
				}
			}

//            attrExplannationSound(poem, document);
//            attrContentSound(poem, document);

		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	// 各个朝代的搜索
	public static List<Poem> searchByType(String url) {
		List<Poem> lists = new ArrayList<Poem>();
		int count = 1;
		Document doucment = null;
		try {
			doucment = Jsoup.connect(url).userAgent(agent).timeout(10000).get();
		} catch (IOException e) {
			e.printStackTrace();
		}
		try {
			for (; count <= 1; count++) {
				Elements eles = doucment.select(".cont");
				for (Element ele : eles) {
					for (final Element txt : ele.select(".source").prev()) {
						Callable<Poem> callable = new Callable<Poem>() {
							@Override
							public Poem call() throws Exception {
								Poem poem = new Poem(txt.text(), txt.select("a").attr("href"));
								return poem;
							}
						};
						FutureTask<Poem> task = new FutureTask<>(callable);
						new Thread(task).start();

						lists.add(task.get());

					}
				}
			}
		} catch (InterruptedException e) {
			e.printStackTrace();
		} catch (ExecutionException e) {
			e.printStackTrace();
		}
		return lists;

	}

	// 获取需要的类型
	public static Map<String, String> getType(String type) {
		String select = null;
		switch (type) {
		case "类型":
			select = ".titletype #type1 a";
			break;
		case "作者":
			select = ".titletype #type2 a";
			break;
		case "朝代":
			select = ".titletype #type3 a";
			break;
		case "形式":
			select = ".titletype > div:eq(4) a";
			break;
		default:
			select = ".titletype #type1 a";
			break;
		}

		Map<String, String> map = new HashMap<String, String>();
		String url = "https://www.gushiwen.org";
		Document doc = null;

		try {
			doc = Jsoup.connect("https://www.gushiwen.org/shiwen/").userAgent(agent).timeout(30000).get();
			Elements eles = doc.select(select);
			for (Element ele : eles) {
				map.put(ele.text(), url + ele.attr("href"));
			}
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

		return map;
	}

	public static void attrExplannationSound(Poem poem, Document document) {
		try {
			if (document.selectFirst(".contyishang h2").text().equals("译文及注释")
					&& document.select(".contyishang h2") != null) {
				String id = document.select(".contyishang").select("img").attr("id").split("Fanyi")[1];

				Document docs = Jsoup.connect("https://so.gushiwen.org/fanyiplay.aspx?id=" + id).userAgent(agent)
						.timeout(1000).get();
				String urlsrc = docs.select("audio").attr("src");
				poem.explannationSound = urlsrc;
			}

		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

	}

	public static void attrContentSound(Poem poem, Document document) {
		try {
			Elements ele = document.selectFirst(".tool").select(".toolpinglun:eq(2)");
			String id = ele.select("img").attr("id").split("img")[1];
			Document docs = Jsoup.connect("https://so.gushiwen.org/viewplay.aspx?id=" + id).userAgent(agent)
					.timeout(1000).get();
			String urlsrc = docs.select("audio").attr("src");
			poem.contentSound = urlsrc;
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}

}
package com.peter.demon_02;

public class Poem {
	public String author;
	public String url;
	public String title;
	public String content;
	public String explanation;
	public String explannationSound;
	public String contentSound;

	public Poem(String author, String url) {
		super();
		this.author = author;
		this.url = url;
	}

	@Override
	public String toString() {
		return "Poem [author=" + author + ", url=" + url + ", title=" + title + ", content=" + content
				+ ", explanation=" + explanation + ", explannationSound=" + explannationSound + ", contentSound="
				+ contentSound + "]";
	}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
Java Jsoup 是一个开源的 HTML 解析器库,可以用来实现简单的 Web 页面爬取。以下是一个简单的 Java Jsoup 爬虫实现: 1. 导入 Jsoup 库 首先需要在项目中导入 Jsoup 库,可以在 Maven 中添加以下依赖: ``` <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.13.1</version> </dependency> ``` 2. 获取页面内容 通过 Jsoupconnect() 方法连接指定的 URL,并使用 get() 方法获取页面内容,如下所示: ``` String url = "https://www.example.com"; Document doc = Jsoup.connect(url).get(); ``` 3. 解析页面内容 使用 Jsoup 的 select() 方法选择页面中需要抓取的元素,并使用 text() 或者 attr() 方法获取其文本内容或属性值,如下所示: ``` Elements links = doc.select("a[href]"); for (Element link : links) { String href = link.attr("href"); String text = link.text(); System.out.println(text + " -> " + href); } ``` 以上代码会抓取页面中所有的链接,然后输出链接的文本和地址。 4. 完整代码 ``` import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class MyCrawler { public static void main(String[] args) { String url = "https://www.example.com"; try { Document doc = Jsoup.connect(url).get(); Elements links = doc.select("a[href]"); for (Element link : links) { String href = link.attr("href"); String text = link.text(); System.out.println(text + " -> " + href); } } catch (Exception e) { e.printStackTrace(); } } } ``` 以上代码可以抓取指定页面中的所有链接,并输出链接的文本和地址。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值