jsoup解析HTML，爬取小说实例

最新推荐文章于 2022-04-11 10:06:10 发布

三事足矣

最新推荐文章于 2022-04-11 10:06:10 发布

阅读量578

点赞数

分类专栏： java 文章标签： java 爬虫 html jsoup

本文链接：https://blog.csdn.net/qq_34199125/article/details/54837260

版权

java 专栏收录该内容

9 篇文章 0 订阅

订阅专栏

1.java 的 File.separator 斜杠

2.jsoup解析标签，element的text()方法直接取出两个标签中间的文本

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Test {

	public static void main(String[] args) throws Exception {
		// TODO Auto-generated method stub
		
			Document doc = Jsoup.connect("http://www.biquge5.com/2_2975/1388243.html").get();
			Elements links = doc.select("a[href]");
			for (Element link:links){
				if (link.text().contentEquals("上一章")||link.text().contentEquals("下一章"))
				System.out.println(link.attr("abs:href").trim()+"---"+link.text());
			}
			Element content = doc.getElementById("content");
			//System.out.println(content.text());	
			String [] sentences ;
			sentences = content.text().split(" ");
			for (String sen : sentences){
				sen = sen.trim();
				sen = sen+"\r\n";
				try {
					File dir = new File("F:"+File.separator+"book");
					if(!dir.exists()){
						dir.mkdirs();
						System.out.println("小说"+"F:"+File.separator+"book"+"目录下");
					}
					File file = new File( "F:"+File.separator+"book"+File.separator+"text.txt");
					FileOutputStream os = new FileOutputStream(file,true);
					try {
						os.write(sen.getBytes());
						os.close();
					} catch (IOException e) {
						// TODO Auto-generated catch block
						e.printStackTrace();
					
				}}
				catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
					e.printStackTrace();
				}				
			}	
		}
}