自己写的一个正文提取算法,在三个网站上测试没问题
需要使用第三方的jar jsoup
package com.extract;
import java.io.File;
import java.io.IOException;
import org.apache.commons.io.FileUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class ExtractNovel {
public static void main(String[] args) throws IOException {
//dijiuzww.com
String test = FileUtils
.readFileToString(new File("C://Users//Administrator//Desktop//sina.com"));
// String test = FileUtils
// .readFileToString(new File("C://Users//Administrator//Desktop//testextaractContent.txt"));
//
Document doc = Jsoup.parse(test);
doc = denoiseElementForDoc(doc);
// System.out.println(doc.text());
int size = doc.text().length();
Element e = doc.getAllElements().get(0);
Element target[] = new Element[1];
check(e,size);
}
public static void check(Element e, float size) {
Element son = findRealSon(e, size);
System.out.println(son.toString());
System.out.println(son.text());
}
public static Element findRealSon(Element e, float size) {
Elements els = e.children();
Element son = null;
for (Element tempson : els) {
float length = tempson.text().length();
if (length / size > 0.75) {
Element element = findRealSon(tempson, size);
if(element ==null){
son = tempson;
return son;
}else{
son = element;
}
}
}
return son;
}
public static Document denoiseElementForDoc(Document document) {
document.getElementsByTag("script").remove();
document.getElementsByTag("style").remove();
document.getElementsByTag("select").remove();
document.getElementsByTag("link").remove();
document.getElementsByTag("input").remove();
document.getElementsByTag("object").remove();
document.getElementsByTag("textarea").remove();
document.getElementsByTag("ul").remove();
document.getElementsByTag("img").remove();
document.getElementsByTag("a").attr("href", "javascript:void(0)").remove();
document.getElementsByAttributeValue("display", "none").remove();
document.getElementsByAttributeValueStarting("class", "foot").remove();
document.getElementsByAttributeValue("class", "settings").remove();
document.getElementsByAttributeValueContaining("style", "display:none").remove();
document.getElementsByAttributeValueContaining("style", "overflow: hidden").remove();
return document;
}
}