第1关:清洗HTML文档中无意义数据
package step1;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.safety.Whitelist;
public class Task {
//通过filePath文件路径获取Docment对象
public Document getDoc(String filePath) throws IOException{
/********** Begin **********/
File input = new File(filePath);
Document doc = Jsoup.parse(input,"UTF-8","http://www.educoder.net/");
return doc;
/********** End **********/
}
/**
* 获取清理后的信息
* @param doc
* @return
*/
public List<String> cleanHTML(Document doc){
/********** Begin **********/
List<String> list=new ArrayList<>();
list.add(Jsoup.clean(doc.toString(), Whitelist.basic()));
list.add(Jsoup.clean(doc.toString(), Whitelist.