网页中的脚本一般在<script>标签中,例如
<script type="text/javascript"> | |
F.use(["/static/common/ui/tangram/base/base.js","/static/widget/common/searchbox/searchbox.js","/static/common/ui/vs/suggestion/suggestion.js"], function(baidu,searchbox,suggestion){ | |
baidu.dom.ready(function(){ | |
searchbox(); | |
if (navigator.cookieEnabled && !/sug?=0/.test(document.cookie)){ | |
suggestion(); | |
} | |
}); | |
}); | |
</script> |
代码:
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class html
{
/**
* @param args
* @throws IOException
*/
public static void main(String[] args)
throws IOException
{
Document doc = Jsoup.connect("http://news.baidu.com").get(); //获取网页内容
//获取网页内容中非脚本信息
getTag(doc);
}
public static void getTag(Document doc)
{
Elements el = doc.select("*");//先遍历整个HTML
List<String>list = new ArrayList<String>();
for(Element element:el){
String text = element.tagName();
if(text.endsWith("script"))continue;//删除HTML中的脚本
else{
if(element.hasText() == true)list.add(element.text()+'\n');
}
}
System.out.println(list);
}