package com.xy.xmweb.Controller; /** * 此类为页面抓取工具类 */ import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.util.ArrayList; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.xy.entity.INewsData; public class JsoupFirstExtract { /** * @param args */ public static void main(String[] args) { //parseHtml(); //parseBody(); //parseUrl(); System.out.println("========================================="); System.out.println("========================================="); System.out.println("========================================="); System.out.println("========================================="); //navigation(); //extractElement(); // navigation(); try { String httpCount = JsoupFirstExtract.clawer2("http://www1.xy.com/myoffice/news.do?optType=1&pageNo=1&pageSize=10"); //使用jSoup解析里头的内容 //就像操作html doc文档对象一样操作网页中的元素 Document doc = Jsoup.parse(httpCount, "http://www1.xy.com/"); Element body = doc.body(); Element span = body.select("td").first(); Elements links = span.getElementsByTag("a"); for (Element element : links) { String linkAbsHref = element.absUrl("href"); String linkText = element.text(); System.out.println("linkAbsHref=:"+linkAbsHref); System.out.println(""+linkText+""); } } catch (Exception e) { e.printStackTrace(); } int pageSize = 10; try { //http://www1.xy.com/myoffice/news.do?optType=2&pageNo=78&pageSize=10 Document doc = Jsoup.connect("http://www1.xy.com/myoffice/news.do?optType=2&pageNo=78&pageSize="+pageSize).timeout(10000).get(); Elements as = doc.select("a[href]"); System.out.println(as.size()); if(pageSize > as.size()){ pageSize = as.size(); } // for (Element a : as) { // System.out.println(a.attr("href") + "###" + a.html()); // } Elements tds = doc.select("td:not([title])"); // for (Element td : tds) { // System.out.println(td.html()); // } for(int i=0;i list = getIntfaceData("http://www1.xy.com/myoffice/news.do?optType=2&pageNo=78&pageSize=",10); if (list != null && list.size() > 0) { for (int i = 0; i < list.size(); i++) { INewsData newsData = list.get(i); System.out.println("=============newDate----getAhref-----:"+newsData.getAhref()); System.out.println("=============newDate----getDatetime-----:"+newsData.getDatetime()); System.out.println("=============newDate----getTitle-----:"+newsData.getTitle()); } } } public static List getIntfaceData(String url, int pageSize) { List list = new ArrayList(); try { //Document docconect = Jsoup.connect("http://www1.xy.com/myoffice/news.do?optType=2&pageNo=78&pageSize="+pageSize).timeout(10000).get(); Document doc = Jsoup.connect(url+pageSize).timeout(10000).get(); // Document doc = Jsoup.parse(docconect.toString(),"http://www1.xy.com/"); Elements as = doc.select("a[href]"); //System.out.println("======条数====="+as.size()); if(pageSize > as.size()){ pageSize = as.size(); } Elements tds = doc.select("td:not([title])"); for(int i=0;i
Parsed HTML into a doc.
"; Document doc = Jsoup.parse(html); System.out.println(doc); System.out.println("Print the html head --------------------"); System.out.println(doc.head()); System.out.println("Print the html body --------------------"); System.out.println(doc.body()); System.out.println("Print the html title --------------------"); System.out.println(doc.title()); } public static void parseBody() { String html = "Lorem ipsum.
"; Document doc = Jsoup.parseBodyFragment(html); Element body = doc.body(); System.out.println("Print the body --------------------"); System.out.println(body); } public static void parseUrl() { try { Document doc = Jsoup.connect("http://www1.xy.com/myoffice/news.do?optType=1&pageNo=1&pageSize=10").get(); System.out.println("Print the Url --------------------"); System.out.println(doc); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void navigation() { String html="First parse" + "Parsed HTML into a doc.
"; Document doc = Jsoup.parse(html, "http://192.168.3.84/gamestore/index.html"); Element content = doc.getElementById("content"); Elements links = content.getElementsByTag("a"); for (Element link : links) { String linkHref = link.attr("href"); String linkAbsHref = link.absUrl("href"); String linkText = link.text(); System.out.println(linkHref); System.out.println(linkAbsHref); System.out.println(linkText); } } public static void extractElement() { String html = "An example link.
"; Document doc = Jsoup.parse(html); Element link = doc.select("a").first(); String text = doc.body().text(); // "An example link" String linkHref = link.attr("href"); // "http://example.com/" String linkText = link.text(); // "example"" String linkOuterH = link.outerHtml(); // " example" String linkInnerH = link.html(); // " example" System.out.println(text); System.out.println(linkHref); System.out.println(linkText); System.out.println(linkOuterH); System.out.println(linkInnerH); } /** * 当有些网页中存在一些嵌套的redirect连接时,它就会报Server redirected too many times这样的错误, * 这是因为此网页内部又有一些代码是转向其它网页的,循环过多导致程序出错。如果只想抓取本URL中的网页内容, * 而不愿意让它有其它 的网页跳转,可以用以下的代码。 * @param myurl * @throws Exception */ @SuppressWarnings("static-access") public static String clawer2(String myurl) throws Exception { URL urlmy = new URL(myurl); HttpURLConnection con = (HttpURLConnection) urlmy.openConnection(); con.setFollowRedirects(true); con.setInstanceFollowRedirects(false); con.connect(); BufferedReader br = new BufferedReader(new InputStreamReader(con.getInputStream(),"UTF-8")); String s = ""; StringBuffer sb = new StringBuffer(""); while ((s = br.readLine()) != null) { sb.append(s+"\r\n"); } return sb.toString(); } }