话不多说,先看代码!
/**
* Created by david on 2017-7-5.
* 爬取网易新闻页面
*/
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
public class JsoupDemo {
private static String trim(String s, int width) {
if (s.length() > width)
return s.substring(0, width-1) + ".";
else
return s;
}
public static void getContent(String url) {
try {
Document doc = Jsoup.connect(url).get();
Elements content_main = doc.getElementsByClass("post_content_main");
String title = content_main.first().getElementsByTag("h1").first().text();
System.out.println(title);
String time_source = content_main.first().getElementsByClass("post_time_source").first().text();
System.out.println(time_source);
// String source = content_main.first().getElementById("ne_article_source").text();
// System.out.println(source);
Elements contents = content_main.first().getElementById("endText").select("p");
StringBuffer buffer = new StringBuffer();
for (Element e : contents) {
buffer.append(e.text() + "\n");
}
String content = buffer.toString();
System.out.println(content);
//获取图片URL
Elements media = doc.select("[src]");
for (Element src : media) {
if (src.tagName().equals("img")) {
System.out.println(src.tagName() + " " + src.attr("abs:src") + " " + src.attr("width") + " * " +
src.attr("height") + " " + trim(src.attr("alt"), 20));
}
}
//获取链接
Elements links = doc.select("a[href]");
for (Element link : links) {
System.out.println(link.tagName() + " " + link.attr("abs:href") + " " + link.attr("rel"));
}
;
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String args[]) {
// getContent("http://tech.163.com/17/0705/19/COJSA0OJ00097U7T.html");
// getContent("http://tech.163.com/17/0703/17/COEF89D700097U7R.html");
getContent("http://news.163.com/17/0705/11/COIVM1LK000189FH.html");
}
}
输出:
网易新闻的页面结构很清晰,每一个网易新闻页面都可以用本代码爬取,只需要把URL网址作为参数传入即可。
用到了Jsoup-1.10.3.jar包
参考链接:https://jsoup.org/cookbook/extracting-data/example-list-links
Jsoup API : https://jsoup.org/apidocs/overview-summary.html