一下代码为一个Jsoup爬取新闻网页的简单示例,可直接运行。
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class newCrawl {
public LinkedList ImgUrls=new LinkedList();//用于存放图片URL
public LinkedList linkurls=new LinkedList();//用于存放url链接
public static void main(String[] args) throws IOException {
newCrawl newCl = new newCrawl();
String url="http://world.huanqiu.com/article/2016-01/8412590.html?from=bdwz";
Document doc = Jsoup.connect(url).get();
newCl.downloadPage(url);//下载网页
String title=newCl.getnewTitle(doc);//获取新闻标题
String time=newCl.getTime(doc);//获取新闻发布时间
String text=newCl.getNewtext(doc);//获取新闻内容
System.out.println("新闻Url:"+url);
System.out.println("新闻标题:"+title);
//System.out.println("newsTime:"+time);
System.out.println("新闻内容:"+text);
System.out.println("******************************************************************");
newCl.getImgurl(doc);//获取图片链接
newCl.getlinkurl(doc);//获取网页链接
System.out.println("图片url链接");
for (Object IU : newCl.ImgUrls) {
System.out.println(IU);
}