先上代码
package tool; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.util.regex.Matcher; import java.util.regex.Pattern; public class download { public static void main(String[] args) throws IOException{ dl("http://daily.zhihu.com/"); } public static void dl(String url) throws IOException{ URL u=new URL(url); HttpURLConnection uc=(HttpURLConnection)u.openConnection(); uc.setRequestMethod("GET"); uc.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"); uc.connect(); BufferedReader br=new BufferedReader(new InputStreamReader(uc.getInputStream())); StringBuffer sb=new StringBuffer(); String s=br.readLine(); while(s!=null){ sb.append(s); s=br.readLine(); } System.out.println(sb.toString()); parser(sb.toString()); } /** * * <div class="box"> * <a href="/story/9505539" class="link-button"> * <img src="https://pic2.zhimg.com/v2-38d1c9828eab1e3843ca303f1beff3b5.jpg" class="preview-image"> * <span class="title">这么一想,在座的各位氨基酸们,名字还挺有故事哦</span> * </a> * </div> * * @param content */ public static void parser(String content){ Pattern pattern=Pattern.compile("<div class=\"box\"><a href=\"(.*?)\" class=\"link-button\"><img src=\"(.*?)\" class=\"preview-image\">" + "<span class=\"title\">(.*?)</span>" + "</a></div>"); // Pattern pattern=Pattern.compile("<span class=\"title\">(.*?)</span>.*?<a href=\"(.*?)\" class=\"link-button\">"); Matcher matcher=pattern.matcher(content); System.out.println(matcher.toString()); while(matcher.find()){ int i=matcher.start(); int j=matcher.end(); System.out.println("源链接:"+matcher.group(1)+"--源图片:"+matcher.group(2)+"--源标题:"+matcher.group(3)); } } }
设置请求头
URL u=new URL(url); HttpURLConnection uc=(HttpURLConnection)u.openConnection(); uc.setRequestMethod("GET"); uc.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"); uc.connect();
获取流,并放在StringBuffer里面方便后面解析
BufferedReader br=new BufferedReader(new InputStreamReader(uc.getInputStream())); StringBuffer sb=new StringBuffer(); String s=br.readLine(); while(s!=null){ sb.append(s); s=br.readLine(); } System.out.println(sb.toString());
利用正则表达式匹配文本
Pattern pattern=Pattern.compile("<div class=\"box\"><a href=\"(.*?)\" class=\"link-button\"><img src=\"(.*?)\" class=\"preview-image\">" + "<span class=\"title\">(.*?)</span>" + "</a></div>"); // Pattern pattern=Pattern.compile("<span class=\"title\">(.*?)</span>.*?<a href=\"(.*?)\" class=\"link-button\">"); Matcher matcher=pattern.matcher(content);
运行结果