先上代码
package tool;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class download {
public static void main(String[] args) throws IOException{
dl("http://daily.zhihu.com/");
}
public static void dl(String url) throws IOException{
URL u=new URL(url);
HttpURLConnection uc=(HttpURLConnection)u.openConnection();
uc.setRequestMethod("GET");
uc.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36");
uc.connect();
BufferedReader br=new BufferedReader(new InputStreamReader(uc.getInputStream()));
StringBuffer sb=new StringBuffer();
String s=br.readLine();
while(s!=null){
sb.append(s);
s=br.readLine();
}
System.out.println(sb.toString());
parser(sb.toString());
}
/**
*
*
*
* @param content
*/
public static void parser(String content){
Pattern pattern=Pattern.compile("
");//Pattern pattern=Pattern.compile("(.*?).*?");
Matcher matcher=pattern.matcher(content);
System.out.println(matcher.toString());
while(matcher.find()){
int i=matcher.start();
int j=matcher.end();
System.out.println("源链接:"+matcher.group(1)+"--源图片:"+matcher.group(2)+"--源标题:"+matcher.group(3));
}
}
}
设置请求头
URL u=new URL(url);
HttpURLConnection uc=(HttpURLConnection)u.openConnection();
uc.setRequestMethod("GET");
uc.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36");
uc.connect();
获取流,并放在StringBuffer里面方便后面解析
BufferedReader br=new BufferedReader(new InputStreamReader(uc.getInputStream()));
StringBuffer sb=new StringBuffer();
String s=br.readLine();
while(s!=null){
sb.append(s);
s=br.readLine();
}
System.out.println(sb.toString());
利用正则表达式匹配文本
Pattern pattern=Pattern.compile("
");//Pattern pattern=Pattern.compile("(.*?).*?");
Matcher matcher=pattern.matcher(content);
运行结果