import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class WebSpiderTest {
public static void main(String[] args) throws IOException {
/*
* 获取单网页里所有超链接里的href属性值并打印到控制台
*/
URL url = new URL("http://www.qq.com");
StringBuilder sb = new StringBuilder();
BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream(),Charset.forName("gbk")));
String temp = "";
// BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("E:/163.txt"),Charset.forName("gbk")));
while((temp=br.readLine() )!= null){
sb.append(temp);
sb.append("\n");
}
//[\\s\\s] 匹配所有字符
String regStr ="?";
Pattern p = Pattern.compile(regStr);
Matcher m = p.matcher(sb.toString());
while(m.find()){
System.out.println(m.group().replaceAll("\\s+", " "));//打印出匹配子序列[把多空格或者换号或者多tab替换成一个空格,便于查看]
System.out.println(m.group(1));//打印出匹配到的子序列里的捕获组,即是 (http[\\s\\S]+?) 这个捕获组匹配到的值,其实就是href属性的值
}
}
}