package Regex;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 网络爬虫取连接然后逐个取出
* @param args
*/
public class WebSpiderTest {
/**
* 获得urlStr对应的网页的源码内容
* @param urlStr
* @return
*/
public static String getURLContent(String urlStr, String charset) {
StringBuilder sb = new StringBuilder();
try {
URL url = new URL(urlStr);
//url.openStream()打开一个输入流
BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(), Charset.forName(charset)));
String temp = "";
while((temp = reader.readLine()) != null) {
sb.append(temp);
// System.out.println(temp);//打印输入网站源码
}
} catch (Exception e) {
e.printStackTrace();
}
return sb.toString();
}
public static List<String> getMatherSubstrs(String destStr, String regexStr) {
// Pattern p = Pattern.compile("<a[\\s\\S]+?</a>"); //取到所有超链接的a标签里的所有内容
Pattern p = Pattern.compile(regexStr); //取到超链接的地址
Matcher m = p.matcher(destStr);
List<String> result = new ArrayList<String>();
while(m.find()) {
result.add(m.group(1));
}
return result;
}
public static void main(String[] args) {
String destStr = getURLContent("https://www.163.com/", "gbk");
List<String> result = getMatherSubstrs(destStr, "href=\"([\\w\\s./:]+?)\"");
for(String temp : result) {
System.out.println(temp);
}
}
}