从百姓网上爬取一些地名和对应的连接(转载)
利用正则表达式匹配地名和对应的url链接
package test0903;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.Buffer;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class FinallyDemo{
public static void main(String[] args) {
String buf = getBuf();
System.out.println("main-----------");
String al = getRegex(buf);
}
public static String getBuf(){
try{
URL url = new URL("");
HttpURLConnection connect = (HttpURLConnection) url.openConnection();
connect.setRequestMethod("GET");
connect.setConnectTimeout(3000);
connect.connect();
int code = connect.getResponseCode();
if (code == 200){
BufferedReader reader = new BufferedReader(
new InputStreamReader(connect.getInputStream(),"UTF-8"));
StringBuffer buffer = new StringBuffer();
String line = null;
while ((line = reader.readLine()) != null){
buffer.append(line);
}
System.out.println("try--------------------");
return buffer.toString();
}
}
catch (IOException e){
e.printStackTrace();
System.out.println("catch---------------------");}
finally{
System.out.println("finally-------------------");
}
return null;
}
public static String getRegex(String s){
String regex = "<a[^>]*href=(\\\"([^\\\"]*)\\\"|\\'([^\\']*)\\'|([^\\\\s>]*))[^>]*>(.*?)</a>";
Pattern r = Pattern.compile(regex);
Matcher m = r.matcher(s);
System.out.println(m.matches());
ArrayList list = new ArrayList();
while(m.find()){
list.add(m.group());
String regex1 = "^<a href='//(.*?)/'.*?([\\u4e00-\\u9fa5]*)</a>$";
Pattern r1 = Pattern.compile(regex1);
Matcher m1 = r1.matcher(m.group());
if (m1.find()){
System.out.println(m1.group(2)+" = "+m1.group(1));
}
}
return list.toString();
}
}