以网易为例,截取网易中包含的所有网址,并把网址输出到本地txt文件
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class URlduqu {
public static void main(String[] args) {
Pattern guize = Pattern.compile("<a href=.*</a>");
Pattern wang = Pattern.compile("http.*");
try {
URL url = new URL("http://www.163.com");
URLConnection conn = url.openConnection();
java.io.InputStream is = conn.getInputStream();
BufferedReader br = new BufferedReader(new InputStreamReader(is,"gb2312"));
File f = new File("F:"+File.separator+"网址1.txt");
PrintWriter pw = new PrintWriter(f);
File f2 = new File("F:"+File.separator+"网址3.txt");
PrintWriter aaa = new PrintWriter(f2);
String str = null;
int n =0;
String[] wangzhi = new String[2000];
String[] zhi;
while((str=br.readLine())!=null){
Matcher m = guize.matcher(str);
if(m.find()){
str = str.substring(str.indexOf("href=\"")+6);
str = str.substring(0,str.indexOf("\""));
if(wang.matcher(str).matches()){
pw.println(str);
wangzhi[n]=str;
n++;
}
}
}
Set set = new HashSet();
for (int i = 0; i < n; i++) {
set.add(wangzhi[i]);
}
zhi = (String[]) set.toArray(new String[0]);
for(String a:zhi){
aaa.println(a);
}
aaa.close();
pw.close();
br.close();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}