技术:正则表达式+网络编程(URL)
package cn.hncu.br;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.junit.Test;
public class SpiderDemo {
@Test
public void Ahelf(){
Pattern p2=Pattern.compile("\\w+@\\w+(\\.\\w)+");
System.out.println("http://sina.com.cn".matches("[a-zA-Z]+://(\\w+)(.\\w+)+(/\\w[^ ])*"));
}
@Test
public void analily(){
String regex="([\\w-\\.]+)@((\\[[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.)|(([a-zA-Z0-9\\-]+\\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\\]?)";
///^([a-zA-Z0-9_-])+@([a-zA-Z0-9_-])+(.[a-zA-Z0-9_-])+
//\\w+@\\w+(\\.\\w+)+
Pattern p=Pattern.compile(regex);
try {
BufferedReader br=new BufferedReader(new FileReader(".\\net\\mail.txt"));
String str=null;
StringBuffer sb=new StringBuffer();//用这个类加载全部可以全部搜索
while((str=br.readLine())!=null){
sb.append(str);//考虑到换行的情况
}
String result=sb.toString();
Matcher m =p.matcher(result);
while(m.find()){
System.out.println(m.group());
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
// getCurrentHTMLMail();
try {
URL url=new URL("http://www.sina.com");
getAllHTMLMail(url);
} catch (MalformedURLException e) {
e.printStackTrace();
}
}
public static void getAllHTMLMail(URL url) {
ArrayList<URL> urls=new ArrayList<URL>();
ArrayList<String> mails=new ArrayList<String>();
urls.add(url);
for(int i=0;i<urls.size();i++){
URL u=urls.get(i);
getCurrentHTMLMail(u,mails);
getHTMLaHref(u, urls);
System.out.println(mails.size());
System.out.println(mails);
}
for(String str:mails){
System.out.println(str);
}
}
public static ArrayList<URL> getHTMLaHref(URL url,ArrayList<URL> urls){
try {
BufferedReader br = new BufferedReader(new InputStreamReader(
url.openStream()));
StringBuilder sb = new StringBuilder();//用这个类加载全部可以全部搜索
String str = null;
while ((str = br.readLine()) != null) {
sb.append(str);
}
Pattern p = Pattern.compile("(http|ftp|https)://(\\w+)(.\\w+)+(/\\w[^ ])*");
//Pattern p = Pattern.compile("<a href=\"*.html\">page</a>");
//<a href="在这里插入URL"></a>
Matcher m=p.matcher(sb);
while (m.find()) {
// System.out.println(m.group());
URL u=new URL(m.group());
urls.add(u);
}
} catch (Exception e) {
}
return urls;
}
public static ArrayList<String> getCurrentHTMLMail(URL url,ArrayList<String> mail) {
try {
// URL url=new URL("http://www.sina.com.cn");
//System.out.println(url.toString());
BufferedReader br=new BufferedReader(new InputStreamReader(url.openStream()));
StringBuilder sb=new StringBuilder();//用这个类加载全部可以全部搜索
String str=null;
while((str=br.readLine())!=null){
sb.append(str);
}
Pattern p=Pattern.compile("\\w+@\\w+(.\\w)+");
Matcher m=p.matcher(sb);
while(m.find()){
// System.out.println(m.group());
mail.add(m.group());
}
} catch (IOException e) {
e.printStackTrace(); //HTTP 403命令是禁止恶意访问此网站,不能从此网站中抓取内容
}
return mail;
}
@Test
public void getMail() {
try {
URL url = new URL("http://127.0.0.1/");
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(
url.openStream()));
} catch (IOException e) {
}
StringBuilder sb = new StringBuilder();//用这个类加载全部可以全部搜索
String str = null;
while ((str = br.readLine()) != null) {
sb.append(str);
}
System.out.println(sb.toString());
Pattern p = Pattern.compile("\\w+@\\w+(.\\w)+");
Matcher m = p.matcher(sb);
while (m.find()) {
System.out.println(m.group());
}
} catch (Exception e) {
}
}
}