网络蜘蛛--抓取一个网页的邮箱

技术:正则表达式+网络编程(URL)

package cn.hncu.br;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.junit.Test;

public class SpiderDemo {
    @Test
    public void Ahelf(){

        Pattern p2=Pattern.compile("\\w+@\\w+(\\.\\w)+");
        System.out.println("http://sina.com.cn".matches("[a-zA-Z]+://(\\w+)(.\\w+)+(/\\w[^ ])*"));
    }
    @Test
    public void analily(){
        String regex="([\\w-\\.]+)@((\\[[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.)|(([a-zA-Z0-9\\-]+\\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\\]?)";
        ///^([a-zA-Z0-9_-])+@([a-zA-Z0-9_-])+(.[a-zA-Z0-9_-])+
        //\\w+@\\w+(\\.\\w+)+
        Pattern p=Pattern.compile(regex);
        try {
            BufferedReader br=new BufferedReader(new FileReader(".\\net\\mail.txt"));
            String str=null;
            StringBuffer sb=new StringBuffer();//用这个类加载全部可以全部搜索
            while((str=br.readLine())!=null){
                sb.append(str);//考虑到换行的情况
            }
            String result=sb.toString();
            Matcher m =p.matcher(result);
            while(m.find()){
                System.out.println(m.group());
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }


    }
    public static void main(String[] args) {
//        getCurrentHTMLMail();
        try {
            URL url=new URL("http://www.sina.com");
            getAllHTMLMail(url);
        } catch (MalformedURLException e) {
            e.printStackTrace();
        }
    }
    public static void getAllHTMLMail(URL url) {
            ArrayList<URL> urls=new ArrayList<URL>();
            ArrayList<String> mails=new ArrayList<String>();

            urls.add(url);
            for(int i=0;i<urls.size();i++){
                URL u=urls.get(i);
                getCurrentHTMLMail(u,mails);
                getHTMLaHref(u, urls);
                System.out.println(mails.size());
                System.out.println(mails);
            }
            for(String str:mails){
                System.out.println(str);
            }
    }

    public static ArrayList<URL> getHTMLaHref(URL url,ArrayList<URL> urls){
        try {
            BufferedReader br = new BufferedReader(new InputStreamReader(
                    url.openStream()));
            StringBuilder sb = new StringBuilder();//用这个类加载全部可以全部搜索
            String str = null;
            while ((str = br.readLine()) != null) {
                sb.append(str);
            }
            Pattern p = Pattern.compile("(http|ftp|https)://(\\w+)(.\\w+)+(/\\w[^ ])*");
            //Pattern p = Pattern.compile("<a href=\"*.html\">page</a>");
            //<a href="在这里插入URL"></a>
            Matcher m=p.matcher(sb);
            while (m.find()) {
//                System.out.println(m.group());
                URL u=new URL(m.group());
                urls.add(u);

            }
        } catch (Exception e) {
        }
        return urls;
    }
    public static ArrayList<String> getCurrentHTMLMail(URL url,ArrayList<String> mail) {
        try {
//            URL url=new URL("http://www.sina.com.cn");
            //System.out.println(url.toString());
            BufferedReader br=new BufferedReader(new InputStreamReader(url.openStream()));
            StringBuilder sb=new StringBuilder();//用这个类加载全部可以全部搜索
            String str=null;
            while((str=br.readLine())!=null){
                sb.append(str);
            }
            Pattern p=Pattern.compile("\\w+@\\w+(.\\w)+");
            Matcher m=p.matcher(sb);
            while(m.find()){
//                System.out.println(m.group());
                mail.add(m.group());
            }
        } catch (IOException e) {
            e.printStackTrace();  //HTTP 403命令是禁止恶意访问此网站,不能从此网站中抓取内容
        }
        return mail;
    }
    @Test
    public void getMail()  {
            try {
                URL url = new URL("http://127.0.0.1/");
                BufferedReader br = null;
                try {
                    br = new BufferedReader(new InputStreamReader(
                            url.openStream()));
                } catch (IOException e) {
                }
                StringBuilder sb = new StringBuilder();//用这个类加载全部可以全部搜索
                String str = null;
                while ((str = br.readLine()) != null) {
                    sb.append(str);
                }
                System.out.println(sb.toString());
                Pattern p = Pattern.compile("\\w+@\\w+(.\\w)+");
                Matcher m = p.matcher(sb);
                while (m.find()) {
                    System.out.println(m.group());
                }
            } catch (Exception e) {
            }

    }

}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值