从网页源文件中得到链接

最新推荐文章于 2021-06-17 11:17:54 发布

iteye_14258

最新推荐文章于 2021-06-17 11:17:54 发布

阅读量158

点赞数

分类专栏： java 程序开发文章标签： HTML OS ASP.net IE ASP

本文链接：https://blog.csdn.net/iteye_14258/article/details/81614521

版权

java 程序开发专栏收录该内容

28 篇文章 0 订阅

订阅专栏

import java.net.*;
import java.io.*;
import java.util.*;

class ScanPage{
    private static String strPage;
    private String strUrl;
    private String fileName;
    public void setURL(String strUrl){
        this.strUrl=strUrl;
    }
    //从地址中得到文件名
    public void setFileName(){
        int i;
        char ch;
        i=strUrl.length();
        ch=strUrl.charAt(--i);
        while(ch!='/' && ch>0)
            ch=strUrl.charAt(--i);
        fileName=strUrl.substring(i);
    }
    //下载网页
    public void downFile()throws IOException{
        URL url =new URL(strUrl);
        InputStream is =url.openStream();
        OutputStream os =new FileOutputStream(fileName);
        byte[] buffer =new byte[512];
        int len;
        while((len =is.read(buffer))!=-1)
            os.write(buffer,0,len);
        is.close();
        os.close();
    }
    //读文件
    public void readFile() throws IOException {
        StringBuffer sb = new StringBuffer();
        BufferedReader in =new BufferedReader(new FileReader(fileName));
        String s;
        while((s = in.readLine()) != null) {
            sb.append(s);
            sb.append("\n");
        }
        in.close();
        strPage=sb.toString();
    }
    public String getTitle(){

        return "";
    }
    //扫描标签，得到资源
    public void scanLabel(ArrayList al,String strLabel,String strType){
        int idx;   //返回下标
        String strTmp;
        strPage=strPage.toLowerCase();  //转换为小写,以便后面比较
        idx = strPage.indexOf("<body");
        while(idx!=-1){
            idx=strPage.indexOf(strLabel,idx);
            if (idx==-1)
                break;
            else{
                int i=0;
                idx=idx+strLabel.length();
                for(;strPage.charAt(idx+i)!='>' && strPage.charAt(idx+i)!=32;++i);
                strTmp=strPage.substring(idx,idx+i);
                idx=idx+i;
                //去掉首尾引号
                if (strTmp.charAt(0)=='\"')
                    strTmp=strTmp.substring(1);
                if (strTmp.charAt(strTmp.length()-1)=='\"')
                    strTmp=strTmp.substring(0,strTmp.length()-1);
            //判断是否是用户需要的类型
                if (strType.equals("*"))
                    al.add(strTmp);
                else{
                    String right;
                    if (strTmp.length()>=strType.length()){
                        right=strTmp.substring(strTmp.length()-strType.length());
                        right=right.toLowerCase();
                        if (right.equals(strType))
                            al.add(strTmp);
                    }
                }
            }
        }
    }
}

class ScanApp{
    public static void main(String[] args){
        ArrayList al=new ArrayList();
        ScanPage sp=new ScanPage();
        sp.setURL(args[0]);
        sp.setFileName();
        try{
            sp.downFile();
            sp.readFile();
        }catch(IOException ie){System.out.println("文件操作出错");};
        sp.scanLabel(al,"<a href=",".html");
        for(int i=0;i<al.size();i++){
            System.out.println(al.get(i));
        }
    }
}

E:\javawork>java ScanApp http://www.17kyk.com/Html/Book/16/2431/list.html
list.html
429400.html
429401.html
439789.html
429403.html
429404.html
429405.html
429406.html
429407.html
429408.html
429409.html
429411.html
429412.html
说明:
scanLabel针对<a href和<img src这2个标签进行扫描.
得到所有链接存储在ArrayList中

scanLabel(al,"<a href=","*");
scanLabel(al,"<img src=","*");
得到特定的链接
scanLabel(al,"<a href=",".html");
scanLabel(al,"<a href=",".asp");
scanLabel(al,"<img src=",".gif");