- import java.net.*;
- import java.io.*;
- import java.util.*;
- class ScanPage{
- private static String strPage;
- private String strUrl;
- private String fileName;
- public void setURL(String strUrl){
- this.strUrl=strUrl;
- }
- //从地址中得到文件名
- public void setFileName(){
- int i;
- char ch;
- i=strUrl.length();
- ch=strUrl.charAt(--i);
- while(ch!='/' && ch>0)
- ch=strUrl.charAt(--i);
- fileName=strUrl.substring(i);
- }
- //下载网页
- public void downFile()throws IOException{
- URL url =new URL(strUrl);
- InputStream is =url.openStream();
- OutputStream os =new FileOutputStream(fileName);
- byte[] buffer =new byte[512];
- int len;
- while((len =is.read(buffer))!=-1)
- os.write(buffer,0,len);
- is.close();
- os.close();
- }
- //读文件
- public void readFile() throws IOException {
- StringBuffer sb = new StringBuffer();
- BufferedReader in =new BufferedReader(new FileReader(fileName));
- String s;
- while((s = in.readLine()) != null) {
- sb.append(s);
- sb.append("/n");
- }
- in.close();
- strPage=sb.toString();
- }
- public String getTitle(){
- return "";
- }
- //扫描标签,得到资源
- public void scanLabel(ArrayList al,String strLabel,String strType){
- int idx; //返回下标
- String strTmp;
- strPage=strPage.toLowerCase(); //转换为小写,以便后面比较
- idx = strPage.indexOf("<body");
- while(idx!=-1){
- idx=strPage.indexOf(strLabel,idx);
- if (idx==-1)
- break;
- else{
- int i=0;
- idx=idx+strLabel.length();
- for(;strPage.charAt(idx+i)!='>' && strPage.charAt(idx+i)!=32;++i);
- strTmp=strPage.substring(idx,idx+i);
- idx=idx+i;
- //去掉首尾引号
- if (strTmp.charAt(0)=='/"')
- strTmp=strTmp.substring(1);
- if (strTmp.charAt(strTmp.length()-1)=='/"')
- strTmp=strTmp.substring(0,strTmp.length()-1);
- //判断是否是用户需要的类型
- if (strType.equals("*"))
- al.add(strTmp);
- else{
- String right;
- if (strTmp.length()>=strType.length()){
- right=strTmp.substring(strTmp.length()-strType.length());
- right=right.toLowerCase();
- if (right.equals(strType))
- al.add(strTmp);
- }
- }
- }
- }
- }
- }
- class ScanApp{
- public static void main(String[] args){
- ArrayList al=new ArrayList();
- ScanPage sp=new ScanPage();
- sp.setURL(args[0]);
- sp.setFileName();
- try{
- sp.downFile();
- sp.readFile();
- }catch(IOException ie){System.out.println("文件操作出错");};
- sp.scanLabel(al,"<a href=",".html");
- for(int i=0;i<al.size();i++){
- System.out.println(al.get(i));
- }
- }
- }
- E:/javawork>java ScanApp http://www.17kyk.com/Html/Book/16/2431/list.html
- list.html
- 429400.html
- 429401.html
- 439789.html
- 429403.html
- 429404.html
- 429405.html
- 429406.html
- 429407.html
- 429408.html
- 429409.html
- 429411.html
- 429412.html
- 说明:
- scanLabel针对<a href和<img src这2个标签进行扫描.
- 得到所有链接存储在ArrayList中
- scanLabel(al,"<a href=","*");
- scanLabel(al,"<img src=","*");
- 得到特定的链接
- scanLabel(al,"<a href=",".html");
- scanLabel(al,"<a href=",".asp");
- scanLabel(al,"<img src=",".gif");
从网页源文件中得到链接
最新推荐文章于 2021-06-05 13:54:36 发布