简单网络爬虫代码示例

//自己写的网络爬虫,希望大家共同探讨交流

package Crawl;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;

public class Indepent {
 
 public static void main(String args[])
 {
  
  Set<String>  visitedurl = new HashSet<String>();
  LinkedList  unvisitedurl = new LinkedList();
  String seed = "http://www.xyz.abc";
  int i = 1;
  String visiturl;
  unvisitedurl.add(seed);
   visiturl = seed;
  while(visiturl!=null)
  {
   Set<String> link = new HashSet<String>();
   
   visiturl = (String) unvisitedurl.removeFirst();
   if(visitedurl.contains(visiturl))
    continue;
   catchallseed(visiturl,i);
  
   link = extractMain("HTMLDOWNLOAD\\"+i+".html");
   i++;
  
  
  unvisitedurl.addAll(link);
  
  visitedurl.add(visiturl);
  System.out.println(visiturl);
  
  
 
  
  }

  
 }
 public static Set<String> extractMain(String filename)
 {
   Set<String> links = new HashSet<String>();
  File file = new File(filename);
  Reader reader = null;          //创建一个读对象
  
  try{
   reader =new InputStreamReader(new FileInputStream(file));  
    StringBuffer input = new StringBuffer();
             int ch;
             int i=1;
             while ((ch = reader.read()) != -1)
                      input.append((char)ch);

          
            
             String patternString = "(http:|https:)//[^[A-Za-z0-9\\._\\?%&+\\-=/#]]*";
             Pattern pattern = Pattern.compile(patternString, Pattern.CASE_INSENSITIVE);
             Matcher matcher = pattern.matcher(input);
      reader.close();
  
   while (matcher.find())
         {
   
            int start = matcher.start();
            int end = matcher.end();
            String match = input.substring(start, end);
            //System.out.println(++i + " : " + match);
            links.add(match);
         }
  
  }
  catch (Exception e) {
            e.printStackTrace();
        }
return links;
 }

 
  public static void catchallseed(String url,int number)
     {
         HttpClient httpClient = new HttpClient();  
        GetMethod getMethod = new GetMethod(url);    //初始化连接函数
        getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, 
         new DefaultHttpMethodRetryHandler());    //设置默认的异常恢复策略
         try { 
               int statusCode = httpClient.executeMethod(getMethod);  //建立连接
               if (statusCode != HttpStatus.SC_OK)
               { 
                   System.err.println("Method failed: " 
                           + getMethod.getStatusLine()); 
               }                                                   //获取连接建立状态结果
               byte[] responseBody = getMethod.getResponseBody();   //获取网页内容
               FileOutputStream fos=new FileOutputStream("HTMLDOWNLOAD\\"+number+".html");  //打开文件进行写入
               fos.write(responseBody); //写入
               //System.out.println(new String(responseBody)); 
               }
         catch (HttpException e)
             { 
               System.out.println("Please check your provided http address!"); 
               e.printStackTrace();    //打印错误原因
              }
         catch (IOException e)
             { 
               e.printStackTrace(); 
             } finally
             {  
               getMethod.releaseConnection();  //释放连接
             } 
     }

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值