简单网络爬虫代码示例

最新推荐文章于 2024-05-11 13:37:52 发布

hi_software

最新推荐文章于 2024-05-11 13:37:52 发布

阅读量1.2k

点赞数

分类专栏：搜索引擎文章标签：网络爬虫 string file exception input null

本文链接：https://blog.csdn.net/hi_software/article/details/7972459

版权

搜索引擎专栏收录该内容

7 篇文章 0 订阅

订阅专栏

//自己写的网络爬虫，希望大家共同探讨交流

package Crawl;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;

public class Indepent {

public static void main(String args[])
{

  Set<String> visitedurl = new HashSet<String>();
  LinkedList unvisitedurl = new LinkedList();
  String seed = "http://www.xyz.abc";
  int i = 1;
  String visiturl;
  unvisitedurl.add(seed);
   visiturl = seed;
  while(visiturl!=null)
  {
   Set<String> link = new HashSet<String>();

   visiturl = (String) unvisitedurl.removeFirst();
   if(visitedurl.contains(visiturl))
    continue;
   catchallseed(visiturl,i);

   link = extractMain("HTMLDOWNLOAD\\"+i+".html");
   i++;


  unvisitedurl.addAll(link);

  visitedurl.add(visiturl);
  System.out.println(visiturl);




  }

}
public static Set<String> extractMain(String filename)
{
   Set<String> links = new HashSet<String>();
  File file = new File(filename);
  Reader reader = null;          //创建一个读对象

  try{
   reader =new InputStreamReader(new FileInputStream(file));
    StringBuffer input = new StringBuffer();
             int ch;
             int i=1;
             while ((ch = reader.read()) != -1)
                      input.append((char)ch);

             String patternString = "(http:|https:)//[^[A-Za-z0-9\\._\\?%&+\\-=/#]]*";
             Pattern pattern = Pattern.compile(patternString, Pattern.CASE_INSENSITIVE);
             Matcher matcher = pattern.matcher(input);
      reader.close();

   while (matcher.find())
         {

            int start = matcher.start();
            int end = matcher.end();
            String match = input.substring(start, end);
            //System.out.println(++i + " : " + match);
            links.add(match);
         }

  }
  catch (Exception e) {
            e.printStackTrace();
        }
return links;
}

public static void catchallseed(String url,int number)
     {
         HttpClient httpClient = new HttpClient();
        GetMethod getMethod = new GetMethod(url);    //初始化连接函数
        getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
         new DefaultHttpMethodRetryHandler());    //设置默认的异常恢复策略
         try {
               int statusCode = httpClient.executeMethod(getMethod); //建立连接
               if (statusCode != HttpStatus.SC_OK)
               {
                   System.err.println("Method failed: "
                           + getMethod.getStatusLine());
               }                                                   //获取连接建立状态结果
               byte[] responseBody = getMethod.getResponseBody();   //获取网页内容
               FileOutputStream fos=new FileOutputStream("HTMLDOWNLOAD\\"+number+".html"); //打开文件进行写入
               fos.write(responseBody); //写入
               //System.out.println(new String(responseBody));
               }
         catch (HttpException e)
             {
               System.out.println("Please check your provided http address!");
               e.printStackTrace();    //打印错误原因
              }
         catch (IOException e)
             {
               e.printStackTrace();
             } finally
             {
               getMethod.releaseConnection(); //释放连接
             }
     }

}

hi_software

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
简单网络爬虫代码示例

//自己写的网络爬虫，希望大家共同探讨交流package Crawl;import java.io.File;import java.io.FileInputStream;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStreamReader;import j
复制链接

扫一扫

专栏目录