java爬虫(一)

package com.maintain.crawler;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.LinkedList;
import java.util.Queue;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class StartCrawler {
 
 //引入log4j日志
 private static Logger logger = Logger.getLogger(StartCrawler.class);
 //统计供搜索到了多少个目标文件
 private static int total = 0;
 //其中以下格式的url不需要继续解析
 public static String[] excludeUrl = new String[]{".pom",".xml", ".md5", ".sha1", ".asc", ".gz", ".zip", "../"};//要过滤的后缀
 //定义队列 其中存储待解析的url
 public static Queue<String> waitForCrawlerUrls = new LinkedList<String>();
 
 //添加开始解析的url地址
 private static void addUrl(String url,String info){
  if(url == null || "".equals(url)){
   return;
  }
  if(!waitForCrawlerUrls.contains(url)){
   waitForCrawlerUrls.add(url);
   logger.info("["+info+"]"+url + "加入到爬虫序列");
  }
 }
 //从txt文件中读取待解析的url(最初的url)
 private static void init(){
  FileInputStream fis = null;
  InputStreamReader isr = null;
  BufferedReader br = null;
  try {
    fis = new FileInputStream("D:\\urlPath.txt");
    isr = new InputStreamReader(fis);
    br = new BufferedReader(isr);
    String str = null;
    try {
    while((str = br.readLine()) != null){
      addUrl(str, "初始化");
      logger.info("插入urlPath:"+str);
    }
   } catch (IOException e) {
    logger.error("IOException", e);
   }
  } catch (FileNotFoundException e) {
   logger.error("FileNotFoundException", e);
  }finally {
   try {
    br.close();
    isr.close();
    fis.close();
   } catch (IOException e) {
    logger.error("IOException", e);
   }
   
  }
  
  parseUrl();
 }
 
 public static void parseUrl(){
  while(waitForCrawlerUrls.size()>0){
   String url = waitForCrawlerUrls.poll(); //获取第一个url
   CloseableHttpClient httpClient = HttpClients.createDefault();
   HttpGet httpGet = new HttpGet(url);
   CloseableHttpResponse response = null;
   try {
    response = httpClient.execute(httpGet);
    HttpEntity entity = response.getEntity();
    if("text/html".equals(entity.getContentType().getValue())){
     String webPageContent = EntityUtils.toString(entity, "utf-8");   //此处可以获得网页内容
     parseWebPage(webPageContent,url);
    }
   }  catch (ClientProtocolException e) {
    logger.error("ClientProtocolException", e);
   } catch (IOException e) {
    logger.error("IOException", e);
   }finally {
    if(response != null){
     try {
      response.close();
     } catch (IOException e) {
      logger.error("IOException", e);
     }
    }
    try {
     httpClient.close();
    } catch (IOException e) {
     logger.error("IOException", e);
    }
   }
   
   try {
    Thread.sleep(3000);   //睡眠一秒
    System.out.println("睡眠3秒");
   } catch (InterruptedException e) {
    logger.error("InterruptedException", e);
   }
  }
 }
 private static void parseWebPage(String webPageContent, String realPath) {
  if("".equals(webPageContent)){
   return;
  }
  
  Document doc = Jsoup.parse(webPageContent);
  Elements links = doc.select("a");
  for(int j = 0;j<links.size();j++){
   Element link = links.get(j);
   String url = link.attr("href");
   logger.info("提取到的url:"+(realPath+url));
   boolean tag = true;
   for(int i = 0;i<excludeUrl.length;i++){
    if(url.endsWith(excludeUrl[i])){
     tag = false;
     break;
    }
   }
   if(tag){
    if(url.endsWith(".jar")){
     total++;
     logger.info("发现第"+total+"个目标"+(realPath+url));
    }else{
     logger.info("新增Url地址"+(realPath+url));
     addUrl(realPath+url,"解析网页");
    }
   }
  }
 }
 public static void main(String[] args) {
  init();
 }
}
以上为java爬虫的一个简单实现
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值