java爬虫（一）

最新推荐文章于 2022-11-04 14:18:06 发布

DTree123

最新推荐文章于 2022-11-04 14:18:06 发布

阅读量238

点赞数

文章标签： log4j java爬虫

本文链接：https://blog.csdn.net/MainTain_/article/details/64644747

版权

package com.maintain.crawler;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.LinkedList;
import java.util.Queue;

import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class StartCrawler {

//引入log4j日志
private static Logger logger = Logger.getLogger(StartCrawler.class);
//统计供搜索到了多少个目标文件
private static int total = 0;
//其中以下格式的url不需要继续解析
public static String[] excludeUrl = new String[]{".pom",".xml", ".md5", ".sha1", ".asc", ".gz", ".zip", "../"};//要过滤的后缀
//定义队列其中存储待解析的url
public static Queue<String> waitForCrawlerUrls = new LinkedList<String>();

//添加开始解析的url地址
private static void addUrl(String url,String info){
  if(url == null || "".equals(url)){
   return;
  }
  if(!waitForCrawlerUrls.contains(url)){
   waitForCrawlerUrls.add(url);
   logger.info("["+info+"]"+url + "加入到爬虫序列");
  }
}
//从txt文件中读取待解析的url（最初的url）
private static void init(){
  FileInputStream fis = null;
  InputStreamReader isr = null;
  BufferedReader br = null;
  try {
    fis = new FileInputStream("D:\\urlPath.txt");
    isr = new InputStreamReader(fis);
    br = new BufferedReader(isr);
    String str = null;
    try {
    while((str = br.readLine()) != null){
      addUrl(str, "初始化");
      logger.info("插入urlPath:"+str);
    }
   } catch (IOException e) {
    logger.error("IOException", e);
   }
  } catch (FileNotFoundException e) {
   logger.error("FileNotFoundException", e);
  }finally {
   try {
    br.close();
    isr.close();
    fis.close();
   } catch (IOException e) {
    logger.error("IOException", e);
   }

  }

  parseUrl();
}

public static void parseUrl(){
  while(waitForCrawlerUrls.size()>0){
   String url = waitForCrawlerUrls.poll(); //获取第一个url
   CloseableHttpClient httpClient = HttpClients.createDefault();
   HttpGet httpGet = new HttpGet(url);
   CloseableHttpResponse response = null;
   try {
    response = httpClient.execute(httpGet);
    HttpEntity entity = response.getEntity();
    if("text/html".equals(entity.getContentType().getValue())){
     String webPageContent = EntityUtils.toString(entity, "utf-8");   //此处可以获得网页内容
     parseWebPage(webPageContent,url);
    }
   } catch (ClientProtocolException e) {
    logger.error("ClientProtocolException", e);
   } catch (IOException e) {
    logger.error("IOException", e);
   }finally {
    if(response != null){
     try {
      response.close();
     } catch (IOException e) {
      logger.error("IOException", e);
     }
    }
    try {
     httpClient.close();
    } catch (IOException e) {
     logger.error("IOException", e);
    }
   }

   try {
    Thread.sleep(3000);   //睡眠一秒
    System.out.println("睡眠3秒");
   } catch (InterruptedException e) {
    logger.error("InterruptedException", e);
   }
  }
}
private static void parseWebPage(String webPageContent, String realPath) {
  if("".equals(webPageContent)){
   return;
  }

  Document doc = Jsoup.parse(webPageContent);
  Elements links = doc.select("a");
  for(int j = 0;j<links.size();j++){
   Element link = links.get(j);
   String url = link.attr("href");
   logger.info("提取到的url："+(realPath+url));
   boolean tag = true;
   for(int i = 0;i<excludeUrl.length;i++){
    if(url.endsWith(excludeUrl[i])){
     tag = false;
     break;
    }
   }
   if(tag){
    if(url.endsWith(".jar")){
     total++;
     logger.info("发现第"+total+"个目标"+(realPath+url));
    }else{
     logger.info("新增Url地址"+(realPath+url));
     addUrl(realPath+url,"解析网页");
    }
   }
  }
}

public static void main(String[] args) {
init();
}
}
以上为java爬虫的一个简单实现

DTree123

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
java爬虫（一）

package com.maintain.crawler;import java.io.BufferedReader;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.IOException;import java.io.InputStreamReader;i
复制链接

扫一扫