package com.maintain.crawler;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.LinkedList;
import java.util.Queue;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.LinkedList;
import java.util.Queue;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class StartCrawler {
//引入log4j日志
private static Logger logger = Logger.getLogger(StartCrawler.class);
//统计供搜索到了多少个目标文件
private static int total = 0;
//其中以下格式的url不需要继续解析
public static String[] excludeUrl = new String[]{".pom",".xml", ".md5", ".sha1", ".asc", ".gz", ".zip", "../"};//要过滤的后缀
//定义队列 其中存储待解析的url
public static Queue<String> waitForCrawlerUrls = new LinkedList<String>();
//添加开始解析的url地址
private static void addUrl(String url,String info){
if(url == null || "".equals(url)){
return;
}
if(!waitForCrawlerUrls.contains(url)){
waitForCrawlerUrls.add(url);
logger.info("["+info+"]"+url + "加入到爬虫序列");
}
}
//从txt文件中读取待解析的url(最初的url)
private static void init(){
FileInputStream fis = null;
InputStreamReader isr = null;
BufferedReader br = null;
try {
fis = new FileInputStream("D:\\urlPath.txt");
isr = new InputStreamReader(fis);
br = new BufferedReader(isr);
String str = null;
try {
while((str = br.readLine()) != null){
addUrl(str, "初始化");
logger.info("插入urlPath:"+str);
}
} catch (IOException e) {
logger.error("IOException", e);
}
} catch (FileNotFoundException e) {
logger.error("FileNotFoundException", e);
}finally {
try {
br.close();
isr.close();
fis.close();
} catch (IOException e) {
logger.error("IOException", e);
}
}
parseUrl();
}
public static void parseUrl(){
while(waitForCrawlerUrls.size()>0){
String url = waitForCrawlerUrls.poll(); //获取第一个url
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(url);
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
HttpEntity entity = response.getEntity();
if("text/html".equals(entity.getContentType().getValue())){
String webPageContent = EntityUtils.toString(entity, "utf-8"); //此处可以获得网页内容
parseWebPage(webPageContent,url);
}
} catch (ClientProtocolException e) {
logger.error("ClientProtocolException", e);
} catch (IOException e) {
logger.error("IOException", e);
}finally {
if(response != null){
try {
response.close();
} catch (IOException e) {
logger.error("IOException", e);
}
}
try {
httpClient.close();
} catch (IOException e) {
logger.error("IOException", e);
}
}
try {
Thread.sleep(3000); //睡眠一秒
System.out.println("睡眠3秒");
} catch (InterruptedException e) {
logger.error("InterruptedException", e);
}
}
}
private static void parseWebPage(String webPageContent, String realPath) {
if("".equals(webPageContent)){
return;
}
Document doc = Jsoup.parse(webPageContent);
Elements links = doc.select("a");
for(int j = 0;j<links.size();j++){
Element link = links.get(j);
String url = link.attr("href");
logger.info("提取到的url:"+(realPath+url));
boolean tag = true;
for(int i = 0;i<excludeUrl.length;i++){
if(url.endsWith(excludeUrl[i])){
tag = false;
break;
}
}
if(tag){
if(url.endsWith(".jar")){
total++;
logger.info("发现第"+total+"个目标"+(realPath+url));
}else{
logger.info("新增Url地址"+(realPath+url));
addUrl(realPath+url,"解析网页");
}
}
}
}
//引入log4j日志
private static Logger logger = Logger.getLogger(StartCrawler.class);
//统计供搜索到了多少个目标文件
private static int total = 0;
//其中以下格式的url不需要继续解析
public static String[] excludeUrl = new String[]{".pom",".xml", ".md5", ".sha1", ".asc", ".gz", ".zip", "../"};//要过滤的后缀
//定义队列 其中存储待解析的url
public static Queue<String> waitForCrawlerUrls = new LinkedList<String>();
//添加开始解析的url地址
private static void addUrl(String url,String info){
if(url == null || "".equals(url)){
return;
}
if(!waitForCrawlerUrls.contains(url)){
waitForCrawlerUrls.add(url);
logger.info("["+info+"]"+url + "加入到爬虫序列");
}
}
//从txt文件中读取待解析的url(最初的url)
private static void init(){
FileInputStream fis = null;
InputStreamReader isr = null;
BufferedReader br = null;
try {
fis = new FileInputStream("D:\\urlPath.txt");
isr = new InputStreamReader(fis);
br = new BufferedReader(isr);
String str = null;
try {
while((str = br.readLine()) != null){
addUrl(str, "初始化");
logger.info("插入urlPath:"+str);
}
} catch (IOException e) {
logger.error("IOException", e);
}
} catch (FileNotFoundException e) {
logger.error("FileNotFoundException", e);
}finally {
try {
br.close();
isr.close();
fis.close();
} catch (IOException e) {
logger.error("IOException", e);
}
}
parseUrl();
}
public static void parseUrl(){
while(waitForCrawlerUrls.size()>0){
String url = waitForCrawlerUrls.poll(); //获取第一个url
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(url);
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
HttpEntity entity = response.getEntity();
if("text/html".equals(entity.getContentType().getValue())){
String webPageContent = EntityUtils.toString(entity, "utf-8"); //此处可以获得网页内容
parseWebPage(webPageContent,url);
}
} catch (ClientProtocolException e) {
logger.error("ClientProtocolException", e);
} catch (IOException e) {
logger.error("IOException", e);
}finally {
if(response != null){
try {
response.close();
} catch (IOException e) {
logger.error("IOException", e);
}
}
try {
httpClient.close();
} catch (IOException e) {
logger.error("IOException", e);
}
}
try {
Thread.sleep(3000); //睡眠一秒
System.out.println("睡眠3秒");
} catch (InterruptedException e) {
logger.error("InterruptedException", e);
}
}
}
private static void parseWebPage(String webPageContent, String realPath) {
if("".equals(webPageContent)){
return;
}
Document doc = Jsoup.parse(webPageContent);
Elements links = doc.select("a");
for(int j = 0;j<links.size();j++){
Element link = links.get(j);
String url = link.attr("href");
logger.info("提取到的url:"+(realPath+url));
boolean tag = true;
for(int i = 0;i<excludeUrl.length;i++){
if(url.endsWith(excludeUrl[i])){
tag = false;
break;
}
}
if(tag){
if(url.endsWith(".jar")){
total++;
logger.info("发现第"+total+"个目标"+(realPath+url));
}else{
logger.info("新增Url地址"+(realPath+url));
addUrl(realPath+url,"解析网页");
}
}
}
}
public static void main(String[] args) {
init();
}
}
以上为java爬虫的一个简单实现
init();
}
}
以上为java爬虫的一个简单实现