apache httpclient PDF的网络爬虫

最新推荐文章于 2023-06-08 21:46:11 发布

chenweishaoxing

最新推荐文章于 2023-06-08 21:46:11 发布

阅读量144

点赞数

分类专栏：爬虫文章标签：爬虫 java

本文链接：https://blog.csdn.net/chenweishaoxing/article/details/84124165

版权

爬虫专栏收录该内容

4 篇文章 0 订阅

订阅专栏

import java.io.File;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStream;

import java.net.URLEncoder;

import java.util.ArrayList;

import java.util.List;

import java.util.Timer;

import java.util.TimerTask;

import org.apache.http.HttpEntity;

import org.apache.http.HttpResponse;

import org.apache.http.client.ClientProtocolException;

import org.apache.http.client.HttpClient;

import org.apache.http.client.methods.HttpGet;

import org.apache.http.conn.ClientConnectionManager;

import org.apache.http.conn.params.ConnManagerParams;

import org.apache.http.conn.params.ConnPerRouteBean;

import org.apache.http.conn.scheme.PlainSocketFactory;

import org.apache.http.conn.scheme.Scheme;

import org.apache.http.conn.scheme.SchemeRegistry;

import org.apache.http.conn.ssl.SSLSocketFactory;

import org.apache.http.impl.client.DefaultHttpClient;

import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;

import org.apache.http.params.BasicHttpParams;

import org.apache.http.params.HttpConnectionParams;

import org.apache.http.params.HttpParams;

import org.apache.http.protocol.BasicHttpContext;

import org.apache.http.protocol.HttpContext;

import org.htmlparser.Node;

import org.htmlparser.NodeFilter;

import org.htmlparser.Parser;

import org.htmlparser.filters.AndFilter;

import org.htmlparser.filters.NodeClassFilter;

import org.htmlparser.tags.LinkTag;

import org.htmlparser.util.NodeList;

import org.htmlparser.util.ParserException;

public class Crawler implements Runnable{

public static String SAVE="C:/Users/Administrator/Downloads";//下载保存路径

private String url="";//要抓取的网页地址

public Crawler(String url){

this.url=url;

}

public Crawler(){}

/**

* @param url 要抓取的网页的地址

* @return 这个对应的内容

* @throws ClientProtocolException

* @throws IOException

private String crawl(String url) throws ClientProtocolException, IOException{

System.out.println("[INFO] Crawl From : "+url);

HttpClient httpClient = new DefaultHttpClient();

HttpGet httpGet=new HttpGet(url);

HttpResponse httpResponse = httpClient.execute(httpGet);

HttpEntity httpEntity=httpResponse.getEntity();

InputStream inStream=httpEntity.getContent();

String content="";

while(true){

byte[] bytes=new byte[1024*1000];

int k=inStream.read(bytes);

if(k>=0)content=content+new String(bytes,0,k);

else break;

System.out.println(content);

System.out.println("=========================================================================================");

}

return content;

}

public void run(){

try {

String prefix=this.url.substring(0,this.url.lastIndexOf("/"));

String content=this.crawl(this.url);//抓取网页内容

Parser parser=new Parser(content); //使用HTMLParser对网页内容进行解析

NodeFilter filter;

NodeList list;

filter=new NodeClassFilter(LinkTag.class);

filter=new AndFilter(filter,new NodeFilter(){

public boolean accept(Node node) {

return ((LinkTag)node).isHTTPLink();

}});

list=parser.extractAllNodesThatMatch(filter);

List<String> urlsList =new ArrayList<String>();

for(int i=0;i<list.size();i++){

String[] array=list.elementAt(i).getText().split("\"");

if(array[1].endsWith(".pdf")||array[1].endsWith(".PDF")){//只下载pdf

String downloadUrl=new String(prefix+"/"+array[1]);

urlsList.add(downloadUrl);//生成需要下载的地址

}

//从这里开始是进行下载，使用了多线程执行请求

HttpParams params=new BasicHttpParams();

//ConnManagerParams.setTimeout(params, 60000*3); //设置连接最大等待时间

ConnManagerParams.setMaxConnectionsPerRoute(params, new ConnPerRouteBean(50));//设置并发数

//HttpConnectionParams.setConnectionTimeout(params, 60000*2); //设置连接超时时间

HttpConnectionParams.setSoTimeout(params, 60000*10);//设置读取超时时间

SchemeRegistry schemeRegistry=new SchemeRegistry();

schemeRegistry.register(new Scheme("http",PlainSocketFactory.getSocketFactory(),80));

schemeRegistry.register(new Scheme("https", SSLSocketFactory.getSocketFactory(), 443));

ThreadSafeClientConnManager cm=new ThreadSafeClientConnManager(params,schemeRegistry);

HttpClient httpClient=new DefaultHttpClient(cm,params);

Thread[] threads=new Thread[urlsList.size()];

int n=0;

for(String url:urlsList){

String path=Crawler.SAVE+url.substring(url.lastIndexOf("/"), url.length());

url=url.substring(0, url.lastIndexOf("/"))+"/"+URLEncoder.encode(url.substring(url.lastIndexOf("/")+1,url.length()),"UTF-8");

HttpGet httpGet=new HttpGet(url);

threads[n]=new Thread(new Downloader(httpClient,httpGet,url,path));

n++;

}

for(Thread thread:threads)thread.start();

for(Thread thread:threads)if(thread.isAlive())thread.join();

}catch (InterruptedException e) {

System.out.println("[ERROR] Download InterruptedException : "+e.toString());

//e.printStackTrace();

} catch (ParserException e) {

System.out.println("[ERROR] Parse ParserException : "+e.toString());

//e.printStackTrace();

}catch (ClientProtocolException e) {

System.out.println("[ERROR] Crawl ClientProtocolException : "+e.toString());

//e.printStackTrace();

} catch (IOException e) {

System.out.println("[ERROR] Crawl IOException : "+e.toString());

//e.printStackTrace();

}

public static void main(String[] args) {

//入口程序

Crawler crawler=new Crawler("http://www3.tjcu.edu.cn/wangshangketang/yuanneike/guanlixue/sjxz.htm");//这里设定网页地址

Thread thread=new Thread(crawler);

thread.start();

}

//类Downloader真正的执行了写入网络数据到文件的步骤

class Downloader implements Runnable{

private String url="";

private String path="";

private final HttpClient httpClient;

private final HttpContext httpContext;

private final HttpGet httpGet;

/**

* @param httpClient 多个线程共享的HtppClient

* @param httpGet 要下载的HttpGet

* @param url 资源网络地址

* @param path 资源下载之后本地的保存路径

public Downloader(HttpClient httpClient,HttpGet httpGet,String url,String path){

this.httpClient=httpClient;

this.httpGet=httpGet;

this.httpContext=new BasicHttpContext();

this.path=path;

this.url=url;

}

public void run() {

System.out.println("[INFO] Download From : "+this.url);

File file=new File(this.path);

if(file.exists())file.delete();

try {

//使用file来写入本地数据

file.createNewFile();

FileOutputStream outStream = new FileOutputStream(this.path);

//执行请求，获得响应

HttpResponse httpResponse = this.httpClient.execute(this.httpGet,this.httpContext);

System.out.println("[STATUS] Download : "+httpResponse.getStatusLine()+" [FROM] "+this.path);

HttpEntity httpEntity=httpResponse.getEntity();

InputStream inStream=httpEntity.getContent();

while(true){//这个循环读取网络数据，写入本地文件

byte[] bytes=new byte[1024*1000];

int k=inStream.read(bytes);

if(k>=0){

outStream.write(bytes,0,k);

outStream.flush();

}

else break;

}

inStream.close();

outStream.close();

} catch (IOException e){

this.httpGet.abort();

System.out.println("[ERROR] Download IOException : "+e.toString()+" [FROM] : "+this.path);

//e.printStackTrace();

}

chenweishaoxing

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
apache httpclient PDF的网络爬虫

import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.net.URLEncoder; import java.util.ArrayList; import java.util.Li...
复制链接

扫一扫

专栏目录