java爬虫抓取下载文件_Java下载文件 爬虫 超时处理解决方案

import java.util.List;

import java.io.BufferedReader;

import java.io.BufferedWriter;

import java.io.File;

import java.io.FileNotFoundException;

import java.io.FileOutputStream;

import java.io.FileReader;

import java.io.FileWriter;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.io.OutputStream;

import java.net.HttpURLConnection;

import java.net.MalformedURLException;

import java.net.SocketTimeoutException;

import java.net.URL;

import java.util.ArrayList;

import java.util.logging.Logger;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

public class Main {

public static final int sleepMsPerConnection = 1000;

public static final int timeOutMs = 20000;

public static final int retry = 2;

private static void download(String urlStr, String filePath) {

int retryCount = 0;

while(true){

try {

DownloadThread thread = new DownloadThread(urlStr, filePath);

thread.start();

thread.join(timeOutMs);

if(!thread.isAlive()){

return;

}else{

thread.interrupt();//实测并不能结束线程,请参考如何中断JAVA线程一文

}

} catch (InterruptedException e) {

e.printStackTrace();

}

retryCount++;

if(retryCount > retry){

throw new RuntimeException("still timeout after retry " + (retry - 1) + " times");

}

System.out.println("retry");

}

}

private static String getHtml(String urlStr) {

int retryCount = 0;

while(true){

try {

GetHtmlThread thread = new GetHtmlThread(urlStr);

thread.start();

thread.join(timeOutMs);

if(!thread.isAlive()){

return thread.html;

}else{

thread.interrupt();

}

} catch (InterruptedException e) {

e.printStackTrace();

}

retryCount++;

if(retryCount > retry){

throw new RuntimeException("still timeout after retry " + (retry - 1) + " times");

}

System.out.println("retry");

}

}

}

import java.io.BufferedReader;

import java.io.InputStreamReader;

import java.net.URL;

public class GetHtmlThread extends Thread {

public String html;

private String urlStr;

public GetHtmlThread(String urlStr) {

this.urlStr = urlStr;

}

public void run() {

try {

Thread.sleep(Main.sleepMsPerConnection);

URL url = new URL(urlStr);

StringBuilder sb = new StringBuilder();

BufferedReader br = new BufferedReader(new InputStreamReader(url

.openStream()));

String line = null;

while ((line = br.readLine()) != null) {

sb.append(line);

sb.append('\n');

}

br.close();

this.html = sb.toString();

} catch (InterruptedException e) {

// do nothing?

} catch (Exception e) {

e.printStackTrace();

System.exit(1);

}

}

}

import java.io.File;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStream;

import java.io.OutputStream;

import java.net.URL;

public class DownloadThread extends Thread {

private String urlStr;

private String filePath;

public DownloadThread(String urlStr, String filePath) {

this.urlStr = urlStr;

this.filePath = filePath;

}

public void run() {

try {

URL url = new URL(urlStr);

InputStream is = url.openStream();

File pdfFile = new File(filePath);

FileOutputStream os = new FileOutputStream(pdfFile);

copyStream(is, os);

is.close();

os.close();

} catch (Exception e) {

e.printStackTrace();

System.exit(1);

}

}

/**

* still need to close inputstream and outputstream after call this method

* @param inputStream

* @param outputStream

* @throws IOException

*/

private void copyStream(InputStream inputStream, OutputStream outputStream)

throws IOException {

byte[] b = new byte[1024];

int len;

while ((len = inputStream.read(b)) > 0) {

outputStream.write(b, 0, len);

}

outputStream.flush();

}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值