java爬虫抓取下载文件_Java下载文件爬虫超时处理解决方案

最新推荐文章于 2023-12-01 11:10:09 发布

张野野

最新推荐文章于 2023-12-01 11:10:09 发布

阅读量423

点赞数

文章标签： java爬虫抓取下载文件

本文链接：https://blog.csdn.net/weixin_33119123/article/details/114568013

版权

import java.util.List;

import java.io.BufferedReader;

import java.io.BufferedWriter;

import java.io.File;

import java.io.FileNotFoundException;

import java.io.FileOutputStream;

import java.io.FileReader;

import java.io.FileWriter;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.io.OutputStream;

import java.net.HttpURLConnection;

import java.net.MalformedURLException;

import java.net.SocketTimeoutException;

import java.net.URL;

import java.util.ArrayList;

import java.util.logging.Logger;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

public class Main {

public static final int sleepMsPerConnection = 1000;

public static final int timeOutMs = 20000;

public static final int retry = 2;

private static void download(String urlStr, String filePath) {

int retryCount = 0;

while(true){

try {

DownloadThread thread = new DownloadThread(urlStr, filePath);

thread.start();

thread.join(timeOutMs);

if(!thread.isAlive()){

return;

}else{

thread.interrupt();//实测并不能结束线程，请参考如何中断JAVA线程一文

}

} catch (InterruptedException e) {

e.printStackTrace();

}

retryCount++;

if(retryCount > retry){

throw new RuntimeException("still timeout after retry " + (retry - 1) + " times");

}

System.out.println("retry");

}

private static String getHtml(String urlStr) {

int retryCount = 0;

while(true){

try {

GetHtmlThread thread = new GetHtmlThread(urlStr);

thread.start();

thread.join(timeOutMs);

if(!thread.isAlive()){

return thread.html;

}else{

thread.interrupt();

}

} catch (InterruptedException e) {

e.printStackTrace();

}

retryCount++;

if(retryCount > retry){

throw new RuntimeException("still timeout after retry " + (retry - 1) + " times");

}

System.out.println("retry");

}

import java.io.BufferedReader;

import java.io.InputStreamReader;

import java.net.URL;

public class GetHtmlThread extends Thread {

public String html;

private String urlStr;

public GetHtmlThread(String urlStr) {

this.urlStr = urlStr;

}

public void run() {

try {

Thread.sleep(Main.sleepMsPerConnection);

URL url = new URL(urlStr);

StringBuilder sb = new StringBuilder();

BufferedReader br = new BufferedReader(new InputStreamReader(url

.openStream()));

String line = null;

while ((line = br.readLine()) != null) {

sb.append(line);

sb.append('\n');

}

br.close();

this.html = sb.toString();

} catch (InterruptedException e) {

// do nothing?

} catch (Exception e) {

e.printStackTrace();

System.exit(1);

}

import java.io.File;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStream;

import java.io.OutputStream;

import java.net.URL;

public class DownloadThread extends Thread {

private String urlStr;

private String filePath;

public DownloadThread(String urlStr, String filePath) {

this.urlStr = urlStr;

this.filePath = filePath;

}

public void run() {

try {

URL url = new URL(urlStr);

InputStream is = url.openStream();

File pdfFile = new File(filePath);

FileOutputStream os = new FileOutputStream(pdfFile);

copyStream(is, os);

is.close();

os.close();

} catch (Exception e) {

e.printStackTrace();

System.exit(1);

}

/**

* still need to close inputstream and outputstream after call this method

* @param inputStream

* @param outputStream

* @throws IOException

private void copyStream(InputStream inputStream, OutputStream outputStream)

throws IOException {

byte[] b = new byte[1024];

int len;

while ((len = inputStream.read(b)) > 0) {

outputStream.write(b, 0, len);

}

outputStream.flush();

}

张野野

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
java爬虫抓取下载文件_Java下载文件爬虫超时处理解决方案

import java.util.List;import java.io.BufferedReader;import java.io.BufferedWriter;import java.io.File;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.FileReader;imp...
复制链接

扫一扫