PicsDownload主类程序:
public class PicsDownload {
static String path;//定义文件存储路径的变量
public static void main(String[] args) {
path=URLFileDownloadUtils.makeFileInMaxSpace();//最大内存可用空间的路径,文件已经建好
String webAddress=null;
Set<String> webSet=new HashSet<String>();
Scanner input=new Scanner(System.in);
System.out.println("请输入网址:");
webAddress=input.nextLine();
webSet.add(webAddress);
patu(webSet);
}
public static void mainDownload(Set<String> imgSet) {
String midle="meinv";
String temp;
if(count%30==0){
path+=File.separator+midle;//每层建30个文件夹后进入下一层文件夹以此类推
}
temp=path;
temp+=count;
System.out.println(temp);
ExecutorService newSingleThreadExecutor = Executors.newSingleThreadExecutor();//创建线程池
for (String imgSrc : imgSet) {//每个图片源地址(每个地址对应一个图片)集合创建一个文件夹存放图片
File file=new File(temp);
if(!file.exists()){
file.mkdirs();
}
String filename=URLFileDownloadUtils.getUUID()+imgSrc.substring(imgSrc.lastIndexOf("."));//获取src文件后缀
String filepath=temp+File.separator+filename;//以原有后缀来定义存储文件的后缀名
newSingleThreadExecutor.execute(new ThreadDownloadTask(imgSrc, filepath));//每个源创建一个线程下载
// URLFileDownloadUtils.download(imgSrc,filepath);//不使用线程下载(速度明显慢了很多)
}
newSingleThreadExecutor.shutdown();//关闭线程池
}
static int count;//用来计数下载次数
public static void patu(Set<String> webSet) {
/*
* 根据网站set集合来爬资源
*/
Map<String, Set<String>> fileMap=new HashMap<String, Set<String>>();//传map集合文件类型作为键,地址List作为值,分别存储文件源地址,html页面地址
Set<String> htmlSet = null;
Set<String> imgSet = null;
String[] htmls=URLFileDownloadUtils.getHtml(webSet);//通过网址获得页面html页面字符源码
for (int i = 0; i < htmls.length; i++) {
fileMap=URLFileDownloadUtils.getSrcMap(htmls[i],fileMap);//通过html爬虫图片源地址或者html地址,保存到map中并返回
htmlSet=(Set<String>) fileMap.get("html");
imgSet=(Set<String>) fileMap.get("img");
htmlSet.addAll((Set<String>) fileMap.get("html"));//取出两个键对应的单列集合
imgSet.addAll((Set<String>) fileMap.get("img"));
}
mainDownload(imgSet);
count++;
System.out.println("第"+count+"次完成");
patu(htmlSet);//递归继续搜索html网页集合中的图片地址
}
}
URLFileDownloadUtils 工具类:
public class URLFileDownloadUtils {
final static String LINE_SEPARATOR=System.getProperty("line.separator");
public static String[] getHtml(Set<String> htmlSet){
/*
* 获取集合中所有页面字符串源码
*/
String[] htmls=new String[htmlSet.size()];
int i=0;
ExecutorService newSingleThreadExecutor = Executors.newSingleThreadExecutor();
for (Iterator<String> it = htmlSet.iterator(); it.hasNext();i++) {
/*
* 可以弄一个有返回值的线程
*/
Future<String> future=newSingleThreadExecutor.submit(new BackHtmlThread(it.next()));
try {
htmls[i]=future.get();
} catch (InterruptedException | ExecutionException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// htmls[i]=getHtml(it.next());
}
newSingleThreadExecutor.shutdown();
return htmls;
}
public static String getHtml(String webAddress){//获取页面字符串源码
StringBuilder sb=new StringBuilder();
try {
InputStream in=new URL(webAddress).openStream();
BufferedReader br=new BufferedReader(new InputStreamReader(in,"utf-8"));
String len=null;
while((len=br.readLine())!=null){
sb.append(len);
sb.append(LINE_SEPARATOR);//换行符
}
in.close();
br.close();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return sb.toString();
}
public static Map<String, Set<String>> getSrcMap(String html,Map<String, Set<String>> fileMap){
/*
* 获取符合正则的html网页中的所有类容并存储到fileMap中
* 这里只获取两类,一个html网页,另一个是图片,分别存入两个set集合,然后用html,img做key,其集合做value
* 存入map中并返回
*/
Set<String> imgSet=new HashSet<String>();//防止重复
Set<String> htmlSet=new HashSet<String>();
Pattern p1=Pattern.compile("http[^\"^>]*\\.(JPG|BMP|PNG|HTML|jpg|bmp|png|html)");//爬虫
Matcher m1=p1.matcher(html);
while(m1.find()){//找图片
String s=m1.group();
System.out.println(s);
if(s.endsWith("html")){
htmlSet.add(s);
}else{
imgSet.add(m1.group());
}
fileMap.put("html", htmlSet);
fileMap.put("img", imgSet);
}
System.out.println(fileMap);
return fileMap;
}
public static void download(String imgSrc,String path) throws IOException{//下载网址,与存储路径
URL u=new URL(imgSrc);//获取网络文件流
HttpURLConnection uc=(HttpURLConnection)u.openConnection();
uc.setRequestProperty("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0");
InputStream in=uc.getInputStream();//获取流
BufferedInputStream bis=new BufferedInputStream(in);//加缓冲
BufferedOutputStream bos=new BufferedOutputStream(new FileOutputStream(new File(path)));
byte [] b=new byte[8096];
int len=0;
while((len=bis.read(b))!=-1){
bos.write(b,0,len);
bos.flush();
}
in.close();
bos.close();
bis.close();
}
public static String getUUID(){//获取随机uuid
UUID u=UUID.randomUUID();
return u.toString().replace("-", "");
}
public static String makeFileInMaxSpace() {
/*
* 获取最大可用空间盘符,并在盘符下创建pics文件夹
*/
long max=0;
String rootpath=null;
File[] listFiles = File.listRoots();
for (File file2 : listFiles) {
if(max<file2.getUsableSpace()){
max=file2.getUsableSpace();
rootpath=file2.getAbsolutePath();
// System.out.println(file2.getAbsolutePath()+"可用空间:"+file2.getUsableSpace());
}
}
System.out.println();
File file=new File(rootpath+"pics");
if(!file.exists()){
file.mkdirs();
System.out.println("注意了!!!!!!!!!!!!!!!!");
System.out.println("创建了文件夹:"+rootpath+"pics");
}
// file.delete();
return file.getAbsolutePath();
}
}
ThreadDownloadTask 线程类:
public class ThreadDownloadTask implements Runnable {
String imgSrc;
String path;
ThreadDownloadTask(String imgSrc,String path){
this.imgSrc=imgSrc;
this.path=path;
}
@Override
public void run() {
// TODO Auto-generated method stub
try {
System.out.println("当前下载文件的线程为:"+Thread.currentThread().getName());
URLFileDownloadUtils.download(imgSrc,path);//图片源地址,存储路径
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
BackHtmlThread 线程类:
public class BackHtmlThread implements Callable<String> {//实现callable接口,此线程有返回值,用Future类对象来接受
String webAddress=null;
BackHtmlThread(String webAddress){
this.webAddress=webAddress;
}
@Override
public String call() throws Exception {
System.out.println("当前获取html文件的线程为:"+Thread.currentThread().getName());
String html=URLFileDownloadUtils.getHtml(webAddress);
return html;
}
}