图片爬虫程序 JAVA

最新推荐文章于 2022-04-14 11:19:55 发布

李意文

最新推荐文章于 2022-04-14 11:19:55 发布

阅读量904

点赞数

分类专栏： java代码文章标签：爬虫图片 java

本文链接：https://blog.csdn.net/u014698348/article/details/48847643

版权

java代码专栏收录该内容

12 篇文章 0 订阅

订阅专栏

package com.liyiwen.Crawler;

import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Collection;
import java.util.SplittableRandom;
import java.util.concurrent.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Created by dell on 2015/7/18.
 */

/**
 * 正则表达式中\表示转义，java字符出串\也表示转义
 */
public class ImgCrawler implements Runnable {
    public static void main(String[] urls) throws FileNotFoundException, InterruptedException, IOException, ExecutionException{
        ImgCrawler imgCrawler = new ImgCrawler();
        ArrayList<Thread> threads = new ArrayList<Thread>();
        for (int i = 0; i < ImgCrawler.threads; ++i){
            Thread thread = new Thread(imgCrawler);
            thread.start();
            threads.add(thread);
        }

        Thread.currentThread().sleep(10000);

        for (Thread t : threads){
            t.interrupt();
        }

        System.out.println(imgCrawler.getWantedUrls().toString() + "李意文");
        printToFile(imgCrawler.getWantedUrls());
        System.out.println("succeed");

    }

    public static void printToFile(Collection<String> strs) throws FileNotFoundException, IOException{
        File file = new File("crawler.html");
        if (!file.exists()){
            file.createNewFile();
        }

         Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), "utf-8"));
        writer.write(1);
        for (String str : strs){
            writer.append(str + "\r\n");
        }

        writer.flush();
    }

    static private int threads;


    static {
        threads = Runtime.getRuntime().availableProcessors();

    }

    public BlockingQueue<String> getWantedUrls() {
        return wantedUrls;
    }

    private BlockingQueue<String> wantedUrls;
    private BlockingQueue<String> handledUrls;

    public ImgCrawler(){
        wantedUrls = new LinkedBlockingQueue<String>();
        handledUrls = new LinkedBlockingQueue<String>();
        handledUrls.add("http://www.dedeshe.com/html/article/2015-7/index26785.html");
    }

    @Override
    public void run() {
        try{
            for (int i = 0; i < 1000; i++ ){
                String url = null;
                url = handledUrls.take();

                crawl(url);
                if (Thread.currentThread().isInterrupted()){
                    break;
                }
            }
        }
        catch (Throwable throwable){
            System.out.println(throwable.getMessage());
        }

    }

    private void crawl(String url) throws InterruptedException{
        try {
            URL crawledUrl = new URL(url);
            URLConnection urlConnection = crawledUrl.openConnection();

            BufferedReader reader = new BufferedReader(new InputStreamReader(urlConnection.getInputStream(), "utf-8"));

            String line = null;
            ArrayList<String> matchUrls = null;
            ArrayList<String> hrefs = null;
            while ((line = reader.readLine()) != null) {
               // System.out.println(line);
                matchUrls = matchWantedUrl(line);
                if (null != matchUrls && !matchUrls.isEmpty()) {
                    for (String matchUrl : matchUrls) {
                        wantedUrls.put(matchUrl);
                    }
                }

                hrefs = matchHref(line, url);

                if (null != hrefs && !hrefs.isEmpty()) {
                    for (String href : hrefs) {
                        handledUrls.put(href);
                    }
                }
            }
        }

        catch (MalformedURLException e){
            System.out.println(e.getMessage());
            System.out.println("url 错误");
        }

        catch (IOException e){
            System.out.println("不能打开连接");
            System.out.println(e.getMessage());
        }

    }

    private ArrayList<String> matchWantedUrl(String line){
        ArrayList<String> wantedUrsl = new ArrayList<String>();

        Pattern pattern = Pattern.compile("<img .*?src=\"http.*?\".*?>");
        Matcher matcher = pattern.matcher(line);
        while (matcher.find()){
            wantedUrsl.add(matcher.group(0));
        }

        return wantedUrsl;
    }

    private ArrayList<String> matchHref(String line,  String url){
        String rootURL = null;
        Pattern rootURLPattern = Pattern.compile("(http.+com).*");
        Matcher rootURLMatcher = rootURLPattern.matcher(url);
        if (rootURLMatcher.find()){
             rootURL = rootURLMatcher.group(1);
            System.out.println("根目录： " + rootURL);
        }


        ArrayList<String> wantedHrefs = new ArrayList<String>();

        Pattern pattern = Pattern.compile("<a href=\"(.+?)\"");
        Matcher matcher = pattern.matcher(line);
        while (matcher.find()){
            if (!matcher.group(1).startsWith("http") && rootURL != null){
                wantedHrefs.add(rootURL + matcher.group(1));
                System.out.println(rootURL + matcher.group(1));
            }else{
                wantedHrefs.add(matcher.group(1));
                System.out.println(matcher.group(1));
            }

        }
        return wantedHrefs;
    }
}

李意文

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
图片爬虫程序 JAVA

package com.liyiwen.Crawler;import java.io.*;import java.net.MalformedURLException;import java.net.URL;import java.net.URLConnection;import java.util.ArrayList;import java.util.Collection;impo
复制链接

扫一扫

专栏目录