爬虫编写DownEngine

最新推荐文章于 2024-10-18 18:47:56 发布

一起学IT技能

最新推荐文章于 2024-10-18 18:47:56 发布

阅读量487

点赞数

分类专栏：一起写爬虫文章标签： html 爬虫数据

本文链接：https://blog.csdn.net/ququhouse/article/details/52080964

版权

一起写爬虫专栏收录该内容

4 篇文章 0 订阅

订阅专栏

编写DownEngine

downEngine主要负责下载HTML页面，以供解析引擎（parseEngine）解析。
下载页面的目的就是为了解析其中的内容，如果不是目标页，需要解析其中的link，然后放到未解析的队列里，如果是目标页面，则需要解析其中的元数据，执行持久化操作。

编写engine接口

/**
DownEngine Interface
***/
public interface DownEngine {
    // 默认的下载页面的方法
    public String downPage(String pageUrl);

    // 需要配置HttpHeader的方法
    public String downPage(String pageUrl,String [] headers);

    // 添加是否需要加载js的下载页面的方法
    public String downPage(String pageUrl,String[] headers ,boolean loadJs);
}

爬虫介绍种已经提到过集中操作HTTP的类库，根据自己的使用习惯程度，可以实现不同的DownEngine。

/**
使用java原生类库实现的DownLoad引擎
**/
public class CommonDownEngine implements DownEngine {

    @Override
    public String downPage(String pageUrl) {
        // TODO Auto-generated method stub
        return downPage(pageUrl,null);
    }

    @Override
    public String downPage(String pageUrl, String[] headers) {
        // TODO Auto-generated method stub
        return downPage(pageUrl,headers,false);
    }
    /**原生类库不做加载js操作**/
    @Override
    public String downPage(String pageUrl, String[] headers, boolean loadJs) {

        return getUrlString(pageUrl,"UTF-8",headers,true,10000);
    }


static final public String getUrlString(String urlString, String defaultEncoding, String[][] headers, boolean gzip, int timeout) {
        InputStream in = null;
        HttpURLConnection con = null;
        try {
            URL url = new URL(urlString);
            con = (HttpURLConnection) url.openConnection();
            con.setReadTimeout(timeout);
            con.setConnectTimeout(timeout);
            if (gzip && Math.random() < gzipratio) {
                con.setRequestProperty("Accept-Encoding", "gzip");
            }
            if (headers != null) {
                for (int i = 0; i < headers.length; i++) {
                    if (headers[i] != null) {
                        con.setRequestProperty(headers[i][0], headers[i][1]);
                        // System.out.println("setting "+headers[i][0]+": "+headers[i][1]);
                    }
                }
            }

            long starttime = System.currentTimeMillis();
            con.connect();

            int code = con.getResponseCode();
            int length = con.getContentLength();


            String encoding2 = con.getHeaderField("Content-Type");
            int index;
            if (encoding2 != null) {
                if ((index = encoding2.indexOf("charset=")) > 0) {
                    encoding2 = encoding2.substring(index + "charset=".length()).replace('"', ' ').replace('\'', ' ').trim();
                } else
                    encoding2 = defaultEncoding;// null;
            }
            in = new BufferedInputStream(con.getInputStream());

            if (in == null)
                return null;
            String contentencoding = con.getHeaderField("Content-Encoding");
            if (gzip && "gzip".equals(contentencoding)) {
                System.out.println("gzipped");
                in = new GZIPInputStream(in);
            }
            ByteArrayOutputStream urlData = new ByteArrayOutputStream();
            byte[] buf2 = new byte[1024];
            int n;
            while ((n = in.read(buf2)) >= 0)
                urlData.write(buf2, 0, n);



            if (encoding2 != null) {
                try {
                    return urlData.toString(encoding2);
                } catch (UnsupportedEncodingException e) {
                    // e.printStackTrace();
                    System.out.println("UnsupportedEncodingException detected: " + e.getMessage());
                    return urlData.toString();
                }
            } else {
                return urlData.toString();
            }

        } catch (SocketTimeoutException e) {
            System.out.println(urlString + " timeout");
            e.printStackTrace();
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
            if (con != null) {
                InputStream err = con.getErrorStream();
                if (err != null) {
                    try {
                        while (err.read() >= 0);
                    } catch (IOException e1) {
                        e1.printStackTrace();
                    } finally {
                        try {
                            err.close();
                        } catch (IOException e1) {
                            e1.printStackTrace();
                        }
                    }
                }
            }
        } finally {
            if (in != null) {
                try {
                    in.close();
                } catch (Exception e) {
                    e.printStackTrace();
                }
                in = null;
            }
        }

        return null;
    }

}