Java实现网络爬虫-Java入门

最新推荐文章于 2021-08-15 14:36:48 发布

会打碟的程序员

最新推荐文章于 2021-08-15 14:36:48 发布

阅读量234

点赞数

分类专栏： JAVA

本文链接：https://blog.csdn.net/sbw17608463386/article/details/103167280

版权

JAVA 专栏收录该内容

6 篇文章 1 订阅

订阅专栏

1. 网络爬虫

网络爬虫（又被称为网页蜘蛛，网络机器人），是一种按照一定的规则，自动地抓取万维网信息的程序或者脚本。
另外一些不常使用的名字还有蚂蚁、自动索引、模拟程序或者蠕虫.

2. 流程

网络爬虫是做什么的? 他的主要工作就是跟据指定的url地址去发送请求,获得响应, 然后解析响应 , 一方面从响应中查找出想要查找的数据,另一方面从响应中解析出新的URL路径,然后继续访问,继续解析;继续查找需要的数据和继续解析出新的URL路径 .
这就是网络爬虫主要干的工作. 下面是流程图:

3.代码实现

pom文件

 <dependencies>
      <dependency>
          <!-- jsoup HTML parser library @ https://jsoup.org/ -->
          <groupId>org.jsoup</groupId>
          <artifactId>jsoup</artifactId>
          <version>1.12.1</version>
      </dependency>
      <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpcore -->
      <dependency>
          <groupId>org.apache.httpcomponents</groupId>
          <artifactId>httpcore</artifactId>
          <version>4.4.10</version>
      </dependency>

      <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
      <dependency>
          <groupId>org.apache.httpcomponents</groupId>
          <artifactId>httpclient</artifactId>
          <version>4.5.6</version>
      </dependency>
      <!-- 文件下载 -->
      <dependency>
          <groupId>commons-io</groupId>
          <artifactId>commons-io</artifactId>
          <version>2.5</version>
      </dependency>
      <dependency>
          <groupId>net.sourceforge.htmlunit</groupId>
          <artifactId>htmlunit</artifactId>
          <version>2.27</version>
      </dependency>
  </dependencies>

代码

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;

public class Pa {
    /**
     * @param url      要抓取的网页地址
     * @param encoding 要抓取网页编码
     * @return
     */
    public static String getHtmlResourceByUrl(String url, String encoding) {
        URL urlObj = null;
        URLConnection uc = null;
        InputStreamReader isr = null;
        BufferedReader reader = null;
        StringBuffer buffer = new StringBuffer();
        // 建立网络连接
        try {
            urlObj = new URL(url);
            // 打开网络连接
            uc = urlObj.openConnection();
            // 建立文件输入流
            isr = new InputStreamReader(uc.getInputStream(), encoding);
            // 建立缓存导入 将网页源代码下载下来
            reader = new BufferedReader(isr);
            // 临时
            String temp = null;
            while ((temp = reader.readLine()) != null) {// 一次读一行 只要不为空就说明没读完继续读
                // System.out.println(temp+"\n");
                buffer.append(temp + "\n");
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            // 关流
            if (isr != null) {
                try {
                    isr.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return buffer.toString();
    }

    public static void getJobInfo(String url, String encoding) {
        // 拿到网页源代码
        String html = getHtmlResourceByUrl(url, encoding);
        try {
            File fp = new File("E:/cskt/cskt.html");
            OutputStream os = new FileOutputStream(fp);          //建立文件输出流
            os.write(html.getBytes());
            os.close();
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /*
     * 获得图片和样式
     * */
    public static void getImg(String uri) throws ClientProtocolException, IOException {
        // 创建httpclient实例
        CloseableHttpClient httpclient = HttpClients.createDefault();
        // 创建httpget实例
        HttpGet httpget = new HttpGet(uri);

        // 执行get请求
        CloseableHttpResponse response = httpclient.execute(httpget);
        HttpEntity entity = response.getEntity();
        // 获取返回实体
        String content = EntityUtils.toString(entity, "gbk");

        // 解析网页 得到文档对象
        Document doc = Jsoup.parse(content);
        // 获取指定的 <img />
        Elements elements = doc.select("img");
        for (int i = 0; i < elements.size(); i++) {
            Element element = elements.get(i);
            // 获取 <img /> 的 src
            String url = element.attr("src");
            System.out.println(url);
            // 再发请求最简单了，并由于该链接是没有 https:开头的，得人工补全
            System.out.println(i + 1 + ":" + "http://www.ktbdqn.com/" + url);
            String url2 = "http://www.ktbdqn.com/" + url;
            HttpGet picturehttpGet = new HttpGet(url2);
            CloseableHttpResponse pictureResponse = httpclient.execute(picturehttpGet);
            HttpEntity pictureEntity = pictureResponse.getEntity();
            InputStream inputStream = pictureEntity.getContent();
            // 使用 common-io 下载图片到本地，注意图片名不能重复
            FileUtils.copyToFile(inputStream, new File("E://cskt//" + url));
            pictureResponse.close(); // pictureResponse关闭
        }
        /*获得样式*/
        Elements element = doc.select("link");
        for (int i = 0; i < element.size(); i++) {
            Element elemen = element.get(i);
            // 获取 <link /> 的 href
            String url = elemen.attr("href");
            System.out.println(url);
            // 再发请求最简单了，并由于该链接是没有 https:开头的，得人工补全
            System.out.println(i + 1 + ":" + "http://www.ktbdqn.com/" + url);
            String url2 = "http://www.ktbdqn.com/" + url;
            HttpGet picturehttpGet = new HttpGet(url2);
            CloseableHttpResponse pictureResponse = httpclient.execute(picturehttpGet);
            HttpEntity pictureEntity = pictureResponse.getEntity();
            InputStream inputStream = pictureEntity.getContent();
            // 使用 common-io 下载样式到本地，注意样式名不能重复
            FileUtils.copyToFile(inputStream, new File("E://cskt//" + url));
            pictureResponse.close(); // pictureResponse关闭
        }
        response.close(); // response关闭
        httpclient.close(); // httpClient关闭
    }

    public static void main(String[] args) throws ClientProtocolException, IOException {
        getJobInfo("http://www.ktbdqn.com/", "utf-8");
        getImg("http://www.ktbdqn.com/");
    }
}