Java使用Jsoup实现简单爬虫

1、在IDEA中创建一个maven项目

在这里插入图片描述

2、导入三个maven包

在这里插入图片描述

3、完整爬虫代码

package jsoup;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;


import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.sql.*;


import java.io.*;

public class pchong {
    public static void downImages(String filePath, String imgUrl, String dataID) {
        // 若指定文件夹没有,则先创建
        File dir = new File(filePath);
        if (!dir.exists()) {
            dir.mkdirs();
        }
        // 写出的路径,设置成楼房的ID号为楼房图片命名
        File file = new File(filePath + File.separator + dataID + ".jpg");

        try {
            // 获取图片URL
            URL url = new URL(imgUrl);
            // 获得连接
            URLConnection connection = url.openConnection();
            // 设置10秒的相应时间
            connection.setConnectTimeout(10 * 1000);
            // 获得输入流
            InputStream in = connection.getInputStream();
            // 获得输出流
            BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(file));
            // 构建缓冲区
            byte[] buf = new byte[1024];
            int size;
            // 写入到文件
            while (-1 != (size = in.read(buf))) {
                out.write(buf, 0, size);
            }
            out.close();
            in.close();
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public static void main(String[] args) throws IOException {
        Connection sqlcon = null;
        try {
            Class.forName("com.mysql.cj.jdbc.Driver");
        } catch (ClassNotFoundException e) {
            System.out.println("找不到驱动程序类,加载驱动失败");
            e.printStackTrace();
        }
        String docurl = "jdbc:mysql://localhost:3306/test?serverTimezone=GMT%2B8";
        String username = "root";
        String password = "123456";
        try {
            sqlcon = DriverManager.getConnection(docurl, username, password);
        } catch (SQLException e) {
            System.out.println("数据库连接失败");
            e.printStackTrace();
        }
        try {
            long starTime = System.currentTimeMillis();
            int pageNum = 0;
            Writer w = new FileWriter("G:/爬虫/pcongtext.txt", true);
            while (true) {
                pageNum++;

                String url = "https://sz.diandianzu.com/listing/p" + pageNum;
                Document doc = Jsoup.connect(url).timeout(500000).userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.64 Safari/537.31").get();
                if (doc == null) {
                    continue;
                }
                Elements data = doc.getElementsByClass("list-main");
                Elements dataIdList = data.select("[data-id]");
                if (dataIdList == null || dataIdList.size() <= 0) {
                    break;
                }
                for (Element dataIdElement : dataIdList) {
                    String dataId = dataIdElement.attr("data-id");
                    System.out.print("写字楼id:" + dataId + "  ");
                    String newUrl = "https://sz.diandianzu.com/listing/detail-i" + dataId + ".html";
                    Document newDoc = Jsoup.connect(newUrl).timeout(500000).userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36").get();
                    Elements bdn = newDoc.getElementsByClass("top-title");
                    String buildingName = bdn.text();
                    System.out.println(buildingName);
                    String writetext = "写字楼id:" + dataId + "  " + buildingName;
                    w.write(writetext + "\r\n");
                    /*
                     *
                     * 爬取图片部分
                     *
                     * */
                    Elements buildingTag = newDoc.getElementsByClass("swiper-slide");
                    Element img = buildingTag.select("img").first();
                    System.out.println("共检测到下列图片URL:");
                    System.out.println("开始下载");
                    //获取每个img标签URL "abs:"表示绝对路径
                    String imgSrc = img.attr("abs:src");
                    // 打印URL
                    System.out.println(imgSrc);
                    //下载图片到本地
                    pchong.downImages("G:/爬虫/img1", imgSrc, dataId);
                    System.out.println("下载完成");
                    /*
                     *
                     * 写进数据库
                     *
                     * */
                    /*
                    String sql = "INSERT INTO building(id,name) VALUES(?,?)";
                    PreparedStatement pstm = sqlcon.prepareStatement(sql);
                    pstm.setString(1,dataId);
                    pstm.setString(2,buildingName);
                    int rows = pstm.executeUpdate();
                    if (rows > 0) {
                        System.out.println("successfully");
                    }
                    */
                }
            }
//            sqlcon.close();
            w.close();
            long endTime = System.currentTimeMillis();
            System.out.println("共访问网页数量:" + pageNum);
            System.out.println("共耗时:" + (endTime - starTime) / 1000 + "s");
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

4、GitHub地址

Small_Pang:https://github.com/BFFat/Jsoup

  • 2
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值