jsoup爬虫代码

1、添加依赖

<dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.2</version>
        </dependency>
        <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>2.6</version>
        </dependency>
        <dependency>
            <groupId>commons-lang</groupId>
            <artifactId>commons-lang</artifactId>
            <version>2.6</version>
        </dependency>

2、具体实现

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.junit.jupiter.api.Test;
import org.springframework.boot.test.context.SpringBootTest;

import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

@SpringBootTest
class SpiderTests {

    @Test
    void contextLoads() throws IOException {
        File folder = new File("C:\\Users\\86139\\Desktop\\姓名 - 副本.txt");
        getData(folder);
    }

    private void getData(File txtFile) throws IOException {
        BufferedReader reader = null;
        BufferedWriter writer = null;
        try {

            // 读
            reader = new BufferedReader(new InputStreamReader(new FileInputStream(txtFile)));

            List<String> lineList = new ArrayList<>();
            String line;
            while ((line = reader.readLine()) != null) {
                if (line == null || line.isEmpty()) {
                    continue;
                }
                String url = "https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=62095104_41_oem_dg&wd=" + line + "&oq=%25E6%259E%2597%25E6%25BD%2598%25E6%25AD%25A6&rsv_pq=c04cd2e0002906e5&rsv_t=0a47IpZ7ERpQPcIxllylZwrPOhgeiYs2DLK5Zm%2BB%2Ffe7BOM5ioHXJqKjvKdDPm8KSHdlWt6w%2BkJ9&rqlang=cn&rsv_dl=tb&rsv_enter=0&rsv_btype=t&inputT=2336&rsv_sug3=1582&rsv_n=2&rsv_sug1=1407&rsv_sug7=100&rsv_sug4=2680";
                HashMap<String, String> map = new HashMap<>();
                map.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
                map.put("Accept-Encoding", "gzip, deflate, br");
                map.put("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8");
                map.put("Cache-Control", "max-age=0");
                map.put("Connection", "keep-alive");
                map.put("sec-ch-ua", "\" Not;A Brand\";v=\"99\", \"Google Chrome\";v=\"91\", \"Chromium\";v=\"91\"");
                map.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");

                Document doc = Jsoup.connect(url).headers(map).get();

                String first = doc.getElementsByAttributeValue("class", "nums_text").first().text();
                String res = first.substring(11, first.length() - 1).replace(",", "");
                //把读到的数据存到数组里
                lineList.add(res);
            }

            // 写
            writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(txtFile)));

            boolean firstTime = true;
            for (String s : lineList) {

                if (firstTime) {
                    writer.write(s);
                    firstTime = false;
                    continue;
                }

                writer.newLine();
                writer.write(s);
            }

            writer.flush();

        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            reader.close();
            writer.close();
        }
    }
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值