扒取中国银行对应的swiftcode数据

SWIFT/BIC Codes for Banks in China

主要把这个网站上的数据拿下来。列表数据是大结构是通义灵码写的,我改能跑动,以及jsoup来解析网页。F12看,就是获取html去解析。

结果详情页忘记了但部署到linux不行,webflux不兼容,也不去解决兼容性问题了。反正这部分数据万年不变。就从数据库遍历出数据,然后通过swift来拼接请求url。拿到详情页的地址信息。忘记地址了。总之做记录。列表数据webflux。  详情页okhttp3。

  <jsoup.version>1.13.1</jsoup.version>
  <springboot.version>2.6.13</springboot.version>

 <dependency>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-starter-webflux</artifactId>
                <version>${springboot.version}</version>
            </dependency>

  <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
            <dependency>
                <groupId>org.jsoup</groupId>
                <artifactId>jsoup</artifactId>
                <version>${jsoup.version}</version>
            </dependency>

(本地跑没问题,服务器上有问题)


import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.springframework.web.reactive.function.client.WebClient;
import reactor.core.publisher.Flux;
import reactor.core.publisher.Mono;

import java.util.List;
import java.util.stream.Collectors;
@Service
public class ScraperService {

    private final WebClient webClient;

    @Autowired
    private ChinabankSwiftCodeMapper chinabankSwiftCodeMapper;

    public ScraperService() {
        this.webClient = WebClient.create();
    }

    public Mono<List<SwiftCodeVo>> fetchPage(String baseUrl, int page) {
        try {
            Thread.sleep(1000);
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
        System.out.println("处理数据,页数: " + page);
        String urlWithPage = baseUrl.contains("?") ? baseUrl + "&page=" + page : baseUrl + "?page=" + page;
        return webClient.get()
                .uri(urlWithPage)
                .retrieve()
                .bodyToMono(String.class)
                .map(this::parseTableData)
                .onErrorResume(e -> Mono.error(new RuntimeException("Error fetching page " + page, e)));
    }

    private List<SwiftCodeVo> parseTableData(String html) {
        Document doc = Jsoup.parse(html);
        Element table = doc.selectFirst("table.swift-country");  // 替换为实际的表格类名或ID
//        String header = table.select("thead tr th").stream()
//                .map(Element::text)
//                .collect(Collectors.joining("\t"));  // 分隔符可以按需更改
        Elements rows = table.select("tbody tr");

       List<SwiftCodeVo> dataRows = rows.stream()
                .map(row -> {
                    SwiftCodeVo swiftCodeVo = new SwiftCodeVo();
                    swiftCodeVo.setId(row.select("td:nth-child(1)").text());
                    swiftCodeVo.setName(row.select("td:nth-child(2)").text());
                    swiftCodeVo.setCity(row.select("td:nth-child(3)").text());
                    swiftCodeVo.setBranch(row.select("td:nth-child(4)").text());
                    swiftCodeVo.setSwift(row.select("td:nth-child(5)").text());
                    //根据swift查询详情页
                    return swiftCodeVo;
                })
                .collect(Collectors.toList());
        List<ChinabankSwiftCode> chinabankSwiftCodeList = BeanCopyUtils.copyList(dataRows, ChinabankSwiftCode.class);
        chinabankSwiftCodeMapper.insertBatch(chinabankSwiftCodeList);
        return dataRows;
    }

    public String getAdress

//    private List<String> parseTableData(String html) {
//        Document doc = Jsoup.parse(html);
//        Element table = doc.selectFirst("table.swift-country");  // 替换为实际的表格类名或ID
//        String header = table.select("thead tr th").stream()
//                .map(Element::text)
//                .collect(Collectors.joining("\t"));  // 分隔符可以按需更改
//        Elements rows = table.select("tbody tr");
//
//        List<String> dataRows = rows.stream()
//                .map(row -> row.select("td").stream()
//                        .map(Element::text)
//                        .collect(Collectors.joining("\t")))  // 分隔符可以按需更改
//                .collect(Collectors.toList());
//
//        List<String> result = new ArrayList<>();
//        result.add(header);
//        result.addAll(dataRows);
//        return result;
//    }


    public Flux<List<SwiftCodeVo>> fetchAllPages(String baseUrl, int totalPages) {
        Flux.range(1, totalPages)
                .flatMap(x -> fetchPage(baseUrl,x))
                .toStream()
                .forEach(System.out::println);
        return null;
    }

//    public Flux<List<SwiftCodeVo>> fetchAllPages1(String baseUrl, int totalPages) {
//        return (Flux<List<SwiftCodeVo>>) Flux.range(1, totalPages)
//                .flatMap(page -> fetchPage(baseUrl, page)).subscribe(swiftcode -> {
//                    for (SwiftCodeVo swiftCodeVo : swiftcode)
//                    {
//                        System.out.println(swiftCodeVo);
//
//                    }
//                }); // 并发限制,根据实际情况调整
//
//    }
}

webflux代码全了。

然后我删除了weblux,本来用的自带的客户端,打印一坨东西,也没具体研究怎么用  最后改成了okttp3.代码记录。


import com.xhj.framework.web.common.StringUtils;
import com.xhj.server.sys.dao.mapper.ChinabankSwiftCodeMapper;
import com.xhj.server.sys.model.entity.ChinabankSwiftCode;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import java.io.IOException;
import java.util.List;
import java.util.concurrent.TimeUnit;

@Service
public class ScraperService {

    @Autowired
    private ChinabankSwiftCodeMapper chinabankSwiftCodeMapper;

    /**
     * 更新表内地址信息
     * @return
     * @throws IOException
     */
   public void updateAddress(){
       List<ChinabankSwiftCode> list =  chinabankSwiftCodeMapper.findChinabankSwiftCodeList();

           list.forEach(swifObj -> {

               try {
                   swifObj.setAddress(getAddress("https://www.theswiftcodes.com/china/", StringUtils.lowerCase(swifObj.getSwift())));
                   chinabankSwiftCodeMapper.updateById(swifObj);
               } catch (IOException e) {
                   throw new RuntimeException(e);
               }
               try {
                   Thread.sleep(1000);
               } catch (InterruptedException e) {
                   throw new RuntimeException(e);
               }

           });
//       chinabankSwiftCodeMapper.updateBatch(list);

   }
//    private String getAddress(String baseUrl, String swift) throws IOException {
//        String url = baseUrl + swift;
//        CloseableHttpResponse response = null;
//        try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
//            HttpGet httpGet = new HttpGet(url);
//            httpGet.setConfig(RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(5000).build());
//            response = httpClient.execute(httpGet);
//            if (response.getStatusLine().getStatusCode() == 200) {
//                String htmlContent = EntityUtils.toString(response.getEntity());
//                Document doc = Jsoup.parse(htmlContent);
//                Elements rows = doc.select("table.modern tbody tr");
//                Element thirdRow = rows.get(2);
//                String tdValue = thirdRow.select("td").text();
                Element table = doc.selectFirst("table.modern");  // 替换为实际的表格类名或ID
//                String address = tdValue;
//                System.out.println("地址:++++++++"+swift+"++++++++++++++++++" + address);
//                httpClient.close();
//                return address;
//            } else {
//                throw new IOException("Failed to fetch detailed page " + url);
//            }
//
//        } finally {
//            if (response != null) {
//                response.close();
//            }
//        }
//    }
    private OkHttpClient OKHTTP_CLIENT = new OkHttpClient.Builder()
            .connectTimeout(20, TimeUnit.SECONDS)
            .readTimeout(1000, TimeUnit.SECONDS)
            .build();

    private String getAddress(String baseUrl, String swift) throws IOException {
        String url = baseUrl + swift;
        Request request = new Request.Builder()
                .url(url)
                .build();

        Response response = null;
        try {
            response = OKHTTP_CLIENT.newCall(request).execute();
            if (!response.isSuccessful()) {
                throw new IOException("请求失败 " + url);
            }

            String htmlContent = response.body().string();
            Document doc = Jsoup.parse(htmlContent);
            Elements rows = doc.select("table.modern tbody tr");
            Element thirdRow = rows.get(2);
            String tdValue = thirdRow.select("td").text();
            String address = tdValue;
            System.out.println("地址:++++++++" + swift + "++++++++++++++++++" + address);
            return address;
        } finally {
            if (response != null) {
                response.close();
            }
        }
    }
//    public List<SwiftCodeVo> fetchPage(String baseUrl, int page) throws IOException {
//        System.out.println("处理数据,页数:" + page);
//        String urlWithPage = baseUrl.contains("?") ? baseUrl + "&page=" + page : baseUrl + "?page=" + page;
//        CloseableHttpClient httpClient = HttpClients.createDefault();
//        HttpGet httpGet = new HttpGet(urlWithPage);
//        RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(5000).build();
//        httpGet.setConfig(requestConfig);
//
//        try (CloseableHttpResponse response = httpClient.execute(httpGet)) {
//            if (response.getStatusLine().getStatusCode() == 200) {
//                String htmlContent = EntityUtils.toString(response.getEntity());
//                return parseTableData(baseUrl,htmlContent);
//            } else {
//                throw new IOException("Failed to fetch page " + page);
//            }
//        } finally {
//            httpClient.close();
//        }
//    }
//
//    private List<SwiftCodeVo> parseTableData(String baseUrl,String html) throws IOException {
//        Document doc = Jsoup.parse(html);
//        Element table = doc.selectFirst("table.swift-country");  // 替换为实际的表格类名或ID
//        Elements rows = table.select("tbody tr");
//
//        List<SwiftCodeVo> dataRows = new ArrayList<>();
//        for (Element row : rows) {
//            SwiftCodeVo swiftCodeVo = new SwiftCodeVo();
//            swiftCodeVo.setId(row.select("td:nth-child(1)").text());
//            swiftCodeVo.setName(row.select("td:nth-child(2)").text());
//            swiftCodeVo.setCity(row.select("td:nth-child(3)").text());
//            swiftCodeVo.setBranch(row.select("td:nth-child(4)").text());
//            swiftCodeVo.setSwift(row.select("td:nth-child(5)").text());
//            swiftCodeVo.setAddress(getAddress(baseUrl, swiftCodeVo.getSwift()));
//            dataRows.add(swiftCodeVo);
//        }
//        return dataRows;
//    }
//
//
//    public List<List<SwiftCodeVo>> fetchAllPages(String baseUrl, int totalPages) throws IOException {
//        List<List<SwiftCodeVo>> allPagesData = new ArrayList<>();
//        for (int i = 1; i <= totalPages; i++) {
//            List<SwiftCodeVo> pageData = fetchPage(baseUrl, i);
//            allPagesData.add(pageData);
//        }
//        return allPagesData;
//    }
}
       <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
            <dependency>
                <groupId>org.jsoup</groupId>
                <artifactId>jsoup</artifactId>
                <version>${jsoup.version}</version>
            </dependency>
            <!-- 在你的pom.xml中添加这个依赖 -->
            <dependency>
                <groupId>com.squareup.okhttp3</groupId>
                <artifactId>okhttp</artifactId>
                <version>4.12.0</version>
            </dependency>

            <dependency>
                <groupId>org.jetbrains.kotlin</groupId>
                <artifactId>kotlin-stdlib-jdk8</artifactId>
                <version>1.5.21</version>
            </dependency>

好了 开始把工程里这部分代码删掉。webflux好用

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值