SWIFT/BIC Codes for Banks in China
主要把这个网站上的数据拿下来。列表数据是大结构是通义灵码写的,我改能跑动,以及jsoup来解析网页。F12看,就是获取html去解析。
结果详情页忘记了但部署到linux不行,webflux不兼容,也不去解决兼容性问题了。反正这部分数据万年不变。就从数据库遍历出数据,然后通过swift来拼接请求url。拿到详情页的地址信息。忘记地址了。总之做记录。列表数据webflux。 详情页okhttp3。
<jsoup.version>1.13.1</jsoup.version>
<springboot.version>2.6.13</springboot.version>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-webflux</artifactId>
<version>${springboot.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>${jsoup.version}</version>
</dependency>
(本地跑没问题,服务器上有问题)
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.springframework.web.reactive.function.client.WebClient;
import reactor.core.publisher.Flux;
import reactor.core.publisher.Mono;
import java.util.List;
import java.util.stream.Collectors;
@Service
public class ScraperService {
private final WebClient webClient;
@Autowired
private ChinabankSwiftCodeMapper chinabankSwiftCodeMapper;
public ScraperService() {
this.webClient = WebClient.create();
}
public Mono<List<SwiftCodeVo>> fetchPage(String baseUrl, int page) {
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
System.out.println("处理数据,页数: " + page);
String urlWithPage = baseUrl.contains("?") ? baseUrl + "&page=" + page : baseUrl + "?page=" + page;
return webClient.get()
.uri(urlWithPage)
.retrieve()
.bodyToMono(String.class)
.map(this::parseTableData)
.onErrorResume(e -> Mono.error(new RuntimeException("Error fetching page " + page, e)));
}
private List<SwiftCodeVo> parseTableData(String html) {
Document doc = Jsoup.parse(html);
Element table = doc.selectFirst("table.swift-country"); // 替换为实际的表格类名或ID
// String header = table.select("thead tr th").stream()
// .map(Element::text)
// .collect(Collectors.joining("\t")); // 分隔符可以按需更改
Elements rows = table.select("tbody tr");
List<SwiftCodeVo> dataRows = rows.stream()
.map(row -> {
SwiftCodeVo swiftCodeVo = new SwiftCodeVo();
swiftCodeVo.setId(row.select("td:nth-child(1)").text());
swiftCodeVo.setName(row.select("td:nth-child(2)").text());
swiftCodeVo.setCity(row.select("td:nth-child(3)").text());
swiftCodeVo.setBranch(row.select("td:nth-child(4)").text());
swiftCodeVo.setSwift(row.select("td:nth-child(5)").text());
//根据swift查询详情页
return swiftCodeVo;
})
.collect(Collectors.toList());
List<ChinabankSwiftCode> chinabankSwiftCodeList = BeanCopyUtils.copyList(dataRows, ChinabankSwiftCode.class);
chinabankSwiftCodeMapper.insertBatch(chinabankSwiftCodeList);
return dataRows;
}
public String getAdress
// private List<String> parseTableData(String html) {
// Document doc = Jsoup.parse(html);
// Element table = doc.selectFirst("table.swift-country"); // 替换为实际的表格类名或ID
// String header = table.select("thead tr th").stream()
// .map(Element::text)
// .collect(Collectors.joining("\t")); // 分隔符可以按需更改
// Elements rows = table.select("tbody tr");
//
// List<String> dataRows = rows.stream()
// .map(row -> row.select("td").stream()
// .map(Element::text)
// .collect(Collectors.joining("\t"))) // 分隔符可以按需更改
// .collect(Collectors.toList());
//
// List<String> result = new ArrayList<>();
// result.add(header);
// result.addAll(dataRows);
// return result;
// }
public Flux<List<SwiftCodeVo>> fetchAllPages(String baseUrl, int totalPages) {
Flux.range(1, totalPages)
.flatMap(x -> fetchPage(baseUrl,x))
.toStream()
.forEach(System.out::println);
return null;
}
// public Flux<List<SwiftCodeVo>> fetchAllPages1(String baseUrl, int totalPages) {
// return (Flux<List<SwiftCodeVo>>) Flux.range(1, totalPages)
// .flatMap(page -> fetchPage(baseUrl, page)).subscribe(swiftcode -> {
// for (SwiftCodeVo swiftCodeVo : swiftcode)
// {
// System.out.println(swiftCodeVo);
//
// }
// }); // 并发限制,根据实际情况调整
//
// }
}
webflux代码全了。
然后我删除了weblux,本来用的自带的客户端,打印一坨东西,也没具体研究怎么用 最后改成了okttp3.代码记录。
import com.xhj.framework.web.common.StringUtils;
import com.xhj.server.sys.dao.mapper.ChinabankSwiftCodeMapper;
import com.xhj.server.sys.model.entity.ChinabankSwiftCode;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.io.IOException;
import java.util.List;
import java.util.concurrent.TimeUnit;
@Service
public class ScraperService {
@Autowired
private ChinabankSwiftCodeMapper chinabankSwiftCodeMapper;
/**
* 更新表内地址信息
* @return
* @throws IOException
*/
public void updateAddress(){
List<ChinabankSwiftCode> list = chinabankSwiftCodeMapper.findChinabankSwiftCodeList();
list.forEach(swifObj -> {
try {
swifObj.setAddress(getAddress("https://www.theswiftcodes.com/china/", StringUtils.lowerCase(swifObj.getSwift())));
chinabankSwiftCodeMapper.updateById(swifObj);
} catch (IOException e) {
throw new RuntimeException(e);
}
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
});
// chinabankSwiftCodeMapper.updateBatch(list);
}
// private String getAddress(String baseUrl, String swift) throws IOException {
// String url = baseUrl + swift;
// CloseableHttpResponse response = null;
// try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
// HttpGet httpGet = new HttpGet(url);
// httpGet.setConfig(RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(5000).build());
// response = httpClient.execute(httpGet);
// if (response.getStatusLine().getStatusCode() == 200) {
// String htmlContent = EntityUtils.toString(response.getEntity());
// Document doc = Jsoup.parse(htmlContent);
// Elements rows = doc.select("table.modern tbody tr");
// Element thirdRow = rows.get(2);
// String tdValue = thirdRow.select("td").text();
Element table = doc.selectFirst("table.modern"); // 替换为实际的表格类名或ID
// String address = tdValue;
// System.out.println("地址:++++++++"+swift+"++++++++++++++++++" + address);
// httpClient.close();
// return address;
// } else {
// throw new IOException("Failed to fetch detailed page " + url);
// }
//
// } finally {
// if (response != null) {
// response.close();
// }
// }
// }
private OkHttpClient OKHTTP_CLIENT = new OkHttpClient.Builder()
.connectTimeout(20, TimeUnit.SECONDS)
.readTimeout(1000, TimeUnit.SECONDS)
.build();
private String getAddress(String baseUrl, String swift) throws IOException {
String url = baseUrl + swift;
Request request = new Request.Builder()
.url(url)
.build();
Response response = null;
try {
response = OKHTTP_CLIENT.newCall(request).execute();
if (!response.isSuccessful()) {
throw new IOException("请求失败 " + url);
}
String htmlContent = response.body().string();
Document doc = Jsoup.parse(htmlContent);
Elements rows = doc.select("table.modern tbody tr");
Element thirdRow = rows.get(2);
String tdValue = thirdRow.select("td").text();
String address = tdValue;
System.out.println("地址:++++++++" + swift + "++++++++++++++++++" + address);
return address;
} finally {
if (response != null) {
response.close();
}
}
}
// public List<SwiftCodeVo> fetchPage(String baseUrl, int page) throws IOException {
// System.out.println("处理数据,页数:" + page);
// String urlWithPage = baseUrl.contains("?") ? baseUrl + "&page=" + page : baseUrl + "?page=" + page;
// CloseableHttpClient httpClient = HttpClients.createDefault();
// HttpGet httpGet = new HttpGet(urlWithPage);
// RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(5000).build();
// httpGet.setConfig(requestConfig);
//
// try (CloseableHttpResponse response = httpClient.execute(httpGet)) {
// if (response.getStatusLine().getStatusCode() == 200) {
// String htmlContent = EntityUtils.toString(response.getEntity());
// return parseTableData(baseUrl,htmlContent);
// } else {
// throw new IOException("Failed to fetch page " + page);
// }
// } finally {
// httpClient.close();
// }
// }
//
// private List<SwiftCodeVo> parseTableData(String baseUrl,String html) throws IOException {
// Document doc = Jsoup.parse(html);
// Element table = doc.selectFirst("table.swift-country"); // 替换为实际的表格类名或ID
// Elements rows = table.select("tbody tr");
//
// List<SwiftCodeVo> dataRows = new ArrayList<>();
// for (Element row : rows) {
// SwiftCodeVo swiftCodeVo = new SwiftCodeVo();
// swiftCodeVo.setId(row.select("td:nth-child(1)").text());
// swiftCodeVo.setName(row.select("td:nth-child(2)").text());
// swiftCodeVo.setCity(row.select("td:nth-child(3)").text());
// swiftCodeVo.setBranch(row.select("td:nth-child(4)").text());
// swiftCodeVo.setSwift(row.select("td:nth-child(5)").text());
// swiftCodeVo.setAddress(getAddress(baseUrl, swiftCodeVo.getSwift()));
// dataRows.add(swiftCodeVo);
// }
// return dataRows;
// }
//
//
// public List<List<SwiftCodeVo>> fetchAllPages(String baseUrl, int totalPages) throws IOException {
// List<List<SwiftCodeVo>> allPagesData = new ArrayList<>();
// for (int i = 1; i <= totalPages; i++) {
// List<SwiftCodeVo> pageData = fetchPage(baseUrl, i);
// allPagesData.add(pageData);
// }
// return allPagesData;
// }
}
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>${jsoup.version}</version>
</dependency>
<!-- 在你的pom.xml中添加这个依赖 -->
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
<version>4.12.0</version>
</dependency>
<dependency>
<groupId>org.jetbrains.kotlin</groupId>
<artifactId>kotlin-stdlib-jdk8</artifactId>
<version>1.5.21</version>
</dependency>
好了 开始把工程里这部分代码删掉。webflux好用