2020.06.26
https://srh.bankofchina.com/search/whpj/search_cn.jsp
因为中行汇率改版本(改为了https请求,页面也有很大变化), 需要重新爬中行汇率,原代码请求的数据总与页面返回的不同,最后感觉应该是模拟的POST是失败的,打断点质量是GET模拟。可能是HTTPS影响吧。
试了网上各种方法,测试的的结果 不是POST请求,还是GET,比如,
HttpURLConnection , HttpsURLConnection
// 发送POST请求必须设置如下两行
conn.setDoOutput(true);
conn.setDoInput(true);
httpURLConnection.setRequestMethod("POST");
或ssl
javax.net.ssl.X509TrustManager
SSLContext sslContext = SSLContext.getInstance("SSL");
TrustManager[] tm = { new MyX509TrustManager() };
// 设置当前实例使用的SSLSoctetFactory
conn.setSSLSocketFactory(ssf);
以上设置都没效果!!!!!!!
如果是http大家可以试网上的通用方法。
----------------------------------------------------------------
import org.apache.commons.lang.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.utils.HttpClientUtils;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.joda.time.DateTime;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Service;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* 爬取中国银行汇率-----------中
*
* @author lm
*/
@Service
public class CrawlingExchangeRateCNService {
public static void main(String[] args) {
CrawlingExchangeRateCNService crawlingExchangeRateService = new CrawlingExchangeRateCNService();
crawlingExchangeRateService.execute();
}
public void execute() {
// List queryList = getExchangeRate("USD", "");
List queryList = getExchangeRate("美元", "");
System.out.println("长度:" + queryList.size());
System.out.println("汇总:" + queryList);
}
/**
* 获取当日传入币别汇率信息
*
* @param sourceCurrency 币别
* @param date 日期
* @return
*/
private List getExchangeRate(String sourceCurrency, String date) {
/***判断入参lsDate是否为空,若为空则赋值为当前时间**/
String lsToday = StringUtils.isEmpty(date) ? new DateTime().toString("yyyy-MM-dd") : date;
List list = new ArrayList();
for (int page = 1; page <= 10; page++) {
/**抓取时间为lsToday,币别为sourceCurrency,页数为page的中国银行网页信息*/
String searchEnHtml = getSearchEnHtml(lsToday, sourceCurrency, String.valueOf(page));
/**开始解析html中的汇率列表信息**/
Map map = assembleObjByHtml(searchEnHtml, sourceCurrency, lsToday);
String flag = (String) map.get("flag");
String htmlPage = (String) map.get("page");
list.add (map.get("list"));
/**当flag为1执行成功时,或总页数等于循环查询到的页数时,则不需要再次进行查询**/
if ("1".equals(flag) || Integer.parseInt(htmlPage) < page) {
break;
}
}
return list;
}
/**
* 获取整个网页的内容
*
* @param lsToday 传入当前时间或空
* @param lsSourceCurrency 币种
* @param liPage 当前查询页数
* @return
*/
private String getSearchEnHtml(String lsToday, String lsSourceCurrency, String liPage) {
// StringBuilder url = new StringBuilder("https://srh.bankofchina.com/search/whpj/searchen.jsp?");
StringBuilder url = new StringBuilder("https://srh.bankofchina.com/search/whpj/search_cn.jsp?");
url.append("erectDate=").append(lsToday);
url.append("¬hing=").append(lsToday);
url.append("&pjname=").append(lsSourceCurrency);
url.append("&page=").append(liPage);
System.out.println("拼接好的url:" + url);
CloseableHttpClient httpClient = HttpClients.createDefault();
CloseableHttpResponse response = null;
HttpPost httpPost = new HttpPost(url.toString());
httpPost.addHeader("Content-Type", "application/x-www-form-urlencoded;charset=utf-8");
httpPost.setHeader("Accept", "Accept: text/plain, */*");
httpPost.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36");
httpPost.addHeader("x-amazon-user-agent", "AmazonJavascriptScratchpad/1.0 (Language=Javascript)");
httpPost.addHeader("X-Requested-With", "XMLHttpRequest");
String html = "";
try {
response = httpClient.execute(httpPost);
/**判断响应状态为200,进行处理**/
if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
HttpEntity httpEntity = response.getEntity();
html = EntityUtils.toString(httpEntity, "utf-8");
} else {
System.out.println(EntityUtils.toString(response.getEntity(), "utf-8"));
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
HttpClientUtils.closeQuietly(response);
HttpClientUtils.closeQuietly(httpClient);
}
/***返回请求得到的页面**/
return html;
}
/**
* 根据取得的网页,解析html中的内容 先不做业务逻辑,全部查询
*
* @param html 要解析的html
* @param lsSourceCurrency 币种
* @param lsToday 日期
* @return
*/
private Map assembleObjByHtml(String html, String lsSourceCurrency, String lsToday) {
/**存储数据**/
Map map = new HashMap(5);
/**使用Jsoup将html解析为Document对象**/
Document document = Jsoup.parse(html);
/**获取页面隐藏域中存放的当前页数**/
Elements pageItem = document.getElementsByAttributeValue("name", "page");
String pageItemValue = "";
pageItemValue = pageItem.select("input[name=page]").val();
map.put("page", pageItemValue);
/**获取页面的整个table信息,这个返回的页面基本上是返回多个table,下方需要细化处理**/
Elements tables = document.getElementsByTag("table");
/**设置存放汇率信息的table下标为-1(默认不存在)**/
int tableIndex = -1;
/**从table中循环获取,查找含有Currency Name字段的table**/
for (int i = 0; i < tables.size(); i++) {
Element element = tables.get(i);
String text = element.text();
/**找到含有汇率信息的table,给tableIndex赋值,跳出循环**/
if (text.indexOf("货币名称") > -1) {
tableIndex = i;
break;
}
}
List<TerstEntity> list = new ArrayList();
/**如果找到汇率列表信息**/
if (tableIndex > -1) {
Element table = tables.get(tableIndex);
/**遍历该表格内的所有的<tr> <tr/>*/
Elements trs = table.select("tr");
for (int i = 1; i < trs.size(); ++i) {
TerstEntity terstEntity = new TerstEntity();
Element tr = trs.get(i);
/**将数据放入实体对象中*/
Elements tds = tr.select("td");
//过滤 <td colspan="11" style="height:30px;"> </td>
if(tds !=null && tds.size() == 7){
System.out.println(tds.get(0).text() + " "+i);
terstEntity.setCurrencyName(tds.get(0).text());
terstEntity.setBuyingRate(tds.get(1).text());
terstEntity.setCashBuyingRate(tds.get(2).text());
terstEntity.setSellingRate(tds.get(3).text());
terstEntity.setCashSellingRate(tds.get(4).text());
terstEntity.setMiddleRate(tds.get(5).text());
terstEntity.setPubTime(tds.get(6).text());
list.add(terstEntity);
}
}
map.put("list", list);
}else{
map.put("flag", "1");
}
return map;
}
}
import lombok.Data;
/**
* 测试使用
*/
@Data
public class TerstEntity {
private String currencyName;
private String buyingRate;
private String cashBuyingRate;
private String sellingRate;
private String cashSellingRate;
private String middleRate;
private String PubTime;
}
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.12.1</version>
</dependency>