最近公司有个系统需要重构,有个需求是需要拿到中国农药信息网的农药数据,以往的旧系统是利用爬虫定时爬取数据,比较费时麻烦,所用想到用Java来直接解析数据渲染到前端.
1.这是农药信息网数据
2.这是我们系统利用Java解析出来的数据
/**
*解析农药列表数据
* @Parms 页码,条数,登记证号,农药名称,登记持有人
**/
public static Map<String,Object> queryList(Integer pageNo,Integer pageSize,String djzh,String nymc,String cjmc) {
try {
//页码,条数,登记证号,农药名称,登记持有人
String address = “https://www.icama.cn/BasicdataSystem/pesticideRegistration/queryselect.do?pageNo=”+pageNo+
“&pageSize=”+pageSize+“&djzh=”+djzh+“&nymc=”+nymc+“&cjmc=”+cjmc+“&sf=&nylb=&zhl=&jx=&zwmc=&fzdx=&syff=&dx=&yxcf=&yxcf_en=&yxcfhl=&yxcf2=&yxcf2_en=&yxcf2hl=&yxcf3=&yxcf3_en=&yxcf3hl=&yxqs_start=&yxqs_end=&yxjz_start=&yxjz_end=”;
//请求参数
Map<String, Object> params = new HashMap<>();
Map<String, String> map = new HashMap<>();
// 调⽤ get 请求
String res = get(address, map, null);
// System.out.println(res);//打印返回参数
Document doc = Jsoup.parse(res);
Elements rows = doc.select(“table[id=tab]”).get(0).select(“tr”);
int count=0;
List<NzDrugs> drugsList= new ArrayList<>();
for (Element headline : rows) {
count++;
System.out.println("第"+count+"行");
Elements tdList = headline.select("td");
//System.out.println(headline.select("td"));
Integer sum=0;
NzDrugs drugs =new NzDrugs();
for (Element td:tdList){
Elements spanList = td.select("span");
//System.out.println(td.select("span"));
for (Element span:spanList){
switch (sum){
case 0:
String onclick="<a[^>]*onclick=(\\\"([^\\\"]*)\\\"|\\'([^\\']*)\\'|([^\\\\s>]*))[^>]*>(.*?)</a>";
Pattern pattern_onclick=Pattern.compile(onclick);
Matcher matcher_onclick = pattern_onclick.matcher(span.select("span").toString());
//System.out.println(span.select("span").select("a"));
//获取药品id
while (matcher_onclick.find()){
for (int i = 2; i < 3; i++) {
String id = matcher_onclick.group(i).replace("_viewpd('", "").replace("')", "");
System.out.println("药品id:"+id);
drugs.setId(id);
}
}
//获取药品编号
String a="<[^>]+>";
String code = span.select("span").toString().replaceAll(a, "").trim();
System.out.println("获取药品编号:"+code);
drugs.setCode(code);
break;
case 1:
//获取农药名称
String name = span.select("span").toString().replace("<span>", "").replace("</span>", "");
System.out.println("农药名称:"+name);
drugs.setName(name);
break;
case 2:
//获取农药类别
String type = span.select("span").toString().replace("<span>", "").replace("</span>", "");
System.out.println("农药类别:"+type);
drugs.setType(type);
break;
case 3:
//获取农药剂型
String category = span.select("span").toString().replace("<span>", "").replace("</span>", "");
System.out.println("剂型:"+category);
drugs.setCategory(category);
break;
case 4:
//获取农药总含量
String total = span.select("span").toString().replace("<span>", "").replace("</span>", "");
System.out.println("总含量:"+total);
drugs.setTotalcontent(total);
break;
case 5:
//农药有效期
String times = span.select("span").toString().replace("<span>", "").replace("</span>", "");
System.out.println("有效期:"+times);
drugs.setTimes(times);
break;
case 6:
//登记证持有人
String ac="<[^>]+>";
String company = span.select("span").toString().replaceAll(ac, "").trim();
System.out.println("登记证持有人:"+company);
drugs.setCompany(company);
drugsList.add(drugs);
break;
}
//System.out.println(span.select("span"));
sum++;
}
}
}
//数据总条数
String ac="<[^>]+>";
Elements fenye = doc.select("li[class=disabled controls]").get(0).select("a");
//获取分页数据
Integer totalSize = Integer.parseInt(fenye.toString().replaceAll(ac, "")
.replace("当前 / 条,共 ", "").replace(" 条", "").trim());
System.out.println("数据总条数:"+totalSize);
params.put("list",drugsList);
params.put("pages",pageNo);
params.put("size",pageSize);
params.put("total",totalSize);
return params;
} catch (Exception e) {
// TODO 异常
e.printStackTrace();
}
return null;
}