处理前的xml文件:
<?xml version="1.0" encoding="UTF-8"?>
<product_set>
<product>
<product_id>614944313705</product_id>
<click_sku_id>614944313705</click_sku_id>
<product_name><![CDATA[oppo钢化水凝膜a3全屏覆盖a3s无白边a1防摔爆护眼抗蓝光膜oppoa3手机屏幕oqqo贴膜软膜oppoa3s保护膜oppoa1]]></product_name>
<expiration_time>2022-12-14 01:12:14</expiration_time>
<image_url><![CDATA[https://img.alicdn.com/imgextra/i1/2206948455447/O1CN01kkILjC1q6lxhX85iX_!!0-item_pic.jpg]]></image_url>
<target_url_mobile><![CDATA[https://huodong.taobao.com/wow/z/usergrowth/outside/dpa_module?wh_biz=tm&force_no_smb=true&bc_fl_src=growth_dhh_3843640202_100-450957-2089633783&dpa_material_id=614944313705&dpa_material_type=1&dpa_source_code=10082&itemIds=614944313705&spm=2014.ugdhh.3843640202.100-450957-2089633783]]></target_url_mobile>
<target_url_android><![CDATA[tbopen://m.taobao.com/tbopen/index.html?action=ali.open.nav&bc_fl_src=growth_dhh_3843640202_100-450957-2089633783&bootImage=0&dpa_material_id=614944313705&dpa_material_type=1&dpa_source_code=10082&itemIds=614944313705&module=h5&source=auto&spm=2014.ugdhh.3843640202.100-450957-2089633783&h5Url=https://huodong.taobao.com/wow/z/usergrowth/outside/dpa_module?wh_biz%3Dtm%26force_no_smb%3Dtrue%26bc_fl_src%3Dgrowth_dhh_3843640202_100-450957-2089633783%26dpa_material_id%3D614944313705%26dpa_material_type%3D1%26dpa_source_code%3D10082%26itemIds%3D614944313705%26spm%3D2014.ugdhh.3843640202.100-450957-2089633783]]></target_url_android>
<target_url_ios><![CDATA[tbopen://m.taobao.com/tbopen/index.html?action=ali.open.nav&bc_fl_src=growth_dhh_3843640202_100-450957-2089633783&bootImage=0&dpa_material_id=614944313705&dpa_material_type=1&dpa_source_code=10082&itemIds=614944313705&module=h5&source=auto&spm=2014.ugdhh.3843640202.100-450957-2089633783&h5Url=https://huodong.taobao.com/wow/z/usergrowth/outside/dpa_module?wh_biz%3Dtm%26force_no_smb%3Dtrue%26bc_fl_src%3Dgrowth_dhh_3843640202_100-450957-2089633783%26dpa_material_id%3D614944313705%26dpa_material_type%3D1%26dpa_source_code%3D10082%26itemIds%3D614944313705%26spm%3D2014.ugdhh.3843640202.100-450957-2089633783]]></target_url_ios>
<product_short_name><![CDATA[总副 oppo a3水凝膜]]></product_short_name>
<third_category_id>50012587</third_category_id>
<third_category_name><![CDATA[手机贴膜]]></third_category_name>
<price>10</price>
<sale_price>10</sale_price>
</product>
<product>
<product_id>615130411096</product_id>
<click_sku_id>615130411096</click_sku_id>
<product_name><![CDATA[小白嘴山药铁棍山药新鲜蔬菜铁杆淮山药5斤装非河南焦作温县垆土]]></product_name>
<expiration_time>2022-12-14 01:12:14</expiration_time>
<image_url><![CDATA[https://img.alicdn.com/imgextra/i2/2207816494036/O1CN01FQQp1Q1fgX74GLegt_!!0-item_pic.jpg]]></image_url>
<target_url_mobile><![CDATA[https://huodong.taobao.com/wow/z/usergrowth/outside/dpa_module?wh_biz=tm&force_no_smb=true&bc_fl_src=growth_dhh_3843640202_100-450957-2089633783&dpa_material_id=615130411096&dpa_material_type=1&dpa_source_code=10082&itemIds=615130411096&spm=2014.ugdhh.3843640202.100-450957-2089633783]]></target_url_mobile>
<target_url_android><![CDATA[tbopen://m.taobao.com/tbopen/index.html?action=ali.open.nav&bc_fl_src=growth_dhh_3843640202_100-450957-2089633783&bootImage=0&dpa_material_id=615130411096&dpa_material_type=1&dpa_source_code=10082&itemIds=615130411096&module=h5&source=auto&spm=2014.ugdhh.3843640202.100-450957-2089633783&h5Url=https://huodong.taobao.com/wow/z/usergrowth/outside/dpa_module?wh_biz%3Dtm%26force_no_smb%3Dtrue%26bc_fl_src%3Dgrowth_dhh_3843640202_100-450957-2089633783%26dpa_material_id%3D615130411096%26dpa_material_type%3D1%26dpa_source_code%3D10082%26itemIds%3D615130411096%26spm%3D2014.ugdhh.3843640202.100-450957-2089633783]]></target_url_android>
<target_url_ios><![CDATA[tbopen://m.taobao.com/tbopen/index.html?action=ali.open.nav&bc_fl_src=growth_dhh_3843640202_100-450957-2089633783&bootImage=0&dpa_material_id=615130411096&dpa_material_type=1&dpa_source_code=10082&itemIds=615130411096&module=h5&source=auto&spm=2014.ugdhh.3843640202.100-450957-2089633783&h5Url=https://huodong.taobao.com/wow/z/usergrowth/outside/dpa_module?wh_biz%3Dtm%26force_no_smb%3Dtrue%26bc_fl_src%3Dgrowth_dhh_3843640202_100-450957-2089633783%26dpa_material_id%3D615130411096%26dpa_material_type%3D1%26dpa_source_code%3D10082%26itemIds%3D615130411096%26spm%3D2014.ugdhh.3843640202.100-450957-2089633783]]></target_url_ios>
<product_short_name><![CDATA[小白嘴山药铁棍山药新鲜蔬菜铁杆淮山药5斤]]></product_short_name>
<third_category_id>50050721</third_category_id>
<third_category_name><![CDATA[新鲜山药]]></third_category_name>
<price>66</price>
<sale_price>66</sale_price>
</product>
<product>
<product_id>615818020475</product_id>
<click_sku_id>615818020475</click_sku_id>
<product_name><![CDATA[发光掏耳神器挖耳勺带灯宝宝儿童采耳掏耳朵淘扣工具套装可视耳屎]]></product_name>
<expiration_time>2022-12-14 01:12:14</expiration_time>
<image_url><![CDATA[https://img.alicdn.com/imgextra/i3/4185085665/O1CN017n2fTh1ricLAKCRgr_!!0-item_pic.jpg]]></image_url>
<target_url_mobile><![CDATA[https://huodong.taobao.com/wow/z/usergrowth/outside/dpa_module?wh_biz=tm&force_no_smb=true&bc_fl_src=growth_dhh_3843640202_100-450957-2089633783&dpa_material_id=615818020475&dpa_material_type=1&dpa_source_code=10082&itemIds=615818020475&spm=2014.ugdhh.3843640202.100-450957-2089633783]]></target_url_mobile>
<target_url_android><![CDATA[tbopen://m.taobao.com/tbopen/index.html?action=ali.open.nav&bc_fl_src=growth_dhh_3843640202_100-450957-2089633783&bootImage=0&dpa_material_id=615818020475&dpa_material_type=1&dpa_source_code=10082&itemIds=615818020475&module=h5&source=auto&spm=2014.ugdhh.3843640202.100-450957-2089633783&h5Url=https://huodong.taobao.com/wow/z/usergrowth/outside/dpa_module?wh_biz%3Dtm%26force_no_smb%3Dtrue%26bc_fl_src%3Dgrowth_dhh_3843640202_100-450957-2089633783%26dpa_material_id%3D615818020475%26dpa_material_type%3D1%26dpa_source_code%3D10082%26itemIds%3D615818020475%26spm%3D2014.ugdhh.3843640202.100-450957-2089633783]]></target_url_android>
<target_url_ios><![CDATA[tbopen://m.taobao.com/tbopen/index.html?action=ali.open.nav&bc_fl_src=growth_dhh_3843640202_100-450957-2089633783&bootImage=0&dpa_material_id=615818020475&dpa_material_type=1&dpa_source_code=10082&itemIds=615818020475&module=h5&source=auto&spm=2014.ugdhh.3843640202.100-450957-2089633783&h5Url=https://huodong.taobao.com/wow/z/usergrowth/outside/dpa_module?wh_biz%3Dtm%26force_no_smb%3Dtrue%26bc_fl_src%3Dgrowth_dhh_3843640202_100-450957-2089633783%26dpa_material_id%3D615818020475%26dpa_material_type%3D1%26dpa_source_code%3D10082%26itemIds%3D615818020475%26spm%3D2014.ugdhh.3843640202.100-450957-2089633783]]></target_url_ios>
<product_short_name><![CDATA[PROMISE/承诺 008]]></product_short_name>
<third_category_id>50007005</third_category_id>
<third_category_name><![CDATA[耳勺]]></third_category_name>
<price>38</price>
<sale_price>38</sale_price>
</product>
</product_set>
处理后的xml文件:
<?xml version="1.0" encoding="UTF-8"?>
<product_set>
<product>
<product_id>614944313705</product_id>
<click_sku_id>614944313705</click_sku_id>
<product_name><![CDATA[oppo钢化水凝膜a3全屏覆盖a3s无白边a1防摔爆护眼抗蓝光膜oppoa3手机屏幕oqqo贴膜软膜oppoa3s保护膜oppoa1]]></product_name>
<expiration_time>2022-12-14 01:12:14</expiration_time>
<image_url><![CDATA[https://img.alicdn.com/imgextra/i1/2206948455447/O1CN01kkILjC1q6lxhX85iX_!!0-item_pic.jpg]]></image_url>
<target_url_mobile><![CDATA[https://huodong.taobao.com/wow/z/usergrowth/outside/dpa_module?wh_biz=tm&force_no_smb=true&bc_fl_src=growth_dhh_3843640202_100-450957-2089633783&dpa_material_id=614944313705&dpa_material_type=1&dpa_source_code=10082&itemIds=614944313705&spm=2014.ugdhh.3843640202.100-450957-2089633783]]></target_url_mobile>
<target_url_android><![CDATA[tbopen://m.taobao.com/tbopen/index.html?action=ali.open.nav&bc_fl_src=growth_dhh_3843640202_100-450957-2089633783&bootImage=0&dpa_material_id=614944313705&dpa_material_type=1&dpa_source_code=10082&itemIds=614944313705&module=h5&source=auto&spm=2014.ugdhh.3843640202.100-450957-2089633783&h5Url=https://huodong.taobao.com/wow/z/usergrowth/outside/dpa_module?wh_biz%3Dtm%26force_no_smb%3Dtrue%26bc_fl_src%3Dgrowth_dhh_3843640202_100-450957-2089633783%26dpa_material_id%3D614944313705%26dpa_material_type%3D1%26dpa_source_code%3D10082%26itemIds%3D614944313705%26spm%3D2014.ugdhh.3843640202.100-450957-2089633783]]></target_url_android>
<target_url_ios><![CDATA[tbopen://m.taobao.com/tbopen/index.html?action=ali.open.nav&bc_fl_src=growth_dhh_3843640202_100-450957-2089633783&bootImage=0&dpa_material_id=614944313705&dpa_material_type=1&dpa_source_code=10082&itemIds=614944313705&module=h5&source=auto&spm=2014.ugdhh.3843640202.100-450957-2089633783&h5Url=https://huodong.taobao.com/wow/z/usergrowth/outside/dpa_module?wh_biz%3Dtm%26force_no_smb%3Dtrue%26bc_fl_src%3Dgrowth_dhh_3843640202_100-450957-2089633783%26dpa_material_id%3D614944313705%26dpa_material_type%3D1%26dpa_source_code%3D10082%26itemIds%3D614944313705%26spm%3D2014.ugdhh.3843640202.100-450957-2089633783]]></target_url_ios>
<product_short_name><![CDATA[总副 oppo a3水凝膜]]></product_short_name>
<third_category_id>50012587</third_category_id>
<third_category_name><![CDATA[手机贴膜]]></third_category_name>
<price>10</price>
<sale_price>10</sale_price>
</product>
</product_set>
具体步骤
1、引入jar
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
<version>1.1</version>
<scope>test</scope>
</dependency>
2、核心代码
package com.shiyu.test;
import com.alibaba.fastjson.JSON;
import org.dom4j.Attribute;
import org.dom4j.Document;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.dom4j.io.OutputFormat;
import org.dom4j.io.SAXReader;
import org.dom4j.io.XMLWriter;
import java.io.*;
import java.util.*;
/**
* @author shiyu
* @since 2020/12/14
*/
public class MyTest {
private static final String LABEL_LIST = "其他配件,剃须刀,口罩,台历,咸鸭蛋,围裙,地垫,地毯,垃圾桶,垃圾袋,定制窗帘,居家鞋,山核桃,成品窗帘,扇子,手机贴膜,护发素,护目镜,挂钟,松子,柠檬,椅垫,榴莲,沐浴球,沐浴露,沙发垫,洗发水,洗手液,洗护套装,浴巾,浴帘杆,海参,海绵擦,湿巾,漱口水,牙膏,牛排,番茄,百洁布,眼罩,眼霜,移动电源,管道疏通器,糖果,纽扣,芒果,芒果干,苹果,隔音耳塞,面膜,鞋垫,飘窗垫,餐垫,香皂";
public static void main(String[] args) {
try {
test();
} catch(Exception e) {
e.printStackTrace();
}
}
public static void test() throws Exception {
File source = new File("resources/test.xml");
long time = System.currentTimeMillis();
File target = new File("resources/test_" + time + ".xml");
int sum = 0, available = 0;
String[] keywords = LABEL_LIST.split(",");
Map<String, Integer> map = new HashMap<>();
//1.创建Reader对象
SAXReader reader = new SAXReader();
//2.加载xml
Document doc = reader.read(source);
//3.获取根节点
Element root = doc.getRootElement();
Document newDoc = DocumentHelper.createDocument();
Element newRoot = newDoc.addElement(root.getQName());
//4.遍历所有子节点
Iterator iterator = root.elementIterator();
while(iterator.hasNext()) {
sum++;
Element stu = (Element) iterator.next();
List<Attribute> attributes = stu.attributes();
for(Attribute attribute : attributes) {
System.out.println(attribute.getValue());
}
Iterator iterator1 = stu.elementIterator();
while(iterator1.hasNext()) {
Element stuChild = (Element) iterator1.next();
//System.out.println("节点名:" + stuChild.getName() + "---节点值:" + stuChild.getStringValue());
if(stuChild.getStringValue().length() > 1) {
for(String keyword : keywords) {
if(stuChild.getStringValue().contains(keyword)) {
if(map.containsKey(keyword)) {
if(map.get(keyword) > 20){
continue;
}
map.put(keyword, map.get(keyword)+1);
} else {
map.put(keyword, 1);
}
newRoot.add((Element) stu.clone());
available++;
}
}
}
}
// 限制上限2000
if(available > 200) break;
}
// xml格式化样式
OutputFormat format = OutputFormat.createPrettyPrint(); // 默认样式
format.setIndentSize(4); // 行缩进重置
// 输出xml文件
XMLWriter writer = new XMLWriter(new FileOutputStream(target), format);
writer.write(newDoc);
System.out.println("共:" + sum + ", 符合:" + available);
System.out.println("匹配详情: " + JSON.toJSONString(map));
}
}