在需要大批量解析 FTP 上的 xml 文件时,下载回本地再处理,会增加两次磁盘 io 操作。
所以本文尝试以流的方式,来读取并解析出 ftp 上的 xml 文件内容。
1, 准备 XML 文件
文件名: USER_20200822.xml
<?xml version="1.0" encoding="UTF-8"?>
<Migration>
<Session Name="张三" Type="Online">
<Access>
<Url ReportTime="2020-08-22T10:45:02.080">/computer/dell/1031</Url>
<Url ReportTime="2020-08-22T10:46:04.133">/computer/lenovo/2080</Url>
</Access>
</Session>
<Session Name="李四" Type="Offline">
<Access>
<Url ReportTime="2020-08-22T10:33:35.013">/phone/huawei/1031</Url>
<Url ReportTime="2020-08-22T10:41:56.898">/phone/xiaomi/2080</Url>
</Access>
</Session>
</Migration>
2, MAVEN 引入依赖
用来连接 FTP
<!-- https://mvnrepository.com/artifact/commons-net/commons-net -->
<dependency>
<groupId>commons-net</groupId>
<artifactId>commons-net</artifactId>
<version>3.7</version>
</dependency>
用来解析 XML
<!-- https://mvnrepository.com/artifact/jaxen/jaxen -->
<dependency>
<groupId>jaxen</groupId>
<artifactId>jaxen</artifactId>
<version>1.2.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/dom4j/dom4j -->
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
<version>1.6.1</version>
</dependency>
3, 编码
文件名: FtpStreamApp.java
package com.dosrain.trial;
import org.apache.commons.net.ftp.FTP;
import org.apache.commons.net.ftp.FTPClient;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.zip.GZIPInputStream;
public class FtpStreamApp {
private String server = "127.0.0.1";
private int port = 21;
private String username = "ftpuser";
private String password = "1qt5!QT%";
private String remoteFullFileName = "./USER_20200822.xml.gz";
public void parseXml() throws IOException, DocumentException {
FTPClient ftp = new FTPClient();
ftp.connect(server, port);
ftp.login(username, password);
System.out.println("Remote system is " + ftp.getSystemType());
// 要处理的是gz文件,是二进制文件,所以要设置一下
ftp.setFileType(FTP.BINARY_FILE_TYPE);
// 生产环境中,FTP客户端设置被动模式,
// 主动模式,指客户端通知服务端,请主动来连接我的某个端口,连上就传输数据。
// 被动模式,指客户端通知服务端,请告诉我服务端的某个端口号,然后你被动等着我连接这个端口,连上就传数据。
ftp.enterLocalPassiveMode();
// 注意这两行,是关键代码
// 注意这两行,是关键代码
// 注意这两行,是关键代码
InputStream inputStream = ftp.retrieveFileStream(remoteFullFileName);
GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream);
// 创建SAXReader
SAXReader reader = new SAXReader();
Document document = reader.read(gzipInputStream);
// 获取根节点, 本例就是 Migration
Element root = document.getRootElement();
// 获取所有 Session 节点
List<Element> sessions = root.elements("Session");
// 循环 Session 节点
int i = 0;
for (Element session: sessions) {
System.out.println("");
i++;
System.out.println("第 " + i + " 个 Session");
// 打印属性,Name, Type
System.out.println("Name: " + session.attributeValue("Name") + ", Type: " + session.attributeValue("Type"));
List<Element> urls = session.element("Access").elements("Url");
// 打印访问过的 Url
int j = 0;
for (Element url: urls) {
j++;
System.out.println("第 " + j + " 个 Url");
System.out.println("ReportTime: " + url.attributeValue("ReportTime"));
System.out.println("Url: " + url.getText());
}
}
gzipInputStream.close();
inputStream.close();
ftp.disconnect();
}
public static void main(String[] args) throws IOException, DocumentException {
new FtpStreamApp().parseXml();
}
}
4, 编译运行
Remote system is UNIX emulated by FileZilla
第 1 个 Session
Name: 张三, Type: Online
第 1 个 Url
ReportTime: 2020-08-22T10:45:02.080
Url: /computer/dell/1031
第 2 个 Url
ReportTime: 2020-08-22T10:46:04.133
Url: /computer/lenovo/2080
第 2 个 Session
Name: 李四, Type: Offline
第 1 个 Url
ReportTime: 2020-08-22T10:33:35.013
Url: /phone/huawei/1031
第 2 个 Url
ReportTime: 2020-08-22T10:41:56.898
Url: /phone/xiaomi/2080
Process finished with exit code -1