增强的功能有:
自由指定要转换的RTF文件和输出文件,输出文件可以不指定
支持HTML与TXT两种格式,默认为TXT
可以指定转换编码,默认为UTF-8
提高HTML字符反转义效率(循环代替递归)
显示执行时间,5M的RTF转换在4s内完成
代码如下,展开查看!
import java.io.File;
import java.io.FileWriter;
import java.util.Date;
import java.util.HashMap;
import pt.tumba.parser.rtf.RTF2HTML;
/**
* Convert RTF to HTML, RTF to TXT
*
* @author KNIGHTRCOM(rcom10002@163.com)
* {@link http://blog.csdn.net/rcom10002}
*/
public class Main {
private static String sourceFilename;
private static String outputFilename;
private static boolean isForced = false; // indicate to overwrite the existing file
private static boolean isSystemListed = false;
private static String type = "txt";
private static String encoding = "UTF-8";
/**
* -i System information (optional)
* -s Source RTF file (mandatory)
* -o Output file name (optional)
* --force Overwrite output file if it exists
*
* @param args
* @throws Exception
*/
public static void main(String[] args) {
try {
if (args != null && args.length > 0) {
for (String arg : args) {
if (arg.equals("-i")) {
isSystemListed = true;
} else if (arg.startsWith("-s")) {
sourceFilename = arg.substring(2);
} else if (arg.startsWith("-o")) {
outputFilename = arg.substring(2);
} else if (arg.equals("--force")) {
isForced = true;
} else if (arg.startsWith("-t")) {
type = arg.substring(2);
if (!"txt".equals(type) && !"html".equals(type)) {
isSystemListed = false;
sourceFilename = null;
break;
}
} else if (arg.startsWith("-e")) {
encoding = arg.substring(2);
} else {
isSystemListed = false;
sourceFilename = null;
break;
}
}
}
if (((sourceFilename == null || !new File(sourceFilename).exists()) && !isSystemListed) ||
args != null && args.length == 1 && args[0].equals("--help")){
System.out.println("usage: java Main [-t] [-eEncodingName] [--force] -sSourceFileName [-oOutputFileName]");
System.out.println(" java Main [-i,--help]");
return;
}
if (isSystemListed) {
listSystemInfo();
}
executeConvertion();
} catch (Exception e) {
System.err.print("Errmsg: " + e.getMessage());
System.exit(1);
}
}
/**
* List the system info you may concern
*/
public static void listSystemInfo() {
System.getProperties().list(System.out);
}
/**
* Convert RTF to required format
*
* @throws Exception
*/
private static void executeConvertion() throws Exception {
long duration = new Date().getTime();
// Convert rtf to HTML
String result = new RTF2HTML().convertRTFToHTML(new File(sourceFilename));
// This step is important for rendering the text with proper encoding
result = new String(result.getBytes(System.getProperty("sun.jnu.encoding")), encoding);
if ("txt".equals(type)) {
// Extract plain text from HTML
result = result.replaceAll("(?i)", "
/n").replaceAll("<.>", "");
result = StringUtils.unescapeHTML(result, 0);
}
// Write the result to the file
if (outputFilename == null) {
outputFilename = sourceFilename.concat(".txt");
}
File outputFile = new File(outputFilename);
if (outputFile.exists() && !isForced) {
System.out.print("Warning: Output file already exists! Try execute with --force.");
System.exit(-1);
}
FileWriter w = new FileWriter(outputFile);
w.write(StringUtils.trimThroughLines(result));
w.close();
duration -= new Date().getTime();
System.out.print("Complete!(" + (duration / 1000 * -1) + "s)");
}
}
/**
* http://www.rgagnon.com/javadetails/java-0307.html
*
*/
class StringUtils {
private StringUtils() {
}
private static HashMap htmlEntities;
static {
htmlEntities = new HashMap();
htmlEntities.put("
htmlEntities.put(">", ">");
htmlEntities.put("&", "&");
htmlEntities.put(""", "/"");
htmlEntities.put("à", "à");
htmlEntities.put("À", "À");
htmlEntities.put("â", "â");
htmlEntities.put("ä", "ä");
htmlEntities.put("Ä", "Ä");
htmlEntities.put("Â", "Â");
htmlEntities.put("å", "å");
htmlEntities.put("Å", "Å");
htmlEntities.put("æ", "æ");
htmlEntities.put("Æ", "Æ");
htmlEntities.put("ç", "ç");
htmlEntities.put("Ç", "Ç");
htmlEntities.put("é", "é");
htmlEntities.put("É", "É");
htmlEntities.put("è", "è");
htmlEntities.put("È", "È");
htmlEntities.put("ê", "ê");
htmlEntities.put("Ê", "Ê");
htmlEntities.put("ë", "ë");
htmlEntities.put("Ë", "Ë");
htmlEntities.put("ï", "ï");
htmlEntities.put("Ï", "Ï");
htmlEntities.put("ô", "ô");
htmlEntities.put("Ô", "Ô");
htmlEntities.put("ö", "ö");
htmlEntities.put("Ö", "Ö");
htmlEntities.put("ø", "ø");
htmlEntities.put("Ø", "Ø");
htmlEntities.put("ß", "ß");
htmlEntities.put("ù", "ù");
htmlEntities.put("Ù", "Ù");
htmlEntities.put("û", "û");
htmlEntities.put("Û", "Û");
htmlEntities.put("ü", "ü");
htmlEntities.put("Ü", "Ü");
htmlEntities.put(" ", " ");
htmlEntities.put("©", "/u00a9");
htmlEntities.put("®", "/u00ae");
htmlEntities.put("€", "/u20a0");
}
public static final String unescapeHTML(String source, int start) {
int i, j;
// 将递归
java -Dfile.encoding=UTF-8 -cp "C:/Documents and Settings/Administrator/My Documents/Workspace/eclipse/RTF/Document Parser;" Main -s"C:/Documents and Settings/Administrator/My Documents/Workspace/php eclipse/QAR Tool/questions/sample.rtf" -o"C:/my.txt" --force -eGB2312 -thtml