RTF转HTML,HTML转TXT(Java版)之威力加强版

增强的功能有:

自由指定要转换的RTF文件和输出文件,输出文件可以不指定

支持HTML与TXT两种格式,默认为TXT

可以指定转换编码,默认为UTF-8

提高HTML字符反转义效率(循环代替递归)

显示执行时间,5M的RTF转换在4s内完成

代码如下,展开查看!

import java.io.File; import java.io.FileWriter; import java.util.Date; import java.util.HashMap; import pt.tumba.parser.rtf.RTF2HTML; /** * Convert RTF to HTML, RTF to TXT * * @author KNIGHTRCOM(rcom10002@163.com) * {@link http://blog.csdn.net/rcom10002} */ public class Main { private static String sourceFilename; private static String outputFilename; private static boolean isForced = false; // indicate to overwrite the existing file private static boolean isSystemListed = false; private static String type = "txt"; private static String encoding = "UTF-8"; /** * -i System information (optional) * -s Source RTF file (mandatory) * -o Output file name (optional) * --force Overwrite output file if it exists * * @param args * @throws Exception */ public static void main(String[] args) { try { if (args != null && args.length > 0) { for (String arg : args) { if (arg.equals("-i")) { isSystemListed = true; } else if (arg.startsWith("-s")) { sourceFilename = arg.substring(2); } else if (arg.startsWith("-o")) { outputFilename = arg.substring(2); } else if (arg.equals("--force")) { isForced = true; } else if (arg.startsWith("-t")) { type = arg.substring(2); if (!"txt".equals(type) && !"html".equals(type)) { isSystemListed = false; sourceFilename = null; break; } } else if (arg.startsWith("-e")) { encoding = arg.substring(2); } else { isSystemListed = false; sourceFilename = null; break; } } } if (((sourceFilename == null || !new File(sourceFilename).exists()) && !isSystemListed) || args != null && args.length == 1 && args[0].equals("--help")){ System.out.println("usage: java Main [-t<html|txt>] [-eEncodingName] [--force] -sSourceFileName [-oOutputFileName]"); System.out.println(" java Main [-i,--help]"); return; } if (isSystemListed) { listSystemInfo(); } executeConvertion(); } catch (Exception e) { System.err.print("Errmsg: " + e.getMessage()); System.exit(1); } } /** * List the system info you may concern */ public static void listSystemInfo() { System.getProperties().list(System.out); } /** * Convert RTF to required format * * @throws Exception */ private static void executeConvertion() throws Exception { long duration = new Date().getTime(); // Convert rtf to HTML String result = new RTF2HTML().convertRTFToHTML(new File(sourceFilename)); // This step is important for rendering the text with proper encoding result = new String(result.getBytes(System.getProperty("sun.jnu.encoding")), encoding); if ("txt".equals(type)) { // Extract plain text from HTML result = result.replaceAll("(?i)<br.*?/?>", "<br />/n").replaceAll("<.+?>", ""); result = StringUtils.unescapeHTML(result, 0); } // Write the result to the file if (outputFilename == null) { outputFilename = sourceFilename.concat(".txt"); } File outputFile = new File(outputFilename); if (outputFile.exists() && !isForced) { System.out.print("Warning: Output file already exists! Try execute with --force."); System.exit(-1); } FileWriter w = new FileWriter(outputFile); w.write(StringUtils.trimThroughLines(result)); w.close(); duration -= new Date().getTime(); System.out.print("Complete!(" + (duration / 1000 * -1) + "s)"); } } /** * http://www.rgagnon.com/javadetails/java-0307.html * */ class StringUtils { private StringUtils() { } private static HashMap<String, String> htmlEntities; static { htmlEntities = new HashMap<String, String>(); htmlEntities.put("<", "<"); htmlEntities.put(">", ">"); htmlEntities.put("&", "&"); htmlEntities.put(""", "/""); htmlEntities.put("à", "à"); htmlEntities.put("À", "À"); htmlEntities.put("â", "â"); htmlEntities.put("ä", "ä"); htmlEntities.put("Ä", "Ä"); htmlEntities.put("Â", "Â"); htmlEntities.put("å", "å"); htmlEntities.put("Å", "Å"); htmlEntities.put("æ", "æ"); htmlEntities.put("Æ", "Æ"); htmlEntities.put("ç", "ç"); htmlEntities.put("Ç", "Ç"); htmlEntities.put("é", "é"); htmlEntities.put("É", "É"); htmlEntities.put("è", "è"); htmlEntities.put("È", "È"); htmlEntities.put("ê", "ê"); htmlEntities.put("Ê", "Ê"); htmlEntities.put("ë", "ë"); htmlEntities.put("Ë", "Ë"); htmlEntities.put("ï", "ï"); htmlEntities.put("Ï", "Ï"); htmlEntities.put("ô", "ô"); htmlEntities.put("Ô", "Ô"); htmlEntities.put("ö", "ö"); htmlEntities.put("Ö", "Ö"); htmlEntities.put("ø", "ø"); htmlEntities.put("Ø", "Ø"); htmlEntities.put("ß", "ß"); htmlEntities.put("ù", "ù"); htmlEntities.put("Ù", "Ù"); htmlEntities.put("û", "û"); htmlEntities.put("Û", "Û"); htmlEntities.put("ü", "ü"); htmlEntities.put("Ü", "Ü"); htmlEntities.put("", " "); htmlEntities.put("©", "/u00a9"); htmlEntities.put("®", "/u00ae"); htmlEntities.put("€", "/u20a0"); } public static final String unescapeHTML(String source, int start) { int i, j; // 将递归算法转换成循环可以防止处理大数据时内存不足的缺陷 // i = source.indexOf("&", start); // if (i > -1) { // j = source.indexOf(";", i); // if (j > i) { // String entityToLookFor = source.substring(i, j + 1); // String value = (String) htmlEntities.get(entityToLookFor); // if (value != null) { // source = new StringBuilder().append( // source.substring(0, i)).append(value).append( // source.substring(j + 1)).toString(); // return unescapeHTML(source, i + 1); // recursive call // } // } // } while (true) { i = source.indexOf("&", start); if (i > -1) { j = source.indexOf(";", i); if (j > i) { String entityToLookFor = source.substring(i, j + 1); String value = (String) htmlEntities.get(entityToLookFor); if (value != null) { source = new StringBuilder().append( source.substring(0, i)).append(value).append( source.substring(j + 1)).toString(); start = i + 1; } } else { break; } } else { break; } } return source; } /** * Remove heading and tailing space or tab of all lines * * @param source * @return */ public static final String trimThroughLines(String source) { return source.replaceAll("(?ms)^//s+|//s+$", ""); } public static void main(String args[]) throws Exception { // to see accented character to the console java.io.PrintStream ps = new java.io.PrintStream(System.out, true, "Cp850"); String test = "© 2007 Réal Gagnon <www.rgagnon.com>"; ps.println(test + "/n-->/n" + unescapeHTML(test, 0)); /* * output : © 2007 Réal Gagnon <www.rgagnon.com> * --> © 2007 Réal Gagnon <www.rgagnon.com> */ } }

java -Dfile.encoding=UTF-8 -cp "C:/Documents and Settings/Administrator/My Documents/Workspace/eclipse/RTF/Document Parser;" Main -s"C:/Documents and Settings/Administrator/My Documents/Workspace/php eclipse/QAR Tool/questions/sample.rtf" -o"C:/my.txt" --force -eGB2312 -thtml

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值