最近在做一个将html转pdf的功能,网上查找了相关的文档,实现方案是html—xhtml —–xsl-fo—–pdf 分三部生成pdf。
这部分为第一步骤html—xhtml的代码,使用了jtidy.jar(参考了其他博客)
/**
* 利用正则表达式匹配html输入流中的charset信息
* @param bin 由于用到了InputStream的mark()、reset()方法
* 所以需要BufferedInputStream
* @return 该html文件的字符编码,如果没找到则返回iso8859-1
* @throws IOException
*/
public String getEncodingOfStream1(BufferedInputStream bin) throws IOException {
byte[] bytes = new byte[1024]; //存放读入的信息,一次读入1024个字节
bin.mark(1024); //标记初始位置,设标记失效的最大字节数为1024
int len = bin.read(bytes);
String encoding;
String encoding_tag = "<head>([\\s\\S]*?)<meta([\\s\\S]*?)charset\\s*=(\")?(.*?)\""; //使用正则表达式匹配charset
String detector = new String(bytes, 0, len, "iso8859-1"); //默认用iso8859-1,避免丢失信息
Pattern encodingPattern = Pattern.compile(encoding_tag, Pattern.CASE_INSENSITIVE);
Matcher m = encodingPattern.matcher(detector);
if (m.find()) {
encoding = m.group(4);
// if (encoding.equals("gb2312")) {
// byte[] gbkBytes = new String(bytes, "gbk").getBytes();
// return new String(gbkBytes, "utf-8");
// }else{
return encoding ;
// }
}
bin.reset();
return "iso8859-1";
}
public void doTidy(String f_in) {
ByteArrayOutputStream tidyOutStream; // 输出流
try {
FileInputStream fis = new FileInputStream(f_in);// 读文件
ByteArrayOutputStream bos = new ByteArrayOutputStream();
int ch;
while ((ch = fis.read()) != -1) {
bos.write(ch);
}
fis.close();
byte[] bs = bos.toByteArray();
bos.close();
// String hope_gb2312 = new String(bs, "GB2312");// 注意,默认是GB2312,所以这里先转化成GB2312然后再转化成其他的。
// byte[] hope_b = hope_gb2312.getBytes();
// String basil = new String(hope_b, "UTF-8");// 将GB2312转化成UTF-8
BufferedInputStream bis = new BufferedInputStream(new FileInputStream(new File(f_in)) );
String unicodeHtml = getEncodingOfStream1(bis);
// String basil = new String(bs, "UTF-8");
String basil = new String(bs, unicodeHtml);
ByteArrayInputStream stream = new ByteArrayInputStream(basil.getBytes());
tidyOutStream = new ByteArrayOutputStream();
Tidy tidy = new Tidy();
tidy.setInputEncoding("UTF-8");
tidy.setQuiet(true);
tidy.setOutputEncoding("UTF-8");
tidy.setShowWarnings(false); //不显示警告信息
tidy.setIndentContent(true);//
tidy.setSmartIndent(true);
tidy.setIndentAttributes(false);
tidy.setWraplen(1024); //多长换行
//输出为xhtml
tidy.setXHTML(true);
tidy.setErrout(new PrintWriter(System.out));
tidy.parse(stream, tidyOutStream);
DataOutputStream to = new DataOutputStream(new FileOutputStream("C:\\Users\\Administrator\\Desktop\\abcd.xhtml")); // 将生成的xhtml写入
tidyOutStream.writeTo(to);
System.out.println(tidyOutStream.toString());
} catch (Exception ex) {
System.out.println(ex.toString());
ex.printStackTrace();
}
}
//测试
public static void main(String args[]) {
HtmlToXhtml t = new HtmlToXhtml();
t.doTidy("C:\\Users\\Administrator\\Desktop\\open(1)\\index.html");// 转化开始
}