如何将doc、docx、txt、srt、lrc格式的文件转成HTML呢?
doc转HTML需要用到Java写的 POIOfficeTool.exe
直接上代码:
doc、docx转HTML
public void Doc2Html(string srcFilePath, string targetFilePath)
{
try
{
string ext = Path.GetExtension(srcFilePath).ToLower();
if (".doc" == ext)
{
using (System.Diagnostics.Process proc = new System.Diagnostics.Process())
{
proc.StartInfo.WorkingDirectory = CommonHelper.BaseDirectory;
proc.StartInfo.FileName = Path.Combine(CommonHelper.BaseDirectory, @"Java\POIOfficeTool.exe");
proc.StartInfo.Arguments = " -word2html \"" + srcFilePath + "\" \"" + targetFilePath + ".temp" + "\"";
proc.StartInfo.UseShellExecute = false;
proc.StartInfo.RedirectStandardError = true;
proc.StartInfo.CreateNoWindow = true;
proc.Start();
proc.WaitForExit();
proc.Close();
proc.Dispose();
}
using (StreamReader sr = new StreamReader(targetFilePath + ".temp"))
{
ConvertHtmlCss(targetFilePath, sr.ReadToEnd());
}
}
else if (".docx" == ext)
{
ConvertToHtml(srcFilePath, targetFilePath);
}
else
{
throw new Exception("不支持的文件类型");
}
}
catch (Exception e)
{
throw e;
}
}
/// <summary>
/// 默认的html样式
/// </summary>
private const string DefaultStyle = @"
body{background-color:#fff !important; margin:17px 0px 18px 0px !important;}*{font-family: 微软雅黑,Cambria,'sans serif';}div,p,span{font-size:16px;line-height:1.5;}br{line-height:normal;font-size:1px;}table {border-width: 1px;border-style: dotted;border-color: #C6C6C6;} table td, table th {border-width: 1px;border-style: dotted;border-color: #C6C6C6; }";
/// <summary>
/// 默认的html模板
/// </summary>
public static readonly string DefaultHeadHtmlTemplate =
@"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 Transitional//EN"" ""http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"">
<html xmlns=""http://www.w3.org/1999/xhtml"">
<head>
<meta http-equiv=""Content-Type"" content=""text/html; charset=utf-8"" />
<title></title>
</head>
<body>
<style type=""text/css"">" + DefaultStyle+"</style>".Replace("\r",string.Empty).Replace("\n",string.Empty);
/// <summary>
/// 覆盖特殊的样式
/// </summary>
/// <param name="node"></param>
private static void ReplaceSpecialStyle(HtmlAgilityPack.HtmlNode node)
{
if (string.Equals(node.Name, "br", StringComparison.OrdinalIgnoreCase)) return;
int fontWeightStartIndex = -1, fontEndWeightIndex = -1;
string tempValue = node.Attributes["style"].Value;
string content = null;
if ((fontWeightStartIndex = tempValue.LastIndexOf("color")) > -1)//字体颜色
{
fontEndWeightIndex = tempValue.IndexOf(";", fontWeightStartIndex, tempValue.Length - fontWeightStartIndex);
if (fontEndWeightIndex == -1)
{
fontEndWeightIndex = tempValue.Length - 1;
}
if (fontEndWeightIndex + 1 - fontWeightStartIndex > 0)
{
int startIndex = tempValue.IndexOf(":", fontWeightStartIndex);
string color = tempValue.Substring(startIndex + 1, fontEndWeightIndex - startIndex).ToLower();
if (color.EndsWith(";"))
color = color.Remove(color.Length - 1);
node.Attributes["style"].Value = tempValue.Remove(fontWeightStartIndex, fontEndWeightIndex + 1 - fontWeightStartIndex);
content = node.InnerHtml;
node.RemoveAllChildren();
node.AppendChild(HtmlAgilityPack.HtmlNode.CreateNode("<font color=\"" + color + "\">" + content + "</font>"));
}
}
tempValue = node.Attributes["style"].Value;
if ((fontWeightStartIndex = tempValue.LastIndexOf("font-style")) > -1)//字体样式
{
fontEndWeightIndex = tempValue.IndexOf(";", fontWeightStartIndex, tempValue.Length - fontWeightStartIndex);
if (fontEndWeightIndex == -1)
{
fontEndWeightIndex = tempValue.Length - 1;
}
if (fontEndWeightIndex + 1 - fontWeightStartIndex > 0)
{
content = tempValue.Substring(fontWeightStartIndex, fontEndWeightIndex + 1 - fontWeightStartIndex).ToLower();
if (content.Contains("italic"))
{
node.Attributes["style"].Value = tempValue.Remove(fontWeightStartIndex, fontEndWeightIndex + 1 - fontWeightStartIndex);
content = node.InnerHtml;
node.RemoveAllChildren();
node.AppendChild(HtmlAgilityPack.HtmlNode.CreateNode("<em>" + content + "</em>"));
}
else if (content.Contains("oblique"))
{
node.Attributes["style"].Value = tempValue.Remove(fontWeightStartIndex, fontEndWeightIndex + 1 - fontWeightStartIndex);
content = node.InnerHtml;
node.RemoveAllChildren();
node.AppendChild(HtmlAgilityPack.HtmlNode.CreateNode("<i>" + content + "</i>"));
}
}
}
tempValue = node.Attributes["style"].Value;
if ((fontWeightStartIndex = tempValue.LastIndexOf("font-weight")) > -1)//字体加粗
{
fontEndWeightIndex = tempValue.IndexOf(";", fontWeightStartIndex, tempValue.Length - fontWeightStartIndex);
if (fontEndWeightIndex == -1)
{
fontEndWeightIndex = tempValue.Length - 1;
}
if (fontEndWeightIndex + 1 - fontWeightStartIndex > 0)
{
content = tempValue.Substring(fontWeightStartIndex, fontEndWeightIndex + 1 - fontWeightStartIndex).ToLower();
if (content.Contains("bold"))
{
node.Attributes["style"].Value = tempValue.Remove(fontWeightStartIndex, fontEndWeightIndex + 1 - fontWeightStartIndex);
content = node.InnerHtml;
node.RemoveAllChildren();
node.AppendChild(HtmlAgilityPack.HtmlNode.CreateNode("<strong>" + content + "</strong>"));
}
}
}
tempValue = node.Attributes["style"].Value;
if ((fontWeightStartIndex = tempValue.LastIndexOf("text-decoration")) > -1)
{
fontEndWeightIndex = tempValue.IndexOf(";", fontWeightStartIndex, tempValue.Length - fontWeightStartIndex);
if (fontEndWeightIndex == -1)
{
fontEndWeightIndex = tempValue.Length - 1;
}
if (fontEndWeightIndex + 1 - fontWeightStartIndex > 0)
{
content = tempValue.Substring(fontWeightStartIndex, fontEndWeightIndex + 1 - fontWeightStartIndex).ToLower();
if (content.Contains("underline"))//下划线
{
node.Attributes["style"].Value = tempValue.Remove(fontWeightStartIndex, fontEndWeightIndex + 1 - fontWeightStartIndex);
content = node.InnerHtml;
node.RemoveAllChildren();
node.AppendChild(HtmlAgilityPack.HtmlNode.CreateNode("<u>" + content + "</u>"));
}
else if (content.Contains("line-through"))//删除线
{
node.Attributes["style"].Value = tempValue.Remove(fontWeightStartIndex, fontEndWeightIndex + 1 - fontWeightStartIndex);
content = node.InnerHtml;
node.RemoveAllChildren();
node.AppendChild(HtmlAgilityPack.HtmlNode.CreateNode("<del>" + content + "</del>"));
}
}
}
tempValue = node.Attributes["style"].Value;
if ((fontWeightStartIndex = tempValue.LastIndexOf("text-align")) > -1)
{
fontEndWeightIndex = tempValue.IndexOf(";", fontWeightStartIndex, tempValue.Length - fontWeightStartIndex);
if (fontEndWeightIndex == -1)
{
fontEndWeightIndex = tempValue.Length - 1;
}
if (fontEndWeightIndex + 1 - fontWeightStartIndex > 0)
{
int startIndex = tempValue.IndexOf(":", fontWeightStartIndex);
string align = tempValue.Substring(startIndex + 1, fontEndWeightIndex - startIndex).ToLower();
if (align.EndsWith(";"))
align = align.Remove(align.Length - 1);
node.Attributes["style"].Value = tempValue.Remove(fontWeightStartIndex, fontEndWeightIndex + 1 - fontWeightStartIndex);
node.Attributes.Add("align", align);
}
}
}
/// <summary>
/// 将html的CSS转为内联style属性,只对class有效
/// </summary>
/// <param name="htmlPath"></param>
/// <param name="content"></param>
private void ConvertHtmlCss(string htmlPath, string content)
{
HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument();
html.LoadHtml(content);
var head = html.DocumentNode.SelectSingleNode("//head");
HtmlAgilityPack.HtmlNode bodyNode = html.DocumentNode.SelectSingleNode("//body");
if (bodyNode == null) return;
HtmlAgilityPack.HtmlNodeCollection styleList = head.SelectNodes("//style");
StringBuilder classCss = new StringBuilder();
//将style由head搬到body中
if (styleList != null)
{
StringBuilder cssText = new StringBuilder(@"<style type=""text/css"">");
string innerText = null;
string tempCss = null;
Regex regex = new Regex(@"/\*.*?\*/", RegexOptions.Compiled);
foreach (HtmlAgilityPack.HtmlNode node in styleList)
{
if ((innerText = node.InnerHtml) != null && innerText.Length > 0)
{
ZetaHtmlEditControl.Code.Css.CssParse parse = new ZetaHtmlEditControl.Code.Css.CssParse();
parse.Source = regex.Replace(innerText,string.Empty);
classCss.Append((tempCss = parse.ParseToHtml()));
cssText.Append(tempCss);
head.RemoveChild(node);
}
}
cssText.Append(DefaultStyle).Append("</style>");
bodyNode.PrependChild(HtmlAgilityPack.HtmlNode.CreateNode(cssText.ToString()));
cssText.Length = 0;
}
else
bodyNode.PrependChild(HtmlAgilityPack.HtmlNode.CreateNode(@"<style type=""text/css"">" + DefaultStyle + "</style>"));
//span空格修正
var nodes = bodyNode.SelectNodes(@"//span");
if (nodes != null)
{
string innerHtml = null;
foreach (var node in nodes)
{
if (Regex.IsMatch(innerHtml = node.InnerHtml, @"^\s+$"))
{
if (innerHtml.Length > 0)
node.InnerHtml = innerHtml.Replace(" ", " ");
else
node.InnerHtml = " ";
}
}
}
//内联样式修正
nodes = bodyNode.SelectNodes(@"//*[@style]");
if (nodes != null)
{
foreach (var node in nodes)
{
if(string.Equals(node.Name,"img",StringComparison.OrdinalIgnoreCase))
continue;
ReplaceSpecialStyle(node);
node.Attributes["style"].Value = ZetaHtmlEditControl.Code.Html.InlineCssParser.ParseToHtml(node.Attributes["style"].Value);
}
}
//class转内联样式
nodes = bodyNode.SelectNodes(@"//*[@class]");
if (nodes != null && classCss.Length>0)
{
ZetaHtmlEditControl.Code.Css.CssDocument cssDoc = new ZetaHtmlEditControl.Code.Css.CssDocument();
cssDoc.Load(classCss.ToString());
ZetaHtmlEditControl.Code.Css.CssElement cssElement = null;
const string style = "style";
const string dot = ".";
const string cls = "class";
string classValues = string.Empty;
foreach (var node in nodes)
{
if(string.Equals(node.Name,"img",StringComparison.OrdinalIgnoreCase))
continue;
classValues = node.Attributes[cls].Value.Trim().ToLower();
foreach (string clsValue in classValues.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries))
{
cssElement = cssDoc[dot + clsValue];
if (cssElement != null)
{
if (node.Attributes.Contains(style))
node.Attributes[style].Value = cssElement.Value + node.Attributes[style].Value;
else
node.Attributes.Add(style, cssElement.Value);
ReplaceSpecialStyle(node);
classValues=classValues.Replace(clsValue, string.Empty);
}
}
classValues = classValues.Trim();
if (classValues.Length < 1)
{
node.Attributes.Remove(cls);
}
else
{
node.Attributes[cls].Value = classValues;
}
}
cssDoc.Elements.Clear();
}
classCss.Length = 0;
html.Save(htmlPath, Encoding.UTF8);
}
private void ConvertToHtml(string srcFilePath, string targetFilePath)
{
var fi = new FileInfo(srcFilePath);
byte[] byteArray = File.ReadAllBytes(fi.FullName);
using (MemoryStream memoryStream = new MemoryStream())
{
memoryStream.Write(byteArray, 0, byteArray.Length);
using (WordprocessingDocument wDoc = WordprocessingDocument.Open(memoryStream, true, new OpenSettings
{
AutoSave=false,
}))
{
var imageDirectoryName = Path.Combine(Path.GetDirectoryName(targetFilePath), Path.GetFileNameWithoutExtension(targetFilePath) + "_images");
if (Directory.Exists(imageDirectoryName) == false)
Directory.CreateDirectory(imageDirectoryName);
int imageCounter = 0;
var pageTitle = fi.FullName;
var part = wDoc.CoreFilePropertiesPart;
if (part != null)
{
pageTitle = (string)part.GetXDocument().Descendants(DC.title).FirstOrDefault() ?? fi.FullName;
}
HtmlConverterSettings settings = new HtmlConverterSettings()
{
PageTitle = pageTitle,
FabricateCssClasses = true,
CssClassPrefix = "pt-",
RestrictToSupportedLanguages = false,
RestrictToSupportedNumberingFormats = false,
ImageHandler = imageInfo =>
{
DirectoryInfo localDirInfo = new DirectoryInfo(imageDirectoryName);
if (!localDirInfo.Exists)
localDirInfo.Create();
++imageCounter;
string extension = imageInfo.ContentType.Split('/')[1].ToLower();
ImageFormat imageFormat = null;
if (extension == "png")
imageFormat = ImageFormat.Png;
else if (extension == "gif")
imageFormat = ImageFormat.Gif;
else if (extension == "bmp")
imageFormat = ImageFormat.Bmp;
else if (extension == "jpeg" || extension == "jpg")
imageFormat = ImageFormat.Jpeg;
else if (extension == "tiff")
{
extension = "gif";
imageFormat = ImageFormat.Gif;
}
else if (extension == "x-wmf")
{
extension = "wmf";
imageFormat = ImageFormat.Wmf;
}
if (imageFormat == null)
return null;
string imageFileName = imageDirectoryName + "/image" +
imageCounter.ToString() + "." + extension;
try
{
imageInfo.Bitmap.Save(imageFileName, imageFormat);
}
catch (System.Runtime.InteropServices.ExternalException)
{
return null;
}
string imageSource = localDirInfo.Name + "/image" +
imageCounter.ToString() + "." + extension;
XElement img = new XElement(Xhtml.img,
new XAttribute(NoNamespace.src, imageSource),
imageInfo.ImgStyleAttribute,
imageInfo.AltText != null ?
new XAttribute(NoNamespace.alt, imageInfo.AltText) : null);
return img;
}
};
XElement htmlElement = HtmlConverter.ConvertToHtml(wDoc, settings);
var html = new XDocument(
new XDocumentType("html", null, null, null),
htmlElement);
var htmlString = html.ToString(SaveOptions.DisableFormatting);
ConvertHtmlCss(targetFilePath, htmlString);
}
}
}
txt、srt、lrc转HTML
if (extension == ".txt" || extension == ".srt" || extension == ".lrc")
{
StringBuilder lines = null;
Regex regex = new Regex(@"^(\s)*$",RegexOptions.Compiled);
using (StreamWriter sw = new StreamWriter(tempDir2 + tempFileName, true, System.Text.Encoding.UTF8))
{
sw.Write(DefaultHeadHtmlTemplate);
lines = new StringBuilder(800);
using (StreamReader sr = new StreamReader(filePath, CommonHelper.GetEncoding(filePath, System.Text.Encoding.Default)))
{
while (!sr.EndOfStream)
{
str = sr.ReadLine();
if (regex.IsMatch(str) == false)
{
lines.Append("<p>").Append(str).Append("</p>");
}
else
{
lines.Append("<br>");
}
if (lines.Length >= 500)
{
sw.Write(lines.ToString());
lines.Length = 0;
lines = new StringBuilder(800);
}
}
}
if (lines.Length > 0)
sw.WriteLine(lines.ToString());
lines.Length = 0;
sw.WriteLine("</body>");
sw.WriteLine("</html>");
}
}