Doc、Docx转成HTML

如何将doc、docx、txt、srt、lrc格式的文件转成HTML呢?
doc转HTML需要用到Java写的 POIOfficeTool.exe
直接上代码:
doc、docx转HTML

 public void Doc2Html(string srcFilePath, string targetFilePath)
        {
            try
            {
                string ext = Path.GetExtension(srcFilePath).ToLower();
                if (".doc" == ext)
                {
                    using (System.Diagnostics.Process proc = new System.Diagnostics.Process())
                    {
                        proc.StartInfo.WorkingDirectory = CommonHelper.BaseDirectory;
                        proc.StartInfo.FileName = Path.Combine(CommonHelper.BaseDirectory, @"Java\POIOfficeTool.exe");
                        proc.StartInfo.Arguments = " -word2html \"" + srcFilePath + "\" \"" + targetFilePath + ".temp" + "\"";
                        proc.StartInfo.UseShellExecute = false;
                        proc.StartInfo.RedirectStandardError = true;
                        proc.StartInfo.CreateNoWindow = true;
                        proc.Start();
                        proc.WaitForExit();
                        proc.Close();
                        proc.Dispose();
                    }

                    using (StreamReader sr = new StreamReader(targetFilePath + ".temp"))
                    {
                        ConvertHtmlCss(targetFilePath, sr.ReadToEnd());
                    }
                }
                else if (".docx" == ext)
                {
                    ConvertToHtml(srcFilePath, targetFilePath);
                }
                else
                {
                    throw new Exception("不支持的文件类型");
                }
            }
            catch (Exception e)
            {
                throw e;
            }
        }
/// <summary>
        /// 默认的html样式
        /// </summary>
        private const string DefaultStyle = @"
        body{background-color:#fff !important; margin:17px 0px 18px 0px !important;}*{font-family: 微软雅黑,Cambria,'sans serif';}div,p,span{font-size:16px;line-height:1.5;}br{line-height:normal;font-size:1px;}table {border-width: 1px;border-style: dotted;border-color: #C6C6C6;} table td, table th {border-width: 1px;border-style: dotted;border-color: #C6C6C6; }";
        /// <summary>
        /// 默认的html模板
        /// </summary>
        public static readonly string DefaultHeadHtmlTemplate =
        @"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 Transitional//EN"" ""http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"">
                <html xmlns=""http://www.w3.org/1999/xhtml"">
                    <head>
                        <meta http-equiv=""Content-Type"" content=""text/html; charset=utf-8"" />
                        <title></title>
                    </head>
                <body>
                <style type=""text/css"">" + DefaultStyle+"</style>".Replace("\r",string.Empty).Replace("\n",string.Empty);
/// <summary>
        /// 覆盖特殊的样式
        /// </summary>
        /// <param name="node"></param>
        private static void ReplaceSpecialStyle(HtmlAgilityPack.HtmlNode node)
        {
            if (string.Equals(node.Name, "br", StringComparison.OrdinalIgnoreCase)) return;

            int fontWeightStartIndex = -1, fontEndWeightIndex = -1;
            string tempValue = node.Attributes["style"].Value;
            string content = null;

            if ((fontWeightStartIndex = tempValue.LastIndexOf("color")) > -1)//字体颜色
            {
                fontEndWeightIndex = tempValue.IndexOf(";", fontWeightStartIndex, tempValue.Length - fontWeightStartIndex);
                if (fontEndWeightIndex == -1)
                {
                    fontEndWeightIndex = tempValue.Length - 1;
                }
                if (fontEndWeightIndex + 1 - fontWeightStartIndex > 0)
                {
                    int startIndex = tempValue.IndexOf(":", fontWeightStartIndex);
                    string color = tempValue.Substring(startIndex + 1, fontEndWeightIndex - startIndex).ToLower();
                    if (color.EndsWith(";"))
                        color = color.Remove(color.Length - 1);

                    node.Attributes["style"].Value = tempValue.Remove(fontWeightStartIndex, fontEndWeightIndex + 1 - fontWeightStartIndex);
                    content = node.InnerHtml;
                    node.RemoveAllChildren();
                    node.AppendChild(HtmlAgilityPack.HtmlNode.CreateNode("<font color=\"" + color + "\">" + content + "</font>"));
                }
            }

            tempValue = node.Attributes["style"].Value;
            if ((fontWeightStartIndex = tempValue.LastIndexOf("font-style")) > -1)//字体样式
            {
                fontEndWeightIndex = tempValue.IndexOf(";", fontWeightStartIndex, tempValue.Length - fontWeightStartIndex);
                if (fontEndWeightIndex == -1)
                {
                    fontEndWeightIndex = tempValue.Length - 1;
                }
                if (fontEndWeightIndex + 1 - fontWeightStartIndex > 0)
                {
                    content = tempValue.Substring(fontWeightStartIndex, fontEndWeightIndex + 1 - fontWeightStartIndex).ToLower();
                    if (content.Contains("italic"))
                    {
                        node.Attributes["style"].Value = tempValue.Remove(fontWeightStartIndex, fontEndWeightIndex + 1 - fontWeightStartIndex);
                        content = node.InnerHtml;
                        node.RemoveAllChildren();
                        node.AppendChild(HtmlAgilityPack.HtmlNode.CreateNode("<em>" + content + "</em>"));
                    }
                    else if (content.Contains("oblique"))
                    {
                        node.Attributes["style"].Value = tempValue.Remove(fontWeightStartIndex, fontEndWeightIndex + 1 - fontWeightStartIndex);
                        content = node.InnerHtml;
                        node.RemoveAllChildren();
                        node.AppendChild(HtmlAgilityPack.HtmlNode.CreateNode("<i>" + content + "</i>"));
                    }
                }
            }

            tempValue = node.Attributes["style"].Value;
            if ((fontWeightStartIndex = tempValue.LastIndexOf("font-weight")) > -1)//字体加粗
            {
                fontEndWeightIndex = tempValue.IndexOf(";", fontWeightStartIndex, tempValue.Length - fontWeightStartIndex);
                if (fontEndWeightIndex == -1)
                {
                    fontEndWeightIndex = tempValue.Length - 1;
                }
                if (fontEndWeightIndex + 1 - fontWeightStartIndex > 0)
                {
                    content = tempValue.Substring(fontWeightStartIndex, fontEndWeightIndex + 1 - fontWeightStartIndex).ToLower();
                    if (content.Contains("bold"))
                    {
                        node.Attributes["style"].Value = tempValue.Remove(fontWeightStartIndex, fontEndWeightIndex + 1 - fontWeightStartIndex);
                        content = node.InnerHtml;
                        node.RemoveAllChildren();
                        node.AppendChild(HtmlAgilityPack.HtmlNode.CreateNode("<strong>" + content + "</strong>"));
                    }

                }
            }

            tempValue = node.Attributes["style"].Value;
            if ((fontWeightStartIndex = tempValue.LastIndexOf("text-decoration")) > -1)
            {
                fontEndWeightIndex = tempValue.IndexOf(";", fontWeightStartIndex, tempValue.Length - fontWeightStartIndex);
                if (fontEndWeightIndex == -1)
                {
                    fontEndWeightIndex = tempValue.Length - 1;
                }
                if (fontEndWeightIndex + 1 - fontWeightStartIndex > 0)
                {
                    content = tempValue.Substring(fontWeightStartIndex, fontEndWeightIndex + 1 - fontWeightStartIndex).ToLower();
                    if (content.Contains("underline"))//下划线
                    {
                        node.Attributes["style"].Value = tempValue.Remove(fontWeightStartIndex, fontEndWeightIndex + 1 - fontWeightStartIndex);
                        content = node.InnerHtml;
                        node.RemoveAllChildren();
                        node.AppendChild(HtmlAgilityPack.HtmlNode.CreateNode("<u>" + content + "</u>"));
                    }
                    else if (content.Contains("line-through"))//删除线
                    {
                        node.Attributes["style"].Value = tempValue.Remove(fontWeightStartIndex, fontEndWeightIndex + 1 - fontWeightStartIndex);
                        content = node.InnerHtml;
                        node.RemoveAllChildren();
                        node.AppendChild(HtmlAgilityPack.HtmlNode.CreateNode("<del>" + content + "</del>"));
                    }
                }
            }

            tempValue = node.Attributes["style"].Value;
            if ((fontWeightStartIndex = tempValue.LastIndexOf("text-align")) > -1)
            {
                fontEndWeightIndex = tempValue.IndexOf(";", fontWeightStartIndex, tempValue.Length - fontWeightStartIndex);
                if (fontEndWeightIndex == -1)
                {
                    fontEndWeightIndex = tempValue.Length - 1;
                }
                if (fontEndWeightIndex + 1 - fontWeightStartIndex > 0)
                {
                    int startIndex = tempValue.IndexOf(":", fontWeightStartIndex);
                    string align = tempValue.Substring(startIndex + 1, fontEndWeightIndex - startIndex).ToLower();
                    if (align.EndsWith(";"))
                        align = align.Remove(align.Length - 1);

                    node.Attributes["style"].Value = tempValue.Remove(fontWeightStartIndex, fontEndWeightIndex + 1 - fontWeightStartIndex);
                    node.Attributes.Add("align", align);
                }
            }
        }
/// <summary>
        /// 将html的CSS转为内联style属性,只对class有效
        /// </summary>
        /// <param name="htmlPath"></param>
        /// <param name="content"></param>
        private void ConvertHtmlCss(string htmlPath, string content)
        {
            HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument();
            html.LoadHtml(content);

            var head = html.DocumentNode.SelectSingleNode("//head");

            HtmlAgilityPack.HtmlNode bodyNode = html.DocumentNode.SelectSingleNode("//body");
            if (bodyNode == null) return;
            HtmlAgilityPack.HtmlNodeCollection styleList = head.SelectNodes("//style");
            StringBuilder classCss = new StringBuilder();
            //将style由head搬到body中
            if (styleList != null)
            {
                StringBuilder cssText = new StringBuilder(@"<style type=""text/css"">");
                string innerText = null;
                string tempCss = null;
                Regex regex = new Regex(@"/\*.*?\*/", RegexOptions.Compiled);
                foreach (HtmlAgilityPack.HtmlNode node in styleList)
                {
                    if ((innerText = node.InnerHtml) != null && innerText.Length > 0)
                    {
                        ZetaHtmlEditControl.Code.Css.CssParse parse = new ZetaHtmlEditControl.Code.Css.CssParse();
                        parse.Source = regex.Replace(innerText,string.Empty);
                        classCss.Append((tempCss = parse.ParseToHtml()));
                        cssText.Append(tempCss);
                        head.RemoveChild(node);
                    }
                }

                cssText.Append(DefaultStyle).Append("</style>");
                bodyNode.PrependChild(HtmlAgilityPack.HtmlNode.CreateNode(cssText.ToString()));
                cssText.Length = 0;
            }
            else
                bodyNode.PrependChild(HtmlAgilityPack.HtmlNode.CreateNode(@"<style type=""text/css"">" + DefaultStyle + "</style>"));

            //span空格修正
            var nodes = bodyNode.SelectNodes(@"//span");
            if (nodes != null)
            {
                string innerHtml = null;
                foreach (var node in nodes)
                {
                    if (Regex.IsMatch(innerHtml = node.InnerHtml, @"^\s+$"))
                    {
                        if (innerHtml.Length > 0)
                            node.InnerHtml = innerHtml.Replace(" ", "&nbsp;");
                        else
                            node.InnerHtml = "&nbsp;";
                    }
                }
            }

            //内联样式修正
            nodes = bodyNode.SelectNodes(@"//*[@style]");
            if (nodes != null)
            {
                foreach (var node in nodes)
                {
                    if(string.Equals(node.Name,"img",StringComparison.OrdinalIgnoreCase))
                        continue;
                    ReplaceSpecialStyle(node);       
                    node.Attributes["style"].Value = ZetaHtmlEditControl.Code.Html.InlineCssParser.ParseToHtml(node.Attributes["style"].Value);
                }
            }

            //class转内联样式
            nodes = bodyNode.SelectNodes(@"//*[@class]");
            if (nodes != null && classCss.Length>0)
            {
                ZetaHtmlEditControl.Code.Css.CssDocument cssDoc = new ZetaHtmlEditControl.Code.Css.CssDocument();
                cssDoc.Load(classCss.ToString());

                ZetaHtmlEditControl.Code.Css.CssElement cssElement = null;
                const string style = "style";
                const string dot = ".";
                const string cls = "class";
                string classValues = string.Empty;

                foreach (var node in nodes)
                {
                    if(string.Equals(node.Name,"img",StringComparison.OrdinalIgnoreCase))
                        continue;

                    classValues = node.Attributes[cls].Value.Trim().ToLower();
                    foreach (string clsValue in classValues.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries))
                    {
                        cssElement = cssDoc[dot + clsValue];
                        if (cssElement != null)
                        {
                            if (node.Attributes.Contains(style))
                                node.Attributes[style].Value = cssElement.Value + node.Attributes[style].Value;
                            else
                                node.Attributes.Add(style, cssElement.Value);

                            ReplaceSpecialStyle(node);

                            classValues=classValues.Replace(clsValue, string.Empty);
                        }
                    }

                    classValues = classValues.Trim();
                    if (classValues.Length < 1)
                    {
                        node.Attributes.Remove(cls);
                    }
                    else
                    {
                        node.Attributes[cls].Value = classValues;
                    }
                }

                cssDoc.Elements.Clear();
            }
            classCss.Length = 0;

            html.Save(htmlPath, Encoding.UTF8);
        }
private void ConvertToHtml(string srcFilePath, string targetFilePath)
        {
            var fi = new FileInfo(srcFilePath);
            byte[] byteArray = File.ReadAllBytes(fi.FullName);
            using (MemoryStream memoryStream = new MemoryStream())
            {
                memoryStream.Write(byteArray, 0, byteArray.Length);
                using (WordprocessingDocument wDoc = WordprocessingDocument.Open(memoryStream, true, new OpenSettings
                    {
                        AutoSave=false,
                    }))
                {
                    var imageDirectoryName = Path.Combine(Path.GetDirectoryName(targetFilePath), Path.GetFileNameWithoutExtension(targetFilePath) + "_images");
                    if (Directory.Exists(imageDirectoryName) == false)
                        Directory.CreateDirectory(imageDirectoryName);

                    int imageCounter = 0;

                    var pageTitle = fi.FullName;
                    var part = wDoc.CoreFilePropertiesPart;
                    if (part != null)
                    {
                        pageTitle = (string)part.GetXDocument().Descendants(DC.title).FirstOrDefault() ?? fi.FullName;
                    }


                    HtmlConverterSettings settings = new HtmlConverterSettings()
                    {

                        PageTitle = pageTitle,
                        FabricateCssClasses = true,
                        CssClassPrefix = "pt-",
                        RestrictToSupportedLanguages = false,
                        RestrictToSupportedNumberingFormats = false,
                        ImageHandler = imageInfo =>
                        {
                            DirectoryInfo localDirInfo = new DirectoryInfo(imageDirectoryName);
                            if (!localDirInfo.Exists)
                                localDirInfo.Create();
                            ++imageCounter;
                            string extension = imageInfo.ContentType.Split('/')[1].ToLower();
                            ImageFormat imageFormat = null;
                            if (extension == "png")
                                imageFormat = ImageFormat.Png;
                            else if (extension == "gif")
                                imageFormat = ImageFormat.Gif;
                            else if (extension == "bmp")
                                imageFormat = ImageFormat.Bmp;
                            else if (extension == "jpeg" || extension == "jpg")
                                imageFormat = ImageFormat.Jpeg;
                            else if (extension == "tiff")
                            {

                                extension = "gif";
                                imageFormat = ImageFormat.Gif;
                            }
                            else if (extension == "x-wmf")
                            {
                                extension = "wmf";
                                imageFormat = ImageFormat.Wmf;
                            }


                            if (imageFormat == null)
                                return null;

                            string imageFileName = imageDirectoryName + "/image" +
                                imageCounter.ToString() + "." + extension;
                            try
                            {
                                imageInfo.Bitmap.Save(imageFileName, imageFormat);
                            }
                            catch (System.Runtime.InteropServices.ExternalException)
                            {
                                return null;
                            }
                            string imageSource = localDirInfo.Name + "/image" +
                                imageCounter.ToString() + "." + extension;

                            XElement img = new XElement(Xhtml.img,
                                new XAttribute(NoNamespace.src, imageSource),
                                imageInfo.ImgStyleAttribute,
                                imageInfo.AltText != null ?
                                    new XAttribute(NoNamespace.alt, imageInfo.AltText) : null);
                            return img;
                        }
                    };
                    XElement htmlElement = HtmlConverter.ConvertToHtml(wDoc, settings);


                    var html = new XDocument(
                        new XDocumentType("html", null, null, null),
                        htmlElement);


                    var htmlString = html.ToString(SaveOptions.DisableFormatting);
                    ConvertHtmlCss(targetFilePath, htmlString);

                }
            }
        }

txt、srt、lrc转HTML

if (extension == ".txt" || extension == ".srt" || extension == ".lrc")
                {
                    StringBuilder lines = null;
                    Regex regex = new Regex(@"^(\s)*$",RegexOptions.Compiled);

                    using (StreamWriter sw = new StreamWriter(tempDir2 + tempFileName, true, System.Text.Encoding.UTF8))
                    {
                        sw.Write(DefaultHeadHtmlTemplate);

                        lines = new StringBuilder(800);
                        using (StreamReader sr = new StreamReader(filePath, CommonHelper.GetEncoding(filePath, System.Text.Encoding.Default)))
                        {
                            while (!sr.EndOfStream)
                            {
                                str = sr.ReadLine();
                                if (regex.IsMatch(str) == false)
                                {
                                    lines.Append("<p>").Append(str).Append("</p>");
                                }
                                else
                                {
                                    lines.Append("<br>");
                                }

                                if (lines.Length >= 500)
                                {
                                    sw.Write(lines.ToString());
                                    lines.Length = 0;
                                    lines = new StringBuilder(800);
                                }
                            }
                        }

                        if (lines.Length > 0)
                            sw.WriteLine(lines.ToString());
                        lines.Length = 0;
                        sw.WriteLine("</body>");
                        sw.WriteLine("</html>");
                    }
                }
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值