asp.net清除word格式(C#)

static void Main(string[] args) { if (args.Length == 0 || String.IsNullOrEmpty(args[0])) { Console.WriteLine("No filename provided."); return; } string filepath = args[0]; if (Path.GetFileName(filepath) == args[0]) { filepath = Path.Combine(Environment.CurrentDirectory, filepath); } if (!File.Exists(args[0])) { Console.WriteLine("File doesn't exist."); } string html = File.ReadAllText(filepath); Console.WriteLine("input html is " html.Length " chars"); html = CleanWordHtml(html); html = FixEntities(html); filepath = Path.GetFileNameWithoutExtension(filepath) ".modified.htm"; File.WriteAllText(filepath, html); Console.WriteLine("cleaned html is " html.Length " chars"); } static string CleanWordHtml(string html) { StringCollection sc = new StringCollection(); // get rid of unnecessary tag spans (comments and title) sc.Add(@"<!--(w|W) ?-->"); sc.Add(@"<title>(w|W) ?</title>"); // Get rid of classes and styles sc.Add(@"s?class=w "); sc.Add(@"s style='[^'] '"); // Get rid of unnecessary tags sc.Add( @"<(meta|link|/?o:|/?style|/?div|/?std|/?head|/?html|body|/?body|/?span|![)[^>]*?>"); // Get rid of empty paragraph tags sc.Add(@"(<[^>] >) (</w >) "); // remove bizarre v: element attached to <img> tag sc.Add(@"s v:w =""[^""] """); // remove extra lines sc.Add(@"( ){2,}"); foreach (string s in sc) { html = Regex.Replace(html, s, "", RegexOptions.IgnoreCase); } return html; } static string FixEntities(string html) { NamueCollection nvc = new NamueCollection(); nvc.Add("“", "“"); nvc.Add("”", "”"); nvc.Add("?", "—"); foreach (string key in nvc.Keys) { html = html.Replace(key, nvc[key]); } return html; }

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值