清理Word生成HTML的冗余;清理与清除HTML标签

众所周知,当直接将word中的内容复制到网页上时,会产生很多冗余代码;

而现在,在线编辑器又很普遍;就包括,现在用的百度空间的这个文本编辑器,如果直接从WORD中写好的文章复制进来,本来没有几个字,结果,它会提示,超出最大字数;也就是因为冗余代码过多的原因;

而用户直接拷贝Word的事儿很常见;以前我也写过一个清理的方法;在我的空间也有;当时只是为了让文章在显示时,很够统一格式;清理了一些多余样式,然后用CSS控件其样式,如果控件不了的,就用!importan强制定义;呵呵;

虽然清理了样式,但也遗留了不少,例如,<font style="color:red">示例</font>;清理后成了<font>示例></font>;其实这时font标签已经没有用了,不如直接清除字;

于是就又写了一些方法;把几个常用的简单方法发上来吧,很简单;

  1. using System; 
  2. using System.Collections.Generic; 
  3. using System.Text; 
  4. using System.Text.RegularExpressions; 
  5.  
  6. namespace Extend 
  7.     public class Article 
  8.     { 
  9.         #region 清理HTML标签 
  10.         /// <summary> 
  11.         /// 清理HTML标签的多余样式;如<div style="color:#454353">示例</div>;换成<div>示例</div> 
  12.         /// </summary> 
  13.         /// <param name="str">原始文本</param> 
  14.         /// <param name="element">要清除的标签</param> 
  15.         /// <returns></returns> 
  16.         public static string ClearElement(string str, string element) 
  17.         { 
  18.             string old = @"<" + element + "[^>]+>"
  19.             string rep = "<" + element + ">"
  20.             str = Regex.Replace(str, old, rep, RegexOptions.IgnoreCase); 
  21.             return str; 
  22.         } 
  23.         /// <summary> 
  24.         /// 清除HTML标签;如<div style="color:#454353">示例</div>;换成:示例 
  25.         /// </summary> 
  26.         /// <param name="str">原始文本</param> 
  27.         /// <param name="element">要清除的标签</param> 
  28.         /// <returns></returns> 
  29.         public static string ReMoveElement(string str,string element) 
  30.         { 
  31.             string regFront = @"<" + element + "[^>]*>"
  32.             string regAfter = "</" + element + ">"
  33.             str = Regex.Replace(str, regFront, "", RegexOptions.IgnoreCase); 
  34.             str = Regex.Replace(str, regAfter, "", RegexOptions.IgnoreCase); 
  35.             return str; 
  36.         } 
  37.         /// <summary> 
  38.         /// 清理指定字符串,大小写不敏感 
  39.         /// </summary> 
  40.         /// <param name="strText">原始文本</param> 
  41.         /// <param name="strOld">要替换的字符串,支持正则表达式,大小写不敏感</param> 
  42.         /// <param name="strNew">替换后的字符串</param> 
  43.         /// <returns></returns> 
  44.         public static string RegexReplace(string strText,string strOld,string strNew) 
  45.         { 
  46.             strText = Regex.Replace(strText, strOld, strNew, RegexOptions.IgnoreCase); 
  47.             return strText; 
  48.         } 
  49.         /// <summary> 
  50.         /// 清理Word的样式,主要是一些带冒号的标签,如o:p 
  51.         /// </summary> 
  52.         /// <param name="strText"></param> 
  53.         /// <returns></returns> 
  54.         public static string ClearWordStyle(string strText) 
  55.         { 
  56.             string regFront = @"<\w+:[^>]*>"
  57.             string regAfter = @"</\w+:[^>]*>"
  58.             strText = Regex.Replace(strText, regFront, "", RegexOptions.IgnoreCase); 
  59.             strText = Regex.Replace(strText, regAfter, "", RegexOptions.IgnoreCase); 
  60.             return strText; 
  61.         } 
  62.         #endregion 
  63.  
  64.     } 
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;

namespace Extend
{
    public class Article
    {

        #region 清理HTML标签
        /// <summary>
        /// 清理HTML标签的多余样式;如<div style="color:#454353">示例</div>;换成<div>示例</div>
        /// </summary>
        /// <param name="str">原始文本</param>
        /// <param name="element">要清除的标签</param>
        /// <returns></returns>
        public static string ClearElement(string str, string element)
        {
            string old = @"<" + element + "[^>]+>";
            string rep = "<" + element + ">";
            str = Regex.Replace(str, old, rep, RegexOptions.IgnoreCase);
            return str;
        }
        /// <summary>
        /// 清除HTML标签;如<div style="color:#454353">示例</div>;换成:示例
        /// </summary>
        /// <param name="str">原始文本</param>
        /// <param name="element">要清除的标签</param>
        /// <returns></returns>
        public static string ReMoveElement(string str,string element)
        {
            string regFront = @"<" + element + "[^>]*>";
            string regAfter = "</" + element + ">";
            str = Regex.Replace(str, regFront, "", RegexOptions.IgnoreCase);
            str = Regex.Replace(str, regAfter, "", RegexOptions.IgnoreCase);
            return str;
        }
        /// <summary>
        /// 清理指定字符串,大小写不敏感
        /// </summary>
        /// <param name="strText">原始文本</param>
        /// <param name="strOld">要替换的字符串,支持正则表达式,大小写不敏感</param>
        /// <param name="strNew">替换后的字符串</param>
        /// <returns></returns>
        public static string RegexReplace(string strText,string strOld,string strNew)
        {
            strText = Regex.Replace(strText, strOld, strNew, RegexOptions.IgnoreCase);
            return strText;
        }
        /// <summary>
        /// 清理Word的样式,主要是一些带冒号的标签,如o:p
        /// </summary>
        /// <param name="strText"></param>
        /// <returns></returns>
        public static string ClearWordStyle(string strText)
        {
            string regFront = @"<\w+:[^>]*>";
            string regAfter = @"</\w+:[^>]*>";
            strText = Regex.Replace(strText, regFront, "", RegexOptions.IgnoreCase);
            strText = Regex.Replace(strText, regAfter, "", RegexOptions.IgnoreCase);
            return strText;
        }
        #endregion

    }
}


以上只是清理的方法;实际操作时,可以这样写;

 

  1. /// <summary> 
  2.   /// 替换新闻内容中的Html标签的多余属性 
  3.   /// </summary> 
  4.   /// <param name="str"></param> 
  5.   /// <returns></returns> 
  6.   private string ArtilceClear(string str) 
  7.   { 
  8.       if (str == "" || str == null || string.IsNullOrEmpty(str)) 
  9.           return ""
  10.       //清理word标签,如o:p之类,带冒号的 
  11.       str = Extend.Article.ClearWordStyle(str); 
  12.       string[] el; 
  13.       //清理样式 
  14.       el = new string[] { "p", "div","table","tr","td" }; 
  15.       foreach (string s in el) 
  16.       { 
  17.           try 
  18.           { 
  19.               str = Extend.Article.ClearElement(str, s); 
  20.           } 
  21.           catch 
  22.           { 
  23.               continue
  24.           } 
  25.       } 
  26.       //清除样式 
  27.       el = new string[] { "span", "strong", "font", "h1", "tbody","o:p" }; 
  28.       foreach (string s in el) 
  29.       { 
  30.           try 
  31.           { 
  32.               str = Extend.Article.ReMoveElement(str, s); 
  33.               //while (str.IndexOf("</"+s+">") >-1) 
  34.               //{ 
  35.               //    str = Extend.Article.ReMoveElement(s, str); 
  36.               //} 
  37.           } 
  38.           catch 
  39.           { 
  40.               continue
  41.           } 
  42.       } 
  43.       str = Extend.Article.RegexReplace(str," ",""); 
  44.       return str; 
  45.   } 
  /// <summary>
    /// 替换新闻内容中的Html标签的多余属性
    /// </summary>
    /// <param name="str"></param>
    /// <returns></returns>
    private string ArtilceClear(string str)
    {
        if (str == "" || str == null || string.IsNullOrEmpty(str))
            return "";
        //清理word标签,如o:p之类,带冒号的
        str = Extend.Article.ClearWordStyle(str);
        string[] el;
        //清理样式
        el = new string[] { "p", "div","table","tr","td" };
        foreach (string s in el)
        {
            try
            {
                str = Extend.Article.ClearElement(str, s);
            }
            catch
            {
                continue;
            }
        }
        //清除样式
        el = new string[] { "span", "strong", "font", "h1", "tbody","o:p" };
        foreach (string s in el)
        {
            try
            {
                str = Extend.Article.ReMoveElement(str, s);
                //while (str.IndexOf("</"+s+">") >-1)
                //{
                //    str = Extend.Article.ReMoveElement(s, str);
                //}
            }
            catch
            {
                continue;
            }
        }
        str = Extend.Article.RegexReplace(str," ","");
        return str;
    }


注意看“清理”还是“清除”;像P、div、table等,是不能清除的,只是将它们的样式清理一下,将冗余代码去掉,该标签并不删除;而像span、font、o:p等,可以连标签清除掉;

上面的代码,只作为参考;更复杂的按条件清理,可以参看我以前的文章;一般的清理,上面的代码,也足够了

当前路径:editor/dialog/fck_paste.html
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<!--
 * FCKeditor - The text editor for Internet - http://www.fckeditor.net
 * Copyright (C) 2003-2007 Frederico Caldeira Knabben
 *
 * == BEGIN LICENSE ==
 *
 * Licensed under the terms of any of the following licenses at your
 * choice:
 *
 *  - GNU General Public License Version 2 or later (the "GPL")
 *    http://www.gnu.org/licenses/gpl.html
 *
 *  - GNU Lesser General Public License Version 2.1 or later (the "LGPL")
 *    http://www.gnu.org/licenses/lgpl.html
 *
 *  - Mozilla Public License Version 1.1 or later (the "MPL")
 *    http://www.mozilla.org/MPL/MPL-1.1.html
 *
 * == END LICENSE ==
 *
 * This dialog is shown when, for some reason (usually security settings),
 * the user is not able to paste data from the clipboard to the editor using
 * the toolbar buttons or the context menu.
-->
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
	<title></title>
	<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
	<meta name="robots" content="noindex, nofollow" />

	<script type="text/javascript">
var oEditor = window.parent.InnerDialogLoaded() ;
var FCK = oEditor.FCK;
var FCKTools	= oEditor.FCKTools ;
var FCKConfig	= oEditor.FCKConfig ;

window.onload = function ()
{
	// First of all, translate the dialog box texts
	oEditor.FCKLanguageManager.TranslatePage(document) ;
	
	var sPastingType = window.parent.dialogArguments.CustomValue ;

	if ( sPastingType == 'Word' || sPastingType == 'Security' )
	{
		if ( sPastingType == 'Security' )
			document.getElementById( 'xSecurityMsg' ).style.display = '' ;

		var oFrame = document.getElementById('frmData') ;
		oFrame.style.display = '' ;

		if ( oFrame.contentDocument )
			oFrame.contentDocument.designMode = 'on' ;
		else
			oFrame.contentWindow.document.body.contentEditable = true ;
	}
	else
	{
		document.getElementById('txtData').style.display = '' ;
	}

	if ( sPastingType != 'Word' )
		document.getElementById('oWordCommands').style.display = 'none' ;

	window.parent.SetOkButton( true ) ;
	window.parent.SetAutoSize( true ) ;
}

function Ok()
{
	var sHtml ;

	var sPastingType = window.parent.dialogArguments.CustomValue ;

	if ( sPastingType == 'Word' || sPastingType == 'Security' )
	{
		var oFrame = document.getElementById('frmData') ;
		var oBody ;

		if ( oFrame.contentDocument )
			oBody = oFrame.contentDocument.body ;
		else
			oBody = oFrame.contentWindow.document.body ;

		if ( sPastingType == 'Word' )
		{
			// If a plugin creates a FCK.CustomCleanWord function it will be called instead of the default one
			if ( typeof( FCK.CustomCleanWord ) == 'function' )
				sHtml = FCK.CustomCleanWord( oBody, document.getElementById('chkRemoveFont').checked, document.getElementById('chkRemoveStyles').checked ) ;
			else
				sHtml = CleanWord( oBody, document.getElementById('chkRemoveFont').checked, document.getElementById('chkRemoveStyles').checked ) ;
		}
		else
			sHtml = oBody.innerHTML ;

		// Fix relative anchor URLs (IE automatically adds the current page URL).
		var re = new RegExp( window.location + "#", "g" ) ;
		sHtml = sHtml.replace( re, '#') ;
	}
	else
	{
		sHtml = oEditor.FCKTools.HTMLEncode( document.getElementById('txtData').value )  ;
		sHtml = sHtml.replace( /\n/g, '<BR>' ) ;
	}

	oEditor.FCK.InsertHtml( sHtml ) ;

	return true ;
}

function CleanUpBox()
{
	var oFrame = document.getElementById('frmData') ;

	if ( oFrame.contentDocument )
		oFrame.contentDocument.body.innerHTML = '' ;
	else
		oFrame.contentWindow.document.body.innerHTML = '' ;
}


// This function will be called from the PasteFromWord dialog (fck_paste.html)
// Input: oNode a DOM node that contains the raw paste from the clipboard
// bIgnoreFont, bRemoveStyles booleans according to the values set in the dialog
// Output: the cleaned string
function CleanWord( oNode, bIgnoreFont, bRemoveStyles )
{
	var html = oNode.innerHTML ;

	html = html.replace(/<o:p>\s*<\/o:p>/g, '') ;
	html = html.replace(/<o:p>.*?<\/o:p>/g, ' ') ;

	// Remove mso-xxx styles.
	html = html.replace( /\s*mso-[^:]+:[^;"]+;?/gi, '' ) ;

	// Remove margin styles.
	html = html.replace( /\s*MARGIN: 0cm 0cm 0pt\s*;/gi, '' ) ;
	html = html.replace( /\s*MARGIN: 0cm 0cm 0pt\s*"/gi, "\"" ) ;

	html = html.replace( /\s*TEXT-INDENT: 0cm\s*;/gi, '' ) ;
	html = html.replace( /\s*TEXT-INDENT: 0cm\s*"/gi, "\"" ) ;

	html = html.replace( /\s*TEXT-ALIGN: [^\s;]+;?"/gi, "\"" ) ;

	html = html.replace( /\s*PAGE-BREAK-BEFORE: [^\s;]+;?"/gi, "\"" ) ;

	html = html.replace( /\s*FONT-VARIANT: [^\s;]+;?"/gi, "\"" ) ;

	html = html.replace( /\s*tab-stops:[^;"]*;?/gi, '' ) ;
	html = html.replace( /\s*tab-stops:[^"]*/gi, '' ) ;

	// Remove FONT face attributes.
	if ( bIgnoreFont )
	{
		html = html.replace( /\s*face="[^"]*"/gi, '' ) ;
		html = html.replace( /\s*face=[^ >]*/gi, '' ) ;

		html = html.replace( /\s*FONT-FAMILY:[^;"]*;?/gi, '' ) ;
	}

	// Remove Class attributes
	html = html.replace(/<(\w[^>]*) class=([^ |>]*)([^>]*)/gi, "<$1$3") ;

	// Remove styles.
	if ( bRemoveStyles )
		html = html.replace( /<(\w[^>]*) style="([^\"]*)"([^>]*)/gi, "<$1$3" ) ;

	// Remove empty styles.
	html =  html.replace( /\s*style="\s*"/gi, '' ) ;

	html = html.replace( /<SPAN\s*[^>]*>\s* \s*<\/SPAN>/gi, ' ' ) ;

	html = html.replace( /<SPAN\s*[^>]*><\/SPAN>/gi, '' ) ;

	// Remove Lang attributes
	html = html.replace(/<(\w[^>]*) lang=([^ |>]*)([^>]*)/gi, "<$1$3") ;

	html = html.replace( /<SPAN\s*>(.*?)<\/SPAN>/gi, '$1' ) ;

	html = html.replace( /<FONT\s*>(.*?)<\/FONT>/gi, '$1' ) ;

	// Remove XML elements and declarations
	html = html.replace(/<\\?\?xml[^>]*>/gi, '' ) ;

	// Remove Tags with XML namespace declarations: <o:p><\/o:p>
	html = html.replace(/<\/?\w+:[^>]*>/gi, '' ) ;

	// Remove comments [SF BUG-1481861].
	html = html.replace(/<\!--.*-->/g, '' ) ;

	html = html.replace( /<(U|I|STRIKE)> <\/\1>/g, ' ' ) ;

	html = html.replace( /<H\d>\s*<\/H\d>/gi, '' ) ;

	// Remove "display:none" tags.
	html = html.replace( /<(\w+)[^>]*\sstyle="[^"]*DISPLAY\s?:\s?none(.*?)<\/\1>/ig, '' ) ;

	if ( FCKConfig.CleanWordKeepsStructure )
	{
		// The original <Hn> tag send from Word is something like this: <Hn style="margin-top:0px;margin-bottom:0px">
		html = html.replace( /<H(\d)([^>]*)>/gi, '<h$1>' ) ;

		// Word likes to insert extra <font> tags, when using MSIE. (Wierd).
		html = html.replace( /<(H\d)><FONT[^>]*>(.*?)<\/FONT><\/\1>/gi, '<$1>$2</$1>' );
		html = html.replace( /<(H\d)><EM>(.*?)<\/EM><\/\1>/gi, '<$1>$2</$1>' );
	}
	else
	{
		html = html.replace( /<H1([^>]*)>/gi, '<div$1><b><font size="6">' ) ;
		html = html.replace( /<H2([^>]*)>/gi, '<div$1><b><font size="5">' ) ;
		html = html.replace( /<H3([^>]*)>/gi, '<div$1><b><font size="4">' ) ;
		html = html.replace( /<H4([^>]*)>/gi, '<div$1><b><font size="3">' ) ;
		html = html.replace( /<H5([^>]*)>/gi, '<div$1><b><font size="2">' ) ;
		html = html.replace( /<H6([^>]*)>/gi, '<div$1><b><font size="1">' ) ;

		html = html.replace( /<\/H\d>/gi, '<\/font><\/b><\/div>' ) ;

		// Transform <P> to <DIV>
		var re = new RegExp( '(<P)([^>]*>.*?)(<\/P>)', 'gi' ) ;	// Different because of a IE 5.0 error
		html = html.replace( re, '<div$2<\/div>' ) ;

		// Remove empty tags (three times, just to be sure).
		// This also removes any empty anchor
		html = html.replace( /<([^\s>]+)(\s[^>]*)?>\s*<\/\1>/g, '' ) ;
		html = html.replace( /<([^\s>]+)(\s[^>]*)?>\s*<\/\1>/g, '' ) ;
		html = html.replace( /<([^\s>]+)(\s[^>]*)?>\s*<\/\1>/g, '' ) ;
	}

	return html ;
}

	</script>

</head>
<body style="overflow: hidden">
	<table cellspacing="0" cellpadding="0" width="100%" border="0" style="height: 98%">
		<tr>
			<td>
				<div id="xSecurityMsg" style="display: none">
					<span fcklang="DlgPasteSec">Because of your browser security settings,
						the editor is not able to access your clipboard data directly. You are required
						to paste it again in this window.</span><br />
					 
				</div>
				<div>
					<span fcklang="DlgPasteMsg2">Please paste inside the following box using the keyboard
						(<strong>Ctrl+V</strong>) and hit <strong>OK</strong>.</span><br />
					 
				</div>
			</td>
		</tr>
		<tr>
			<td valign="top" height="100%" style="border-right: #000000 1px solid; border-top: #000000 1px solid;
				border-left: #000000 1px solid; border-bottom: #000000 1px solid">
				<textarea id="txtData" cols="80" rows="5" style="border: #000000 1px; display: none;
					width: 99%; height: 98%"></textarea>
				<iframe id="frmData" src="javascript:void(0)" height="98%" width="99%" frameborder="0"
					style="border-right: #000000 1px; border-top: #000000 1px; display: none; border-left: #000000 1px;
					border-bottom: #000000 1px; background-color: #ffffff"></iframe>
			</td>
		</tr>
		<tr id="oWordCommands">
			<td>
				<table border="0" cellpadding="0" cellspacing="0" width="100%">
					<tr>
						<td nowrap="nowrap">
							<input id="chkRemoveFont" type="checkbox" checked="checked" />
							<label for="chkRemoveFont" fcklang="DlgPasteIgnoreFont">
								Ignore Font Face definitions</label>
							<br />
							<input id="chkRemoveStyles" type="checkbox" />
							<label for="chkRemoveStyles" fcklang="DlgPasteRemoveStyles">
								Remove Styles definitions</label>
						</td>
						<td align="right" valign="top">
							<input type="button" fcklang="DlgPasteCleanBox" value="Clean Up Box" οnclick="CleanUpBox()" />
						</td>
					</tr>
				</table>
			</td>
		</tr>
	</table>
</body>
</html>


 

当前路径:editor/dialog/fck_paste.html<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"><!-- * FCKeditor - The text editor for Internet - http://www.fckeditor.net * Copyright (C) 2003-2007 Frederico Caldeira Knabben * * == BEGIN LICENSE == * * Licensed under the terms of any of the following licenses at your * choice: * *  - GNU General Public License Version 2 or later (the "GPL") *    http://www.gnu.org/licenses/gpl.html * *  - GNU Lesser General Public License Version 2.1 or later (the "LGPL") *    http://www.gnu.org/licenses/lgpl.html * *  - Mozilla Public License Version 1.1 or later (the "MPL") *    http://www.mozilla.org/MPL/MPL-1.1.html * * == END LICENSE == * * This dialog is shown when, for some reason (usually security settings), * the user is not able to paste data from the clipboard to the editor using * the toolbar buttons or the context menu.--><html xmlns="http://www.w3.org/1999/xhtml"><head> <title></title> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <meta name="robots" content="noindex, nofollow" /> <script type="text/javascript">var oEditor = window.parent.InnerDialogLoaded() ;var FCK = oEditor.FCK;var FCKTools = oEditor.FCKTools ;var FCKConfig = oEditor.FCKConfig ;window.onload = function (){ // First of all, translate the dialog box texts oEditor.FCKLanguageManager.TranslatePage(document) ; var sPastingType = window.parent.dialogArguments.CustomValue ; if ( sPastingType == 'Word' || sPastingType == 'Security' ) { if ( sPastingType == 'Security' ) document.getElementById( 'xSecurityMsg' ).style.display = '' ; var oFrame = document.getElementById('frmData') ; oFrame.style.display = '' ; if ( oFrame.contentDocument ) oFrame.contentDocument.designMode = 'on' ; else oFrame.contentWindow.document.body.contentEditable = true ; } else { document.getElementById('txtData').style.display = '' ; } if ( sPastingType != 'Word' ) document.getElementById('oWordCommands').style.display = 'none' ; window.parent.SetOkButton( true ) ; window.parent.SetAutoSize( true ) ;}function Ok(){ var sHtml ; var sPastingType = window.parent.dialogArguments.CustomValue ; if ( sPastingType == 'Word' || sPastingType == 'Security' ) { var oFrame = document.getElementById('frmData') ; var oBody ; if ( oFrame.contentDocument ) oBody = oFrame.contentDocument.body ; else oBody = oFrame.contentWindow.document.body ; if ( sPastingType == 'Word' ) { // If a plugin creates a FCK.CustomCleanWord function it will be called instead of the default one if ( typeof( FCK.CustomCleanWord ) == 'function' ) sHtml = FCK.CustomCleanWord( oBody, document.getElementById('chkRemoveFont').checked, document.getElementById('chkRemoveStyles').checked ) ; else sHtml = CleanWord( oBody, document.getElementById('chkRemoveFont').checked, document.getElementById('chkRemoveStyles').checked ) ; } else sHtml = oBody.innerHTML ; // Fix relative anchor URLs (IE automatically adds the current page URL). var re = new RegExp( window.location + "#", "g" ) ; sHtml = sHtml.replace( re, '#') ; } else { sHtml = oEditor.FCKTools.HTMLEncode( document.getElementById('txtData').value )  ; sHtml = sHtml.replace( /\n/g, '<BR>' ) ; } oEditor.FCK.InsertHtml( sHtml ) ; return true ;}function CleanUpBox(){ var oFrame = document.getElementById('frmData') ; if ( oFrame.contentDocument ) oFrame.contentDocument.body.innerHTML = '' ; else oFrame.contentWindow.document.body.innerHTML = '' ;}// This function will be called from the PasteFromWord dialog (fck_paste.html)// Input: oNode a DOM node that contains the raw paste from the clipboard// bIgnoreFont, bRemoveStyles booleans according to the values set in the dialog// Output: the cleaned stringfunction CleanWord( oNode, bIgnoreFont, bRemoveStyles ){ var html = oNode.innerHTML ; html = html.replace(/<o:p>\s*<\/o:p>/g, '') ; html = html.replace(/<o:p>.*?<\/o:p>/g, '&nbsp;') ; // Remove mso-xxx styles. html = html.replace( /\s*mso-[^:]+:[^;"]+;?/gi, '' ) ; // Remove margin styles. html = html.replace( /\s*MARGIN: 0cm 0cm 0pt\s*;/gi, '' ) ; html = html.replace( /\s*MARGIN: 0cm 0cm 0pt\s*"/gi, "\"" ) ; html = html.replace( /\s*TEXT-INDENT: 0cm\s*;/gi, '' ) ; html = html.replace( /\s*TEXT-INDENT: 0cm\s*"/gi, "\"" ) ; html = html.replace( /\s*TEXT-ALIGN: [^\s;]+;?"/gi, "\"" ) ; html = html.replace( /\s*PAGE-BREAK-BEFORE: [^\s;]+;?"/gi, "\"" ) ; html = html.replace( /\s*FONT-VARIANT: [^\s;]+;?"/gi, "\"" ) ; html = html.replace( /\s*tab-stops:[^;"]*;?/gi, '' ) ; html = html.replace( /\s*tab-stops:[^"]*/gi, '' ) ; // Remove FONT face attributes. if ( bIgnoreFont ) { html = html.replace( /\s*face="[^"]*"/gi, '' ) ; html = html.replace( /\s*face=[^ >]*/gi, '' ) ; html = html.replace( /\s*FONT-FAMILY:[^;"]*;?/gi, '' ) ; } // Remove Class attributes html = html.replace(/<(\w[^>]*) class=([^ |>]*)([^>]*)/gi, "<$1$3") ; // Remove styles. if ( bRemoveStyles ) html = html.replace( /<(\w[^>]*) style="([^\"]*)"([^>]*)/gi, "<$1$3" ) ; // Remove empty styles. html =  html.replace( /\s*style="\s*"/gi, '' ) ; html = html.replace( /<SPAN\s*[^>]*>\s*&nbsp;\s*<\/SPAN>/gi, '&nbsp;' ) ; html = html.replace( /<SPAN\s*[^>]*><\/SPAN>/gi, '' ) ; // Remove Lang attributes html = html.replace(/<(\w[^>]*) lang=([^ |>]*)([^>]*)/gi, "<$1$3") ; html = html.replace( /<SPAN\s*>(.*?)<\/SPAN>/gi, '$1' ) ; html = html.replace( /<FONT\s*>(.*?)<\/FONT>/gi, '$1' ) ; // Remove XML elements and declarations html = html.replace(/<\\?\?xml[^>]*>/gi, '' ) ; // Remove Tags with XML namespace declarations: <o:p><\/o:p> html = html.replace(/<\/?\w+:[^>]*>/gi, '' ) ; // Remove comments [SF BUG-1481861]. html = html.replace(/<\!--.*-->/g, '' ) ; html = html.replace( /<(U|I|STRIKE)>&nbsp;<\/\1>/g, '&nbsp;' ) ; html = html.replace( /<H\d>\s*<\/H\d>/gi, '' ) ; // Remove "display:none" tags. html = html.replace( /<(\w+)[^>]*\sstyle="[^"]*DISPLAY\s?:\s?none(.*?)<\/\1>/ig, '' ) ; if ( FCKConfig.CleanWordKeepsStructure ) { // The original <Hn> tag send from Word is something like this: <Hn style="margin-top:0px;margin-bottom:0px"> html = html.replace( /<H(\d)([^>]*)>/gi, '<h$1>' ) ; // Word likes to insert extra <font> tags, when using MSIE. (Wierd). html = html.replace( /<(H\d)><FONT[^>]*>(.*?)<\/FONT><\/\1>/gi, '<$1>$2</$1>' ); html = html.replace( /<(H\d)><EM>(.*?)<\/EM><\/\1>/gi, '<$1>$2</$1>' ); } else { html = html.replace( /<H1([^>]*)>/gi, '<div$1><b><font size="6">' ) ; html = html.replace( /<H2([^>]*)>/gi, '<div$1><b><font size="5">' ) ; html = html.replace( /<H3([^>]*)>/gi, '<div$1><b><font size="4">' ) ; html = html.replace( /<H4([^>]*)>/gi, '<div$1><b><font size="3">' ) ; html = html.replace( /<H5([^>]*)>/gi, '<div$1><b><font size="2">' ) ; html = html.replace( /<H6([^>]*)>/gi, '<div$1><b><font size="1">' ) ; html = html.replace( /<\/H\d>/gi, '<\/font><\/b><\/div>' ) ; // Transform <P> to <DIV> var re = new RegExp( '(<P)([^>]*>.*?)(<\/P>)', 'gi' ) ; // Different because of a IE 5.0 error html = html.replace( re, '<div$2<\/div>' ) ; // Remove empty tags (three times, just to be sure). // This also removes any empty anchor html = html.replace( /<([^\s>]+)(\s[^>]*)?>\s*<\/\1>/g, '' ) ; html = html.replace( /<([^\s>]+)(\s[^>]*)?>\s*<\/\1>/g, '' ) ; html = html.replace( /<([^\s>]+)(\s[^>]*)?>\s*<\/\1>/g, '' ) ; } return html ;} </script></head><body style="overflow: hidden"> <table cellspacing="0" cellpadding="0" width="100%" border="0" style="height: 98%"> <tr> <td> <div id="xSecurityMsg" style="display: none"> <span fcklang="DlgPasteSec">Because of your browser security settings, the editor is not able to access your clipboard data directly. You are required to paste it again in this window.</span><br /> &nbsp; </div> <div> <span fcklang="DlgPasteMsg2">Please paste inside the following box using the keyboard (<strong>Ctrl+V</strong>) and hit <strong>OK</strong>.</span><br /> &nbsp; </div> </td> </tr> <tr> <td valign="top" height="100%" style="border-right: #000000 1px solid; border-top: #000000 1px solid; border-left: #000000 1px solid; border-bottom: #000000 1px solid"> <textarea id="txtData" cols="80" rows="5" style="border: #000000 1px; display: none; width: 99%; height: 98%"></textarea> <iframe id="frmData" src="javascript:void(0)" height="98%" width="99%" frameborder="0" style="border-right: #000000 1px; border-top: #000000 1px; display: none; border-left: #000000 1px; border-bottom: #000000 1px; background-color: #ffffff"></iframe> </td> </tr> <tr id="oWordCommands"> <td> <table border="0" cellpadding="0" cellspacing="0" width="100%"> <tr> <td nowrap="nowrap"> <input id="chkRemoveFont" type="checkbox" checked="checked" /> <label for="chkRemoveFont" fcklang="DlgPasteIgnoreFont"> Ignore Font Face definitions</label> <br /> <input id="chkRemoveStyles" type="checkbox" /> <label for="chkRemoveStyles" fcklang="DlgPasteRemoveStyles"> Remove Styles definitions</label> </td> <td align="right" valign="top"> <input type="button" fcklang="DlgPasteCleanBox" value="Clean Up Box" οnclick="CleanUpBox()" /> </td> </tr> </table> </td> </tr> </table></body></html>

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值