再度改进,在脏字可能存在的情况下,例如出现了多个脏字前Length-1部分时,性能相比http://www.cnblogs.com/xingd/archive/2008/01/31/1060425.html中描述的又提升了300%~400%。
直接贴出全部代码了,通过新增的一个byte[char.MaxValue]和BitArray(char.MaxValue),减少了大量的Substring和GetHashCode的调用。耗的内存也不算多,除HashSet外,仅需要144k内存。
引用此文或者使用此代码请说明出处,谢谢,以便于我将来的更新。
2008-02-02修订:if (index > 0 || (fastCheck[text[index]] & 1) == 0) 应去掉index > 0的判断,这个优化考虑的不够成熟。感谢sumtec和灵感之源指出错误。避免最短匹配时,可以在 if (hash.Contains(sub)) 之后,可以加入判断 if ((fastLength[begin] >> Math.Min(j,7)) == 0),然后再return true。
2008-02-03修订:for循环内部的if ((fastCheck[current] & 1) == 0)应为if ((fastCheck[current] & 1) == 0 && count == j)。修正bug并加入大小写敏感后,效率降低1倍。
public
class
BadWordsFilter
{
private HashSet < string > hash = new HashSet < string > ();
private byte [] fastCheck = new byte [ char .MaxValue];
private byte [] fastLength = new byte [ char .MaxValue];
private BitArray charCheck = new BitArray( char .MaxValue);
private BitArray endCheck = new BitArray( char .MaxValue);
private int maxWordLength = 0 ;
private int minWordLength = int .MaxValue;
public BadWordsFilter()
{
}
public void Init( string [] badwords)
{
foreach ( string word in badwords)
{
maxWordLength = Math.Max(maxWordLength, word.Length);
minWordLength = Math.Min(minWordLength, word.Length);
for ( int i = 0 ; i < 7 && i < word.Length; i ++ )
{
fastCheck[word[i]] |= ( byte )( 1 << i);
}
for ( int i = 7 ; i < word.Length; i ++ )
{
fastCheck[word[i]] |= 0x80 ;
}
if (word.Length == 1 )
{
charCheck[word[ 0 ]] = true ;
}
else
{
fastLength[word[ 0 ]] |= ( byte )( 1 << (Math.Min( 7 , word.Length - 2 )));
endCheck[word[word.Length - 1 ]] = true ;
hash.Add(word);
}
}
}
public string Filter( string text, string mask)
{
throw new NotImplementedException();
}
public bool HasBadWord( string text)
{
int index = 0 ;
while (index < text.Length)
{
int count = 1 ;
if (index > 0 || (fastCheck[text[index]] & 1 ) == 0 )
{
while (index < text.Length - 1 && (fastCheck[text[ ++ index]] & 1 ) == 0 ) ;
}
char begin = text[index];
if (minWordLength == 1 && charCheck[begin])
{
return true ;
}
for ( int j = 1 ; j <= Math.Min(maxWordLength, text.Length - index - 1 ); j ++ )
{
char current = text[index + j];
if ((fastCheck[current] & 1 ) == 0 )
{
++ count;
}
if ((fastCheck[current] & ( 1 << Math.Min(j, 7 ))) == 0 )
{
break ;
}
if (j + 1 >= minWordLength)
{
if ((fastLength[begin] & ( 1 << Math.Min(j - 1 , 7 ))) > 0 && endCheck[current])
{
string sub = text.Substring(index, j + 1 );
if (hash.Contains(sub))
{
return true ;
}
}
}
}
index += count;
}
return false ;
}
}
{
private HashSet < string > hash = new HashSet < string > ();
private byte [] fastCheck = new byte [ char .MaxValue];
private byte [] fastLength = new byte [ char .MaxValue];
private BitArray charCheck = new BitArray( char .MaxValue);
private BitArray endCheck = new BitArray( char .MaxValue);
private int maxWordLength = 0 ;
private int minWordLength = int .MaxValue;
public BadWordsFilter()
{
}
public void Init( string [] badwords)
{
foreach ( string word in badwords)
{
maxWordLength = Math.Max(maxWordLength, word.Length);
minWordLength = Math.Min(minWordLength, word.Length);
for ( int i = 0 ; i < 7 && i < word.Length; i ++ )
{
fastCheck[word[i]] |= ( byte )( 1 << i);
}
for ( int i = 7 ; i < word.Length; i ++ )
{
fastCheck[word[i]] |= 0x80 ;
}
if (word.Length == 1 )
{
charCheck[word[ 0 ]] = true ;
}
else
{
fastLength[word[ 0 ]] |= ( byte )( 1 << (Math.Min( 7 , word.Length - 2 )));
endCheck[word[word.Length - 1 ]] = true ;
hash.Add(word);
}
}
}
public string Filter( string text, string mask)
{
throw new NotImplementedException();
}
public bool HasBadWord( string text)
{
int index = 0 ;
while (index < text.Length)
{
int count = 1 ;
if (index > 0 || (fastCheck[text[index]] & 1 ) == 0 )
{
while (index < text.Length - 1 && (fastCheck[text[ ++ index]] & 1 ) == 0 ) ;
}
char begin = text[index];
if (minWordLength == 1 && charCheck[begin])
{
return true ;
}
for ( int j = 1 ; j <= Math.Min(maxWordLength, text.Length - index - 1 ); j ++ )
{
char current = text[index + j];
if ((fastCheck[current] & 1 ) == 0 )
{
++ count;
}
if ((fastCheck[current] & ( 1 << Math.Min(j, 7 ))) == 0 )
{
break ;
}
if (j + 1 >= minWordLength)
{
if ((fastLength[begin] & ( 1 << Math.Min(j - 1 , 7 ))) > 0 && endCheck[current])
{
string sub = text.Substring(index, j + 1 );
if (hash.Contains(sub))
{
return true ;
}
}
}
}
index += count;
}
return false ;
}
}