1. 二元分词
1
<?
php
2 function _tokenizer ( $text )
3 {
4 // UTF8_only
5 // 2-Base Cut
6 $len = strlen ( $text );
7 $mbc = '' ;
8 $last_mbc = '' ;
9 $tmp = '' ;
10 $tokens = array ();
11
12 for ( $i = 0 ; $i < $len ; $i ++ ) {
13 $c = $text [ $i ];
14 $v = ord ( $c );
15
16 if ( $v > 0xe0 ) {
17 // 3-bytes chars
18 $tmp = '' ;
19 $mbc = $c . $text [ $i + 1 ] . $text [ $i + 2 ];
20 $i += 2 ;
21 }
22
23 elseif ( $v > 0xc0 ) {
24 // 2-bytes chars
25 $tmp = '' ;
26 $mbc = $c . $text [ $i + 1 ];
27 $i ++ ;
28 }
29
30 else {
31 $mbc = '' ;
32 if ( $c == ' ' ) {
33 if ( $tmp ) {
34 $p = $i - strlen ( $tmp );
35 $tokens [ $p ] = $tmp ;
36 }
37
38 $tmp = '' ;
39 }
40 else {
41 $tmp .= $c ;
42 }
43 }
44
45 if ( $mbc ) {
46 if ( $last_mbc ) {
47 $p = $i - strlen ( $last_mbc . $mbc ) + 1 ;
48 $tokens [ $p ] = $last_mbc . $mbc ;
49 }
50 $last_mbc = $mbc ;
51 }
52
53 else {
54 $last_mbc = '' ;
55 }
56 }
57
58 return $tokens ;
59 }
60
61 function _tokenizer_dict ( $text , $non_word = false )
62 {
63 $len = strlen ( $text );
64 $mbc = '' ;
65 // $mbc_str = '';
66 $mbc_str = array ();
67 $tmp = '' ;
68 $tokens = array ();
69
70 for ( $i = 0 ; $i < $len ; $i ++ ) {
71 $c = $text [ $i ];
72 $v = ord ( $c );
73
74 if ( $v > 0xe0 ) {
75 // 3-bytes chars
76 $tmp = '' ;
77 $mbc = $c . $text [ $i + 1 ] . $text [ $i + 2 ];
78 $i += 2 ;
79 }
80
81 elseif ( $v > 0xc0 ) {
82 // 2-bytes chars
83 $tmp = '' ;
84 $mbc = $c . $text [ $i + 1 ];
85 $i ++ ;
86 }
87
88 else {
89 $mbc = '' ;
90 if ( $c == ' ' ) {
91 if ( $tmp ) {
92 $p = $i - strlen ( $tmp );
93 $tokens [ $p ] = $tmp ;
94 }
95
96 $tmp = '' ;
97 }
98
99 else {
100 $tmp .= $c ;
101 }
102
103 if ( count ( $mbc_str ) > 0 ) {
104 // Div_dict
105 //mb_internal_encoding ('UTF-8');
106 $start_offset = $i - strlen ( implode ( '' , $mbc_str ));
107 $mbc_str_left = $mbc_str ;
108 while ( count ( $mbc_str_left )) {
109 // $mb_len = mb_strlen ($mbc_str_left);
110 $mb_len = count ( $mbc_str_left );
111 $word = '' ;
112
113 for ( $j = ( $mb_len > 4 ? 4 : $mb_len ); $j >= 1 ; $j -- ) {
114 // $test = mb_substr ($mbc_str_left, 0, $j);
115 $test = '' ;
116 for ( $k = 0 ; $k < $j ; $k ++ ) {
117 $test .= $mbc_str_left [ $k ];
118 }
119
120 // $mb_test_len = mb_strlen ($test);
121 if ( $j == 1 ) {
122 // 1 only
123 $word = $test ;
124 }
125
126 else {
127 if ( $this -> dict -> find ( $test )) {
128 $word = $test ;
129 }
130 }
131
132 if ( $word ) {
133 // $mbc_str_left = mb_substr ($mbc_str_left, $mb_test_len);
134
135 $arr_tmp = array ();
136 for ( $k = $j ; $k < $mb_len ; $k ++ ) {
137 $arr_tmp [] = $mbc_str_left [ $k ];
138 }
139
140 $mbc_str_left = $arr_tmp ;
141 if ( ! $non_word ) {
142 if ( $j > 1 )
143 $tokens [ $start_offset ] = $word ;
144 }
145 else
146 $tokens [ $start_offset ] = $word ;
147
148 $start_offset += strlen ( $word );
149 continue 2 ;
150 }
151 }
152 }
153 }
154
155 // $mbc_str = '';
156 $mbc_str = array ();
157 }
158
159 if ( $mbc ) {
160 $mbc_str [] = $mbc ;
161 }
162 }
163
164 return $tokens ;
165 }
166 ?>
2 function _tokenizer ( $text )
3 {
4 // UTF8_only
5 // 2-Base Cut
6 $len = strlen ( $text );
7 $mbc = '' ;
8 $last_mbc = '' ;
9 $tmp = '' ;
10 $tokens = array ();
11
12 for ( $i = 0 ; $i < $len ; $i ++ ) {
13 $c = $text [ $i ];
14 $v = ord ( $c );
15
16 if ( $v > 0xe0 ) {
17 // 3-bytes chars
18 $tmp = '' ;
19 $mbc = $c . $text [ $i + 1 ] . $text [ $i + 2 ];
20 $i += 2 ;
21 }
22
23 elseif ( $v > 0xc0 ) {
24 // 2-bytes chars
25 $tmp = '' ;
26 $mbc = $c . $text [ $i + 1 ];
27 $i ++ ;
28 }
29
30 else {
31 $mbc = '' ;
32 if ( $c == ' ' ) {
33 if ( $tmp ) {
34 $p = $i - strlen ( $tmp );
35 $tokens [ $p ] = $tmp ;
36 }
37
38 $tmp = '' ;
39 }
40 else {
41 $tmp .= $c ;
42 }
43 }
44
45 if ( $mbc ) {
46 if ( $last_mbc ) {
47 $p = $i - strlen ( $last_mbc . $mbc ) + 1 ;
48 $tokens [ $p ] = $last_mbc . $mbc ;
49 }
50 $last_mbc = $mbc ;
51 }
52
53 else {
54 $last_mbc = '' ;
55 }
56 }
57
58 return $tokens ;
59 }
60
61 function _tokenizer_dict ( $text , $non_word = false )
62 {
63 $len = strlen ( $text );
64 $mbc = '' ;
65 // $mbc_str = '';
66 $mbc_str = array ();
67 $tmp = '' ;
68 $tokens = array ();
69
70 for ( $i = 0 ; $i < $len ; $i ++ ) {
71 $c = $text [ $i ];
72 $v = ord ( $c );
73
74 if ( $v > 0xe0 ) {
75 // 3-bytes chars
76 $tmp = '' ;
77 $mbc = $c . $text [ $i + 1 ] . $text [ $i + 2 ];
78 $i += 2 ;
79 }
80
81 elseif ( $v > 0xc0 ) {
82 // 2-bytes chars
83 $tmp = '' ;
84 $mbc = $c . $text [ $i + 1 ];
85 $i ++ ;
86 }
87
88 else {
89 $mbc = '' ;
90 if ( $c == ' ' ) {
91 if ( $tmp ) {
92 $p = $i - strlen ( $tmp );
93 $tokens [ $p ] = $tmp ;
94 }
95
96 $tmp = '' ;
97 }
98
99 else {
100 $tmp .= $c ;
101 }
102
103 if ( count ( $mbc_str ) > 0 ) {
104 // Div_dict
105 //mb_internal_encoding ('UTF-8');
106 $start_offset = $i - strlen ( implode ( '' , $mbc_str ));
107 $mbc_str_left = $mbc_str ;
108 while ( count ( $mbc_str_left )) {
109 // $mb_len = mb_strlen ($mbc_str_left);
110 $mb_len = count ( $mbc_str_left );
111 $word = '' ;
112
113 for ( $j = ( $mb_len > 4 ? 4 : $mb_len ); $j >= 1 ; $j -- ) {
114 // $test = mb_substr ($mbc_str_left, 0, $j);
115 $test = '' ;
116 for ( $k = 0 ; $k < $j ; $k ++ ) {
117 $test .= $mbc_str_left [ $k ];
118 }
119
120 // $mb_test_len = mb_strlen ($test);
121 if ( $j == 1 ) {
122 // 1 only
123 $word = $test ;
124 }
125
126 else {
127 if ( $this -> dict -> find ( $test )) {
128 $word = $test ;
129 }
130 }
131
132 if ( $word ) {
133 // $mbc_str_left = mb_substr ($mbc_str_left, $mb_test_len);
134
135 $arr_tmp = array ();
136 for ( $k = $j ; $k < $mb_len ; $k ++ ) {
137 $arr_tmp [] = $mbc_str_left [ $k ];
138 }
139
140 $mbc_str_left = $arr_tmp ;
141 if ( ! $non_word ) {
142 if ( $j > 1 )
143 $tokens [ $start_offset ] = $word ;
144 }
145 else
146 $tokens [ $start_offset ] = $word ;
147
148 $start_offset += strlen ( $word );
149 continue 2 ;
150 }
151 }
152 }
153 }
154
155 // $mbc_str = '';
156 $mbc_str = array ();
157 }
158
159 if ( $mbc ) {
160 $mbc_str [] = $mbc ;
161 }
162 }
163
164 return $tokens ;
165 }
166 ?>
可以看到注释掉的信息,是mb_函数部分,我去掉他们,一方面是为了迁移,一方面是mb_很慢。我偷懒地使用了不完整的UTF8切字,只判断2个字节的和3个字节的,其实只有UTF3,呵呵……以后再说
1
<?
php
2 function _normalize_text ( $text )
3 {
4 $symbol = ' `~!@#$%^&*()_+=|{}[]:;"<>,.? ' ;
5 $symbol = preg_quote ( $symbol );
6 $ret = preg_replace ( " /[ $symbol ]/ " , ' ' , $text );
7 $ret = preg_replace ( " /[rnt]/ " , ' ' , $ret );
8
9 // For Chinese
10 $ret = str_replace ( ' “ ' , ' ' , $ret );
11 $ret = str_replace ( ' ” ' , ' ' , $ret );
12 $ret = str_replace ( ' ‘ ' , ' ' , $ret );
13 $ret = str_replace ( ' ’ ' , ' ' , $ret );
14 $ret = str_replace ( ' ! ' , ' ' , $ret );
15 $ret = str_replace ( ' ? ' , ' ' , $ret );
16 $ret = str_replace ( ' 。 ' , ' ' , $ret );
17 $ret = str_replace ( ' , ' , ' ' , $ret );
18 $ret = str_replace ( ' 、 ' , ' ' , $ret );
19 $ret = str_replace ( ' · ' , ' ' , $ret );
20 $ret = str_replace ( ' ( ' , ' ' , $ret );
21 $ret = str_replace ( ' ) ' , ' ' , $ret );
22 $ret = str_replace ( ' # ' , ' ' , $ret );
23 $ret = str_replace ( ' 《 ' , ' ' , $ret );
24 $ret = str_replace ( ' 》 ' , ' ' , $ret );
25 $ret = str_replace ( ' ; ' , ' ' , $ret );
26 $ret = str_replace ( ' : ' , ' ' , $ret );
27 $ret = str_replace ( ' …… ' , ' ' , $ret );
28 $ret = str_replace ( ' ' , ' ' , $ret );
29 $ret = str_replace ( ' —— ' , ' ' , $ret );
30
31 // Cut Words
32 $ret = str_replace ( ' 的 ' , ' 的 ' , $ret );
33 $ret = str_replace ( ' 是 ' , ' 是 ' , $ret );
34 $ret = str_replace ( ' 吗 ' , ' 吗 ' , $ret );
35 $ret = str_replace ( ' 吧 ' , ' 吧 ' , $ret );
36 $ret = str_replace ( ' 呀 ' , ' 呀 ' , $ret );
37
38 $ret = preg_replace ( " /s+/ " , ' ' , $ret );
39
40 return ( trim ( $ret ) . ' ' );
41 }
42 ?>
上面这个函数对文字做了一些简单的预处理,扔掉了一些标点符号,主要就是为了把文章先分割成“句子”,实验性函数……
2 function _normalize_text ( $text )
3 {
4 $symbol = ' `~!@#$%^&*()_+=|{}[]:;"<>,.? ' ;
5 $symbol = preg_quote ( $symbol );
6 $ret = preg_replace ( " /[ $symbol ]/ " , ' ' , $text );
7 $ret = preg_replace ( " /[rnt]/ " , ' ' , $ret );
8
9 // For Chinese
10 $ret = str_replace ( ' “ ' , ' ' , $ret );
11 $ret = str_replace ( ' ” ' , ' ' , $ret );
12 $ret = str_replace ( ' ‘ ' , ' ' , $ret );
13 $ret = str_replace ( ' ’ ' , ' ' , $ret );
14 $ret = str_replace ( ' ! ' , ' ' , $ret );
15 $ret = str_replace ( ' ? ' , ' ' , $ret );
16 $ret = str_replace ( ' 。 ' , ' ' , $ret );
17 $ret = str_replace ( ' , ' , ' ' , $ret );
18 $ret = str_replace ( ' 、 ' , ' ' , $ret );
19 $ret = str_replace ( ' · ' , ' ' , $ret );
20 $ret = str_replace ( ' ( ' , ' ' , $ret );
21 $ret = str_replace ( ' ) ' , ' ' , $ret );
22 $ret = str_replace ( ' # ' , ' ' , $ret );
23 $ret = str_replace ( ' 《 ' , ' ' , $ret );
24 $ret = str_replace ( ' 》 ' , ' ' , $ret );
25 $ret = str_replace ( ' ; ' , ' ' , $ret );
26 $ret = str_replace ( ' : ' , ' ' , $ret );
27 $ret = str_replace ( ' …… ' , ' ' , $ret );
28 $ret = str_replace ( ' ' , ' ' , $ret );
29 $ret = str_replace ( ' —— ' , ' ' , $ret );
30
31 // Cut Words
32 $ret = str_replace ( ' 的 ' , ' 的 ' , $ret );
33 $ret = str_replace ( ' 是 ' , ' 是 ' , $ret );
34 $ret = str_replace ( ' 吗 ' , ' 吗 ' , $ret );
35 $ret = str_replace ( ' 吧 ' , ' 吧 ' , $ret );
36 $ret = str_replace ( ' 呀 ' , ' 呀 ' , $ret );
37
38 $ret = preg_replace ( " /s+/ " , ' ' , $ret );
39
40 return ( trim ( $ret ) . ' ' );
41 }
42 ?>
我的词典是保存在内存中的,依靠memcached来维护,每一个词保存的就是一个名字为word_key,值为“t”的内存变量。memcached对这个词典进行了有效的散列。下面是词典class:
1
<?
php
2 class BsmSearchDictMemcached
3 {
4 var $mc ;
5
6 function BsmSearchDictMemcached ()
7 {
8 global $dict_memcached_host , $dict_memcached_port ;
9
10 $this -> mc = memcache ();
11 $this -> mc -> add_server ( $dict_memcached_host , $dict_memcached_port );
12
13 return $this -> mc;
14 }
15
16 function make_mem_dict ()
17 {
18 global $dict_source_file ;
19
20 $fp = fopen ( $dict_source_file , ' rb ' );
21
22 while ( $word = fgets ( $fp )) {
23 $word = trim ( $word );
24 $key = $this -> _gen_mem_key ( $word );
25 $this -> mc -> set ( $key , ' t ' );
26 }
27
28 fclose ( $fp );
29 }
30
31 function find ( $word )
32 {
33 $key = $this -> _gen_mem_key ( $word );
34
35 if ( $this -> mc -> get ( $key ) == ' t ' )
36 return true ;
37
38 else
39 return false ;
40 }
41
42 function _gen_mem_key ( $word )
43 {
44 if ( $word ) {
45 $md5_word = md5 ( $word );
46 $key = substr ( $md5_word , 0 , 4 ) . substr ( $md5_word , 16 , 8 );
47 $key = ' dict_ ' . $key ;
48 }
49
50 else
51 $key = ' NO_KEY ' ;
52
53 return $key ;
54 }
55 }
56 ?>
一些参数是在BSM的配置文件中定义的,make_mem_dict是生成内存词典的方法,它从原始词典dict.dat中导出数据插入到内存中。
2 class BsmSearchDictMemcached
3 {
4 var $mc ;
5
6 function BsmSearchDictMemcached ()
7 {
8 global $dict_memcached_host , $dict_memcached_port ;
9
10 $this -> mc = memcache ();
11 $this -> mc -> add_server ( $dict_memcached_host , $dict_memcached_port );
12
13 return $this -> mc;
14 }
15
16 function make_mem_dict ()
17 {
18 global $dict_source_file ;
19
20 $fp = fopen ( $dict_source_file , ' rb ' );
21
22 while ( $word = fgets ( $fp )) {
23 $word = trim ( $word );
24 $key = $this -> _gen_mem_key ( $word );
25 $this -> mc -> set ( $key , ' t ' );
26 }
27
28 fclose ( $fp );
29 }
30
31 function find ( $word )
32 {
33 $key = $this -> _gen_mem_key ( $word );
34
35 if ( $this -> mc -> get ( $key ) == ' t ' )
36 return true ;
37
38 else
39 return false ;
40 }
41
42 function _gen_mem_key ( $word )
43 {
44 if ( $word ) {
45 $md5_word = md5 ( $word );
46 $key = substr ( $md5_word , 0 , 4 ) . substr ( $md5_word , 16 , 8 );
47 $key = ' dict_ ' . $key ;
48 }
49
50 else
51 $key = ' NO_KEY ' ;
52
53 return $key ;
54 }
55 }
56 ?>
一个使用实例:
1
<?
php
2 define ( ' IN_BSM ' , true );
3 $phpEx = ' php ' ;
4 error_reporting ( 2047 );
5 require ( ' ../include/kernel/common.inc. ' . $phpEx );
6 require ( $include_root . ' search/search.inc. ' . $phpEx );
7 $search = new BsmSearch ( ' search/ ' );
8 $str = ' 我是大傻瓜 ' ;
9 $start_time = array_sum ( explode ( ' ' , microtime ()));
10 $db -> sql_query ( " INSERT INTO `data` SET `text` = ' $str ');
11 $id = $db ->sql_nextid ();
12 $search ->add_text ( $id , $str );
13 print_r ( $search ->search ('傻瓜'));
14 $end_time = array_sum (explode (' ', microtime()));
15 $time = $end_time - $start_time ;
16 echo ('<br>Spend Time: ' . $time . ' secs');
17 ?>
18
2 define ( ' IN_BSM ' , true );
3 $phpEx = ' php ' ;
4 error_reporting ( 2047 );
5 require ( ' ../include/kernel/common.inc. ' . $phpEx );
6 require ( $include_root . ' search/search.inc. ' . $phpEx );
7 $search = new BsmSearch ( ' search/ ' );
8 $str = ' 我是大傻瓜 ' ;
9 $start_time = array_sum ( explode ( ' ' , microtime ()));
10 $db -> sql_query ( " INSERT INTO `data` SET `text` = ' $str ');
11 $id = $db ->sql_nextid ();
12 $search ->add_text ( $id , $str );
13 print_r ( $search ->search ('傻瓜'));
14 $end_time = array_sum (explode (' ', microtime()));
15 $time = $end_time - $start_time ;
16 echo ('<br>Spend Time: ' . $time . ' secs');
17 ?>
18