中文分词-简单文章分析玩出来的

最近自己写了下文章分析程序,用的是c# 2.0,数据库是mysql 5.0,自己弄两个mysql的Helper类,其中有涉及到线程和委托的东西,我找高手指点了我一下,结果,弄出来了,分析1万篇文章,大概1个小时左右,想要这个程序或者想交流的可以联系我.

帖一下自己的代码:

 

ExpandedBlockStart.gif 代码
using  System;
using  System.Collections.Generic;
using  System.ComponentModel;
using  System.Data;
using  System.Drawing;
using  System.Text;
using  System.Windows.Forms;
using  System.IO;

using  System.Data.OleDb;

using  WoWExpress.Core;
using  MySql.Data.MySqlClient;
using  System.Text.RegularExpressions;
using  System.Threading;

using  Rainsoft.WordSeg;
namespace  CSVProject
{
    
public   partial   class  Form1 : Form
    {
        
public  Form1()
        {
            InitializeComponent();
        }

 

        
public  DataSet GetStopwords()
        {
            
string  myConnectionStr  =  WoWExpress.Core.MySqlHelper.GetConnectionString( " localhost " " hwyd " " root " " 8152 " );
            
string  mysqlStr  =   " select * from stopwords " ;

            DataSet stopwordsDataSet 
=  WoWExpress.Core.MySqlHelper.ExecuteDataset(myConnectionStr, CommandType.Text, mysqlStr);
            
return  stopwordsDataSet;
        }

        
public  DataSet GetArticles()
        {
            
string  myConnectionStr  =  WoWExpress.Core.MySqlHelper.GetConnectionString( " localhost " " hwyd " " root " " 8152 " );
            
string  mysqlStr  =   " select * from ccl_addonarticle " ;

            DataSet stopwordsDataSet 
=  WoWExpress.Core.MySqlHelper.ExecuteDataset(myConnectionStr, CommandType.Text, mysqlStr);
            
return  stopwordsDataSet;
        }


        
/* 一篇文章一个对象,对象包括文章主题,文章id等,现在就只要两个参数
         文章对象放入ArrayList,这样可以循环操作文章
         1.对文章使用停用词表,把文章隔开,如何隔开?利用停用词表集合循环的把文章中的停用词给用标识替换(如[%stopword%])
         2.直接使用split(artirleBody,[%stopword%])来分隔文章,留下的词就全部分入数组,数组循环判断,从第一个开始,相同就数量加1
         * 插入新的对象关键词对象,对象包括关键词id,关键词,关键词在本篇文章数量,关键词在本篇文章的百分比(这个需要在本篇文章循环
         * 完才可以计算的出),关键词在本数据库中的数量,关键词在本数据库中的百分比(这个需要在所有文章循环
         * 完才可以计算的出)
         * 全部循环完之后,需要的数据就是关键词对象,这个也相应的显示出来,并且存到数据库静态化,但是当数据不断增加的时候,每次
         * 就需要重新计算一次,得出当前最真实的结果,这样也会导致速度越来越慢,不过这是将来需要处理的。
         
*/

        
/* 获得文章 */
        
public  List < ArticleInfo >  GetMyArticles( string  pageLength)
        {

            List
< ArticleInfo >  articlesInfo  =   new  List < ArticleInfo > ();

            
string  myConnectionStr  =  WoWExpress.Core.MySqlHelper.GetConnectionString( " localhost " " hwyd " " root " " 8152 " );
            
string  mysqlStr  =   "  select aid,body,isDo from ccl_addonarticle where isDo = 0 limit @pageLength " ;
            mysqlStr 
=  mysqlStr.Replace( " @pageLength " , pageLength);

            
// Execute the query against the database
             using  (MySqlDataReader rdr  =  WoWExpress.Core.MySqlHelper.ExecuteReader(myConnectionStr, CommandType.Text, mysqlStr))
            {
                
//  Scroll through the results
                 while  (rdr.Read())
                {
                    
// 预先分词
                     this .segment(rdr.GetString( 1 ));

                    ArticleInfo articleInfo 
=   new  ArticleInfo(Convert.ToInt32(rdr.GetString( 0 )), rdr.GetString( 1 ));
                    
// Add each item to the arraylist
                    articlesInfo.Add(articleInfo);
                }
            }
            
return  articlesInfo;
        }
        
/* 获得文章总数 */
        
public   int  GetArticlesCount()
        {

            List
< ArticleInfo >  articlesInfo  =   new  List < ArticleInfo > ();

            
string  myConnectionStr  =  WoWExpress.Core.MySqlHelper.GetConnectionString( " localhost " " hwyd " " root " " 8152 " );
            
string  mysqlStr  =   "  select count(*) from ccl_addonarticle where isDo = 0 " ;
            
int  result  =   0 ;

            
// Execute the query against the database
             using  (MySqlDataReader rdr  =  WoWExpress.Core.MySqlHelper.ExecuteReader(myConnectionStr, CommandType.Text, mysqlStr))
            {
                
//  Scroll through the results
                 if  (rdr.Read())
                {
                    result 
=  rdr.GetInt32( 0 );
                }
            }
            
return  result;
        }

 

        
/* 获得停用词表 */
        
public  List < StopwordsInfo >  GetMyStopwords()
        {

            List
< StopwordsInfo >  stopwords  =   new  List < StopwordsInfo > ();

            
string  myConnectionStr  =  WoWExpress.Core.MySqlHelper.GetConnectionString( " localhost " " hwyd " " root " " 8152 " );
            
string  mysqlStr  =   " select * from stopwords " ;

            
// Execute the query against the database
             using  (MySqlDataReader rdr  =  WoWExpress.Core.MySqlHelper.ExecuteReader(myConnectionStr, CommandType.Text, mysqlStr))
            {
                
//  Scroll through the results
                 while  (rdr.Read())
                {
                    StopwordsInfo stopwordsInfo 
=   new  StopwordsInfo(Convert.ToInt32(rdr.GetString( 0 )), rdr.GetString( 1 ).Trim());
                    
// Add each item to the arraylist
                    stopwords.Add(stopwordsInfo);
                }
            }
            
return  stopwords;
        }

        
/* 使用停用词表 */
        
public  List < ArticleInfo >  UseStopwords(List < ArticleInfo >  articlesInfo, List < StopwordsInfo >  stopwords)
        {
            
/* 处理过后的文章 */
            List
< ArticleInfo >  targetArticles  =   new  List < ArticleInfo > ();

            
/* 循环文章 */
            
foreach  (ArticleInfo articleInfo  in  articlesInfo)
            {

                
/* 每篇文章循环使用停用词表里面的各个词 */
                
string  curArticleBody  =  articleInfo.ArticleBody.ToString();
                
/* 去除所有html代码 */
                curArticleBody 
=   this .stripHtml(curArticleBody);
                curArticleBody 
=   this .StripHTML3(curArticleBody);


                
foreach  (StopwordsInfo stopwordsInfo  in  stopwords)
                {
                    
string  curStopwords  =  stopwordsInfo.Stopwords.ToString();
                    curArticleBody 
=  curArticleBody.Replace(curStopwords,  "   " );
                }
                
/* 处理每篇文章后,在把每篇文章放入新的列表里面等待使用 */

                
/* 去除所有html代码-在处理一次 */
                curArticleBody 
=   this .stripHtml(curArticleBody);

                articleInfo.ArticleBody 
=  curArticleBody;
                targetArticles.Add(articleInfo);
            }
            
return  targetArticles;
        }

        
/* 分隔文章到单词--这里已经得到了单篇文章的关键词统计 */
        
public  List < SingleKeywords >  SplitArticle(List < ArticleInfo >  articlesInfo)
        {
            
/* 处理过后得到的关键词列表 */
            List
< SingleKeywords >  singleKeywordsArray  =   new  List < SingleKeywords > ();

            
/* 循环文章 */
            
/* 这里可以显示分进度 */

            
// 这里已经是处在了线程里面的话,就需要代理了,这里的设置也就需要代理了
            
// progressBar2.Maximum = articlesInfo.Count; // 设置最大长度值-
            
// progressBar2.Value = 0; // 设置当前值
            
// progressBar2.Step = 1; // 设置没次增长多少

            OnRrogressBar2Set(articlesInfo.Count);

            
foreach  (ArticleInfo articleInfo  in  articlesInfo)
            {

                
/* 每篇文章循环使用切割 */
                
string  curArticleBody  =  articleInfo.ArticleBody.ToString().Trim();
                
int  curArticleId  =  articleInfo.ArticleId;
                
// 切割后得到关键词列表
                 string [] keywordsArray  =  curArticleBody.Split( '   ' );

                
int  keywordsArrayLength  =  keywordsArray.Length;

                
/* 首先初始入库一个关键词,每篇第一个关键词肯定是要入库的 */
                SingleKeywords curKeywords 
=   new  SingleKeywords(curArticleId, keywordsArray[ 0 ],  1 0 );
                singleKeywordsArray.Add(curKeywords);

                
for  ( int  i  =   1 ; i  <  keywordsArrayLength  -   1 ; i ++ )
                {
                    
int  singlekeywordsLength  =  singleKeywordsArray.Count;
                    
bool  flag  =   true ;
                    
for  ( int  j  =   0 ; j  <  singlekeywordsLength; j ++ )
                    {
                        
string  tempSingleKeywords  =  keywordsArray[i].Trim();
                        
int  tempArticleId  =  curArticleId;

                        
string  temp2SingleKeywords  =  singleKeywordsArray[j].KeywordsStr.Trim();
                        
int  temp2ArticleId  =  singleKeywordsArray[j].ArticleId;

                        
if  (tempSingleKeywords.Equals(temp2SingleKeywords)  &&  tempArticleId  ==  temp2ArticleId)
                        {
                            singleKeywordsArray[j].SingleCount 
+=   1 ;
                            flag 
=   false ;
                            
break ;
                        }
                    }

                    
// true代表没有一个是相同的,allKeywordsArray要加关键词
                     if  (flag)
                    {
                        SingleKeywords addSingleKeywords 
=   new  SingleKeywords(curArticleId, keywordsArray[i].Trim(),  1 0 );
                        singleKeywordsArray.Add(addSingleKeywords);
                    }
                }
                
/* 这里进行百分比的计算 */
                
// todo

                
/* 分进度 */
                OnRrogressBarAdd2(progressBar2.Step);
            }
            
/* 嵌套到分词的时候,就顺便操作数据库了 */
            
this .UpdateArticleAndInsertKeywords(singleKeywordsArray);


            
// string startId = articlesInfo[0].ArticleId.ToString();
            
// string endId = articlesInfo[articlesInfo.Count - 1].ArticleId.ToString();
            
// label1.Text = "文章范围:" + startId + "-" + endId + "已经被更新完成!";


            
return  singleKeywordsArray;

        }

        
/* 接着数据库里面的操作 */
        
public   string  UpdateArticleAndInsertKeywords(List < SingleKeywords >  singleKeywordsArray)
        {
            
// 数据库连接加了字符集后,问题解决,插入正常
             string  strSetCharset  =   " utf8 " ; // System.Text.Encoding.UTF8.HeaderName; // System.Text.Encoding.Default.HeaderName;
            
// string strSetCharset = "UTF8";

            
string  myConnectionStr  =  WoWExpress.Core.MySqlHelper.GetConnectionString( " localhost " " hwyd " " root " " 8152 " , strSetCharset);
            
// string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost", "hwyd", "root", "8152");


            
// 这样就已经循环更新了,本地使用的程序,不担心安全,快速开发出来-这里需要用事务,待处理
             foreach  (SingleKeywords singleKeywords  in  singleKeywordsArray)
            {
                
string  mysqlStr  =   "  Update ccl_addonarticle set isDo =1 where aid = @aid " ;
                
int  articleId  =  singleKeywords.ArticleId;
                mysqlStr 
=  mysqlStr.Replace( " @aid " , articleId.ToString());
                WoWExpress.Core.MySqlHelper.ExecuteNonQuery(myConnectionStr, CommandType.Text, mysqlStr);


                
string  mysqlStr2  =   " Insert articlekeywords(articleId,keywords,singleCount,singlePercent) values(?articleId,?KeywordsStr,?singleCount,?singlePercent) " ;
                
int  articleId2  =  singleKeywords.ArticleId;
                
string  keywordsStr  =  singleKeywords.KeywordsStr;
                
int  singleCount  =  singleKeywords.SingleCount;
                
double  singlePercent  =  singleKeywords.SinglePercent;

 

                MySqlParameter[] keywordsParms 
=   new  MySqlParameter[] {
     
new  MySqlParameter( " ?articleId " , MySqlDbType.Int32,  4 ),
     
new  MySqlParameter( " ?KeywordsStr " , MySqlDbType.VarChar),
     
new  MySqlParameter( " ?singleCount " , MySqlDbType.Int32,  4 ),
     
new  MySqlParameter( " ?singlePercent " , MySqlDbType.Double, 4 )};


                keywordsStr 
=  Traditional2Simplified(keywordsStr);

                keywordsParms[
0 ].Value  =  articleId2;
                keywordsParms[
1 ].Value  =  keywordsStr;
                keywordsParms[
2 ].Value  =  singleCount;
                keywordsParms[
3 ].Value  =  singlePercent;

                WoWExpress.Core.MySqlHelper.ExecuteNonQuery(myConnectionStr, CommandType.Text, mysqlStr2, keywordsParms);

            }
            
return   " ok " ;

        }

        
/* 辅助程序,改变编码 */
        
private   string  DBStringToNormal( string  dbStr)
        {
            
byte [] str  =   new   byte [dbStr.Length];
            
for  ( int  i  =   0 ; i  <  dbStr.Length;  ++ i)
                str[i] 
=  ( byte )(dbStr[i]);
            
return  System.Text.Encoding.Default.GetString(str,  0 , dbStr.Length);
        }

        
public   string  Traditional2Simplified( string  str)
        { 
// 繁体转简体   
             return  (Microsoft.VisualBasic.Strings.StrConv(str, Microsoft.VisualBasic.VbStrConv.SimplifiedChinese,  0 ));

        }
        
///   <summary>
        
///  提取HTML代码中文字的C#函数
        
///   </summary>
         public   string  StripHTML2( string  strHtml)
        {
            
string [] aryReg  = {
           
@" <script[^>]*?>.*?</script> " ,
           
@" <(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(http://www.cnblogs.com/oxite/admin/file://[%22%22'tbnr]%7c[%5e/7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?> " ,
           
@" ([\r\n])[\s]+ " ,
           
@" &(quot|#34); " ,
           
@" &(amp|#38); " ,
           
@" &(lt|#60); " ,
           
@" &(gt|#62); " ,
           
@" &(nbsp|#160); " ,
           
@" &(iexcl|#161); " ,
           
@" &(cent|#162); " ,
           
@" &(pound|#163); " ,
           
@" &(copy|#169); " ,
           
@" &#(\d+); " ,
           
@" --> " ,
           
@" <!--.*\n "
          };
            
string [] aryRep  =    {
             
"" ,
             
"" ,
             
"" ,
             
" \ "" ,
              " & " ,
             
" < " ,
             
" > " ,
             
"     " ,
             
" \xa1 " , // chr(161), 
              " \xa2 " , // chr(162), 
              " \xa3 " , // chr(163), 
              " \xa9 " , // chr(169), 
              "" ,
             
" \r\n " ,
             
""
            };
            
string  newReg  =  aryReg[ 0 ];
            
string  strOutput  =  strHtml;
            
for  ( int  i  =   0 ; i  <  aryReg.Length; i ++ )
            {
                Regex regex 
=   new  Regex(aryReg[i], RegexOptions.IgnoreCase);
                strOutput 
=  regex.Replace(strOutput, aryRep[i]);
            }
            strOutput.Replace(
" < " "" );
            strOutput.Replace(
" > " "" );
            strOutput.Replace(
" \r\n " "" );
            
return  strOutput;
        }

        
///   <summary>
        
///  提取HTML代码中文字的C#函数
        
///   </summary>
         public   string  StripHTML3( string  strHtml)
        {
            
return  strHtml.Replace( @" [^A-Za-z0-9\u4E00-\u9FBB] " "" );
        }
        
/* 利用得到的最原始的关键词列表进行全局计算 */
        
public  List < AllKeywords >  ComputeKeywords(List < SingleKeywords >  singleKeywords)
        {
            
/* 处理过后得到的关键词列表 */
            List
< AllKeywords >  allKeywordsArray  =   new  List < AllKeywords > ();

            
/* 把关键词相同的全部加起来,统计全局关键词 */
            
// 初始化全局统计列表

            AllKeywords allKeywords 
=   new  AllKeywords(singleKeywords[ 0 ].KeywordsStr, singleKeywords[ 0 ].SingleCount,  0 );
            allKeywordsArray.Add(allKeywords);
            
int  singleKeywordsCount  =  singleKeywords.Count;
            
for  ( int  i  =   1 ; i  <  singleKeywordsCount  -   1 ; i ++ )
            {
                
int  allkeywordsLength  =  allKeywordsArray.Count;
                
bool  flag  =   true ;
                
for  ( int  j  =   0 ; j  <  allkeywordsLength; j ++ )
                {
                    
string  tempSingleKeywords  =  singleKeywords[i].KeywordsStr.Trim();
                    
string  tempAllKeywords  =  allKeywordsArray[j].KeywordsStr.Trim();

                    
if  (tempSingleKeywords.Equals(tempAllKeywords))
                    {
                        allKeywordsArray[j].AllCount 
+=   1 ;
                        flag 
=   false ;
                        
break ;
                    }
                }

                
// true代表没有一个是相同的,allKeywordsArray要加关键词
                 if  (flag)
                {
                    AllKeywords addAllKeywords 
=   new  AllKeywords(singleKeywords[i].KeywordsStr, singleKeywords[i].SingleCount,  0 );
                    allKeywordsArray.Add(addAllKeywords);
                }
            }

            
/* 这里进行百分比的计算 */
            
// todo
             return  allKeywordsArray;
        }

 

        
/**/
        
///   <summary>
        
///  将Html标签转化为空格
        
///   </summary>
        
///   <param name="strHtml"> 待转化的字符串 </param>
        
///   <returns> 经过转化的字符串 </returns>
         private   string  stripHtml( string  strHtml)
        {
            Regex objRegExp 
=   new  Regex( " <(.|\n)+?> " );
            
string  strOutput  =  objRegExp.Replace(strHtml,  "" );
            strOutput 
=  strOutput.Replace( " < " " &lt; " );
            strOutput 
=  strOutput.Replace( " > " " &gt; " );

            
// 把所有空格变为一个空格
            Regex r  =   new  Regex( @" \s+ " );
            strOutput 
=  r.Replace(strOutput,  "   " );
            strOutput.Trim();

            
return  strOutput;
        }


        
private   void  btnUseStopword_Click( object  sender, EventArgs e)
        {
            
/* 这里分开写,好校验,现在是为了方便 */
            List
< ArticleInfo >  targetArticles  =   this .UseStopwords( this .GetMyArticles( " 10 " ),  this .GetMyStopwords());
            dataGridView3.DataSource 
=  targetArticles;

        }

        
private   void  btnGetArticle_Click( object  sender, EventArgs e)
        {
            DataSet articleDS 
=   this .GetArticles();
            dataGridView2.DataSource 
=  articleDS.Tables[ 0 ];
        }

        
private   void  btnStopwords_Click( object  sender, EventArgs e)
        {
            
// string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost", "hwyd", "root", "8152");
            
// string mysqlStr = "select * from stopwords  where stopwordsId = @stopwordsId ";
            
// MySqlParameter myParameter = new MySqlParameter("@stopwordsId", MySqlDbType.Int32, 4);
            
// myParameter.Value = 26;

            DataSet stopwordsDataSet 
=   this .GetStopwords();
            dataGridView1.DataSource 
=  stopwordsDataSet.Tables[ 0 ];
        }

        
/* 这里循环点击,或者程序自动点击也行 */
        
private   void  btnGetKeywords_Click( object  sender, EventArgs e)
        {
            
/* 这里分开写,好校验,现在是为了方便---分批处理,并且能够自动,判断,如果返回了值,就可以继续循环 */
            
/* 按分页的方法,先统计出一共多少篇文章,规定每次执行的篇数,计算出需要执行的次数,利用返回结果来判断是否当前
             处理是否已经完成,完成者继续执行,否者报出错原因
*/
            
int  articlesCount  =   this .GetArticlesCount();
            
int  pageLength  =   10 ;
            
int  doCount  =  articlesCount / pageLength;
            
int  lastLength  =  articlesCount  %  pageLength;
            progressBar1.Maximum 
=  doCount; // 设置最大长度值
            progressBar1.Value  =   0 ; // 设置当前值
            progressBar1.Step  =   1 ; // 设置没次增长多少


            System.Threading.Thread thread 
=   new  System.Threading.Thread( delegate ( object  arg) {
                
// 如果总数小于单批长度,直接一次处理,长度为余数
                 if  (articlesCount  <  pageLength)
                {
                    List
< SingleKeywords >  singleKeywordsArray  =   this .SplitArticle( this .UseStopwords( this .GetMyArticles(lastLength.ToString()),  this .GetMyStopwords()));
                    OnGridViewDataBind(singleKeywordsArray);
                }
                
else
                {
                    
for  ( int  i  =   0 ; i  <  doCount; i ++ )
                    {
                        
// 这里循环操作
                        List < SingleKeywords >  singleKeywordsArray  =   this .SplitArticle( this .UseStopwords( this .GetMyArticles(pageLength.ToString()),  this .GetMyStopwords()));
                        OnGridViewDataBind(singleKeywordsArray);
                        OnRrogressBarAdd(progressBar1.Step);

                    }

                    
// 如果有余数,单独处理最后一次
                     if  (lastLength  !=   0 )
                    {
                        List
< SingleKeywords >  singleKeywordsArray  =   this .SplitArticle( this .UseStopwords( this .GetMyArticles(lastLength.ToString()),  this .GetMyStopwords()));
                        OnGridViewDataBind(singleKeywordsArray);
                    }
                }
            });
            thread.Start();
        }

        
private   void  btnAllCompute_Click( object  sender, EventArgs e)
        {
            List
< AllKeywords >  allKeywordsArray  =   this .ComputeKeywords( this .SplitArticle( this .UseStopwords( this .GetMyArticles( " 10 " ),  this .GetMyStopwords())));
            dataGridView5.DataSource 
=  allKeywordsArray;

        }


        
/* 跨线程的操作 */
        
// 绑定datagridview
         protected   delegate   void  GridViewDataBind( object  source);

        
protected   void  OnGridViewDataBind( object  source)
        {
            
if  (dataGridView4  ==   null )
                
return ;
            
if  (dataGridView4.InvokeRequired)
                dataGridView4.Invoke(
new  GridViewDataBind(
                    
delegate ( object  dataSource)
                    {
                        dataGridView4.DataSource 
=  dataSource;
                    }
                    ), source);
            
else
                dataGridView4.DataSource 
=  source;

        }        
        
        
// 设置整体进度条
         protected   delegate   void  RrogressBarAdd( int  step);

        
protected   void  OnRrogressBarAdd( int  step)
        {
            
if  (progressBar1  ==   null )
                
return ;
            
if  (progressBar1.InvokeRequired)
                progressBar1.Invoke(
new  RrogressBarAdd(
                    
delegate ( int  mystep)
                    {
                        progressBar1.Value 
+=  mystep; // 让进度条增加一次
                    }
                    ), step);
            
else
                progressBar1.Value 
+=  step; // 让进度条增加一次

        }


        
// 设置分进度条
         protected   delegate   void  RrogressBarAdd2( int  step);

        
protected   void  OnRrogressBarAdd2( int  step)
        {
            
if  (progressBar2  ==   null )
                
return ;
            
if  (progressBar2.InvokeRequired)
                progressBar2.Invoke(
new  RrogressBarAdd2(
                    
delegate ( int  mystep)
                    {
                        progressBar2.Value 
+=  mystep; // 让进度条增加一次
                    }
                    ), step);
            
else
                progressBar2.Value 
+=  step; // 让进度条增加一次

        }
        
protected   delegate   void  RrogressBar2Set( int  maximum);

        
protected   void  OnRrogressBar2Set( int  maximum)
        {
            
if  (progressBar2  ==   null )
                
return ;
            
if  (progressBar2.InvokeRequired)
                progressBar2.Invoke(
new  RrogressBar2Set(
                    
delegate ( int  myMaximum)
                    {
                        progressBar2.Maximum 
=  myMaximum; // 设置最大长度值-
                        progressBar2.Value  =   0 ; // 设置当前值
                        progressBar2.Step  =   1 ; // 设置没次增长多少

                    }
                    ), maximum);
            
else
                progressBar2.Value 
+=  maximum; // 让进度条增加一次

        }

 


        
public   string  segment( string  articleStr)
        {
            WordSegV1 seg 
=   new  WordSegV1();
            
string  s  =  seg.Segment(articleStr, '   ' );
            
return  s;
        }

 


    }
}

 

 

 

程序开发完毕后,我突然发现分词不是那么容易的,找了下,又发现好东西了,c#版本开源的中文分词-ictclas,和一个简单的c#版本的分词组件。中文分词组件   好慢,等申请首页发布我在给出另外下载的代码吧,呵呵,看博客园园长的了。

转载于:https://www.cnblogs.com/oxite/archive/2010/03/22/1691480.html

已标记关键词 清除标记
表情包
插入表情
评论将由博主筛选后显示,对所有人可见 | 还能输入1000个字符
相关推荐
©️2020 CSDN 皮肤主题: 编程工作室 设计师:CSDN官方博客 返回首页