汉字拼音的一个解决方法
发表日期:2006-12-06
更新日期:2006-12-06
作者文章阅读次数:1828
汉字 拼音
Author:水如烟
暂歇一下写那个区划方案。
平常中,经常用到汉字转拼音,比如批量生成姓名->拼音作为登录帐号。
这个方法只是简单的利用汉字拼音库。至于怎么找这个库,网上多有介绍。在最后提供下载的方案中也提供了这个库文本文件。
主要代码如下:
Imports
System.IO
Imports System.Text.RegularExpressions
Namespace Businness.PinYin
Public Class PYService
Private gDataSet As New dsPinYin
' '' <summary>
' '' 汉字表
' '' </summary>
Public ReadOnly Property PinYinTable() As dsPinYin.PinYinDataTable
Get
Return gDataSet.PinYin
End Get
End Property
Private gTxtFile As String = AppDomain.CurrentDomain.SetupInformation.ApplicationBase & " pinyin.txt "
Private gxmlFile As String = AppDomain.CurrentDomain.SetupInformation.ApplicationBase & " pinyin.xml "
Private gRegex As New Regex("(?<Word>^[/u4e00-/u9fa5]+)(?<PingYin>.*)")
' '' <summary>
' '' 加载汉字库,文件名为pinyin.xml,在程序当前目录下
' '' </summary>
Public Sub Load()
If Not IO.File.Exists(gxmlFile) Then
Throw New Exception( String .Format( " 文件{0}不存在 " , gxmlFile))
End If
DataSetInitialize()
gDataSet.ReadXml(gxmlFile)
End Sub
' '' <summary>
' '' 从汉字文件中更新,文件名为pinyin.txt,在程序当前目录下
' '' </summary>
' '' <remarks></remarks>
Public Sub Update()
If Not IO.File.Exists(gTxtFile) Then
Throw New Exception( String .Format( " 文件{0}不存在 " , gTxtFile))
End If
UpdateFromTxt(gTxtFile)
End Sub
' '' <summary>
' '' 保存汉字库,文件为pingyin.xml,在程序当前目录下
' '' </summary>
' '' <remarks></remarks>
Public Sub Save()
gDataSet.WriteXml(gxmlFile)
End Sub
Private Sub DataSetInitialize()
' 在更新或读入时,清除
Me .gDataSet.Clear()
Me .gDataSet.AcceptChanges()
End Sub
Private Sub UpdateFromTxt( ByVal file As String )
DataSetInitialize()
Dim mLine As String
Dim mBuilder As New System.Text.StringBuilder
Dim mReader As New IO.StreamReader(file, System.Text.Encoding.Default)
Do
mLine = mReader.ReadLine
Add(mLine)
Loop Until String .IsNullOrEmpty(mLine)
mReader.Close()
mReader.Dispose()
Me .gDataSet.PinYin.AcceptChanges()
End Sub
Private Sub Add( ByVal line As String )
If line Is Nothing Then Exit Sub
With gRegex.Match(line)
If .Success Then
' 只取单字,不取词组
If .Groups( " Word " ).Value.Length = 1 Then
Add(.Groups( " Word " ).Value, .Groups( " PingYin " ).Value)
End If
End If
End With
End Sub
Private Sub Add( ByVal word As String , ByVal py As String )
' 多音的,拼音间用单个空枨符隔开
py = py.Trim.Replace( " " , " " )
Dim mCode As String = ChineseCode(word)
Dim mRow As dsPinYin.PinYinRow = Me .gDataSet.PinYin.FindBy代码(mCode)
If mRow Is Nothing Then
Me .gDataSet.PinYin.AddPinYinRow(word, mCode, py)
Else
Dim pyArray() As String = py.Split( " " c)
For Each s As String In pyArray
If Not mRow.拼音.Contains(s) Then
mRow.拼音 = String .Concat(mRow.拼音, " " & s)
End If
Next
End If
End Sub
' '' <summary>
' '' 将字符串转为拼音
' '' </summary>
' '' <param name="line">字符串</param>
' '' <param name="isgetfirst">如是多音字,取第一个拼音</param>
Public Function ToPinyin( ByVal line As String , ByVal isgetfirst As Boolean ) As String
Dim mBuilder As New Text.StringBuilder
For Each s As Char In line.ToCharArray
If IsTrue(s) Then
mBuilder.Append(GetPinyin(s, isgetfirst))
Else
mBuilder.Append(s)
End If
Next
Return mBuilder.ToString
End Function
Private Function GetPinyin( ByVal word As String , ByVal isgetfirst As Boolean ) As String
Dim mResult As String = word
Dim mArray As String () = PinYinArray(ChineseCode(word)) ' 取拼音组
If Not mArray Is Nothing Then
If mArray.Length = 1 Or isgetfirst Then
mResult = mArray( 0 ) ' 单音的
Else
mResult = String .Format( " ({0}) " , String .Join( " , " , mArray)) ' 多音的用括号括住,拼音间用逗号隔开
End If
End If
Return mResult
End Function
' 取拼音组
Private Function PinYinArray( ByVal code As String ) As String ()
Dim mRow As dsPinYin.PinYinRow = Me .gDataSet.PinYin.FindBy代码(code)
If mRow Is Nothing Then Return Nothing
Return mRow.拼音.Split( " " c)
End Function
' '' <summary>
' '' 按拼音查字
' '' </summary>
' '' <param name="pinyin">拼音</param>
Public Function WordArray( ByVal pinyin As String ) As String ()
Dim mRows As dsPinYin.PinYinRow() = CType ( Me .gDataSet.PinYin.Select( String .Format( " 拼音 LIKE '%{0}%' " , pinyin)), dsPinYin.PinYinRow())
Dim mResult( - 1 ) As String
For i As Integer = 0 To mRows.Length - 1
If Array.IndexOf(mRows(i).拼音.Split( " " c), pinyin) <> - 1 Then
Me .Append(mResult, mRows(i).汉字)
End If
Next
Return mResult
End Function
' '' <summary>
' '' 按拼音查字
' '' </summary>
' '' <param name="pinyin">拼音</param>
Public Function Words( ByVal pinyin As String ) As String
Return String .Concat(WordArray(pinyin))
End Function
' '' <summary>
' '' 汉字代码
' '' </summary>
' '' <param name="word">单个汉字</param>
Public Shared Function ChineseCode( ByVal word As String ) As String
If Not IsTrue(word) Then Return Nothing
Dim bytes() As Byte = System.Text.Encoding.Default.GetBytes(word)
Return String .Concat( Hex (bytes( 0 )), Hex (bytes( 1 )))
End Function
' '' <summary>
' '' 是否是单个汉字
' '' </summary>
' '' <param name="word">字符</param>
Public Shared Function IsTrue( ByVal word As String ) As Boolean
If word Is Nothing Then Return False
Return System.Text.RegularExpressions.Regex.IsMatch(word, "^[/u4e00-/u9fa5]$" )
End Function
Private Sub Append( ByRef collection As String (), ByVal value As String )
ReDim Preserve collection(collection.Length)
collection(collection.Length - 1 ) = value
End Sub
End Class
End Namespace
Imports System.Text.RegularExpressions
Namespace Businness.PinYin
Public Class PYService
Private gDataSet As New dsPinYin
' '' <summary>
' '' 汉字表
' '' </summary>
Public ReadOnly Property PinYinTable() As dsPinYin.PinYinDataTable
Get
Return gDataSet.PinYin
End Get
End Property
Private gTxtFile As String = AppDomain.CurrentDomain.SetupInformation.ApplicationBase & " pinyin.txt "
Private gxmlFile As String = AppDomain.CurrentDomain.SetupInformation.ApplicationBase & " pinyin.xml "
Private gRegex As New Regex("(?<Word>^[/u4e00-/u9fa5]+)(?<PingYin>.*)")
' '' <summary>
' '' 加载汉字库,文件名为pinyin.xml,在程序当前目录下
' '' </summary>
Public Sub Load()
If Not IO.File.Exists(gxmlFile) Then
Throw New Exception( String .Format( " 文件{0}不存在 " , gxmlFile))
End If
DataSetInitialize()
gDataSet.ReadXml(gxmlFile)
End Sub
' '' <summary>
' '' 从汉字文件中更新,文件名为pinyin.txt,在程序当前目录下
' '' </summary>
' '' <remarks></remarks>
Public Sub Update()
If Not IO.File.Exists(gTxtFile) Then
Throw New Exception( String .Format( " 文件{0}不存在 " , gTxtFile))
End If
UpdateFromTxt(gTxtFile)
End Sub
' '' <summary>
' '' 保存汉字库,文件为pingyin.xml,在程序当前目录下
' '' </summary>
' '' <remarks></remarks>
Public Sub Save()
gDataSet.WriteXml(gxmlFile)
End Sub
Private Sub DataSetInitialize()
' 在更新或读入时,清除
Me .gDataSet.Clear()
Me .gDataSet.AcceptChanges()
End Sub
Private Sub UpdateFromTxt( ByVal file As String )
DataSetInitialize()
Dim mLine As String
Dim mBuilder As New System.Text.StringBuilder
Dim mReader As New IO.StreamReader(file, System.Text.Encoding.Default)
Do
mLine = mReader.ReadLine
Add(mLine)
Loop Until String .IsNullOrEmpty(mLine)
mReader.Close()
mReader.Dispose()
Me .gDataSet.PinYin.AcceptChanges()
End Sub
Private Sub Add( ByVal line As String )
If line Is Nothing Then Exit Sub
With gRegex.Match(line)
If .Success Then
' 只取单字,不取词组
If .Groups( " Word " ).Value.Length = 1 Then
Add(.Groups( " Word " ).Value, .Groups( " PingYin " ).Value)
End If
End If
End With
End Sub
Private Sub Add( ByVal word As String , ByVal py As String )
' 多音的,拼音间用单个空枨符隔开
py = py.Trim.Replace( " " , " " )
Dim mCode As String = ChineseCode(word)
Dim mRow As dsPinYin.PinYinRow = Me .gDataSet.PinYin.FindBy代码(mCode)
If mRow Is Nothing Then
Me .gDataSet.PinYin.AddPinYinRow(word, mCode, py)
Else
Dim pyArray() As String = py.Split( " " c)
For Each s As String In pyArray
If Not mRow.拼音.Contains(s) Then
mRow.拼音 = String .Concat(mRow.拼音, " " & s)
End If
Next
End If
End Sub
' '' <summary>
' '' 将字符串转为拼音
' '' </summary>
' '' <param name="line">字符串</param>
' '' <param name="isgetfirst">如是多音字,取第一个拼音</param>
Public Function ToPinyin( ByVal line As String , ByVal isgetfirst As Boolean ) As String
Dim mBuilder As New Text.StringBuilder
For Each s As Char In line.ToCharArray
If IsTrue(s) Then
mBuilder.Append(GetPinyin(s, isgetfirst))
Else
mBuilder.Append(s)
End If
Next
Return mBuilder.ToString
End Function
Private Function GetPinyin( ByVal word As String , ByVal isgetfirst As Boolean ) As String
Dim mResult As String = word
Dim mArray As String () = PinYinArray(ChineseCode(word)) ' 取拼音组
If Not mArray Is Nothing Then
If mArray.Length = 1 Or isgetfirst Then
mResult = mArray( 0 ) ' 单音的
Else
mResult = String .Format( " ({0}) " , String .Join( " , " , mArray)) ' 多音的用括号括住,拼音间用逗号隔开
End If
End If
Return mResult
End Function
' 取拼音组
Private Function PinYinArray( ByVal code As String ) As String ()
Dim mRow As dsPinYin.PinYinRow = Me .gDataSet.PinYin.FindBy代码(code)
If mRow Is Nothing Then Return Nothing
Return mRow.拼音.Split( " " c)
End Function
' '' <summary>
' '' 按拼音查字
' '' </summary>
' '' <param name="pinyin">拼音</param>
Public Function WordArray( ByVal pinyin As String ) As String ()
Dim mRows As dsPinYin.PinYinRow() = CType ( Me .gDataSet.PinYin.Select( String .Format( " 拼音 LIKE '%{0}%' " , pinyin)), dsPinYin.PinYinRow())
Dim mResult( - 1 ) As String
For i As Integer = 0 To mRows.Length - 1
If Array.IndexOf(mRows(i).拼音.Split( " " c), pinyin) <> - 1 Then
Me .Append(mResult, mRows(i).汉字)
End If
Next
Return mResult
End Function
' '' <summary>
' '' 按拼音查字
' '' </summary>
' '' <param name="pinyin">拼音</param>
Public Function Words( ByVal pinyin As String ) As String
Return String .Concat(WordArray(pinyin))
End Function
' '' <summary>
' '' 汉字代码
' '' </summary>
' '' <param name="word">单个汉字</param>
Public Shared Function ChineseCode( ByVal word As String ) As String
If Not IsTrue(word) Then Return Nothing
Dim bytes() As Byte = System.Text.Encoding.Default.GetBytes(word)
Return String .Concat( Hex (bytes( 0 )), Hex (bytes( 1 )))
End Function
' '' <summary>
' '' 是否是单个汉字
' '' </summary>
' '' <param name="word">字符</param>
Public Shared Function IsTrue( ByVal word As String ) As Boolean
If word Is Nothing Then Return False
Return System.Text.RegularExpressions.Regex.IsMatch(word, "^[/u4e00-/u9fa5]$" )
End Function
Private Sub Append( ByRef collection As String (), ByVal value As String )
ReDim Preserve collection(collection.Length)
collection(collection.Length - 1 ) = value
End Sub
End Class
End Namespace
效果图: