不同字符集之间的编码转换
对于编码转换,Windows有两个基本的API,WideCharToMultiByte和MultiByteToWideChar。这两个API用起来非常烦琐,不是迫不得已,我可不想动它。
另外,中文版的Windows提供了一个中文转换工具,可以在GB2312和BIG5之间进行转换,优点是可以把繁体转为简体,缺点是不能批量转换,而且也没有提供命令行模式。
.NET的优点之一就是提供了功能强大的类库,Encoding就是其中之一,用它来做编码就非常方便。附录提供了一个简单的例子,功能不多,但比较实用。可以批量转换(针对目录),但不能把繁体转为简体。
程序只有两个文件:
CodePage.cs —— 主程序,提供UI
CPConvertor.cs —— 封装了具体的转换功能
代码中也尝试了一下API,但效果并不理想。
顺便提一句,程序中提供了一个类:FolderBrowserDialog,即目录对话框。按理说,C#应该提供一个和OpenFileDialog类似的目录对话框的,但是,我怎么找也没找到。
在Linux下面,有一个编码转换工具:iconv,使用非常简单。
语法:iconv -f encoding -t encoding inputfile
示例:iconv -f big5 -t gb2312 big5.txt > gb2312.txt
如果需要转换当前目录下的所有文件,可以使用下面的代码:
#!/bin/bash
for f in $(find .)
do
iconv -f big5 -t gb2312 f > tmp
mv tmp f
done
当然,Linux中也提供了iconv函数,个人认为,还是使用iconv命令方便一些。
附录:
/******************************************************
* CodePage.cs
*******************************************************/
using System;
using System.Drawing;
using System.Collections;
using System.ComponentModel;
using System.Windows.Forms;
using System.Windows.Forms.Design;
using System.Data;
using System.Text;
namespace CodePage
{
/// <summary>
/// Form1 的摘要说明。
/// </summary>
public class CodePageConvertor : System.Windows.Forms.Form
{
private System.Windows.Forms.Label m_CodePageLabel;
private System.Windows.Forms.ComboBox m_comboSrcCodePage;
private System.Windows.Forms.Label label1;
private System.Windows.Forms.ComboBox m_comboDestCodePage;
private System.Windows.Forms.Button m_btExit;
private System.Windows.Forms.Button m_btConvert;
private System.Windows.Forms.TextBox m_txtPattern;
private System.Windows.Forms.Label m_labelPattern;
private System.Windows.Forms.TextBox m_txtFolder;
private System.Windows.Forms.CheckBox m_ckSearchSubFolder;
private System.Windows.Forms.Button m_btBrowerFolder;
private System.Windows.Forms.Label m_labelFolder;
/// <summary>
/// 必需的设计器变量。
/// </summary>
private System.ComponentModel.Container components = null;
public CodePageConvertor()
{
//
// Windows 窗体设计器支持所必需的
//
InitializeComponent();
//
// TODO: 在 InitializeComponent 调用后添加任何构造函数代码
//
CCodePage[] codePageList = CCodePage.GetCodePageList();
m_comboSrcCodePage.BeginUpdate();
m_comboSrcCodePage.Items.Clear();
m_comboSrcCodePage.Items.AddRange(codePageList);
m_comboSrcCodePage.EndUpdate();
m_comboSrcCodePage.SelectedIndex = 0;
m_comboDestCodePage.BeginUpdate();
m_comboDestCodePage.Items.Clear();
m_comboDestCodePage.Items.AddRange(codePageList);
m_comboDestCodePage.EndUpdate();
m_comboDestCodePage.SelectedIndex = 0;
}
/// <summary>
/// 清理所有正在使用的资源。
/// </summary>
protected override void Dispose( bool disposing )
{
if( disposing )
{
if (components != null)
{
components.Dispose();
}
}
base.Dispose( disposing );
}
#region Windows Form Designer generated code
/// <summary>
/// 设计器支持所需的方法 - 不要使用代码编辑器修改
/// 此方法的内容。
/// </summary>
private void InitializeComponent()
{
this.m_comboSrcCodePage = new System.Windows.Forms.ComboBox();
this.m_txtFolder = new System.Windows.Forms.TextBox();
this.m_btBrowerFolder = new System.Windows.Forms.Button();
this.m_CodePageLabel = new System.Windows.Forms.Label();
this.m_labelFolder = new System.Windows.Forms.Label();
this.label1 = new System.Windows.Forms.Label();
this.m_comboDestCodePage = new System.Windows.Forms.ComboBox();
this.m_btExit = new System.Windows.Forms.Button();
this.m_ckSearchSubFolder = new System.Windows.Forms.CheckBox();
this.m_btConvert = new System.Windows.Forms.Button();
this.m_txtPattern = new System.Windows.Forms.TextBox();
this.m_labelPattern = new System.Windows.Forms.Label();
this.SuspendLayout();
//
// m_comboSrcCodePage
//
this.m_comboSrcCodePage.DropDownStyle = System.Windows.Forms.ComboBoxStyle.DropDownList;
this.m_comboSrcCodePage.Location = new System.Drawing.Point(96, 24);
this.m_comboSrcCodePage.Name = "m_comboSrcCodePage";
this.m_comboSrcCodePage.Size = new System.Drawing.Size(136, 20);
this.m_comboSrcCodePage.TabIndex = 0;
//
// m_txtFolder
//
this.m_txtFolder.Location = new System.Drawing.Point(96, 96);
this.m_txtFolder.Name = "m_txtFolder";
this.m_txtFolder.RightToLeft = System.Windows.Forms.RightToLeft.No;
this.m_txtFolder.Size = new System.Drawing.Size(328, 21);
this.m_txtFolder.TabIndex = 2;
this.m_txtFolder.Text = "";
//
// m_btBrowerFolder
//
this.m_btBrowerFolder.Location = new System.Drawing.Point(424, 96);
this.m_btBrowerFolder.Name = "m_btBrowerFolder";
this.m_btBrowerFolder.Size = new System.Drawing.Size(56, 21);
this.m_btBrowerFolder.TabIndex = 11;
this.m_btBrowerFolder.Text = "Brower";
this.m_btBrowerFolder.Click += new System.EventHandler(this.m_btBrowerForder_Click);
//
// m_CodePageLabel
//
this.m_CodePageLabel.Location = new System.Drawing.Point(24, 24);
this.m_CodePageLabel.Name = "m_CodePageLabel";
this.m_CodePageLabel.Size = new System.Drawing.Size(72, 20);
this.m_CodePageLabel.TabIndex = 16;
this.m_CodePageLabel.Text = "源代码页:";
this.m_CodePageLabel.TextAlign = System.Drawing.ContentAlignment.MiddleLeft;
//
// m_labelFolder
//
this.m_labelFolder.Location = new System.Drawing.Point(24, 96);
this.m_labelFolder.Name = "m_labelFolder";
this.m_labelFolder.Size = new System.Drawing.Size(48, 20);
this.m_labelFolder.TabIndex = 17;
this.m_labelFolder.Text = "目录:";
this.m_labelFolder.TextAlign = System.Drawing.ContentAlignment.MiddleLeft;
//
// label1
//
this.label1.Location = new System.Drawing.Point(256, 24);
this.label1.Name = "label1";
this.label1.Size = new System.Drawing.Size(80, 20);
this.label1.TabIndex = 20;
this.label1.Text = "目标代码页:";
this.label1.TextAlign = System.Drawing.ContentAlignment.MiddleLeft;
//
// m_comboDestCodePage
//
this.m_comboDestCodePage.DropDownStyle = System.Windows.Forms.ComboBoxStyle.DropDownList;
this.m_comboDestCodePage.Location = new System.Drawing.Point(336, 24);
this.m_comboDestCodePage.Name = "m_comboDestCodePage";
this.m_comboDestCodePage.Size = new System.Drawing.Size(136, 20);
this.m_comboDestCodePage.TabIndex = 19;
//
// m_btExit
//
this.m_btExit.Location = new System.Drawing.Point(280, 144);
this.m_btExit.Name = "m_btExit";
this.m_btExit.Size = new System.Drawing.Size(128, 24);
this.m_btExit.TabIndex = 27;
this.m_btExit.Text = "退 出";
this.m_btExit.Click += new System.EventHandler(this.m_btExit_Click);
//
// m_ckSearchSubFolder
//
this.m_ckSearchSubFolder.Location = new System.Drawing.Point(256, 64);
this.m_ckSearchSubFolder.Name = "m_ckSearchSubFolder";
this.m_ckSearchSubFolder.TabIndex = 33;
this.m_ckSearchSubFolder.Text = "搜索子目录";
this.m_ckSearchSubFolder.CheckedChanged += new System.EventHandler(this.m_ckSearchSubDirectory_CheckedChanged);
//
// m_btConvert
//
this.m_btConvert.Location = new System.Drawing.Point(104, 144);
this.m_btConvert.Name = "m_btConvert";
this.m_btConvert.Size = new System.Drawing.Size(120, 23);
this.m_btConvert.TabIndex = 34;
this.m_btConvert.Text = "转换";
this.m_btConvert.Click += new System.EventHandler(this.m_btConvert_Click);
//
// m_txtPattern
//
this.m_txtPattern.Location = new System.Drawing.Point(96, 64);
this.m_txtPattern.Name = "m_txtPattern";
this.m_txtPattern.TabIndex = 35;
this.m_txtPattern.Text = "";
//
// m_labelPattern
//
this.m_labelPattern.Location = new System.Drawing.Point(24, 64);
this.m_labelPattern.Name = "m_labelPattern";
this.m_labelPattern.Size = new System.Drawing.Size(72, 20);
this.m_labelPattern.TabIndex = 39;
this.m_labelPattern.Text = "文件/类型:";
this.m_labelPattern.TextAlign = System.Drawing.ContentAlignment.MiddleLeft;
//
// CodePageConvertor
//
this.AutoScaleBaseSize = new System.Drawing.Size(6, 14);
this.ClientSize = new System.Drawing.Size(504, 189);
this.Controls.AddRange(new System.Windows.Forms.Control[] {
this.m_labelPattern,
this.m_txtPattern,
this.m_btConvert,
this.m_ckSearchSubFolder,
this.m_btExit,
this.label1,
this.m_comboDestCodePage,
this.m_labelFolder,
this.m_CodePageLabel,
this.m_btBrowerFolder,
this.m_txtFolder,
this.m_comboSrcCodePage});
this.Name = "CodePageConvertor";
this.Text = "代码页转换器";
this.ResumeLayout(false);
}
#endregion
/// <summary>
/// 应用程序的主入口点。
/// </summary>
[STAThread]
static void Main()
{
Application.Run(new CodePageConvertor());
}
private string GetFile()
{
OpenFileDialog openFileDialog =new OpenFileDialog();
//openFileDialog.InitialFolder = "c://" ;
openFileDialog.Filter = "txt files (*.txt)|*.txt|All files (*.*)|*.*" ;
openFileDialog.FilterIndex = 2 ;
openFileDialog.RestoreDirectory = true ;
if(openFileDialog.ShowDialog() == DialogResult.OK)
{
return openFileDialog.FileName;
}
else
return "";
}
private string GetFolder()
{
FolderBrowserDialog folder = new FolderBrowserDialog("Get Folder ");
if(folder.ShowDialog() == DialogResult.OK)
return folder.Folder;
else
return "";
}
private void m_btExit_Click(object sender, System.EventArgs e)
{
Application.Exit();
}
private void m_btConvert_Click(object sender, System.EventArgs e)
{
string sFolder = m_txtFolder.Text;
string sPattern = m_txtPattern.Text;
bool bSearchSubFolder = this.m_ckSearchSubFolder.Checked;
Encoding srcEncoding = ((CCodePage)m_comboSrcCodePage.SelectedItem).Encoding;
Encoding destEncoding = ((CCodePage)m_comboDestCodePage.SelectedItem).Encoding;
CPConvertor.ConvertFolder( sFolder, sPattern, bSearchSubFolder, srcEncoding, destEncoding );
}
private void m_txtFolder_TextChanged(object sender, System.EventArgs e)
{
}
private void m_btBrowerForder_Click(object sender, System.EventArgs e)
{
m_txtFolder.Text = GetFolder();
}
private void m_ckSearchSubDirectory_CheckedChanged(object sender, System.EventArgs e)
{
}
}
public class FolderBrowserDialog : System.Windows.Forms.Design.FolderNameEditor
{
protected FolderNameEditor.FolderBrowser folderDlg;
public FolderBrowserDialog(string description)
{
folderDlg = new FolderNameEditor.FolderBrowser();
//folderDlg.StartLocation = FolderBrowserFolder.MyDocuments;
folderDlg.Style = FolderBrowserStyles.RestrictToFilesystem;
//FolderBrowserStyles.BrowseForEverything;
//FolderBrowserStyles.BrowseForComputer|
//FolderBrowserStyles.RestrictToDomain|
//FolderBrowserStyles.RestrictToFilesystem|
//FolderBrowserStyles.RestrictToSubfolders|
//FolderBrowserStyles.ShowTextBox;
folderDlg.Description = description;
}
public DialogResult ShowDialog()
{
return folderDlg.ShowDialog();
}
public string Folder
{
get
{
return folderDlg.DirectoryPath;
}
}
}
}
/**********************************************************
* CPConvertor.cs
***********************************************************/
using System;
using System.IO;
using System.Runtime.InteropServices;
using System.Text;
using System.Windows.Forms;
namespace CodePage
{
public class CCodePage
{
private int m_CodePage;
private string m_Name;
private string m_DisplayName;
private Encoding m_Encoding;
public CCodePage(int codePage,string name)
{
m_CodePage = codePage;
m_Name = name;
m_DisplayName = codePage + " " + name;
m_Encoding = Encoding.GetEncoding(codePage);
// 使用Encoding.WindowsCodePage重新构造一个Encoding是为了解决葡萄牙文转换的问题
// 对于葡萄牙文,使用860代码页,结果总是不对,
// 实际发现Encoding(860).WindowsCodePage = 1252
// 而使用1252进行转换,结果就是对的,所以做了个特殊处理:
// 如果Encoding的codepage和WindowsCodePage不一致,就用WindowsCodePage重新构造一个Encoding
if( m_CodePage != m_Encoding.WindowsCodePage )
m_Encoding = Encoding.GetEncoding( m_Encoding.WindowsCodePage );
}
public int CodePage
{
get { return m_CodePage; }
set { m_CodePage = value; }
}
public string Name
{
get { return m_Name; }
set { m_Name = value; }
}
public string DisplayName
{
get { return m_DisplayName; }
set { m_DisplayName = value; }
}
public Encoding Encoding
{
get { return m_Encoding; }
set { m_Encoding = value; }
}
public override string ToString()
{
return m_DisplayName;
}
public static CCodePage[] GetCodePageList()
{
CCodePage[] codePageList = new CCodePage[]
{
new CCodePage(65001,"UTF-8"),
new CCodePage(437,"默认:美国"),
new CCodePage(936,"中文(简体)"),
new CCodePage(950,"中文(台、港繁体)"),
new CCodePage(949,"朝鲜语"),
new CCodePage(860,"葡萄牙语"),
new CCodePage(932,"日语"),
// new CCodePage(708,"阿拉伯代码页"),
// new CCodePage(737,"希腊"),
// new CCodePage(775,"波罗的"),
// new CCodePage(850,"国际"),
// new CCodePage(852,"Slavic"),
// new CCodePage(855,"西里尔语"),
// new CCodePage(857,"土耳其语"),
// new CCodePage(861,"冰岛语"),
// new CCodePage(862,"希伯来语"),
// new CCodePage(863,"加拿大法语"),
// new CCodePage(864,"阿拉伯语"),
// new CCodePage(865,"挪威/丹麦语"),
// new CCodePage(866,"俄语"),
// new CCodePage(874,"泰语")
};
return codePageList;
}
}
/// <summary>
/// CPConvertor 的摘要说明。
/// </summary>
public class CPConvertor
{
public static void Save(byte[] buf,string filename)
{
FileStream fs=new FileStream(filename , FileMode.Create, FileAccess.Write);
BinaryWriter bw = new BinaryWriter(fs);
bw.Write(buf);
bw.Close();
fs.Close();
}
public static byte[] Read(string filename)
{
FileStream fs=new FileStream(filename , FileMode.Open, FileAccess.Read);
BinaryReader br = new BinaryReader(fs);
byte[] buf = br.ReadBytes((int)fs.Length);
br.Close();
fs.Close();
return buf;
}
public static void ConvertFolder(string sDir, string sPattern, bool bSearchSubdir, Encoding srcEncoding, Encoding destEncoding )
{
string[] sFiles = Directory.GetFiles( sDir, sPattern );
foreach( string sFile in sFiles )
{
ConvertFile( sFile, srcEncoding, sFile, destEncoding );
//ConvertFileByAPI( sFile, srcEncoding.CodePage, sFile, destEncoding.CodePage );
}
if( bSearchSubdir )
{
string[] sDirs = Directory.GetDirectories( sDir );
foreach( string sSubdir in sDirs )
ConvertFolder( sSubdir, sPattern, bSearchSubdir, srcEncoding, destEncoding );
}
}
public static bool ConvertFile(string sSrcFile, Encoding srcEncoding, string sDestFile, Encoding destEncoding )
{
byte[] srcBuffer = Read(sSrcFile);
byte[] destBuffer = Encoding.Convert( srcEncoding,destEncoding,srcBuffer );
Save( destBuffer, sDestFile );
return true;
}
public static bool ConvertFileByAPI(string sSrcFile,int srcCodePage, string sDestFile, int destCodePage )
{
byte[] inBuffer = Read( sSrcFile );
byte[] outBuffer = null;
if( srcCodePage == (int)EnumCodePage.CP_UTF8 )
{
outBuffer = W2M( inBuffer, destCodePage );
}
else if( destCodePage == (int)EnumCodePage.CP_UTF8 )
{
outBuffer = M2W( inBuffer, srcCodePage );
}
else
{
outBuffer = M2W( inBuffer, srcCodePage );
outBuffer = W2M( outBuffer, destCodePage );
}
Save( outBuffer,sDestFile);
return true;
}
//#define CP_ACP 0 // default to ANSI code page
//#define CP_OEMCP 1 // default to OEM code page
//#define CP_MACCP 2 // default to MAC code page
//#define CP_THREAD_ACP 3 // current thread's ANSI code page
//#define CP_SYMBOL 42 // SYMBOL translations
//
//#define CP_UTF7 65000 // UTF-7 translation
//#define CP_UTF8 65001 // UTF-8 translation
public enum EnumCodePage
{
CP_ACP,
CP_OEMCP,
CP_MACCP,
CP_THREAD_ACP,
CP_SYMBOL = 42,
CP_UTF7 = 65000,
CP_UTF8 = 65001
}
[DllImport("kernel32.dll",CharSet=CharSet.Auto, ExactSpelling=true)]
static extern public int WideCharToMultiByte(
int CodePage, // code page
int dwFlags, // performance and mapping flags
byte[] lpWideCharStr, // address of wide-character string
int cchWideChar, // number of characters in string
byte[] lpMultiByteStr, // address of buffer for new string
int cchMultiByte, // size of buffer
string lpDefaultChar, // address of default for unmappable
// characters
bool lpUsedDefaultChar // address of flag set when default
// char. used
);
[DllImport("kernel32.dll",CharSet=CharSet.Auto, ExactSpelling=true)]
static extern public int MultiByteToWideChar(
int CodePage, // code page
int dwFlags, // character-type options
byte[] lpMultiByteStr, // address of string to map
int cchMultiByte, // number of bytes in string
byte[] lpWideCharStr, // address of wide-character buffer
int cchWideChar // size of buffer
);
public static byte[] W2M( byte[] inBuffer, int codepage )
{
try
{
//取得所需buf大小
int bufferlength = WideCharToMultiByte(codepage,0,inBuffer,inBuffer.Length,null,0,"?",false);
byte[] outBuffer = new byte[bufferlength];
//将UTF-8字符串转为ANSI字符串
WideCharToMultiByte(codepage,0,inBuffer,inBuffer.Length,outBuffer,bufferlength,"?",false);
//File.Delete(strPPDFileFullPath);
return outBuffer;
}
catch(Exception e)
{
#if DEBUG
System.Windows.Forms.MessageBox.Show(e.Message);
System.Windows.Forms.MessageBox.Show(e.StackTrace);
#endif
return null;
}
}
//
public static byte[] M2W( byte[] inBuffer, int codepage )
{
try
{
int contentlength =
MultiByteToWideChar(
codepage, // code page
0, // character-type options
inBuffer, // address of string to map
inBuffer.Length, // number of bytes in string
null, // address of wide-character buffer
0 // size of buffer
);
byte[] outBuffer = new Byte[contentlength*2];
MultiByteToWideChar(
codepage, // code page
0, // character-type options
inBuffer, // address of string to map
inBuffer.Length, // number of bytes in string
outBuffer, // address of wide-character buffer
contentlength*2 // size of buffer
);
return outBuffer;
}
catch(Exception e)
{
#if DEBUG
System.Windows.Forms.MessageBox.Show(e.Message);
System.Windows.Forms.MessageBox.Show(e.StackTrace);
#endif
return null;
}
}
}
}