不同字符集之间的编码转换

不同字符集之间的编码转换

  对于编码转换,Windows有两个基本的API,WideCharToMultiByte和MultiByteToWideChar。这两个API用起来非常烦琐,不是迫不得已,我可不想动它。
  另外,中文版的Windows提供了一个中文转换工具,可以在GB2312和BIG5之间进行转换,优点是可以把繁体转为简体,缺点是不能批量转换,而且也没有提供命令行模式。

  .NET的优点之一就是提供了功能强大的类库,Encoding就是其中之一,用它来做编码就非常方便。附录提供了一个简单的例子,功能不多,但比较实用。可以批量转换(针对目录),但不能把繁体转为简体。
  程序只有两个文件:
  CodePage.cs —— 主程序,提供UI
  CPConvertor.cs —— 封装了具体的转换功能
  代码中也尝试了一下API,但效果并不理想。  
  顺便提一句,程序中提供了一个类:FolderBrowserDialog,即目录对话框。按理说,C#应该提供一个和OpenFileDialog类似的目录对话框的,但是,我怎么找也没找到。
 
  在Linux下面,有一个编码转换工具:iconv,使用非常简单。
  语法:iconv -f encoding -t encoding inputfile
  示例:iconv -f big5 -t gb2312 big5.txt > gb2312.txt

  如果需要转换当前目录下的所有文件,可以使用下面的代码:

#!/bin/bash
for f in $(find .)
do
iconv -f big5 -t gb2312 f > tmp
mv tmp f
done

  当然,Linux中也提供了iconv函数,个人认为,还是使用iconv命令方便一些。 


附录:

/******************************************************
* CodePage.cs
*******************************************************/
using System;
using System.Drawing;
using System.Collections;
using System.ComponentModel;
using System.Windows.Forms;
using System.Windows.Forms.Design;
using System.Data;
using System.Text;

namespace CodePage
{
 /// <summary>
 /// Form1 的摘要说明。
 /// </summary>
 public class CodePageConvertor : System.Windows.Forms.Form
 {
  private System.Windows.Forms.Label m_CodePageLabel;
  private System.Windows.Forms.ComboBox m_comboSrcCodePage;
  private System.Windows.Forms.Label label1;
  private System.Windows.Forms.ComboBox m_comboDestCodePage;
  private System.Windows.Forms.Button m_btExit;
  private System.Windows.Forms.Button m_btConvert;
  private System.Windows.Forms.TextBox m_txtPattern;
  private System.Windows.Forms.Label m_labelPattern;
  private System.Windows.Forms.TextBox m_txtFolder;
  private System.Windows.Forms.CheckBox m_ckSearchSubFolder;
  private System.Windows.Forms.Button m_btBrowerFolder;
  private System.Windows.Forms.Label m_labelFolder;
  /// <summary>
  /// 必需的设计器变量。
  /// </summary>
  private System.ComponentModel.Container components = null;

  public CodePageConvertor()
  {
   //
   // Windows 窗体设计器支持所必需的
   //
   InitializeComponent();

   //
   // TODO: 在 InitializeComponent 调用后添加任何构造函数代码
   //
   CCodePage[] codePageList = CCodePage.GetCodePageList();
   m_comboSrcCodePage.BeginUpdate();
   m_comboSrcCodePage.Items.Clear();
   m_comboSrcCodePage.Items.AddRange(codePageList);
   m_comboSrcCodePage.EndUpdate();
   m_comboSrcCodePage.SelectedIndex = 0;
   m_comboDestCodePage.BeginUpdate();
   m_comboDestCodePage.Items.Clear();
   m_comboDestCodePage.Items.AddRange(codePageList);
   m_comboDestCodePage.EndUpdate();
   m_comboDestCodePage.SelectedIndex = 0;
  }

  /// <summary>
  /// 清理所有正在使用的资源。
  /// </summary>
  protected override void Dispose( bool disposing )
  {
   if( disposing )
   {
    if (components != null)
    {
     components.Dispose();
    }
   }
   base.Dispose( disposing );
  }

  #region Windows Form Designer generated code
  /// <summary>
  /// 设计器支持所需的方法 - 不要使用代码编辑器修改
  /// 此方法的内容。
  /// </summary>
  private void InitializeComponent()
  {
   this.m_comboSrcCodePage = new System.Windows.Forms.ComboBox();
   this.m_txtFolder = new System.Windows.Forms.TextBox();
   this.m_btBrowerFolder = new System.Windows.Forms.Button();
   this.m_CodePageLabel = new System.Windows.Forms.Label();
   this.m_labelFolder = new System.Windows.Forms.Label();
   this.label1 = new System.Windows.Forms.Label();
   this.m_comboDestCodePage = new System.Windows.Forms.ComboBox();
   this.m_btExit = new System.Windows.Forms.Button();
   this.m_ckSearchSubFolder = new System.Windows.Forms.CheckBox();
   this.m_btConvert = new System.Windows.Forms.Button();
   this.m_txtPattern = new System.Windows.Forms.TextBox();
   this.m_labelPattern = new System.Windows.Forms.Label();
   this.SuspendLayout();
   //
   // m_comboSrcCodePage
   //
   this.m_comboSrcCodePage.DropDownStyle = System.Windows.Forms.ComboBoxStyle.DropDownList;
   this.m_comboSrcCodePage.Location = new System.Drawing.Point(96, 24);
   this.m_comboSrcCodePage.Name = "m_comboSrcCodePage";
   this.m_comboSrcCodePage.Size = new System.Drawing.Size(136, 20);
   this.m_comboSrcCodePage.TabIndex = 0;
   //
   // m_txtFolder
   //
   this.m_txtFolder.Location = new System.Drawing.Point(96, 96);
   this.m_txtFolder.Name = "m_txtFolder";
   this.m_txtFolder.RightToLeft = System.Windows.Forms.RightToLeft.No;
   this.m_txtFolder.Size = new System.Drawing.Size(328, 21);
   this.m_txtFolder.TabIndex = 2;
   this.m_txtFolder.Text = "";
   //
   // m_btBrowerFolder
   //
   this.m_btBrowerFolder.Location = new System.Drawing.Point(424, 96);
   this.m_btBrowerFolder.Name = "m_btBrowerFolder";
   this.m_btBrowerFolder.Size = new System.Drawing.Size(56, 21);
   this.m_btBrowerFolder.TabIndex = 11;
   this.m_btBrowerFolder.Text = "Brower";
   this.m_btBrowerFolder.Click += new System.EventHandler(this.m_btBrowerForder_Click);
   //
   // m_CodePageLabel
   //
   this.m_CodePageLabel.Location = new System.Drawing.Point(24, 24);
   this.m_CodePageLabel.Name = "m_CodePageLabel";
   this.m_CodePageLabel.Size = new System.Drawing.Size(72, 20);
   this.m_CodePageLabel.TabIndex = 16;
   this.m_CodePageLabel.Text = "源代码页:";
   this.m_CodePageLabel.TextAlign = System.Drawing.ContentAlignment.MiddleLeft;
   //
   // m_labelFolder
   //
   this.m_labelFolder.Location = new System.Drawing.Point(24, 96);
   this.m_labelFolder.Name = "m_labelFolder";
   this.m_labelFolder.Size = new System.Drawing.Size(48, 20);
   this.m_labelFolder.TabIndex = 17;
   this.m_labelFolder.Text = "目录:";
   this.m_labelFolder.TextAlign = System.Drawing.ContentAlignment.MiddleLeft;
   //
   // label1
   //
   this.label1.Location = new System.Drawing.Point(256, 24);
   this.label1.Name = "label1";
   this.label1.Size = new System.Drawing.Size(80, 20);
   this.label1.TabIndex = 20;
   this.label1.Text = "目标代码页:";
   this.label1.TextAlign = System.Drawing.ContentAlignment.MiddleLeft;
   //
   // m_comboDestCodePage
   //
   this.m_comboDestCodePage.DropDownStyle = System.Windows.Forms.ComboBoxStyle.DropDownList;
   this.m_comboDestCodePage.Location = new System.Drawing.Point(336, 24);
   this.m_comboDestCodePage.Name = "m_comboDestCodePage";
   this.m_comboDestCodePage.Size = new System.Drawing.Size(136, 20);
   this.m_comboDestCodePage.TabIndex = 19;
   //
   // m_btExit
   //
   this.m_btExit.Location = new System.Drawing.Point(280, 144);
   this.m_btExit.Name = "m_btExit";
   this.m_btExit.Size = new System.Drawing.Size(128, 24);
   this.m_btExit.TabIndex = 27;
   this.m_btExit.Text = "退     出";
   this.m_btExit.Click += new System.EventHandler(this.m_btExit_Click);
   //
   // m_ckSearchSubFolder
   //
   this.m_ckSearchSubFolder.Location = new System.Drawing.Point(256, 64);
   this.m_ckSearchSubFolder.Name = "m_ckSearchSubFolder";
   this.m_ckSearchSubFolder.TabIndex = 33;
   this.m_ckSearchSubFolder.Text = "搜索子目录";
   this.m_ckSearchSubFolder.CheckedChanged += new System.EventHandler(this.m_ckSearchSubDirectory_CheckedChanged);
   //
   // m_btConvert
   //
   this.m_btConvert.Location = new System.Drawing.Point(104, 144);
   this.m_btConvert.Name = "m_btConvert";
   this.m_btConvert.Size = new System.Drawing.Size(120, 23);
   this.m_btConvert.TabIndex = 34;
   this.m_btConvert.Text = "转换";
   this.m_btConvert.Click += new System.EventHandler(this.m_btConvert_Click);
   //
   // m_txtPattern
   //
   this.m_txtPattern.Location = new System.Drawing.Point(96, 64);
   this.m_txtPattern.Name = "m_txtPattern";
   this.m_txtPattern.TabIndex = 35;
   this.m_txtPattern.Text = "";
   //
   // m_labelPattern
   //
   this.m_labelPattern.Location = new System.Drawing.Point(24, 64);
   this.m_labelPattern.Name = "m_labelPattern";
   this.m_labelPattern.Size = new System.Drawing.Size(72, 20);
   this.m_labelPattern.TabIndex = 39;
   this.m_labelPattern.Text = "文件/类型:";
   this.m_labelPattern.TextAlign = System.Drawing.ContentAlignment.MiddleLeft;
   //
   // CodePageConvertor
   //
   this.AutoScaleBaseSize = new System.Drawing.Size(6, 14);
   this.ClientSize = new System.Drawing.Size(504, 189);
   this.Controls.AddRange(new System.Windows.Forms.Control[] {
                    this.m_labelPattern,
                    this.m_txtPattern,
                    this.m_btConvert,
                    this.m_ckSearchSubFolder,
                    this.m_btExit,
                    this.label1,
                    this.m_comboDestCodePage,
                    this.m_labelFolder,
                    this.m_CodePageLabel,
                    this.m_btBrowerFolder,
                    this.m_txtFolder,
                    this.m_comboSrcCodePage});
   this.Name = "CodePageConvertor";
   this.Text = "代码页转换器";
   this.ResumeLayout(false);

  }
  #endregion

  /// <summary>
  /// 应用程序的主入口点。
  /// </summary>
  [STAThread]
  static void Main()
  {
   Application.Run(new CodePageConvertor());
  }

  private string GetFile()
  {
   OpenFileDialog openFileDialog =new OpenFileDialog();
   //openFileDialog.InitialFolder = "c://" ;
   openFileDialog.Filter = "txt files (*.txt)|*.txt|All files (*.*)|*.*" ;
   openFileDialog.FilterIndex = 2 ;
   openFileDialog.RestoreDirectory = true ;
   if(openFileDialog.ShowDialog() == DialogResult.OK)
   {
    return openFileDialog.FileName;
   }
   else
    return "";
  }

  private string GetFolder()
  {
   FolderBrowserDialog folder = new FolderBrowserDialog("Get Folder ");
   if(folder.ShowDialog() == DialogResult.OK)
    return folder.Folder;
   else
    return "";
  }

  private void m_btExit_Click(object sender, System.EventArgs e)
  {
   Application.Exit();
  }


  private void m_btConvert_Click(object sender, System.EventArgs e)
  {
   string sFolder = m_txtFolder.Text;
   string sPattern = m_txtPattern.Text;
   bool   bSearchSubFolder = this.m_ckSearchSubFolder.Checked;
   Encoding srcEncoding = ((CCodePage)m_comboSrcCodePage.SelectedItem).Encoding;
   Encoding destEncoding = ((CCodePage)m_comboDestCodePage.SelectedItem).Encoding;
   CPConvertor.ConvertFolder( sFolder, sPattern, bSearchSubFolder, srcEncoding, destEncoding );
  }

  private void m_txtFolder_TextChanged(object sender, System.EventArgs e)
  {
  
  }

  private void m_btBrowerForder_Click(object sender, System.EventArgs e)
  {
   m_txtFolder.Text = GetFolder();  
  }

  private void m_ckSearchSubDirectory_CheckedChanged(object sender, System.EventArgs e)
  {
  
  }

 }

 public class FolderBrowserDialog : System.Windows.Forms.Design.FolderNameEditor
 {
  protected FolderNameEditor.FolderBrowser folderDlg;

  public FolderBrowserDialog(string description)
  {
   folderDlg = new FolderNameEditor.FolderBrowser();
   //folderDlg.StartLocation = FolderBrowserFolder.MyDocuments;
   folderDlg.Style = FolderBrowserStyles.RestrictToFilesystem;
   //FolderBrowserStyles.BrowseForEverything;
   //FolderBrowserStyles.BrowseForComputer|
   //FolderBrowserStyles.RestrictToDomain|
   //FolderBrowserStyles.RestrictToFilesystem|
   //FolderBrowserStyles.RestrictToSubfolders|
   //FolderBrowserStyles.ShowTextBox;
   folderDlg.Description = description;
  }

  public DialogResult ShowDialog()
  {
   return folderDlg.ShowDialog();
  }

  public string Folder
  {
   get
   {
    return folderDlg.DirectoryPath;
   }
  }
 } 
}


/**********************************************************
*  CPConvertor.cs
***********************************************************/
using System;
using System.IO;
using System.Runtime.InteropServices;
using System.Text;
using System.Windows.Forms;

namespace CodePage
{
 public class CCodePage
 {
  private int m_CodePage;
  private string m_Name;
  private string m_DisplayName;
  private Encoding m_Encoding;

  public CCodePage(int codePage,string name)
  {
   m_CodePage = codePage;
   m_Name = name;
   m_DisplayName = codePage + " " + name;
   m_Encoding = Encoding.GetEncoding(codePage);
   // 使用Encoding.WindowsCodePage重新构造一个Encoding是为了解决葡萄牙文转换的问题
   // 对于葡萄牙文,使用860代码页,结果总是不对,
   // 实际发现Encoding(860).WindowsCodePage = 1252
   // 而使用1252进行转换,结果就是对的,所以做了个特殊处理:
   // 如果Encoding的codepage和WindowsCodePage不一致,就用WindowsCodePage重新构造一个Encoding
   if( m_CodePage != m_Encoding.WindowsCodePage )
    m_Encoding = Encoding.GetEncoding( m_Encoding.WindowsCodePage );
  }

  public int CodePage
  {
   get { return m_CodePage; }
   set { m_CodePage = value; }
  }
  public string Name
  {
   get { return m_Name; }
   set { m_Name = value; }
  }
  public string DisplayName
  {
   get { return m_DisplayName; }
   set { m_DisplayName = value; }
  }

  public Encoding Encoding
  {
   get { return m_Encoding; }
   set { m_Encoding = value; }
  }


  public override string ToString()
  {
   return m_DisplayName;
  }

  public static CCodePage[] GetCodePageList()
  {
   CCodePage[] codePageList = new CCodePage[]
    {
     new CCodePage(65001,"UTF-8"),
     new CCodePage(437,"默认:美国"),
     new CCodePage(936,"中文(简体)"),
     new CCodePage(950,"中文(台、港繁体)"),
     new CCodePage(949,"朝鲜语"),
     new CCodePage(860,"葡萄牙语"),
     new CCodePage(932,"日语"),
//     new CCodePage(708,"阿拉伯代码页"),
//     new CCodePage(737,"希腊"),
//     new CCodePage(775,"波罗的"),
//     new CCodePage(850,"国际"),
//     new CCodePage(852,"Slavic"),
//     new CCodePage(855,"西里尔语"),
//     new CCodePage(857,"土耳其语"),
//     new CCodePage(861,"冰岛语"),
//     new CCodePage(862,"希伯来语"),
//     new CCodePage(863,"加拿大法语"),
//     new CCodePage(864,"阿拉伯语"),
//     new CCodePage(865,"挪威/丹麦语"),
//     new CCodePage(866,"俄语"),
//     new CCodePage(874,"泰语")
    };
   return codePageList;

  }
 }
 /// <summary>
 /// CPConvertor 的摘要说明。
 /// </summary>
 public class CPConvertor
 {
  public static void Save(byte[] buf,string filename)
  {
   FileStream fs=new FileStream(filename , FileMode.Create, FileAccess.Write);
   BinaryWriter bw = new BinaryWriter(fs);
   bw.Write(buf);
   bw.Close();
   fs.Close();
  }
 
  public static byte[] Read(string filename)
  {
   FileStream fs=new FileStream(filename , FileMode.Open, FileAccess.Read);
   BinaryReader br = new BinaryReader(fs);
   byte[]  buf = br.ReadBytes((int)fs.Length);
   br.Close();
   fs.Close();
   return buf;
  }

  public static void ConvertFolder(string sDir, string sPattern, bool bSearchSubdir, Encoding srcEncoding, Encoding destEncoding )
  {
   string[] sFiles = Directory.GetFiles( sDir, sPattern );
   foreach( string sFile in sFiles )
   {
    ConvertFile( sFile, srcEncoding, sFile, destEncoding );
    //ConvertFileByAPI( sFile, srcEncoding.CodePage, sFile, destEncoding.CodePage );
   }
   if( bSearchSubdir )
   {
    string[] sDirs = Directory.GetDirectories( sDir );
    foreach( string sSubdir in sDirs )
     ConvertFolder( sSubdir, sPattern, bSearchSubdir, srcEncoding, destEncoding );
   }
  }
  public static bool ConvertFile(string sSrcFile, Encoding srcEncoding, string sDestFile, Encoding destEncoding )
  { 
   byte[] srcBuffer = Read(sSrcFile);
   byte[] destBuffer = Encoding.Convert( srcEncoding,destEncoding,srcBuffer );
   Save( destBuffer, sDestFile );
   return true;
  }

  public static bool ConvertFileByAPI(string sSrcFile,int srcCodePage, string sDestFile, int destCodePage )
  { 
   byte[] inBuffer = Read( sSrcFile );
   byte[] outBuffer = null;
   if( srcCodePage == (int)EnumCodePage.CP_UTF8 )
   {     
    outBuffer = W2M( inBuffer, destCodePage );    
   }
   else if( destCodePage == (int)EnumCodePage.CP_UTF8 )
   {
    outBuffer = M2W( inBuffer, srcCodePage );    
   }
   else
   {
    outBuffer = M2W( inBuffer, srcCodePage );
    outBuffer = W2M( outBuffer, destCodePage ); 
   }
   Save( outBuffer,sDestFile);
   return true;
  }


  //#define CP_ACP                    0           // default to ANSI code page
  //#define CP_OEMCP                  1           // default to OEM  code page
  //#define CP_MACCP                  2           // default to MAC  code page
  //#define CP_THREAD_ACP             3           // current thread's ANSI code page
  //#define CP_SYMBOL                 42          // SYMBOL translations
  //
  //#define CP_UTF7                   65000       // UTF-7 translation
  //#define CP_UTF8                   65001       // UTF-8 translation
  public enum EnumCodePage
  {
   CP_ACP,
   CP_OEMCP,
   CP_MACCP,
   CP_THREAD_ACP,
   CP_SYMBOL = 42,
   CP_UTF7 = 65000,
   CP_UTF8 = 65001
  }

  [DllImport("kernel32.dll",CharSet=CharSet.Auto, ExactSpelling=true)]
  static extern public int WideCharToMultiByte(
   int CodePage,         // code page
   int dwFlags,         // performance and mapping flags
   byte[] lpWideCharStr, // address of wide-character string
   int cchWideChar,       // number of characters in string
   byte[] lpMultiByteStr,  // address of buffer for new string
   int cchMultiByte,      // size of buffer
   string lpDefaultChar,  // address of default for unmappable
   // characters
   bool lpUsedDefaultChar   // address of flag set when default
   // char. used
   );
  [DllImport("kernel32.dll",CharSet=CharSet.Auto, ExactSpelling=true)]
  static extern public int MultiByteToWideChar(
   int CodePage,         // code page
   int dwFlags,         // character-type options
   byte[] lpMultiByteStr, // address of string to map
   int cchMultiByte,      // number of bytes in string
   byte[] lpWideCharStr,  // address of wide-character buffer
   int cchWideChar        // size of buffer
   );

  public static byte[] W2M( byte[] inBuffer, int codepage )
  {
   try
   {    
    //取得所需buf大小
    int bufferlength = WideCharToMultiByte(codepage,0,inBuffer,inBuffer.Length,null,0,"?",false);
    byte[] outBuffer = new byte[bufferlength];
    //将UTF-8字符串转为ANSI字符串
    WideCharToMultiByte(codepage,0,inBuffer,inBuffer.Length,outBuffer,bufferlength,"?",false);
    //File.Delete(strPPDFileFullPath);
    return outBuffer;
   }
   catch(Exception e)
   {
#if DEBUG
    System.Windows.Forms.MessageBox.Show(e.Message);
    System.Windows.Forms.MessageBox.Show(e.StackTrace);
#endif
    return null;
   }
  }
// 
  public static byte[] M2W( byte[] inBuffer, int codepage )
  { 
   try
   {
    int contentlength =
     MultiByteToWideChar(
     codepage,         // code page
     0,     // character-type options
     inBuffer, // address of string to map
     inBuffer.Length,      // number of bytes in string
     null,  // address of wide-character buffer
     0        // size of buffer
     );
    byte[] outBuffer = new Byte[contentlength*2];
    MultiByteToWideChar(
     codepage,         // code page
     0,     // character-type options
     inBuffer, // address of string to map
     inBuffer.Length,      // number of bytes in string
     outBuffer,  // address of wide-character buffer
     contentlength*2        // size of buffer
     );
    return outBuffer;
   }
   catch(Exception e)
   {
#if DEBUG
    System.Windows.Forms.MessageBox.Show(e.Message);
    System.Windows.Forms.MessageBox.Show(e.StackTrace);
#endif
    return null;
   }
  }
 }
}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值