Url encode和decode 相关问题

最新推荐文章于 2024-04-17 17:27:54 发布

newsyoung1

最新推荐文章于 2024-04-17 17:27:54 发布

阅读量751

点赞数 1

分类专栏： url解码文章标签： url解码

url解码专栏收录该内容

1 篇文章 0 订阅

订阅专栏


今天在弄url解码时，弄了好久。比如我的URL带有中文，浏览器默认将中文转成UTF-8编码，但是我进行UTF-8解码时，发现解析出来的中文是乱码，并不是我想要的值。<p>经过查找资料，发现再将UTF-8解析出来的乱码，做一次utf-8 to gb2312就可以正常显示了。</p><p>
</p><p>
</p><p>下面是网上转载的内容：</p>在做网站分析时，我们经常要分析baidu、<span class="t_tag">google</span> 等搜索引擎的搜索关键字，比如搜索“中国”

在 baidu 中是:http://www.baidu.com/s?wd=%D6%D0%B9%FA&cl=3
在 google 中是：http://www.google.com/search?hl=zh-CN&q=%E4%B8%AD%E5%9B%BD&lr=

“中国”这两个字在 baidu 中的编码是 %D6%D0%B9%FA  为 gb2312 编码后转成 16 进制
在 google 中的编码是 %E4%B8%AD%E5%9B%BD 为 utf-8 编码后转成 16 进制

下面的程序分可以把这两种编码解译成原文 "中国". 欢迎拍码

//从 URL 专用格式字符串还原成普通字符串

#include <iconv.h>
#include <iostream>

using namespace std;

char Char2Int(char ch){
        if(ch>='0' && ch<='9')return (char)(ch-'0');
        if(ch>='a' && ch<='f')return (char)(ch-'a'+10);
        if(ch>='A' && ch<='F')return (char)(ch-'A'+10);
        return -1;
}

char Str2Bin(char *str){
        char tempWord[2];
        char chn;

        tempWord[0] = Char2Int(str[0]);                                //make the B to 11 -- 00001011
        tempWord[1] = Char2Int(str[1]);                                //make the 0 to 0  -- 00000000

        chn = (tempWord[0] << 4) | tempWord[1];                //to change the BO to 10110000

        return chn;
}

string UrlDecode(string str){
        string output="";
        char tmp[2];
        int i=0,idx=0,ndx,len=str.length();
        
        while(i<len){
                if(str[i]=='%'){
                        tmp[0]=str[i+1];
                        tmp[1]=str[i+2];
                        output+=Str2Bin(tmp);
                        i=i+3;
                }
                else if(str[i]=='+'){
                        output+=' ';
                        i++;
                }
                else{
                        output+=str[i];
                        i++;
                }
        }
        
        return output;
}

// 代码转换操作类 用于将utf-8 格式转成 gb2312
class CodeConverter {
        private:
                        iconv_t cd;
        public:
                        CodeConverter(const char *from_charset,const char *to_charset) {// 构造
                                cd = iconv_open(to_charset,from_charset);
                        }
                
                        ~CodeConverter() {// 析构
                                iconv_close(cd);
                        }
                
                        int convert(char *inbuf,int inlen,char *outbuf,int outlen) {// 转换输出
                                char **pin = &inbuf;
                                char **pout = &outbuf;

                                memset(outbuf,0,outlen);
                                return iconv(cd,pin,(size_t *)&inlen,pout,(size_t *)&outlen);
                        }
};

//输入url_Utf-8 ,输出 gb2312
string Url2Str_Utf8(string instr){
        string input;
        input=UrlDecode(instr);

        const int        outlen=instr.length();
        char output[outlen];

        CodeConverter cc = CodeConverter("utf-8","gb2312");
        cc.convert((char *)input.c_str(),strlen(input.c_str()),output,outlen);

        return output;
}

//输入url_gb2312 ,输出 gb2312 实际上是直接调用 UrlDecode()
string Url2Str_gb2312(string str){
        return UrlDecode(str);
}


//示例程序
/*int main(){
        //char out2[OUTLEN];

        //+中国哈哈哈终于得了^_^
        cout<<"Url2String_gb2312:"<<Url2String_gb2312("%2B%D6%D0%B9%FA%B9%FE%B9%FE%B9%FE%D6%D5%D3%DA%B5%C3%C1%CB%5E_%5E")<<endl;
        cout<<"Url2String_Utf8:"<<Url2String_Utf8("%2B%E4%B8%AD%E5%9B%BD%E5%93%88%E5%93%88%E5%93%88%E7%BB%88%E4%BA%8E%E5%BE%97%E4%BA%86%5E_%5E")<<endl;
        
        // utf-8-->gb2312        

        //cout << "utf-8-->gb2312 in=" << out1 << ",out=" << out2 << endl;
        //cout<<Url2String_Utf8(out1);

        return 0;
}*/

#include <iostream>
#include <string>
#include <errno.h>
#include <iconv.h>

using namespace std;

//Linux下 GB2312和UTF8转换接口
class CUtilTools  
{
public:
    CUtilTools(){};
    ~CUtilTools(){};
    
    //iInLen的长度不包括\0，应该用strlen。返回值是处理后的sOut长度
    static int Utf8ToGb2312(char *sOut, int iMaxOutLen, const char *sIn, int iInLen)
    {
        char *pIn = (char *)sIn;
        char *pOut = sOut;
        size_t ret;
        size_t iLeftLen=iMaxOutLen;
        iconv_t cd;

        cd = iconv_open("gb2312", "utf-8");
        if (cd == (iconv_t) - 1)
        {
            return -1;
        }
        size_t iSrcLen=iInLen;
        ret = iconv(cd, &pIn,&iSrcLen, &pOut,&iLeftLen);
        if (ret == (size_t) - 1)
        {
            iconv_close(cd);
            return -1;
        }

        iconv_close(cd);

        return (iMaxOutLen - iLeftLen);
    }
    
    //iInLen的长度不包括\0，应该用strlen。返回值是处理后的sOut长度
    static int Gb2312ToUtf8(char *sOut, int iMaxOutLen, const char *sIn, int iInLen)
    {
        char *pIn = (char *)sIn;
        char *pOut = sOut;
        size_t ret;
        size_t iLeftLen=iMaxOutLen;
        iconv_t cd;

        cd = iconv_open("utf-8", "gb2312");
        if (cd == (iconv_t) - 1)
        {
            return -1;
        }
        size_t iSrcLen=iInLen;
        ret = iconv(cd, &pIn,&iSrcLen, &pOut,&iLeftLen);
        if (ret == (size_t) - 1)
        {
            iconv_close(cd);
            return -1;
        }

        iconv_close(cd);

        return (iMaxOutLen - iLeftLen);
    }   
};


int main(int argc, char* argv[])
{
    char* pszOri = "中文字符测试";
    cout << "strlen:" << strlen(pszOri) << endl;
    
    char pszDst[50] = {0};
    
    int iLen = CUtilTools::Gb2312ToUtf8(pszDst, 50, pszOri, strlen(pszOri)); // Gb2312ToUtf8
    
    cout << iLen << "," << strerror(errno) << "," << pszDst << endl;
    
    cout << "-----------" << endl;
    
    char pszGbDst[50] = {0};  
    int iNewLen = CUtilTools::Utf8ToGb2312(pszGbDst, 50, pszDst, iLen); // Utf8ToGb2312   
    cout << iNewLen << "," << strerror(errno) << "," << pszGbDst << endl;
    
    /*
    输出为：
    size:12
    18,Success,涓枃瀛楃娴嬭瘯
    -----------
    12,Success,中文字符测试
    */
    //可以看出，UTF8格式下，一个中文字符占三个字节；而GB2312下占两个字节。
    
    return 0;
}

相关代码：

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <string>
using namespace std;

typedef unsigned char BYTE;

BYTE toHex(const BYTE &x)
{
    return x > 9 ? x -10 + 'A': x + '0';
}

BYTE fromHex(const BYTE &x)
{
    return isdigit(x) ? x-'0' : x-'A'+10;
}

string URLEncode(const string &sIn)
{
        string sOut;
        for( size_t ix = 0; ix < sIn.size(); ix++ )
        {      
            BYTE buf[4];
            memset( buf, 0, 4 );
            if( isalnum( (BYTE)sIn[ix] ) )
            {      
                buf[0] = sIn[ix];
            }
            //else if ( isspace( (BYTE)sIn[ix] ) ) //貌似把空格编码成%20或者+都可以
            //{
            //    buf[0] = '+';
            //}
            else
            {
                buf[0] = '%';
                buf[1] = toHex( (BYTE)sIn[ix] >> 4 );
                buf[2] = toHex( (BYTE)sIn[ix] % 16);
            }
            sOut += (char *)buf;
        }
        return sOut;
}

string URLDecode(const string &sIn)
{
    string sOut;
    for( size_t ix = 0; ix < sIn.size(); ix++ )
    {
        BYTE ch = 0;
        if(sIn[ix]=='%')
        {
            ch = (fromHex(sIn[ix+1])<<4);
            ch |= fromHex(sIn[ix+2]);
            ix += 2;
        }
        else if(sIn[ix] == '+')
        {
            ch = ' ';
        }
        else
        {
            ch = sIn[ix];
        }
        sOut += (char)ch;
    }
    return sOut;
}

int main(int argc,char *argv[])
{
    string strTest = "你好中国";
    string strEnCode = URLEncode(strTest);
    printf("strEnCode:%s\n",strEnCode.c_str());
    
    string strDecode = URLDecode(strEnCode);
    printf("strDecode:%s\n",strDecode.c_str());
    return 0;
}