IT博客汇 | C++爬虫原理(五):编码和解码URL

C++爬虫原理(五):编码和解码URL_UTF-8

admin发表于 2015-08-15 09:13:17

C++爬虫原理(五):编码和解码URL，UTF-8方式，网上大多数是ansi方式的编码，即：UTF-8,UrlEncode编码/UrlDecode解码：

一个CString版的代码如下（项目需要随手写了一个）：

CString CTestDlg::URLEncode(CString str)
{
	int len = MultiByteToWideChar(CP_ACP,0,str.GetBuffer(0),-1,NULL,0);
	str.ReleaseBuffer();
	wchar_t *unicode = new wchar_t[len];
	MultiByteToWideChar(CP_ACP,0,str.GetBuffer(0),-1,unicode,len);
	str.ReleaseBuffer();

	len = WideCharToMultiByte(CP_UTF8,0,unicode,-1,NULL,0,NULL,NULL);
	unsigned char *newChar = new unsigned char[len];
	WideCharToMultiByte(CP_UTF8,0,unicode,-1,(LPSTR)newChar,len,NULL,NULL);
	CString newStr="";

	CString tempStr="";
	char ch[4];
	//code by:cplusplus.me
	for (size_t i=0;i<len-1;i++)
	{ 
		if (isalnum(newChar[i]))		//数字字母不转换
			sprintf(ch,"%c",newChar[i]);
		else if (isspace(newChar[i]))	//空格转为+，or %20
			sprintf(ch,"%%20");
		else							//中文
			sprintf(ch,"%%%x",newChar[i]);

		tempStr.Format("%s",ch);
		newStr += tempStr;
	}
	delete[] unicode;
	delete[] newChar;
	return newStr;
}

另外一个适用，靠谱的版本是[代码不知道出自哪里了]：

#include <string>
#include <vector>
 
inline BYTE toHex(const BYTE x)
{
    return x>9?x+55:x+48;
}
 
std::string WC2UT(const wchar_t* buf)
{
    int len=WideCharToMultiByte(CP_UTF8,0,buf,-1,NULL,0,NULL,NULL);
    std::vector<char> utf8(len);
    WideCharToMultiByte(CP_UTF8,0,buf,-1,&utf8[0],len,NULL,NULL);
    return std::string(&utf8[0]);
}
 
std::wstring MB2WC(const char* buf)
{
    int len=MultiByteToWideChar(CP_ACP,0,buf,-1,NULL,0);
    std::vector<wchar_t> unicode(len);
    MultiByteToWideChar(CP_ACP,0,buf,-1,&unicode[0],len);
    return std::wstring(&unicode[0]);
}
 
//参数要用指针。
void URLEncode(CString* str)
{
    std::string sln=str->GetBuffer(0);
    sln=WC2UT(MB2WC(sln.c_str()).c_str());
    std::string sOut;
    for (size_t ix=0;ix<sln.size();ix++)
    {
        BYTE buf[4];
        memset(buf,0,4);
        if (isalnum((BYTE)sln[ix]))
            buf[0]=sln[ix];
        else if (isspace((BYTE)sln[ix]))
            buf[0]='+';
        else
        {
            buf[0]='%';
            buf[1]=toHex((BYTE)sln[ix]>>4);
            buf[2]=toHex((BYTE)sln[ix]);
        }
        sOut+=(char*)buf;
    }
    CString out=sOut.c_str();
    *str=out;
}

这里仅仅给出了编码方式，解码方式可自行百度。