花了几天时间研究了一下C++的<windows.h>库,然后写了一个C++版本的爬虫,爬取网上题解中的代码然后自动提交....
这里先对所有跟我一样学习ACM的同学和大佬以及hdu管理员陪个不是,毕竟这只是个取巧的方式......望见谅
(但是后来测试发现C++的通过率实在有点尴尬,并且出现了大部分转码有问题造成CE的情况....在学长的建议下转投python大法.....然后排名直冲第二...)
1.整体思路
1)使用socket编程模拟HTTP协议GET请求向服务器发送页面请求
2)借助cn.bing.com找到对应题目的csdn题解,并使用正则表达式解析HTML代码获取博客连接
3)在博客的HTML代码中提取c++代码
4)对代码进行转码
5)模拟post提交代码
6)如上方式解析Status页面的HTML代码,提取运行结果
1)使用socket编程模拟HTTP协议GET请求向服务器发送页面请求
使用Socket编程通过bind(),connect(),send(),recv()这些函数建立与服务器的连接。
在这里我们只需要实现GET的请求头即可(这个可以从网页F12开发人员工具调试页面找到),注意和正文之间有一个空行,即/r/n
//因为这个我也不是很懂...所以不作过多解释
//请求建立连接
void SendRequst(char *host, char *othPath)
{
WSADATA wd;
WSAStartup(MAKEWORD(2, 2), &wd);
sock = socket(AF_INET, SOCK_STREAM, 0);
sockaddr_in sa = { AF_INET };
int n = bind(sock, (sockaddr*)&sa, sizeof(sa));
struct hostent *p = gethostbyname(host);
sa.sin_port = htons(80);
memcpy(&sa.sin_addr, p->h_addr, 4);
n = connect(sock, (sockaddr*)&sa, sizeof(sa));
string reqInfo = "GET " + (string)othPath + " HTTP/1.1\r\nHost: " + (string)host + "\r\nConnection:Close\r\n\r\n";
}
2)借助cn.bing.com找到对应题目的csdn题解,并使用正则表达式解析HTML代码获得博客地址
//之所以不用百度的原因是百度对搜索后的链接进行了加密处理...我这种菜鸡是没办法解析的...所以用微软的cnbing搜索引擎
从左下角可以看到,百度的搜索结果页面上的链接是进行了加密的
然后对比cnbing的搜索界面,可以看到左下角的链接是目标博客链接,并没有进行加密
所以这里选择cnbing作为我们的搜索引擎
//记录博客地址
void GetCSDNurl(string &allHtml)
{
blogUrl.clear();
smatch mat;
regex pattern("href=\"(http://blog.csdn[^\\s\"]+)\"");
string::const_iterator start = allHtml.begin();
string::const_iterator end = allHtml.end();
while (regex_search(start, end, mat, pattern))
{
string msg(mat[1].first, mat[1].second);
blogUrl.push_back(msg);
start = mat[0].second;
}
}
3)在博客的HTML代码中提取C++代码
//这张图是借用的,因为我手里没有网页代码...
在提取了HTML代码后,可以发现,我们需要提取的代码在某个特定标签页内,并且c++的语言是有关键字"include"的,所以我们只要在整段代码中查找第一个"include",从这里开始就是我们的代码,并且可以发现,这个标签页的结尾要么是</pre> 要么是</text>
//博客中提取代码
int flagcode;
void GetCode(string &allHtml)
{
CodeHtml = "";
string tfind = "name=\"code\"";
int pos = allHtml.find(tfind);
if (pos != string::npos)
{
while (allHtml[pos] != '>') pos++; pos++;
for (int i = pos; i < (int)allHtml.length(); i++)
{
if ((allHtml[i] == '<'&&allHtml[i + 1] == '/'&&allHtml[i + 2] == 't'&&allHtml[i + 3] == 'e'&&allHtml[i + 4] == 'x'&&allHtml[i + 5] == 't'))return;
else if (allHtml[i] == '<'&&allHtml[i + 1] == '/'&&allHtml[i + 2] == 'p'&&allHtml[i + 3] == 'r'&&allHtml[i + 4] == 'e'&&allHtml[i + 5] == '>')return;
CodeHtml += allHtml[i];
}
}
else
{
cout << "can not find code" << endl;
flagcode = 0; //全局变量flagcode用来判断当前博客中是否找到代码,如果没有找到就没有必要进行提交
return;
}
}
4)对代码进行转码
转码这里使用了大佬的转码...
//HTML转义字符转义处理
string HTMLTOC(string &CodeHtml)
{
string ans;
for (int i = 0; i < (int)CodeHtml.length(); i++)
{
if (CodeHtml[i] == '&'&&CodeHtml[i + 1] == 'l'&&CodeHtml[i + 2] == 't'&&CodeHtml[i + 3] == ';') ///< <
{
ans += '<';
i += 3;
}
else if (CodeHtml[i] == '&'&&CodeHtml[i + 1] == 'g'&&CodeHtml[i + 2] == 't'&&CodeHtml[i + 3] == ';') ///> >
{
ans += '>';
i += 3;
}
else if (CodeHtml[i] == '/'&&CodeHtml[i + 1] == 'n') /// /n; \\n
{
ans += "\\n";
i += 1;
}
else if (CodeHtml[i] == '&'&&CodeHtml[i + 1] == 'a'&&CodeHtml[i + 2] == 'm'&&CodeHtml[i + 3] == 'p'&&CodeHtml[i + 4] == ';') ///& &
{
ans += '&';
i += 4;
}
else if (CodeHtml[i] == '&'&&CodeHtml[i + 1] == 'q'&&CodeHtml[i + 2] == 'u'&&CodeHtml[i + 3] == 'o'&&CodeHtml[i + 4] == 't'&&CodeHtml[i + 5] == ';') ///" \"
{
ans += '\"';
i += 5;
}
else if (CodeHtml[i] == '&'&&CodeHtml[i + 1] == 'n'&&CodeHtml[i + 2] == 'b'&&CodeHtml[i + 3] == 's'&&CodeHtml[i + 4] == 'p'&&CodeHtml[i + 5] == ';') /// ' '
{
ans += ' ';
i += 5;
}
else if (CodeHtml[i] == '&'&&CodeHtml[i + 1] == '#'&&CodeHtml[i + 2] == '4'&&CodeHtml[i + 3] == '3'&&CodeHtml[i + 4] == ';') ///+ +
{
ans += '+';
i += 4;
}
else if (CodeHtml[i] == '&'&&CodeHtml[i + 1] == '#'&&CodeHtml[i + 2] == '3'&&CodeHtml[i + 3] == '9'&&CodeHtml[i + 4] == ';') ///' '\'
{
ans += '\'';
i += 4;
}
else ans += CodeHtml[i];
}
return ans;
}
//十进制转换成十六进制
string ASCtoHex(int num)
{
char str[] = "0123456789ABCDEF";
int temp = num;
string ans;
while (temp)
{
ans += str[temp % 16];
temp /= 16;
}
ans += '%';
reverse(ans.begin(), ans.end());
return ans;
}
//转换为HTTP编码
string GetRescode(string &CodeHtml)
{
ResCode = "";
for (int i = 0; i < (int)CodeHtml.length(); i++)
{
if ((CodeHtml[i] >= 0 && CodeHtml[i] < 48) || (CodeHtml[i]>57 && CodeHtml[i]<65) || (CodeHtml[i]>90 && CodeHtml[i]<97) || (CodeHtml[i]>122 && CodeHtml[i] <= 127))
{
if (CodeHtml[i] == '\n')ResCode += "%0D%0A";
else if (CodeHtml[i] == '.' || CodeHtml[i] == '-' || CodeHtml[i] == '*')ResCode += CodeHtml[i];
else
{
string cur = ASCtoHex(CodeHtml[i]);
if (cur == "%9")ResCode += "++++";
else if (cur == "%20")ResCode += '+';
else if (cur == "%D")ResCode += "++";
else ResCode += cur;
}
}
else ResCode += CodeHtml[i];
}
return ResCode;
}
//字体转换
char* U2G(const char* utf8) //UTF-8 to GB2312
{
int len = MultiByteToWideChar(CP_UTF8, 0, utf8, -1, NULL, 0);
wchar_t* wstr = new wchar_t[len + 1];
memset(wstr, 0, len + 1);
MultiByteToWideChar(CP_UTF8, 0, utf8, -1, wstr, len);
len = WideCharToMultiByte(CP_ACP, 0, wstr, -1, NULL, 0, NULL, NULL);
char* str = new char[len + 1];
memset(str, 0, len + 1);
WideCharToMultiByte(CP_ACP, 0, wstr, -1, str, len, NULL, NULL);
if (wstr) delete[] wstr;
return str;
}
char* G2U(const char* gb2312) //GB2312 TO UTF-8
{
int len = MultiByteToWideChar(CP_ACP, 0, gb2312, -1, NULL, 0);
wchar_t* wstr = new wchar_t[len + 1];
memset(wstr, 0, len + 1);
MultiByteToWideChar(CP_ACP, 0, gb2312, -1, wstr, len);
len = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, NULL, 0, NULL, NULL);
char* str = new char[len + 1];
memset(str, 0, len + 1);
WideCharToMultiByte(CP_UTF8, 0, wstr, -1, str, len, NULL, NULL);
if (wstr) delete[] wstr;
return str;
}
//提交代码
void SendCode(char *host, char *othPath, string &Code, string PHPSESSID, string CNZZDATA)
{
WSADATA wd;
WSAStartup(MAKEWORD(2, 2), &wd);
sock = socket(AF_INET, SOCK_STREAM, 0);
sockaddr_in sa = { AF_INET };
int n = bind(sock, (sockaddr*)&sa, sizeof(sa));
struct hostent *p = gethostbyname(host);
sa.sin_port = htons(80);
memcpy(&sa.sin_addr, p->h_addr, 4);
n = connect(sock, (sockaddr*)&sa, sizeof(sa));
string Typee = "\r\nContent-Type: application/x-www-form-urlencoded";
string ConLen = "\r\nContent-Length: ";
_itoa(ProblemID, s, 10);
string ElseInfo = "\r\nCache-Control: max-age=0\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8O\r\nOrigin: http://acm.hdu.edu.cn\r\nUser-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36\r\nReferer: http://acm.hdu.edu.cn/submit.php?pid=";
ElseInfo = ElseInfo + (string)s + "\r\nAccept-Encoding: gzip, deflate\r\nAccept-Language: zh-CN,zh;q=0.8";
string HeaderP = "check=0&problemid=" + (string)s;
HeaderP += "&language=2&usercode=";
ResCode = HeaderP + ResCode;
char s[300];
_itoa(ResCode.length(), s, 10);
string Cookie = "exesubmitlang=2; PHPSESSID=" + PHPSESSID + "; CNZZDATA1254072405=" + CNZZDATA;
string reqInfo = "POST " + (string)othPath + " HTTP/1.1\r\nHost: " + (string)host + ElseInfo + Typee + ConLen + (string)s + "\r\nCookie: " + Cookie + "\r\nConnection:Close\r\n\r\n" + ResCode;
}
6)在status页面中提取运行结果
前面既然都能从HTML中转码出C++代码了...提取结果还是好找的..找到题号往前移动就行
//这里有个小细节,因为hdu的提交测试需要时间,以及考虑到还有其他人同时提交的情况,我这里考虑了提交之后sleep5秒后再查看status
//处理status中的数据,自己的最后一次提交情况
int flagg;//考虑一次AC之后不需要多次提交
void GetResult(string &allHtml, int Prob)
{
StateAns = "", StateSapce = "", StateTime = "";
char d[200];
_itoa(ProblemID, d, 10);
strcat(d, "</a>");
int pos = allHtml.find((string)d);
if (pos == string::npos)return;
else
{
pos = pos - 52;
int begin;
while (true)
{
if (allHtml[pos] == '>')
{
begin = pos;
break;
}
pos--;
}
for (int i = begin + 1; allHtml[i] != '<'; i++)StateAns += allHtml[i];
}
cout << "结果: " << StateAns << endl;
if (StateAns == "Accepted") flagg = 1;
}
8)cookies模拟在线
说了这么多显然还有一点最重要的...那就是...在线登录,没有登录怎么提交呢hhh
因为C++不知道怎么模拟登录按键...所以选择的是cookies模拟在线,每次打开浏览器登录,都会有一个特定的cookies,只要记录了这个就可以模拟你的在线情况...
另外我也不知道怎么用C++在控制台模拟浏览器...所以需要自己打开浏览器不要关闭...当然这不是像按键精灵那样真的模拟提交,查找等操作,只需要开着浏览器,让控制台中存在浏览器后台即可.
cookies可以在F12的Application中找到,有的浏览器会同时存在PHPSESSID, CNZZDATA,那么就都需要记录
int main()
{
int start, end, n;
string PHPSESSID, CNZZDATA;
//cookies
PHPSESSID="m83mg1ah8fsljf828dfqmueas6";
CNZZDATA="0";
for (ProblemID = 3619; ProblemID < 6000; ProblemID++)
{
allHtml = "";
strcpy(host, "cn.bing.com");
strcpy(othPath, "/search?q=hdu+");
//printf("%s",host);
_itoa(ProblemID, s, 10);
strcat(othPath, s);
strcat(othPath, "+csdn");
//printf("%s\n", othPath);
SendRequst(host, othPath);
while ((n = recv(sock, buf, sizeof(buf) - 1, 0)) > 0){buf[n] = 0;allHtml += buf;}
closesocket(sock);
GetCSDNurl(allHtml);
for (int i = 0; i < (int)(blogUrl.size()); i++)
{
cout << "Problem:" << ProblemID << " ";
allHtml = "";
char tmp[400];
strcpy(tmp, blogUrl[i].c_str());
SendRequst(host, othPath);
while ((n = recv(sock, buf, sizeof(buf) - 1, 0)) > 0)
{
buf[n] = 0;
allHtml += buf;
}
closesocket(sock);
flagcode = 1;
GetCode(allHtml);
if (!flagcode) continue;
CodeHtml = HTMLTOC(CodeHtml);
char *p = U2G(CodeHtml.c_str());
CodeHtml = string(p);
strcpy(tmp, "http://acm.hdu.edu.cn/submit.php?action=submit");
ResCode = GetRescode((string)p);
SendCode(host, othPath, ResCode, PHPSESSID, CNZZDATA);
while ((n = recv(sock, buf, sizeof(buf) - 1, 0)) > 0)
{
buf[n] = '\0';
}
char temp[400];
strcpy(temp, "http://acm.hdu.edu.cn/status.php");
Sleep(5000);
flagg = 0;
SendRequst(host, othPath);
allHtml = "";
while ((n = recv(sock, buf, sizeof(buf) - 1, 0)) > 0)
{
buf[n] = '\0';
allHtml += buf;
}
GetResult(allHtml, ProblemID);
closesocket(sock);
if (flagg) break;
Sleep(12000);
}
}
return 0;
}