#include <iostream>
#include <iosfwd>
#include <fstream>
#include <regex>
#include <string>
#include <winsock2.h>
#include <map>
#include <queue>
//准备实现多线程,读html和写图片两个线程
//读html进程主要操作html队列pop,申请html的socket,读html内容,匹配html和图片并分别放入队列
//写图片进程主要操作图片队列pop,申请图片socket,打开图片文件,写图片
//还有一个留守进程,主要等待控制台返回回车结束,有空再加
//读写进程有互斥关系,对于写日志文件互斥,还有对于socket的访问返回结果互斥锁,用生产者/消费者的方式
using namespace std;
SOCKET sock;
//测试主机和端口
const char *testHostName="www.27270.com";
const short testPort=80;
const string testPortChar="80";
//vector<string> vectormapurl;
//vector<string> vectorhtmlurl;
queue<string> queueHtmlUrl;
queue<string> queueMapUrl;
map<string,int> mapMapUrl;
int mapUrlInt;
map<string,int> mapHtmlUrl;
int htmlUrlInt;
string mapType=".jpg";
//定义一个互斥锁
pthread_mutex_t mutex;
int fileNameInt;
//去字符串头尾空
string& trim(string &str)
{
if(str.empty())
{
return str;
}
str.erase(0,str.find_first_not_of(" "));
str.erase(str.find_last_not_of(" ")+1);
return str;
}
//发送http请求包
bool sendHttpQuery(string sendQueryStr){
int n=0;
//初始化socket
sock = socket(AF_INET, SOCK_STREAM, 0);
if (sock == INVALID_SOCKET)
{
cout << "建立socket失败! 错误码: " << WSAGetLastError() << endl;
return false;
}
sockaddr_in sa = { AF_INET };
n = bind(sock, (sockaddr*)&sa, sizeof(sa));
if (n == SOCKET_ERROR)
{
cout << "bind函数失败! 错误码: " << WSAGetLastError() << endl;
return false;
}
struct hostent *p = gethostbyname(testHostName);
if (p == NULL)
{
cout << "主机无法解析出ip! 错误吗: " << WSAGetLastError() << endl;
return false;
}
sa.sin_port = htons(testPort);
memcpy(&sa.sin_addr, p->h_addr, 4);
//连接
n = connect(sock, (sockaddr*)&sa, sizeof(sa));
if (n == SOCKET_ERROR)
{
cout << "connect函数失败! 错误码: " << WSAGetLastError() << endl;
return false;
}
//按照http送GET请求
cout<<"发送http请求:"+sendQueryStr<<endl;
if (SOCKET_ERROR == send(sock, sendQueryStr.c_str(), sendQueryStr.size(), 0))
{
cout << "send error! 错误码: " << WSAGetLastError() << endl;
closesocket(sock);
return false;
}
return true;
}
//从html中找出图片的url,分辨其唯一性(使用Map),并放入queue中
void getMapUrl(string &htmlContent){
smatch mat;
cout<<"图片匹配!"<<endl;
regex pattern("data-original=\"([^ ]*?\.jpg)\"");
string::const_iterator stringStart = htmlContent.begin();
string::const_iterator stringEnd = htmlContent.end();
while (regex_search(stringStart,stringEnd, mat, pattern))
{
string msg(mat[1].first, mat[1].second);
//判断msg所在的url是否已遍历过
if(mapMapUrl.find(msg)==mapMapUrl.end()){
cout<<"插入图片map库:"+msg<<endl;
mapMapUrl.insert(pair<string, int>(msg,mapUrlInt));
//如果未遍历过,push到队列中
cout<<"插入图片队列:"+msg<<endl;
queueMapUrl.push(msg);
mapUrlInt++;
}
stringStart = mat[0].second;
}
smatch mat1;
regex pattern1("src=\"([^ ]*?\.jpg)\"");
stringStart = htmlContent.begin();
stringEnd = htmlContent.end();
while (regex_search(stringStart,stringEnd, mat1, pattern1))
{
string msg(mat1[1].first, mat1[1].second);
//判断msg所在的url是否已遍历过
if(mapMapUrl.find(msg)==mapMapUrl.end()){
cout<<"插入图片map库:"+msg<<endl;
mapMapUrl.insert(pair<string, int>(msg,mapUrlInt));
//如果未遍历过,push到队列中
cout<<"插入图片队列:"+msg<<endl;
queueMapUrl.push(msg);
mapUrlInt++;
}
stringStart = mat1[0].second;
}
}
//从html中找出html的url,分辨其唯一性(使用Map),并放入queue中
void getHtmlUrl(string &htmlContent){
smatch mat;
cout<<"html匹配!"<<endl;
regex pattern("href=\"/([^ ]*?)\"");
string::const_iterator stringStart = htmlContent.begin();
string::const_iterator stringEnd = htmlContent.end();
//pair<map<string,int>::iterator, bool> insertPair;
while (regex_search(stringStart,stringEnd, mat, pattern))
{
string msg(mat[1].first, mat[1].second);
//判断msg所在的url是否已遍历过
if(mapHtmlUrl.find(msg)==mapHtmlUrl.end()){
// cout<<msg<<endl;
if(trim(msg)!=""){
cout<<"插入Html的map库:"+msg<<endl;
mapHtmlUrl.insert(pair<string, int>(msg,htmlUrlInt));
//如果未遍历过,push到队列中
cout<<"插入Html的队列:"+msg<<endl;
queueHtmlUrl.push(msg);
htmlUrlInt++;
}
}
stringStart = mat[0].second;
}
smatch mat1;
cout<<"html匹配!"<<endl;
regex pattern1("href=\"([^ ]*?)\"");
stringStart = htmlContent.begin();
stringEnd = htmlContent.end();
while (regex_search(stringStart,stringEnd, mat1, pattern1))
{
string msg(mat1[1].first, mat1[1].second);
//判断msg所在的url是否已遍历过
if(mapHtmlUrl.find(msg)==mapHtmlUrl.end()){
// cout<<msg<<endl;
if(trim(msg)!=""){
cout<<"插入Html的map库:"+msg<<endl;
mapHtmlUrl.insert(pair<string, int>(msg,htmlUrlInt));
//如果未遍历过,push到队列中
cout<<"插入Html的队列:"+msg<<endl;
queueHtmlUrl.push(msg);
htmlUrlInt++;
}
}
stringStart = mat1[0].second;
}
}
//写图片文件
void *writeMapFileThread(void *ptr){
int n=0;
char buf[1024];
memset(buf, 0, sizeof(buf));
fstream fileTmp;
while(!queueMapUrl.empty()){
char *fileNameChar;
sprintf(fileNameChar,"%d",fileNameInt);
//打开文档,当文档不存在时会无法打开,需要再看看原因
fileTmp.open("D:\\testmap\\"+(string)fileNameChar+mapType ,ios::out|ios::binary);
cout << " 文档打开 D:\\testmap\\"+(string)fileNameChar << endl;
if(!fileTmp.is_open()){
cout << " 文档打开失败! D:\\testmap\\"+(string)fileNameChar << endl;
}
string mapUrlStr;
//得到图片的url
mapUrlStr=queueMapUrl.front();
cout << " 图片队列退出: "+mapUrlStr << endl;
queueMapUrl.pop();
string getRequestStr="GET "+mapUrlStr+" HTTP/1.1\r\nHost:"+(string)testHostName+":"+testPortChar+"\r\nConnection:Close\r\n\r\n";
//互斥锁锁定
pthread_mutex_lock(&mutex);
if(sendHttpQuery(getRequestStr)){
cout<<"开始写图文件!"<<endl;
//接收返回的jpg文件
n = recv(sock, buf, sizeof(buf)-1, 0);
//过滤掉前面的字符
char *cpos = strstr(buf, "\r\n\r\n");
fileTmp.write(cpos + strlen("\r\n\r\n"), n - (cpos - buf) - strlen("\r\n\r\n"));
while ((n = recv(sock, buf, sizeof(buf)-1, 0)) > 0)
{
fileTmp.write(buf, n);
}
}else{
cout<<"图片http请求失败!"<<endl;
}
//线程互斥锁打开
pthread_mutex_unlock(&mutex);
cout<<"写图文件完成退出!"<<endl;
fileNameInt++;
fileTmp.close();
}
}
//读取返回的html内容,识别html和jpg,并分别加入到html和jpg的queue中
void *readHtmlContenThread(void *ptr){
int n=0;
char buf[1024];
memset(buf, 0, sizeof(buf));
string htmlUrlStr;
while(!queueHtmlUrl.empty()){
//得到图片的url
htmlUrlStr=queueHtmlUrl.front();
queueHtmlUrl.pop();
cout<<"出队列:"+htmlUrlStr<<endl;
//string reqInfo = "GET /thread-551469-1.html HTTP/1.1\r\nHost:bbs.51cto.com:80\r\nConnection:Close\r\n\r\n";
string getRequestStr="GET "+htmlUrlStr+" HTTP/1.1\r\nHost:"+(string)testHostName+":"+testPortChar+"\r\nConnection:Close\r\n\r\n";
if(trim(htmlUrlStr)==""){
cout<<"请求html内容为空!"<<endl;
//return;
}
//线程互斥锁锁定
pthread_mutex_lock(&mutex);
cout<<"请求html内容:"+getRequestStr<<endl;
if(sendHttpQuery(getRequestStr)){
//接收返回的jpg文件
n = recv(sock, buf, sizeof(buf)-1, 0);
//cout<<buf <<endl;
//过滤掉前面的字符
char *cpos = strstr(buf, "\r\n\r\n");
while ((n = recv(sock, buf, sizeof(buf)-1, 0)) > 0)
{
//filecontent.write(buf, n);
string tmpbuf=buf;
getHtmlUrl(tmpbuf);
getMapUrl(tmpbuf);
//cout<<cpos<<endl;
}
}
//线程互斥锁打开
pthread_mutex_unlock(&mutex);
}
}
int main()
{
int n;
char buf[1024];
memset(buf, 0, sizeof(buf));
//fstream file;
WORD version(0);
WSADATA wsadata;
int socket_return(0);
version = MAKEWORD(2,0);
//socket通讯之前的初始化网络接口
socket_return = WSAStartup(version,&wsadata);
if (socket_return != 0)
{
return 0;
}
pthread_mutex_init(&mutex,NULL);
string strFirst="GET /ent/meinvtupian/ HTTP/1.1\r\nHost:www.27270.com:80\r\nConnection:Close\r\n\r\n";
//发送http请求
if(!sendHttpQuery(strFirst)){
cout<<" http 请求发送失败! "<<endl;
return 0;
}
cout<<" http 请求发送成功! "<<endl;
n = recv(sock, buf, sizeof(buf)-1, 0);
//strstr返回子串首次出现的地址,返回到char *类型中就是返回了首次出现的地址后的整个字符串
//利用strstr过滤掉响应消息的报文头:将\r\n\r\n加响应数据保存到*cpos,cpos是buf中第一次出现\r\n\r\n的地址
//将cpos+\r\n\r\n来去掉*cpos中的\r\n\r\n,放入文件中
//长度参数用接收到的字节数减去(两个地址直接的差)再减去\r\n\r\n,必须保证recv返回的是字节数,指针相减的差是字节数,strlen返回的也是字节数
//
while ((n = recv(sock, buf, sizeof(buf)-1, 0)) > 0)
{
string tmpbuf=buf;
getHtmlUrl(tmpbuf);
getMapUrl(tmpbuf);
}
//当图片队列和html队列不为空时
//如何watch函数中queue和map??
//取html队列信息,发送socket的http请求报文,返回html内容字符串,解析html内容字符串
//将得到的html访问地址,加入到html队列中,将得到的jpg访问地址,加入到jpg队列中
//取jpg队列信息,发送socket的http请求报文,接收返回报文并写入本地文件
//创建读取html线程
pthread_t testThreadGetHtml;
int ret1= pthread_create(&testThreadGetHtml,NULL,readHtmlContenThread,NULL);
if(ret1){
cout<<"创建读取html内容线程失败!"<<endl;
return 1;
}
//创建写图片线程
pthread_t testthreadWriteMap;
int ret2= pthread_create(&testthreadWriteMap,NULL,writeMapFileThread,NULL);
if(ret2){
cout<<"创建写图片文件线程失败!"<<endl;
return 1;
}
pthread_join(testThreadGetHtml,NULL);
pthread_join(testthreadWriteMap,NULL);
return 0;
}
抽空写的网站抓图片c++,增加了简单的多线程和互斥
最新推荐文章于 2021-01-21 11:51:02 发布