抽空写的网站抓图片c++,增加了简单的多线程和互斥


#include <iostream>
#include <iosfwd>
#include <fstream>
#include <regex>
#include <string>
#include <winsock2.h>
#include <map>
#include <queue>

//准备实现多线程,读html和写图片两个线程
//读html进程主要操作html队列pop,申请html的socket,读html内容,匹配html和图片并分别放入队列
//写图片进程主要操作图片队列pop,申请图片socket,打开图片文件,写图片
//还有一个留守进程,主要等待控制台返回回车结束,有空再加
//读写进程有互斥关系,对于写日志文件互斥,还有对于socket的访问返回结果互斥锁,用生产者/消费者的方式
using namespace std;

SOCKET sock;
//测试主机和端口
const char *testHostName="www.27270.com";
const short testPort=80;
const string testPortChar="80";

//vector<string> vectormapurl;
//vector<string> vectorhtmlurl;

queue<string> queueHtmlUrl;
queue<string> queueMapUrl;

map<string,int> mapMapUrl;
int mapUrlInt;
map<string,int> mapHtmlUrl;
int htmlUrlInt;

string mapType=".jpg";

//定义一个互斥锁
pthread_mutex_t mutex;

int fileNameInt;

//去字符串头尾空
string& trim(string &str)
{
    if(str.empty())
    {
        return str;
    }
    str.erase(0,str.find_first_not_of(" "));
    str.erase(str.find_last_not_of(" ")+1);
    return str;
}

//发送http请求包
bool sendHttpQuery(string sendQueryStr){
    int n=0;
   //初始化socket
    sock = socket(AF_INET, SOCK_STREAM, 0);

    if (sock == INVALID_SOCKET)
    {
        cout << "建立socket失败! 错误码: " << WSAGetLastError() << endl;
        return false;
    }
    sockaddr_in sa = { AF_INET };
    n = bind(sock, (sockaddr*)&sa, sizeof(sa));
    if (n == SOCKET_ERROR)
    {
        cout << "bind函数失败! 错误码: " << WSAGetLastError() << endl;
        return false;
    }
    struct hostent *p = gethostbyname(testHostName);
    if (p == NULL)
    {
        cout << "主机无法解析出ip! 错误吗: " << WSAGetLastError() << endl;
        return false;
    }
    sa.sin_port = htons(testPort);
    memcpy(&sa.sin_addr, p->h_addr, 4);
    //连接
    n = connect(sock, (sockaddr*)&sa, sizeof(sa));
    if (n == SOCKET_ERROR)
    {
        cout << "connect函数失败! 错误码: " << WSAGetLastError() << endl;
        return false;
    }
    //按照http送GET请求
    cout<<"发送http请求:"+sendQueryStr<<endl;
    if (SOCKET_ERROR == send(sock, sendQueryStr.c_str(), sendQueryStr.size(), 0))
    {
        cout << "send error! 错误码: " << WSAGetLastError() << endl;
        closesocket(sock);
        return false;
    }
    return true;
}

//从html中找出图片的url,分辨其唯一性(使用Map),并放入queue中
void getMapUrl(string &htmlContent){
    smatch mat;
    cout<<"图片匹配!"<<endl;
    regex pattern("data-original=\"([^ ]*?\.jpg)\"");
    string::const_iterator stringStart = htmlContent.begin();
    string::const_iterator stringEnd = htmlContent.end();
    while (regex_search(stringStart,stringEnd, mat, pattern))
    {
        string msg(mat[1].first, mat[1].second);
        //判断msg所在的url是否已遍历过
         if(mapMapUrl.find(msg)==mapMapUrl.end()){
            cout<<"插入图片map库:"+msg<<endl;
            mapMapUrl.insert(pair<string, int>(msg,mapUrlInt));
            //如果未遍历过,push到队列中
            cout<<"插入图片队列:"+msg<<endl;
            queueMapUrl.push(msg);
            mapUrlInt++;
        }
        stringStart = mat[0].second;
    }

    smatch mat1;
    regex pattern1("src=\"([^ ]*?\.jpg)\"");
    stringStart = htmlContent.begin();
    stringEnd = htmlContent.end();
    while (regex_search(stringStart,stringEnd, mat1, pattern1))
    {
        string msg(mat1[1].first, mat1[1].second);
        //判断msg所在的url是否已遍历过
         if(mapMapUrl.find(msg)==mapMapUrl.end()){
            cout<<"插入图片map库:"+msg<<endl;
            mapMapUrl.insert(pair<string, int>(msg,mapUrlInt));
            //如果未遍历过,push到队列中
            cout<<"插入图片队列:"+msg<<endl;
            queueMapUrl.push(msg);
            mapUrlInt++;
        }
        stringStart = mat1[0].second;
    }
}

//从html中找出html的url,分辨其唯一性(使用Map),并放入queue中
void getHtmlUrl(string &htmlContent){
    smatch mat;
    cout<<"html匹配!"<<endl;
    regex pattern("href=\"/([^ ]*?)\"");
    string::const_iterator stringStart = htmlContent.begin();
    string::const_iterator stringEnd = htmlContent.end();
    //pair<map<string,int>::iterator, bool> insertPair;
    while (regex_search(stringStart,stringEnd, mat, pattern))
    {
        string msg(mat[1].first, mat[1].second);
        //判断msg所在的url是否已遍历过
        if(mapHtmlUrl.find(msg)==mapHtmlUrl.end()){
           // cout<<msg<<endl;
           if(trim(msg)!=""){
                cout<<"插入Html的map库:"+msg<<endl;
                mapHtmlUrl.insert(pair<string, int>(msg,htmlUrlInt));
                //如果未遍历过,push到队列中
                cout<<"插入Html的队列:"+msg<<endl;
                queueHtmlUrl.push(msg);
                htmlUrlInt++;
           }
        }
        stringStart = mat[0].second;
    }


    smatch mat1;
    cout<<"html匹配!"<<endl;
    regex pattern1("href=\"([^ ]*?)\"");
    stringStart = htmlContent.begin();
    stringEnd = htmlContent.end();
    while (regex_search(stringStart,stringEnd, mat1, pattern1))
    {
        string msg(mat1[1].first, mat1[1].second);
        //判断msg所在的url是否已遍历过
        if(mapHtmlUrl.find(msg)==mapHtmlUrl.end()){
           // cout<<msg<<endl;
           if(trim(msg)!=""){
           cout<<"插入Html的map库:"+msg<<endl;
            mapHtmlUrl.insert(pair<string, int>(msg,htmlUrlInt));
            //如果未遍历过,push到队列中
            cout<<"插入Html的队列:"+msg<<endl;
            queueHtmlUrl.push(msg);
            htmlUrlInt++;
           }
        }
        stringStart = mat1[0].second;
    }
}


//写图片文件
void *writeMapFileThread(void *ptr){
    int n=0;
    char buf[1024];
    memset(buf, 0, sizeof(buf));
    fstream fileTmp;

    while(!queueMapUrl.empty()){
        char *fileNameChar;
        sprintf(fileNameChar,"%d",fileNameInt);

        //打开文档,当文档不存在时会无法打开,需要再看看原因
        fileTmp.open("D:\\testmap\\"+(string)fileNameChar+mapType ,ios::out|ios::binary);

        cout << " 文档打开 D:\\testmap\\"+(string)fileNameChar  << endl;
        if(!fileTmp.is_open()){
            cout << " 文档打开失败! D:\\testmap\\"+(string)fileNameChar  << endl;
        }

        string mapUrlStr;
        //得到图片的url
        mapUrlStr=queueMapUrl.front();
        cout << " 图片队列退出:  "+mapUrlStr  << endl;
        queueMapUrl.pop();

        string getRequestStr="GET "+mapUrlStr+" HTTP/1.1\r\nHost:"+(string)testHostName+":"+testPortChar+"\r\nConnection:Close\r\n\r\n";
        //互斥锁锁定
        pthread_mutex_lock(&mutex);
        if(sendHttpQuery(getRequestStr)){
            cout<<"开始写图文件!"<<endl;
            //接收返回的jpg文件
            n = recv(sock, buf, sizeof(buf)-1, 0);
            //过滤掉前面的字符
            char *cpos = strstr(buf, "\r\n\r\n");
            fileTmp.write(cpos + strlen("\r\n\r\n"), n - (cpos - buf) - strlen("\r\n\r\n"));
            while ((n = recv(sock, buf, sizeof(buf)-1, 0)) > 0)
            {
                fileTmp.write(buf, n);
            }
        }else{
            cout<<"图片http请求失败!"<<endl;
        }
        //线程互斥锁打开
        pthread_mutex_unlock(&mutex);
        cout<<"写图文件完成退出!"<<endl;
        fileNameInt++;
        fileTmp.close();
    }
}

//读取返回的html内容,识别html和jpg,并分别加入到html和jpg的queue中
void *readHtmlContenThread(void *ptr){

    int n=0;
    char buf[1024];
    memset(buf, 0, sizeof(buf));
    string htmlUrlStr;

    while(!queueHtmlUrl.empty()){

        //得到图片的url
        htmlUrlStr=queueHtmlUrl.front();
        queueHtmlUrl.pop();
        cout<<"出队列:"+htmlUrlStr<<endl;

        //string reqInfo = "GET /thread-551469-1.html HTTP/1.1\r\nHost:bbs.51cto.com:80\r\nConnection:Close\r\n\r\n";
        string getRequestStr="GET "+htmlUrlStr+" HTTP/1.1\r\nHost:"+(string)testHostName+":"+testPortChar+"\r\nConnection:Close\r\n\r\n";
        if(trim(htmlUrlStr)==""){
            cout<<"请求html内容为空!"<<endl;
            //return;
        }
        //线程互斥锁锁定
        pthread_mutex_lock(&mutex);
        cout<<"请求html内容:"+getRequestStr<<endl;
        if(sendHttpQuery(getRequestStr)){

            //接收返回的jpg文件
            n = recv(sock, buf, sizeof(buf)-1, 0);
            //cout<<buf <<endl;
            //过滤掉前面的字符
            char *cpos = strstr(buf, "\r\n\r\n");
            while ((n = recv(sock, buf, sizeof(buf)-1, 0)) > 0)
            {
                //filecontent.write(buf, n);
                string tmpbuf=buf;
                getHtmlUrl(tmpbuf);
                getMapUrl(tmpbuf);
                //cout<<cpos<<endl;
            }
        }
        //线程互斥锁打开
        pthread_mutex_unlock(&mutex);
    }
}

int main()
{
    int n;

    char buf[1024];
    memset(buf, 0, sizeof(buf));
    //fstream file;

    WORD version(0);
    WSADATA wsadata;
    int socket_return(0);
    version = MAKEWORD(2,0);
    //socket通讯之前的初始化网络接口
    socket_return = WSAStartup(version,&wsadata);
    if (socket_return != 0)
    {
        return 0;
    }

    pthread_mutex_init(&mutex,NULL);

    string strFirst="GET /ent/meinvtupian/ HTTP/1.1\r\nHost:www.27270.com:80\r\nConnection:Close\r\n\r\n";
    //发送http请求
    if(!sendHttpQuery(strFirst)){
        cout<<" http 请求发送失败! "<<endl;
        return 0;
    }
    cout<<" http 请求发送成功! "<<endl;
    n = recv(sock, buf, sizeof(buf)-1, 0);
    //strstr返回子串首次出现的地址,返回到char *类型中就是返回了首次出现的地址后的整个字符串
    //利用strstr过滤掉响应消息的报文头:将\r\n\r\n加响应数据保存到*cpos,cpos是buf中第一次出现\r\n\r\n的地址

    //将cpos+\r\n\r\n来去掉*cpos中的\r\n\r\n,放入文件中
    //长度参数用接收到的字节数减去(两个地址直接的差)再减去\r\n\r\n,必须保证recv返回的是字节数,指针相减的差是字节数,strlen返回的也是字节数
    //
    while ((n = recv(sock, buf, sizeof(buf)-1, 0)) > 0)
    {
        string tmpbuf=buf;
        getHtmlUrl(tmpbuf);
        getMapUrl(tmpbuf);
    }
    //当图片队列和html队列不为空时
    //如何watch函数中queue和map??
    //取html队列信息,发送socket的http请求报文,返回html内容字符串,解析html内容字符串
    //将得到的html访问地址,加入到html队列中,将得到的jpg访问地址,加入到jpg队列中
    //取jpg队列信息,发送socket的http请求报文,接收返回报文并写入本地文件

//创建读取html线程
    pthread_t testThreadGetHtml;
    int ret1= pthread_create(&testThreadGetHtml,NULL,readHtmlContenThread,NULL);
    if(ret1){
        cout<<"创建读取html内容线程失败!"<<endl;
        return 1;
    }
//创建写图片线程
    pthread_t testthreadWriteMap;
    int ret2= pthread_create(&testthreadWriteMap,NULL,writeMapFileThread,NULL);
    if(ret2){
        cout<<"创建写图片文件线程失败!"<<endl;
        return 1;
    }
    pthread_join(testThreadGetHtml,NULL);
    pthread_join(testthreadWriteMap,NULL);

    return 0;
}


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值