Qt5 网页标题、关键词提取工具Findyou

Qt5 网页标题、关键词提取工具Findyou

一、程序运行
运行界面
在这里插入图片描述
辅助功能,可用于将扫描器的扫描结果转换为url
在这里插入图片描述

二、所涉及的重要知识点
1、Qt爬取https的网页
来自宇龍_
https://blog.csdn.net/qq_45809384/article/details/122049295?spm=1001.2014.3001.5506

在这里插入图片描述
打包完成后,把这两个dll补充了就可以
在这里插入图片描述
配合下面这段代码使用
在这里插入图片描述

2、对301、302的重定向进行跟踪,加入这一行代码就可以
在这里插入图片描述
项目结构
在这里插入图片描述

源代码
getTitleFromUrl.pro

#-------------------------------------------------
#
# Project created by QtCreator 2022-11-23T22:53:34
#
#-------------------------------------------------

QT       += core gui
QT += network

greaterThan(QT_MAJOR_VERSION, 4): QT += widgets

TARGET = getTitleFromUrl
TEMPLATE = app

# The following define makes your compiler emit warnings if you use
# any feature of Qt which as been marked as deprecated (the exact warnings
# depend on your compiler). Please consult the documentation of the
# deprecated API in order to know how to port your code away from it.
DEFINES += QT_DEPRECATED_WARNINGS

# You can also make your code fail to compile if you use deprecated APIs.
# In order to do so, uncomment the following line.
# You can also select to disable deprecated APIs only up to a certain version of Qt.
#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000    # disables all the APIs deprecated before Qt 6.0.0


SOURCES += \
        main.cpp \
        mainwindow.cpp \
    robots.cpp \
    form.cpp

HEADERS += \
        mainwindow.h \
    robots.h \
    form.h

FORMS += \
        mainwindow.ui \
    form.ui

DISTFILES +=

RESOURCES += \
    myicon.qrc

form.h

#ifndef FORM_H
#define FORM_H

#include <QWidget>
#include<QIcon>
#include<QDebug>
#include<QStringList>
namespace Ui {
class Form;
}

class Form : public QWidget
{
    Q_OBJECT

public:
    explicit Form(QWidget *parent = 0);
    ~Form();

private slots:
    void on_pushButton_clicked();

private:
    Ui::Form *ui;
};

#endif // FORM_H

mainwindow.h

#ifndef MAINWINDOW_H
#define MAINWINDOW_H

#include <QMainWindow>
#include <QFile>
#include<QIcon>
#include"robots.h"
#include"form.h"

namespace Ui {
class MainWindow;
}

class MainWindow : public QMainWindow
{
    Q_OBJECT

public:
    explicit MainWindow(QWidget *parent = 0);
    ~MainWindow();

private slots:
    void on_pushButton_start_clicked();

    void on_pushButton_clicked();

private:
    Ui::MainWindow *ui;
};

#endif // MAINWINDOW_H

robots.h

#ifndef ROBOTS_H
#define ROBOTS_H

#endif // ROBOTS_H

#include<QCoreApplication>
#include<QRegularExpression>
#include<QRegularExpressionMatch>
#include<QRegularExpressionMatchIterator>
#include<QString>
#include<QDebug>
#include <QCoreApplication>
#include<QtCore>
#include<QNetworkAccessManager>
#include<QUrl>
#include<QNetworkRequest>
#include<QNetworkReply>
#include<QObject>
#include<QTextCodec>

QString Robots(QString url);//爬取页面
QString RegularExpression(QString HTML,QString re,QString fenzu);//正则提取爬到的页面

form.cpp

#include "form.h"
#include "ui_form.h"

Form::Form(QWidget *parent) :
    QWidget(parent),
    ui(new Ui::Form)
{
    ui->setupUi(this);
    this->setWindowIcon(QIcon("://bgjzicon.png"));
}

Form::~Form()
{
    delete ui;
}

void Form::on_pushButton_clicked()
{
    qDebug()<<ui->textEdit_xieyi->toPlainText();
    QStringList xieyiList=ui->textEdit_xieyi->toPlainText().split("\n");
    xieyiList.removeFirst();
    xieyiList.removeLast();
    QStringList ipList=ui->textEdit_ip->toPlainText().split("\n");
    ipList.removeFirst();
    ipList.removeLast();
    QStringList portList=ui->textEdit_port->toPlainText().split("\n");
    portList.removeFirst();
    portList.removeLast();
    QString url="";

    for(int i=0;i<xieyiList.size();i++)
    {
        url=url+xieyiList[i]+"://"+ipList[i]+":"+portList[i]+"\n";
    }
    ui->textEdit_url->setText(url);

}




main.cpp

#include "mainwindow.h"
#include <QApplication>
#include<QTextCodec>

#include "robots.h"
int main(int argc, char *argv[])
{
    QApplication a(argc, argv);
    MainWindow w;
    w.show();

    return a.exec();
}

mainwindow.cpp

#include "mainwindow.h"
#include "ui_mainwindow.h"

MainWindow::MainWindow(QWidget *parent) :
    QMainWindow(parent),
    ui(new Ui::MainWindow)
{
    ui->setupUi(this);
    this->setWindowIcon(QIcon("://bgjzicon.png"));
}

MainWindow::~MainWindow()
{
    delete ui;
}

void MainWindow::on_pushButton_start_clicked()
{

    QString alltitleresult="";
    QString allkeywordresult="以下页面内含有关键词:";
    ui->textEdit_title->setText(alltitleresult);
    ui->textEdit_h1->setText(allkeywordresult);
    int count=0;

    //这一段是用来匹配出每一个url,可以增加一些对输入格式的兼容性...........................................................
    QRegularExpression Re("(?<url>http[s]{0,1}.*?://.*?)fengefu");
    QString urls=ui->textEdit_url->toPlainText();
    qDebug()<<"原始的:"<<urls<<endl;
    //排除url重定向的链接打乱顺序 如 http://xx.xxx.xx.x/login.php?redirect=http://xxx.xx.xx/
    urls.replace("=http","1");
    //去除\r\n
    urls.remove("\n");
    urls.remove("\r");
     urls.remove("\t");

    //这样可以使用fengefu有效分割出每个url,适应不同的输入格式
    urls.replace("http://","fengefuhttp://");
    urls.replace("https://","fengefuhttps://");
    //在末尾加上分隔符这样可以兼容最后一个url,使得最后一个url得到匹配
    urls=urls+"fengefu";
    qDebug()<<urls;
    //....................................................................................................................


       if(ui->lineEdit_keyword->text()!="")//输入了关键字
     {
       qDebug()<<"输入了关键字"<<endl;
       QRegularExpressionMatchIterator Matchs=Re.globalMatch(urls);
       QRegularExpressionMatch match=Matchs.next();
       QString oneUrl=match.captured("url");//提取每一个url
       qDebug()<<"提取到"<<oneUrl<<endl;
       //单独爬取第一个.......................................................................
       QString HTML=Robots(oneUrl);
       QString title_re="(<title.*?>(?<title>.*?)</title>)";
       QString titleresult=RegularExpression(HTML,title_re,"title");

       QString keyword=ui->lineEdit_keyword->text();
       QString keyword_re="(?<keyword>"+keyword+")";
       QString keywordresult=RegularExpression(HTML,keyword_re,"keyword");
       //qDebug()<<"关键词正则"<<keyword_re<<endl;
       if(keywordresult!="keyword标签值:")//匹配到关键词了
       {
           allkeywordresult=oneUrl;
       }

       alltitleresult=alltitleresult+titleresult+"\n";

       ui->textEdit_title->setText(alltitleresult);
       ui->textEdit_h1->setText(allkeywordresult);
       //qDebug()<<"已检查数:"<<++count;

       ui->label_jindu->setText(QString::number(++count));
     //......................................................................................

       while(Matchs.hasNext()==true)
       {
           match=Matchs.next();
           oneUrl=match.captured("url");
           qDebug()<<"提取到"<<oneUrl<<endl;
           QString HTML=Robots(oneUrl);

           QString title_re="(<title.*?>(?<title>.*?)</title>)";

           QString titleresult=RegularExpression(HTML,title_re,"title");

           QString keyword_re="(?<keyword>"+keyword+")";
           QString keywordresult=RegularExpression(HTML,keyword_re,"keyword");

           if(keywordresult!="keyword标签值:")//匹配到关键词了
           {
               allkeywordresult=allkeywordresult+"\n"+oneUrl;
           }

           alltitleresult=alltitleresult+titleresult+"\n";
           ui->textEdit_title->setText(alltitleresult);
           ui->textEdit_h1->setText(allkeywordresult);

           //滚动条置底,方便观察实时结果
            ui->textEdit_title->moveCursor(QTextCursor::End);
            ui->textEdit_h1->moveCursor(QTextCursor::End);
          // qDebug()<<"已检查数:"<<++count;
           ui->label_jindu->setText(QString::number(++count));

       }
     }
       else//未输入关键字,仅匹配title标签
     {
           qDebug()<<"未输入关键字"<<endl;
           QRegularExpressionMatchIterator Matchs=Re.globalMatch(urls);
           QRegularExpressionMatch match=Matchs.next();
           QString oneUrl=match.captured("url");//提取每一个url
           qDebug()<<"提取到1"<<oneUrl<<endl;
           //单独爬取第一个.......................................................................
           QString HTML=Robots(oneUrl);
           QString title_re="(<title.*?>(?<title>.*?)</title>)";
           qDebug()<<"爬到"<<HTML<<endl;
           QString titleresult=RegularExpression(HTML,title_re,"title");
           alltitleresult=alltitleresult+titleresult+"\n";
           ui->textEdit_title->setText(alltitleresult);
           ui->label_jindu->setText(QString::number(++count));
         //......................................................................................

           while(Matchs.hasNext()==true)
           {
               match=Matchs.next();
               oneUrl=match.captured("url");
               qDebug()<<"提取到2"<<oneUrl<<endl;
               QString HTML=Robots(oneUrl);
               QString title_re="(<title.*?>(?<title>.*?)</title>)";
              qDebug()<<"爬到2"<<HTML<<endl;
               QString titleresult=RegularExpression(HTML,title_re,"title");
               qDebug()<<"结果:"<<titleresult<<endl;
               alltitleresult=alltitleresult+titleresult+"\n";
               ui->textEdit_title->setText(alltitleresult);
               //滚动条置底,方便观察实时结果
                ui->textEdit_title->moveCursor(QTextCursor::End);
              // qDebug()<<"已检查数:"<<++count;
               ui->label_jindu->setText(QString::number(++count));

           }
     }
}

void MainWindow::on_pushButton_clicked()
{
    Form *pinjie=new Form;
    pinjie->show();
}

robots.cpp

#include "robots.h"

QString Robots(QString url)//爬取页面
{
    QUrl URL=url;
    QNetworkAccessManager manager;
    QEventLoop Loop;
    QNetworkRequest request=QNetworkRequest(URL);
    //设置请求头,主要是user-agent字段,不然爬不到有些url,比如百度
    request.setRawHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0");
    request.setRawHeader("Accept","*/*");
    request.setRawHeader("Accept-Language","zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2");

    //这个对于浏览器提醒的那种不安全,是否继续访问的是可行的。会自动去继续浏览的,估计那个只是浏览器自己的特性,直接代码去访问的话,不涉及那个提示,因为不经过浏览器
    //实验后确定FollowRedirectsAttribute可以实现当状态码为301、302时,会自动根据响应包的Location值进行跳转,可以进行多次跳转
    //多次跳转测试url http://39.129.231.7:85
    //测试url http://39.129.50.68:8000 ,响应包Location: /web/index.html,为相对路径,实验确认可自动跳转
    //测试url http://39.129.48.54:8090 ,响应包Location: http://39.129.48.54:8090/login.php,为完整路径,实验确认可自动跳转
    request.setAttribute(QNetworkRequest::FollowRedirectsAttribute,true);
    //这一段是兼容ssl的,这样才可以进行https请求
    //.......................................................................
    QSslConfiguration config = request.sslConfiguration();
    config.setPeerVerifyMode(QSslSocket::VerifyNone);
    config.setProtocol(QSsl::TlsV1SslV3);
    request.setSslConfiguration(config);
   //........................................................................
    QNetworkReply *reply=manager.get(request);
    QObject::connect(reply,SIGNAL(finished()),&Loop,SLOT(quit()));
    QTimer::singleShot(10000, &Loop, &QEventLoop::quit);//30秒无响应退出消息循环机制,不然有的url直接访问不到,就会直接卡死,不继续访问后面的url
    Loop.exec();
    QString HtmlText=reply->readAll();
    return HtmlText;
}

QString RegularExpression(QString  HTML,QString re,QString fenzu)//正则提取爬到的页面正文
{
    QString TextAfterRe=fenzu+"标签值:";
    QRegularExpression Re(re);
    QRegularExpressionMatchIterator Matchs=Re.globalMatch(HTML);
    QRegularExpressionMatch match=Matchs.next();
    TextAfterRe=TextAfterRe+match.captured(fenzu);//title标签
    TextAfterRe=TextAfterRe;

    while(Matchs.hasNext()==true)
    {
        match=Matchs.next();
        QString temp=match.captured(fenzu);
        TextAfterRe=TextAfterRe+temp;
    }
    return TextAfterRe;
}





  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值