使用正则表达式自动提取网页内容。
lesson learn:
- 处理前替换掉页面中的回车换行,以便执行第2步
- 使用 ".*?" 进行非贪婪匹配,结合正则表达式尾部的 "(?<rest>.*)",反复对剩余内容进行匹配,直至没有匹配内容
- 尖括号(< >),双引号( " ) 等特殊字符需要注意添加反斜杠( \ )转义
- 使用正则表达式方法 reInfo.patternErrorOffset() 来确认正则方法是否正确;
- 使用 QtConcurrent::run( this, &DataSource::fetch, 7 ) 线程化耗时操作
#define DIV_BGN "\\<div.*?"
#define DIV_END ".*?\\</div\\>.*?"
#define ITEM_CLS "class=\\\"item\\\".*?"
#define NAME_CLS "class=\\\"name\\\".*?"
#define LINK_NAME_ELEM "\\<a href=\\\"(?<link>.*?.html)\\\"\\s+?title=\\\"(?<name>.*?)\\\"\\s.*?class=\\\"name\\-text\\\".*?"
#define INFO_CLS "class=\\\"info\\\".*?"
#define DESC_ELEM "\\<a href=\\\"(.*?.html)\\\".*?title=\\\"(?<desc>.*?)\\\"\\s.*?class=\\\"text\\\".*?"
#define OTHER_CLS "class=\\\"other\\\".*?"
#define SIZE_ELEM "\\<label.*?\\>大小:(?<size>.*?)\\</label\\>.*?"
#define SEED_ELEM DIV_BGN \
"class=\\\"seed-down\\\".*?href=\\\"(?<seed>http.*?\\.torrent)\\\".*?" \
DIV_END
#define MAGNET_ELEM DIV_BGN \
"\\s*class=\\\"magnet\\\".*?href=\\\"(?<magnet>magnet.*?)\\\".*?" \
DIV_END
#define REST_DATA "(?<rest>.*)"
const QRegularExpression reInfo(
DIV_BGN ITEM_CLS
DIV_BGN NAME_CLS
LINK_NAME_ELEM
DIV_END
DIV_BGN INFO_CLS
DESC_ELEM
DIV_BGN OTHER_CLS
SIZE_ELEM
SEED_ELEM
MAGNET_ELEM
DIV_END
DIV_END
DIV_END
REST_DATA
);
void DataSource::fetch( int idx)
{
qint64 i=0;
QRegularExpressionMatch match;
for( int k=1 ; k< 240 ; k++ )
{
int try_cnt = 0;
TinyCurl curl;
QEventLoop evtLoop;
QByteArray replyData;
QUrl url = QString("https://hide-the-real-url/sort-%1-%2.html").arg(idx).arg(k);
QNetworkReply::NetworkError err = QNetworkReply::UnknownNetworkError;
for( try_cnt=0 ; (try_cnt<3) && ( QNetworkReply::NoError != err ) ; try_cnt++ );
{
err = curl.get( url, replyData, evtLoop );
}
if( QNetworkReply::NoError != err )
{
continue;
}
QString data(replyData);
data.replace("\r", "").replace("\n", "");
// if( reInfo.patternErrorOffset() >= 0 )
// {
// qDebug() << "reInfo.patternErrorOffset : " << reInfo.patternErrorOffset();
// }
while( ( match = reInfo.match( data ) ).hasMatch() )
{
qDebug() << "Name" << match.captured("name");
qDebug() << "Page" << match.captured("link");
qDebug() << "Desc" << match.captured("desc");
qDebug() << "Size" << match.captured("size");
qDebug() << "Magnet" << match.captured("magnet");
qDebug() << "seed" << match.captured("seed");
data = match.captured("rest");
}
}
}
void DataSource::start()
{
craber = QtConcurrent::run( this, &DataSource::fetch, 7 );
}