1. 为什么使用Qt写爬虫?
老实说爬虫非常关键是效率,所以说用qt来写不是一个好的选择。。。。但是我的需求比较轻量级,就用qt了,qt跨平台,UI不错,然后连接数据库方便,所以用来搞也不是一个坏选择。
2.爬虫主要的内容
基本爬虫就是请求地址,然后用正则表达式对结果进行处理,然后存到数据库中。大概就三步。这里只涉及到用get方式来获取。有一些数据还需要post。还有一些需要登录后,涉及到cookie, session什么的就没研究过了.还有多线程进行请求等等。
3.请求地址
void MainWindow::on_btnStartGet_clicked()
{
QNetworkAccessManager *manager = new QNetworkAccessManager(this);
connect(manager,SIGNAL(finished(QNetworkReply*)),this, SLOT(query(QNetworkReply*)));
//manager->get(QNetworkRequest(QUrl(stockSource)));
QNetworkRequest request(QUrl("http://www.baidu.com"));
request.setHeader(QNetworkRequest::ContentTypeHeader,
"application/x-www-form-urlencoded");
//QByteArray postData;
//postData.append(" 5d|false|BIDU");
// QUrlQuery postData;
// postData.addQueryItem("", "5d|false|BIDU");
// manager->post(request, postData.toString(QUrl::FullyEncoded).toUtf8());
// manager->post(request, postData);
manager->get(request);
}
对结果进行正则表达式进行处理,然后存到数据库中,仅供参考。直接复制无法编译通过。
void MainWindow::query(QNetworkReply* reply){
const int eachCount = 200;
vector<StockHistory> websiteHistorys(eachCount);
QString input = reply->readAll();
qDebug() << input;
qDebug() << "request finish!";
QRegularExpression dateRegex("(?:<td class=\"yfnc_tabledata1\" nowrap align=\"right\">)(.*?)(?:</td>)");
QRegularExpressionMatchIterator dateItr = dateRegex.globalMatch(input);
int dateCount = 0;
QDate databaseMaxDate = STUtility::getMaxHistoryDate(ui->editStockId->text());
//QDate databaseMaxDate = QDate(2015,1,1);
//save date
while (dateItr.hasNext()) {
if(dateCount == eachCount){
break;
}
QRegularExpressionMatch match = dateItr.next();
if (match.hasMatch()) {
qDebug() << "date:" << match.captured(1);
QString dateString = match.captured(1);
QDate currentDate = QDate::fromString(STUtility::getValideDate(dateString),"MM dd, yyyy");
if(currentDate > databaseMaxDate){
websiteHistorys[dateCount] = StockHistory();
websiteHistorys[dateCount].setDate(currentDate);
++dateCount;
}else{
break;
}
}
}
websiteHistorys.resize(dateCount);
//save price
QRegularExpression priceRegex("(?:<td class=\"yfnc_tabledata1\" align=\"right\">)(.*?)(?:</td>)");
QRegularExpressionMatchIterator priceItr = priceRegex.globalMatch(input);
int priceTypeIndex = 0;
int priceIndex = 0;
while (priceItr.hasNext()) {
if(priceIndex == dateCount){
break;
}
QRegularExpressionMatch match = priceItr.next();
if (match.hasMatch()) {
qDebug() << "price:" << match.captured(1);
if(priceTypeIndex == 0){
float price = match.captured(1).toFloat();
websiteHistorys[priceIndex].setOpen(price);
}else if(priceTypeIndex == 1){
float price = match.captured(1).toFloat();
websiteHistorys[priceIndex].setHigh(price);
}else if(priceTypeIndex == 2){
float price = match.captured(1).toFloat();
websiteHistorys[priceIndex].setLow(price);
}else if(priceTypeIndex == 3){
float price = match.captured(1).toFloat();
websiteHistorys[priceIndex].setClose(price);
}else if(priceTypeIndex == 4){
std::string volumeString = match.captured(1).toStdString();
QString qVolumeString = QString(volumeString.c_str());
qVolumeString = qVolumeString.replace(",", "");
websiteHistorys[priceIndex].setVolume(qVolumeString.toInt());
}
// else if(priceTypeIndex == 5){
// //do nothing
// }
++priceTypeIndex;
if(priceTypeIndex == 6){
priceTypeIndex = 0;
++priceIndex;
}
}
}
// insert into database
for(int i = 0; i < dateCount; ++i){
StockHistory oneHistory = websiteHistorys[i];
QSqlQuery query;
QString qQuery = "INSERT INTO stockHistory (stockId,date,close,volume,open,high,low) "
"VALUES (\"%1\", \"%2\", \"%3\", \"%4\", \"%5\", \"%6\", \"%7\")";
qQuery = qQuery.arg(ui->editStockId->text()).arg(oneHistory.getDate().toString("yyyy-MM-dd")).arg(oneHistory.getClose())
.arg(oneHistory.getVolume()).arg(oneHistory.getOpen()).arg(oneHistory.getHigh()).arg(oneHistory.getLow());
query.prepare(qQuery);
//qDebug() << "query:" << qQuery;
bool result = query.exec();
if(!result){
qDebug() << query.lastError().text().toLocal8Bit().data();
}else{
qDebug() << "insert one line success";
}
}
}
4.该简单爬虫应用源码下载
http://www.waitingfy.com/archives/1778
点击GetData,能从雅虎财经抓取一段数据,点击StartAnalysis对数据进行分析,结果看debug 输出。