一种不太完善的OpenStreetMap字典汉化方法

根据世界地名词典,对OpenStreetMap进行汉化,使用了下面的代码。不太完善,这里仅贴出来。
地名字典在我的资源世界地名大词典下载。

#include <QCoreApplication>
#include <QDebug>
#include <QFile>
#include <QHash>
#include <QMap>
#include <QRegExp>
#include <QString>
#include <QSqlDatabase>
#include <QSqlError>
#include <QSqlQuery>
#include <QTextStream>
#include <QVector>

QHash <QString, QMap<int,QVector<QString> > >  make_dictionary(QSqlDatabase db);
void outputDictionary(QHash <QString, QMap<int,QVector<QString> > > dict);
void prepareToTranslate(const QHash <QString, QMap<int,QVector<QString> > > dict,
					   QSqlDatabase db,
					   const QString & tableName,
					   QVector<qint64> & vec_osmid,
					   QVector<QString> & vec_rawName,
					   QVector<QString> & vec_TransName
					   );
int main(int argc, char *argv[])
{
	QCoreApplication a(argc, argv);
	QTextStream Stdout(stdout,QIODevice::WriteOnly);
	QSqlDatabase db = QSqlDatabase::addDatabase("QPSQL");
	if (db.isValid()==false)
		return 0;
	db.setHostName("127.0.0.1");
	db.setDatabaseName("gis");
	db.setUserName("archosm");
	db.setPassword("archosm");
	if (db.open()==false)
	{
		Stdout << db.lastError().text()<<"\n";
		qDebug() << db.lastError().text();
		return 0;
	}
	try
	{
		QHash <QString, QMap<int,QVector<QString> > >   dict = make_dictionary(db);
		outputDictionary(dict);

		//! start to translate
		QSqlQuery queryWordsToTrans(db);

		const QString tableNames[4] = {
			QString("planet_osm_line"),QString("planet_osm_point"),QString("planet_osm_polygon"),QString("planet_osm_roads")
		};

		//输出
		QFile fpDict(QCoreApplication::applicationDirPath()+"/trans.txt");
		if (fpDict.open(QIODevice::WriteOnly)==false)
			return 0;
		QTextStream stout(&fpDict);
		QSqlQuery queryUpdate(db);
		queryUpdate.setForwardOnly(true);
		db.transaction();
		for (int i=0;i<4;++i)
		{
			QVector<qint64>  vec_osmid;
			QVector<QString>  vec_rawName;
			QVector<QString>  vec_TransName;
			prepareToTranslate(dict,db,tableNames[i],vec_osmid,vec_rawName,vec_TransName);
			QMap<QString, QString> map_trans;
			int nTransed = vec_osmid.size();
			for (int j=0;j<nTransed;++j)
				map_trans[vec_rawName[j]] = vec_TransName[j];
			QList<QString> key_raws = map_trans.keys();
			foreach (QString str_rawName, key_raws)
			{
				QString strTransName = map_trans[str_rawName];
				stout<<tableNames[i]<<","<<str_rawName<<","<<strTransName<<"\n";
				queryUpdate.prepare(QString("update %1 set name = ? , trans_name_chs = ? where name = ? and trans_name_chs is null;").arg(tableNames[i]));
				queryUpdate.addBindValue(str_rawName + ","+strTransName);
				queryUpdate.addBindValue(strTransName);
				queryUpdate.addBindValue(str_rawName);
				if (queryUpdate.exec()==false)
					throw queryUpdate.lastError().text();
				stout.flush();
				fpDict.flush();
			}
		}
		db.commit();
		fpDict.close();

	}
	catch (QString errMessage)
	{
		db.rollback();
		Stdout<<"Error!"<<errMessage<<"\n";
		qDebug()<<"Error!"<<errMessage;
	}

	db.close();


	Stdout<<"Finished!\n";
	qDebug()<<"Finished!";
	exit(0);
	return a.exec();
}

//预处理原始数据,生成词典
QHash <QString, QMap<int,QVector<QString> > >  make_dictionary(QSqlDatabase db)
{
	QVector<QString> lst_tails;
	//这些后缀去掉后,会得到更多的有效词根。
	lst_tails.push_back(QString::fromUtf8("国家野生动物保护区"));
	lst_tails.push_back(QString::fromUtf8("国家森林公园"));
	lst_tails.push_back(QString::fromUtf8("野生动物保护区"));
	lst_tails.push_back(QString::fromUtf8("森林公园"));
	lst_tails.push_back(QString::fromUtf8("国家公园"));
	lst_tails.push_back(QString::fromUtf8("深海平原"));
	lst_tails.push_back(QString::fromUtf8("海底峡谷"));
	lst_tails.push_back(QString::fromUtf8("断裂带"));
	lst_tails.push_back(QString::fromUtf8("自治区"));
	lst_tails.push_back(QString::fromUtf8("裂口"));
	lst_tails.push_back(QString::fromUtf8("盐湖"));
	lst_tails.push_back(QString::fromUtf8("内湖"));
	lst_tails.push_back(QString::fromUtf8("海岭"));
	lst_tails.push_back(QString::fromUtf8("环礁"));
	lst_tails.push_back(QString::fromUtf8("大区"));
	lst_tails.push_back(QString::fromUtf8("机场"));
	lst_tails.push_back(QString::fromUtf8("山口"));
	lst_tails.push_back(QString::fromUtf8("公园"));
	lst_tails.push_back(QString::fromUtf8("半岛"));
	lst_tails.push_back(QString::fromUtf8("冰川"));
	lst_tails.push_back(QString::fromUtf8("沙漠"));
	lst_tails.push_back(QString::fromUtf8("峡谷"));
	lst_tails.push_back(QString::fromUtf8("山谷"));
	lst_tails.push_back(QString::fromUtf8("海沟"));
	lst_tails.push_back(QString::fromUtf8("水道"));
	lst_tails.push_back(QString::fromUtf8("水库"));
	lst_tails.push_back(QString::fromUtf8("大坝"));
	lst_tails.push_back(QString::fromUtf8("神庙"));
	lst_tails.push_back(QString::fromUtf8("干河"));
	lst_tails.push_back(QString::fromUtf8("平原"));
	lst_tails.push_back(QString::fromUtf8("海岸"));
	lst_tails.push_back(QString::fromUtf8("群岛"));
	lst_tails.push_back(QString::fromUtf8("火山"));
	lst_tails.push_back(QString::fromUtf8("浅滩"));
	lst_tails.push_back(QString::fromUtf8("大桥"));
	lst_tails.push_back(QString::fromUtf8("洼地"));
	lst_tails.push_back(QString::fromUtf8("瀑布"));
	lst_tails.push_back(QString::fromUtf8("海峡"));
	lst_tails.push_back(QString::fromUtf8("熔岩"));
	lst_tails.push_back(QString::fromUtf8("岛"));
	lst_tails.push_back(QString::fromUtf8("湖"));
	lst_tails.push_back(QString::fromUtf8("湾"));
	lst_tails.push_back(QString::fromUtf8("山"));
	lst_tails.push_back(QString::fromUtf8("河"));
	lst_tails.push_back(QString::fromUtf8("滩"));
	lst_tails.push_back(QString::fromUtf8("村"));
	lst_tails.push_back(QString::fromUtf8("市"));
	lst_tails.push_back(QString::fromUtf8("坝"));
	lst_tails.push_back(QString::fromUtf8("港"));
	lst_tails.push_back(QString::fromUtf8("区"));
	lst_tails.push_back(QString::fromUtf8("县"));
	lst_tails.push_back(QString::fromUtf8("省"));
	lst_tails.push_back(QString::fromUtf8("礁"));
	lst_tails.push_back(QString::fromUtf8("角"));
	lst_tails.push_back(QString::fromUtf8("峰"));
	lst_tails.push_back(QString::fromUtf8("站"));
	lst_tails.push_back(QString::fromUtf8("岭"));
	const int remvSz = lst_tails.size();
	QSqlQuery query(db);
	query.setForwardOnly(true);
	if (false == query.exec("select * from national_place_names"))
		throw query.lastError().text();
	QHash <QString, QMap<int,QVector<QString> > >  hash_dict;
	//Make dictionary
	while (query.next())
	{
		const QString raw_name = query.value("place_name").toString()
				.replace("<u>","")
				.replace("</u>","")
				.replace("<rt>","")
				.replace("</rt>","")
				.replace("<ruby>","")
				.replace("</ruby>","");
		const QString raw_trans = query.value("trans_name").toString();
		///Replace some split comma.
		//! Replace "见"
		QStringList lst_raw_name = raw_name.split(QRegExp(QString::fromUtf8("[〈〉见,()]")),QString::SkipEmptyParts);
		if (lst_raw_name.size())
		{
			QString word = lst_raw_name.first();
			QString upperKey = word.toUpper().trimmed();
			upperKey.replace(QRegExp(QString::fromUtf8("[ ,, ]")),"_");
			upperKey.replace("-","_");
			upperKey.replace(".","_");
			QStringList listWordsKey = upperKey.split("_");
			int n = listWordsKey.size();
			for (int i = 0 ;i < n; ++i)
			{
				QString finalKey;
				for (int j = 0;j<=i;++j)
				{
					if (j)
						finalKey += "_";
					finalKey += listWordsKey.at(j);
				}
				//CHS
				QStringList chslists = raw_trans.split(QRegExp(QString::fromUtf8("[()(),;。]")),QString::SkipEmptyParts);
				if (chslists.size())
				{
					bool bfound = false;
					int deleted = 0;
					QString chs_value = chslists.first();
					do
					{
						bfound = false;
						for (int k = 0; k< remvSz ;++k)
						{
							if (chs_value.endsWith(lst_tails[k]))
							{
								QString newv = chs_value.left(chs_value.length()-lst_tails[k].length());
								if (newv.size())
								{
									bfound = true;
									chs_value = newv;
									++deleted;
									break;
								}
							}
						}//end for (int k = 0; k< remvSz && bfound==true;++k)
					}while (bfound); //end do remove laterFix
					hash_dict[finalKey][deleted-i].push_back(chs_value);

				}//end if (chslists.size())
			}//end for i = 1 ~ n n = listWordsKey.size();
		}//end if (lst_raw_name.size())
	}
	return hash_dict;
}

void outputDictionary(QHash <QString, QMap<int,QVector<QString> > > dict)
{
	QFile fpDict(QCoreApplication::applicationDirPath()+"/dict.txt");
	if (fpDict.open(QIODevice::WriteOnly)==false)
		return;
	QTextStream stout(&fpDict);
	QList<QString> words = dict.keys();
	std::sort(words.begin(),words.end());
	foreach (QString word, words)
	{
		stout<<word<<":";
		const QMap<int,QVector<QString> >  & vals = dict[word];
		QList<int> simrts = vals.keys();
		foreach (int simrt, simrts)
		{
			stout<<simrt<<"={";
			const QVector<QString> & transs = vals[simrt];
			const int nPoss = transs.size();
			for(int i=0;i<nPoss;++i)
				stout<<transs[i]<<",";
			stout<<simrt<<"}; ";
		}
		stout<<"\n";
	}
	fpDict.close();
}

void prepareToTranslate(QHash <QString, QMap<int,QVector<QString> > > dict,
					   QSqlDatabase db,
					   const QString & tableName,
					   QVector<qint64> & vec_osmid,
					   QVector<QString> & vec_rawName,
					   QVector<QString> & vec_TransName
					   )
{
	QSqlQuery query(db);
	query.setForwardOnly(true);
	if (false == query.exec(QString("select osm_id,name from %1 where name > ' ';").arg(tableName)))
		throw query.lastError().text();
	while (query.next())
	{
		qint64 osmid = query.value(0).toLongLong();
		const QString strRawName = query.value(1).toString();
		QString transName;
		if (strRawName.size()>1)
		{
			QString upperKey = strRawName.toUpper().trimmed();
			upperKey.replace(QRegExp(QString::fromUtf8("[ ,, ]")),"_");
			upperKey.replace("-","_");
			upperKey.replace(".","_");
			QStringList listWordsKey = upperKey.split("_");
			int n = listWordsKey.size();
			if (n )
			{
				for (int i = n-1 ;i >=0; --i)
				{
					QString finalKey;
					for (int j = 0;j<=i;++j)
					{
						if (j)
							finalKey += "_";
						finalKey += listWordsKey.at(j);
					}
					if (dict.contains(finalKey))
					{
						if (finalKey.size()>3)
						{
							if (transName.size())
								transName +="_";
							transName += dict[finalKey].first().first();
							for (int j = 0; j<=i;++j)
								listWordsKey.pop_front();
						}
						break;
					}
					if (i<2)
						break;
				}
			}
		}
		if (transName.size())
		{
			vec_osmid.push_back(osmid);
			vec_rawName.push_back(strRawName);
			vec_TransName.push_back(transName);
		}
	}
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

丁劲犇

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值