关闭

一步步DIY: OSM-Web服务器(四) 对万国语的地名进行翻译与检索

6304人阅读 评论(16) 收藏 举报
分类:

(-:<转载时请注明本文由goldenhawking在 CSDN blog 撰写  http://write.blog.csdn.net/postedit/7957192

   经过 前三篇的调试,已经有了一个完整的Map可以浏览,我们痛苦的世界范围数据下载、导入过程也结束了。要提醒一下的是,鉴于网速,不要下载 planetosm.lastest 文件,因为这个文件每周更新,万一一周下不完,就over了。

当然了,导入后,别忘了

sudo touch /var/lib/mod_tile/planet-import-complete
sudo chown www-data /var/lib/mod_tile/planet-import-complete

设置时间戳哦!

   导入后,只有中国、日本有些中文字符,其他国家都是鸟语,必须进行汉化。用PostgreSQL count 一下,name 字段不为null 的条目太多了,利用在线的翻译API似乎不现实。我们通过下载字典来进行本地自动匹配与翻译。字典在http://download.csdn.net/detail/goldenhawking/4556453, 导入后,含有17万个地名翻译的表如下面所示


由于place_name 里的地名有不规范的表示,比如括号中的曾用名、用逗号分隔的等效名等情况,不能直接把地名表与planet_osm_roads 、planet_osm_polygon 、planet_osm_line、planet_osm_point 四张表的name字段做 like 或者 = 的换算。同样,即使是做正则式的匹配,也要考虑到比如   XXXX 与 XX'XX (YYYY) 的情况,即原本地名已经包含阿拉伯语与英语两种语言的情况。

   为此,写一个程序,进行匹配,提前把地名进行标准化。其算法过程是:

   读取planet_osm_roads 、planet_osm_polygon 、planet_osm_line、planet_osm_point四张表里  name is not null 并多于1个字符的地名,进行简化,清除括号、非拉丁、斯拉夫字符,而后与经过规范化的 place_name 进行匹配。为了存储独立的中文字段,在四张表尾部追加了一个trans_name_chs的 text 字段,以便存储纯粹的中文地名,供搜索用。

ALTER TABLE planet_osm_point ADD COLUMN trans_name_chs text;
ALTER TABLE planet_osm_line ADD COLUMN trans_name_chs text;
ALTER TABLE planet_osm_polygon ADD COLUMN trans_name_chs text;
ALTER TABLE planet_osm_roads ADD COLUMN trans_name_chs text;


 算法伪代码表示:

  

void Match(unicode TableName)
{
	for_each (record in TableName where 长度>3)
	{
		unicode 地名 = record->name;
		//清除首尾空格
		TrimSpaces(地名);

		//只保留两类字符,根据字符的unicode取值范围筛选
		unicode 词干 = FilterChar (地名, new LanguageFilter({拉丁,斯拉夫}));

		//在翻译表中查找可能的翻译
        	unicodeList 可能结果集 = DatabaseSearch("规范化词干表","like %s%",词干);
         
		//对所有含有词干的可能结果,进行相似度排序,这里的策略是看看长度比例因子,
		//比如  Shanghai 与  Shanghai City 为 8:13, 与Shanghai 为 1:1 ,因此取 Shanghai

		unicode 最佳解=null;
		double  最佳因子=0;

	        for_each (unicode 可能解 in 可能结果集 where length(词干)/length(可能解)>0.6)
		{
			double 当前因子 = length(词干)/length(可能解);
			if (当前因子>最佳因子)
			{
				最佳解 = 可能解;
				最佳因子 = 当前因子;	
				if (最佳因子 == 1)
					break;
			}
		}
	
		//刷新数据库
		if (最佳因子 >0)
		{
			unicode 翻译结果 = 最佳解 + "(" + 地名 + ")";
			UpdateTable(TableName, record->id, 翻译结果);
		}
	}	
}


匹配过程大概需要1-2天,匹配完成后,翻译好的地名便存入了name字段中。渲染瓦片,看一看,主要的地名都OK啦

德国的


西班牙的

非洲的

还有老美的


最后,为这些字段建立索引

CREATE INDEX idx_planet_osm_roads_name ON planet_osm_roads USING btree ("name") where name is not null;
CREATE INDEX idx_planet_osm_roads_trans_name_chs ON planet_osm_roads USING btree ("trans_name_chs") where trans_name_chs is not null;
CREATE INDEX idx_planet_osm_polygon_name ON planet_osm_polygon USING btree ("name") where name is not null;
CREATE INDEX idx_planet_osm_polygon_trans_name_chs ON planet_osm_polygon USING btree ("trans_name_chs") where trans_name_chs is not null;
CREATE INDEX idx_planet_osm_line_name ON planet_osm_line USING btree ("name") where name is not null;
CREATE INDEX idx_planet_osm_line_trans_name_chs ON planet_osm_line USING btree ("trans_name_chs") where trans_name_chs is not null;
CREATE INDEX idx_planet_osm_point_name ON planet_osm_point USING btree ("name") where name is not null;
CREATE INDEX idx_planet_osm_point_trans_name_chs ON planet_osm_point USING btree ("trans_name_chs") where trans_name_chs is not null;


全部搞定后,vacuum 一下,索引就可以立刻参与查询了,而且使用 FCGI 实现地名的检索就变得简单了。这里为了测试,直接用C写CGI程序。

程序实现两个功能,一个是根据地名检索旁边的GIS对象,另一个是根据坐标检索最近的地名。这里用到 PostGIS的 CoverBy 系列函数.  CGI代码:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <postgresql/libpq-fe.h>
#include <sys/stat.h>
const char * pConnDBStr = "host='localhost' dbname='osmgis' user='用户名' password='密码'";
#define FCGI_PRG 1

#if FCGI_PRG != 0
#include <fcgi_stdio.h>
#endif

const char * sql_mask = "\
select * from(\
select name , trans_name_chs ,ST_Distance(way,ST_Transform(ST_SetSRID(ST_Point(%lf,%lf),4326),900913)) as mindis,ST_AsText(ST_Transform(way,4326)) as geobj,osm_id\n\
	from planet_osm_polygon\n\
	where\n\
		name is not null\n\
		and\n\
		way && ST_Transform(ST_SetSRID('BOX3D(%lf %lf,%lf %lf)'::box3d,4326),900913)\n\
 union\n\
 select name , trans_name_chs,ST_Distance(way,ST_Transform(ST_SetSRID(ST_Point(%lf,%lf),4326),900913)) as mindis,ST_AsText(ST_Transform(way,4326)) as geobj,osm_id\n\
	from planet_osm_roads\n\
	where\n\
		name is not null\n\
		and\n\
		way && ST_Transform(ST_SetSRID('BOX3D(%lf %lf,%lf %lf)'::box3d,4326),900913)\n\
 union\n\
 select name, trans_name_chs,ST_Distance(way,ST_Transform(ST_SetSRID(ST_Point(%lf,%lf),4326),900913)) as mindis,ST_AsText(ST_Transform(way,4326)) as geobj,osm_id\n\
	from planet_osm_point\n\
	where\n\
		name is not null\n\
		and\n\
		way && ST_Transform(ST_SetSRID('BOX3D(%lf %lf,%lf %lf)'::box3d,4326),900913)\n\
 union\n\
 select name , trans_name_chs,ST_Distance(way,ST_Transform(ST_SetSRID(ST_Point(%lf,%lf),4326),900913)) as mindis,ST_AsText(ST_Transform(way,4326)) as geobj,osm_id\n\
	from planet_osm_line\
	where\
		name is not null\
		and\
		way && ST_Transform(ST_SetSRID('BOX3D(%lf %lf,%lf %lf)'::box3d,4326),900913)\
) as full_objs\
 where mindis<200\
 order by mindis\
 limit 256\
";

const char * psql_findbyname =
"select distinct * from(\
    select name,trans_name_chs,ST_AsText(ST_Transform(way,4326)) as geobj,osm_id from planet_osm_point where name like '%s%%' or trans_name_chs like '%s%%'\
    union\
    select name,trans_name_chs,ST_AsText(ST_Transform(way,4326)) as geobj,osm_id from planet_osm_roads where name like '%s%%' or trans_name_chs like '%s%%'\
    union\
    select name,trans_name_chs,ST_AsText(ST_Transform(way,4326)) as geobj,osm_id from planet_osm_line where name like '%s%%' or trans_name_chs like '%s%%'\
    union\
    select name,trans_name_chs,ST_AsText(ST_Transform(way,4326)) as geobj,osm_id from planet_osm_polygon where name like '%s%%' or trans_name_chs like '%s%%'\
) as geobjs limit 256;";

void initialize(void)
{
}

//很据坐标检索地名
bool findobjs(double dCenterX,double dCenterY)
{
    bool succeeded = false;

    double dLeft, dRight, dTop, dBottom;

    dLeft = dCenterX - 0.1;
    dRight = dCenterX + 0.1;
    dTop = dCenterY+0.1;
    dBottom = dCenterY-0.1;

    if (dLeft>=179 || dLeft <=-179 || dRight>=179 || dRight <=-179 ||
    fabs(dRight-dLeft)>0.5)
        return false;

    if (dTop>=85 || dTop <=-85 || dBottom>=85 || dBottom <=-85 ||
    fabs(dTop-dBottom)>0.5)
        return false;


    PGconn * pConn = PQconnectdb(pConnDBStr);
    if (!pConn)
        printf("<p>Error ! Could not open PostgreSQL Connections.</p>\r\n");
    else if (PQstatus(pConn)==CONNECTION_OK)
    {
                char bufsql[4096];
                sprintf(bufsql,sql_mask,
                    dCenterX,dCenterY,dLeft,dBottom,dRight,dTop,
                    dCenterX,dCenterY,dLeft,dBottom,dRight,dTop,
                    dCenterX,dCenterY,dLeft,dBottom,dRight,dTop,
                    dCenterX,dCenterY,dLeft,dBottom,dRight,dTop);
                //printf("<p>Executing %s</p>\r\n",bufsql);
                PGresult * pRes = PQexec(pConn,bufsql);
                if (!pRes)
                    printf("<p>Error!Cold not init selection recordset.%s</p>\r\n",PQerrorMessage(pConn));
                else
                {
                    if (PQresultStatus(pRes)==PGRES_TUPLES_OK)
                    {
                        int n=PQntuples(pRes);
                        if (n==0)
                        {
                            printf("<p>No symbols found.</p>\r\n");
                            succeeded = true;
                        }
                        else
                        {
                            //fetching results.
                            printf("<p>nItems = %d</p>\r\n<table border=1>",n);
                             printf("<tr><td>No.</td><td>name</td><td>trans_name</td><td>distance</td><td>geobj</td><td>geo_id</td></tr>");
                            for (int i=0;i<n;i++)
                            {
                                printf("    <tr><td>%d</td>\r\n",i);
                                for (int j=0;j<5;j++)
                                {
                                    char * pstrVal = PQgetvalue(pRes,i,j);
                                    char buf_text[2048];
                                    if (pstrVal)
                                    {
                                        if (strlen(pstrVal)>1024)
                                        {
                                            strncpy(buf_text,pstrVal,1024);
                                            strcat(buf_text,"...");
                                        }
                                        else
                                            strcpy(buf_text,pstrVal);
                                        printf("        <td>%s</td>\r\n",buf_text);
                                    }
                                }
                                printf("</tr>\r\n");
                            }
                            printf("</table>\r\n");
                            succeeded = true;

                        }
                    }
                    else
                        printf("<p>Error!Requesting Failed.%s</p>\r\n",PQresultErrorMessage(pRes));
                    PQclear (pRes);
                }

   }
    else
        printf("<p>Error!Connection Failed.%s</p>\r\n",PQerrorMessage(pConn));

    if (pConn)
        PQfinish (pConn);
    return succeeded;
}
//根据地名检索坐标
bool findobjs(const char * name)
{
    bool succeeded = false;

    if (!name)
        return succeeded;
    int nLen_name = strlen(name);
    bool failed = false;
    for (int i=0;i<nLen_name && failed==false;i++)
    {
        const char * pMask = "~!@#$%^&*()_+=-`{}[]:;'\"|\\<>/?";
        const char * pfinded = strchr(pMask,name[i]);
        if (pfinded>=pMask && pfinded<=pMask+strlen(pMask)-1)
            failed = true;
    }

    printf("<p>Find Placename: %s</p>\r\n",name);
    if (failed==true)
    {
        printf("<p>Name \"%s\" has invalid charactors. Request rejected.</p>\r\n",name);
        return false;
    }

    PGconn * pConn = PQconnectdb(pConnDBStr);

    if (!pConn)
        printf("<p>Error ! Could not open PostgreSQL Connections.</p>\r\n");
    else if (PQstatus(pConn)==CONNECTION_OK)
    {
                char bufsql[4096];
                sprintf(bufsql,psql_findbyname,
                    name,name,name,name,name,name,name,name);
                //printf("<p>Executing %s</p>\r\n",bufsql);
                PGresult * pRes = PQexec(pConn,bufsql);
                if (!pRes)
                    printf("<p>Error!Cold not init selection recordset.%s</p>\r\n",PQerrorMessage(pConn));
                else
                {
                    if (PQresultStatus(pRes)==PGRES_TUPLES_OK)
                    {
                        int n=PQntuples(pRes);
                        if (n==0)
                        {
                            printf("<p>No symbols found.</p>\r\n");
                            succeeded = true;
                        }
                        else
                        {
                            //fetching results.
                            printf("<p>nItems = %d</p>\r\n<table border=1>",n);
                            printf("<tr><td>num</td><td>name</td><td>trans_name</td><td>geobj</td><td>geo_id</td></tr>");
                            for (int i=0;i<n;i++)
                            {
                                printf("<tr><td>%d</td>",i);
                                {
                                    char * pstrVal = PQgetvalue(pRes,i,0);
                                    if (pstrVal)
                                        printf("<td>%s</td>",pstrVal);
                                }
                                {
                                    char * pstrVal = PQgetvalue(pRes,i,1);
                                    if (pstrVal)
                                        printf("<td>%s</td>",pstrVal);
                                }
                                {
                                    char * pstrVal = PQgetvalue(pRes,i,2);
                                    char buf_text[2048];
                                    if (pstrVal)
                                    {
                                        if (strlen(pstrVal)>1024)
                                        {
                                            strncpy(buf_text,pstrVal,1024);
                                            strcat(buf_text,"...");
                                        }
                                        else
                                            strcpy(buf_text,pstrVal);
                                        printf("        <td>%s</td>\r\n",buf_text);
                                    }
                                }
                                 {
                                    char * pstrVal = PQgetvalue(pRes,i,3);
                                    if (pstrVal)
                                        printf("<td>%s</td>",pstrVal);
                                }
                                printf("</tr>\r\n");
                            }
                            succeeded = true;
                            printf("</table>\r\n");

                        }
                    }
                    else
                        printf("<p>Error!Requesting Failed.%s</p>\r\n",PQresultErrorMessage(pRes));
                    PQclear (pRes);
                }

   }
    else
        printf("<p>Error!Connection Failed.%s</p>\r\n",PQerrorMessage(pConn));

    if (pConn)
        PQfinish (pConn);
    return succeeded;
}

int main(void)

{

    initialize();

#if FCGI_PRG != 0

    while (FCGI_Accept() >= 0)
    {
        char * rawdata = getenv("QUERY_STRING");
        //char * server_name = getenv("SERVER_NAME");
        double  cenx=0,ceny = 0;

        printf("Content-type: text/html; charset=UTF-8\r\n"
            "\r\n"
            "<title>GEO Results</title>\r\n");

        if (rawdata)
        {
            int nLen = strlen(rawdata);
            char * data = new char [nLen+1];
            strcpy (data,rawdata);
            char * pcenx = 0,* pceny = 0, * pname=0;
            char * tmp = 0;

            tmp = strstr(data,"cenx=");
            if (tmp)
                pcenx = tmp;
            tmp = strstr(data,"ceny=");
            if (tmp)
                pceny = tmp;
            tmp = strstr(data,"name=");
            if (tmp)
                pname = tmp;
            if ( pcenx && pceny)
            {
                for (int i=0;i<nLen;i++)
                    if (data[i]=='?'||data[i]=='&')
                        data[i] = 0;
                cenx =  atof(pcenx+5);
                ceny =  atof(pceny+5);
                delete [] data;
                if (false==findobjs(cenx,ceny))
                {
                    printf("<p>Error!DBoper Failed!</p><p>\r\n");
                    continue;
                }
           }
            else if (pname)
            {
                unsigned char bufCodes[1024];
                unsigned char * pCode = (unsigned char *)pname+5;
                int nCodes = 0;
                if (*pCode=='%')
                {
                    while (*pCode=='%'&&nCodes<1024-1)
                    {
                        if (pCode[1]==0)
                            break;
                        if (pCode[2]==0)
                            break;
                        if (pCode[1]>='0' && pCode[1]<='9')
                            bufCodes[nCodes] = pCode[1]-'0';
                        else if (pCode[1]>='A' && pCode[1]<='F')
                            bufCodes[nCodes] = pCode[1]-'A'+10;
                        else
                            break;

                        bufCodes[nCodes] <<=4;

                        if (pCode[2]>='0' && pCode[2]<='9')
                            bufCodes[nCodes] += pCode[2]-'0';
                        else if (pCode[2]>='A' && pCode[2]<='F')
                            bufCodes[nCodes] += pCode[2]-'A'+10;
                        else
                            break;
                        nCodes++;
                        pCode+=3;
                    }
                    bufCodes[nCodes]=0;
                }
                else
                    strcpy((char *)bufCodes,(char *)pCode);
                if (bufCodes[0])
                {
                    if (false==findobjs((char *)bufCodes))
                    {
                        printf("<p>Error!DBoper Failed!</p><p>\r\n");
                        continue;
                    }
                }
                else
                     printf("<p>Error!Wrong query place names!</p><p>\r\n");

            }
            else
            {
                printf("<p>Error! At least Missing para cenx,ceny or name.</p>\r\n");
                delete [] data;
                continue;
            }

        }


    }
#else
    //测试
    findobjs (121.538,31.221);
#endif
    return 0;
}


这里的功能及其有限,也没有考虑到安全性。仅仅实现功能而已。



下一篇,我们来看看使用 GeoServer 显示叠加的矢量图层数据。

3
0

查看评论
* 以上用户言论只代表其个人观点,不代表CSDN网站的观点或立场
    个人资料
    • 访问:290326次
    • 积分:4098
    • 等级:
    • 排名:第7551名
    • 原创:86篇
    • 转载:3篇
    • 译文:0篇
    • 评论:317条
    最新评论