pg_jieba在windows上编译安装

2024.02.03修改,添加关键词增加了独占锁.

1 源码下载

cd D:\build
git clone https://github.com/jaiminpan/pg_jieba
git clone --depth=10 --branch=master git://github.com/yanyiwu/cppjieba.git


# 复制cppjieba/deps至pg_jieba/libjieba 
# 复制cppjieba/include至pg_jieba/libjieba 
# 复制cppjieba/dict至pg_jieba/libjieba 

2 修改pg_jieba目录中的jieba_token.h

去掉".token =" 和 “.descr =”.

改完后的效果

/*-------------------------------------------------------------------------
 *
 * jieba_token.c
 *    token description
 *
 * Author: Jaimin Pan <jaimin.pan@gmail.com>
 *
 * IDENTIFICATION
 *    jieba_token.h
 *
 *-------------------------------------------------------------------------
 */

#ifndef JIEBA_TOKEN_H_
#define JIEBA_TOKEN_H_

typedef struct TokenDescData {
	const char* token;
	const char* descr;
} TokenDescData;

typedef struct TokenDescData *TokenDesc;

static const TokenDescData lex_descr[] =
{
  { "",  ""},
  { "eng",  "letter"},
  { "nz",  "other proper noun"},
  { "n",  "noun"},
  { "m",  "numeral"},
  { "i",  "idiom"},
  { "l",  "temporary idiom"},
  { "d",  "adverb"},
  { "s",  "space"},
  { "t",  "time"},
  { "mq",  "numeral-classifier compound"},
  { "nr",  "person's name"},
  { "j",  "abbreviate"},
  { "a",  "adjective"},
  { "r",  "pronoun"},
  { "b",  "difference"},
  { "f",  "direction noun"},
  { "nrt",  "nrt"},
  { "v",  "verb"},
  { "z",  "z"},
  { "ns",  "location"},
  { "q",  "quantity"},
  { "vn",  "vn"},
  { "c",  "conjunction"},
  { "nt",  "organization"},
  { "u",  "auxiliary"},
  { "o",  "onomatopoeia"},
  { "zg",  "zg"},
  { "nrfg",  "nrfg"},
  { "df",  "df"},
  { "p",  "prepositional"},
  { "g",  "morpheme"},
  { "y",  "modal verbs"},
  { "ad",  "ad"},
  { "vg",  "vg"},
  { "ng",  "ng"},
  { "x",  "unknown"},
  { "ul",  "ul"},
  { "k",  "k"},
  { "ag",  "ag"},
  { "dg",  "dg"},
  { "rr",  "rr"},
  { "rg",  "rg"},
  { "an",  "an"},
  { "vq",  "vq"},
  { "e",  "exclamation"},
  { "uv",  "uv"},
  { "tg",  "tg"},
  { "mg",  "mg"},
  { "ud",  "ud"},
  { "vi",  "vi"},
  { "vd",  "vd"},
  { "uj",  "uj"},
  { "uz",  "uz"},
  { "h",  "h"},
  { "ug",  "ug"},
  { "rz",  "rz"}
};

/* Start From 1 and LASTNUM is the last number */
extern int LASTNUM;

#endif /* JIEBA_TOKEN_H_ */

3 在pg_jieba目录中添加pg_main.h文件

#ifndef H_3069A164_35BD_4687_868B_1F5B20E2030E
#define H_3069A164_35BD_4687_868B_1F5B20E2030E
#include <postgres.h>
#include <fmgr.h>

PGDLLEXPORT void _PG_init(void);
PGDLLEXPORT void _PG_fini(void);

PGDLLEXPORT Datum jieba_start(PG_FUNCTION_ARGS);

PGDLLEXPORT Datum jieba_query_start(PG_FUNCTION_ARGS);

PGDLLEXPORT Datum jieba_mp_start(PG_FUNCTION_ARGS);

PGDLLEXPORT Datum jieba_hmm_start(PG_FUNCTION_ARGS);

PGDLLEXPORT Datum jieba_gettoken(PG_FUNCTION_ARGS);

PGDLLEXPORT Datum jieba_end(PG_FUNCTION_ARGS);

PGDLLEXPORT Datum jieba_lextype(PG_FUNCTION_ARGS);

PGDLLEXPORT Datum jieba_reload_dict(PG_FUNCTION_ARGS);

#endif /*H_3069A164_35BD_4687_868B_1F5B20E2030E*/

4 修改pg_jieba.c文件

大约在45行

//引用头文件pg_main.h
#include "pg_main.h"

/*
 * prototypes
 */
PG_FUNCTION_INFO_V1(jieba_start);
//Datum jieba_start(PG_FUNCTION_ARGS);

PG_FUNCTION_INFO_V1(jieba_query_start);
//Datum jieba_query_start(PG_FUNCTION_ARGS);

PG_FUNCTION_INFO_V1(jieba_mp_start);
//Datum jieba_mp_start(PG_FUNCTION_ARGS);

PG_FUNCTION_INFO_V1(jieba_hmm_start);
//Datum jieba_hmm_start(PG_FUNCTION_ARGS);

PG_FUNCTION_INFO_V1(jieba_gettoken);
//Datum jieba_gettoken(PG_FUNCTION_ARGS);

PG_FUNCTION_INFO_V1(jieba_end);
//Datum jieba_end(PG_FUNCTION_ARGS);

PG_FUNCTION_INFO_V1(jieba_lextype);
//Datum jieba_lextype(PG_FUNCTION_ARGS);

PG_FUNCTION_INFO_V1(jieba_reload_dict);
//Datum jieba_reload_dict(PG_FUNCTION_ARGS);

5 修改CMakeLists.txt

添加以下内容

#修改cmake最小版本需求,pg_jieba使用的是cmake2版本
cmake_minimum_required(VERSION 3.11)

#message(STATUS "Setting ${CMAKE_PROJECT_NAME} build type - ${CMAKE_BUILD_TYPE}")后面添加以下内容
IF (MSVC)
        #设置编译器标准
        #set(CMAKE_C_STANDARD 99)
        #set(CMAKE_CXX_STANDARD 17)
        #set(CMAKE_CXX_STANDARD_REQUIRED ON)

        #宏定义
        add_definitions(-D_CRT_SECURE_NO_WARNINGS)
        add_definitions(-D_HAS_AUTO_PTR_ETC=1)
        add_definitions(-D_SILENCE_CXX17_NEGATORS_DEPRECATION_WARNING)

		#引用PostgreSQL头文件
        IF (NOT DEFINED ENV{PostgreSQL_INCLUDE_DIR})
                set(PostgreSQL_INCLUDE_DIR "C:/pgsql/include/server/port/win32_msvc;C:/pgsql/include/server/port/win32;C:/pgsql/include;C:/pgsql/include/server;")
        ENDIF ()

		#使用pg_config获取PostgreSQL的基本配置信息
        IF (NOT DEFINED ENV{PostgreSQL_PG_CONFIG})
                set(PostgreSQL_PG_CONFIG "C:/pgsql/bin/pg_config.exe")
        ENDIF ()

		#链接PostgreSQL库文件
        IF (NOT DEFINED ENV{PostgreSQL_LIBRARY})
                set(PostgreSQL_LIBRARY "C:/pgsql/lib/postgres.lib")
        ENDIF ()

		#设置PostgreSQL库目录
        IF (NOT DEFINED ENV{PostgreSQL_LIBRARY_DIRS})
                set(PostgreSQL_LIBRARY_DIRS "C:/pgsql/lib")
        ENDIF ()
ENDIF ()

#删除所有PostgreSQL_TYPE_INCLUDE_DIR
#在add_library(${PG_JIEBA_LIBRARY_NAME} ${LIBRARY_MODE_TARGET} ${SOURCE_FILES})后面添加一行
TARGET_LINK_LIBRARIES(${PG_JIEBA_LIBRARY_NAME} ${PostgreSQL_LIBRARY})

6 开始编译

以下所有命令均在“开始-》Visual Studio 2019-》x64 Native Tools Command Prompt for VS 2019”命令行中执行.

#namke 不能编译,修改了CMakelist.txt后加在项目文件中自动生成几个字符"没有被记录",这是因为在windows上运行pg_config时C/C++标志的结果
#cmake -G "NMake Makefiles" -Wno-dev -DCMAKE_BUILD_TYPE=Release -B vc_build -S .
#cmake --build vc_build --config Release
#cmake --install build --config Release

运行pg_config, 没有被记录是受CC、CPPFLAGS、CFLAGS、CFLAGS_SL、LDFLAGS、LDFLAGS_EX、LDFLAGS_SL、LIBS输出的影响。

BINDIR = c:/pgsql/bin
DOCDIR = c:/pgsql/doc
HTMLDIR = c:/pgsql/doc
INCLUDEDIR = c:/pgsql/include
PKGINCLUDEDIR = c:/pgsql/include
INCLUDEDIR-SERVER = c:/pgsql/include/server
LIBDIR = c:/pgsql/lib
PKGLIBDIR = c:/pgsql/lib
LOCALEDIR = c:/pgsql/share/locale
MANDIR = c:/pgsql/man
SHAREDIR = c:/pgsql/share
SYSCONFDIR = c:/pgsql/etc
PGXS = c:/pgsql/lib/pgxs/src/makefiles/pgxs.mk
CONFIGURE = --enable-thread-safety --enable-nls --with-ldap --with-ssl=openssl --with-uuid --with-libxml --with-libxslt --with-lz4 --with-icu --with-tcl --with-perl --with-python
CC = 没有被记录
CPPFLAGS = 没有被记录
CFLAGS = 没有被记录
CFLAGS_SL = 没有被记录
LDFLAGS = 没有被记录
LDFLAGS_EX = 没有被记录
LDFLAGS_SL = 没有被记录
LIBS = 没有被记录
VERSION = PostgreSQL 14.13
rd /S /Q vc_build
cmake-gui -Wno-dev -DCMAKE_BUILD_TYPE=Release -B vc_build -S .
#点击"Configure"后选择"Visual Studio 16 2019"和x64,然后点击"Finish"
#完成后点击"Generate",在"Generate"成功后要修改一下vc_build\pg_jieba.vcxproj,用文本编辑器打开修改,查找删除"没有被记录"这几个字符,修改完成后保存
#打开"Visual Studio",选择"Release x64",然后在项目"INSTALL"上右键编译, "Visual Studio"即开始编译并安装

#注意:编译完成安装成功后要修改文件名
del C:\pgsql\lib\pg_jieba.dll
rename C:\pgsql\lib\libpg_jieba.dll pg_jieba.dll


#查看依赖和导出(可选)
dumpbin /DEPENDENTS C:/pgsql/lib/pg_jieba.dll
dumpbin /EXPORTS C:/pgsql/lib/pg_jieba.dll

7 pg_jieba安装的文件

有关更多信息请参看:(GitHub - fxsjy/jieba: 结巴中文分词)

#库文件,用于pg_jieba二次开发,如果没有可以删除
C:/pgsql/lib/pg_jieba.lib

#pg启动时会加载这个dll和加载字典数据
C:/pgsql/lib/libpg_jieba.dll
#pg专用控制文件(不能修改,必须要)
C:/pgsql/share/extension/pg_jieba.control
#pg_jieba中的函数定义(不能修改,必须要)
C:/pgsql/share/extension/pg_jieba--1.1.1.sql

#后面的都是字典数据,如果要添加自定义数据,请在share/tsearch_data/jieba_user.dict中添加.具体信息请参看结巴中文分词
C:/pgsql/share/tsearch_data/jieba_base.dict
C:/pgsql/share/tsearch_data/jieba_hmm.model
C:/pgsql/share/tsearch_data/jieba_user.dict
C:/pgsql/share/tsearch_data/jieba.stop
C:/pgsql/share/tsearch_data/jieba.idf

自定义数据设置完成后,无需重启数据库,执行以下命令即可(可能需要管理员权限)

select jieba_reload_dict()

只要服务器的PostgreSQL版本和你编译pg_jieba时的PostgreSQL版本一致,你只要复制上述安装文件至服务器上即可,服务器不需要重新编译.

8 修改postgresql.conf

shared_preload_libraries = 'pg_jieba'	# (change requires restart)
C:/pgsql/bin/pg_ctl -D F:/pgdata stop -m fast
C:/pgsql/bin/pg_ctl -D F:/pgdata start



c:\pgsql\bin\psql -h 127.0.0.1 -U postgres
create extension pg_jieba;
select * from to_tsquery('jiebacfg', '是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。');
                                                            to_tsquery

----------------------------------------------------------------------------------------------------------------------------------
 '拖拉机' & '学院' & '手扶拖拉机' & '专业' & '不用' & '多久' & '会' & '升职' & '加薪' & '当上' & 'ceo' & '走上' & '人生' & '巅峰'
使用方法:打开scws-1.2.3\win32里的解决方案文件,里面包括了scws和zhparser,另外一个是scws的php扩展不用编译 zhparser是Postgresql中进行中文分词的常用扩展,网上的相关资料很多,安装的教程也不少,但大多数是linux的,并没有windows安装介绍。原因有两个方面,一个是本身像这种数据库服务器一般都是linux系统的,另外一个比较致命,zhparser本身并没提供windows环境下的编译工程,连依赖库scws的readme里也建议在linux环境下使用,或者用cygwin或mingw一类工具。对于博主这种只使用windows环境(其实Postgresql也是刚接触),连makefile也搞不明白的真是头大。好在万变不离其中,编译环境只是工具,只要搞清楚个中原理(看代码),移植到windows+VS的环境应该也不是难事(当然这个也只是在轻量级的库下面适用)。下面进入正题: 干货在这里 ① 编译scws:由于 zhparser是基于scws(scws是简易中文分词系统的缩写,它的原理其实很简单,基于词典,将文本中的内容按照词典进行分词,提取关键字等。)做的分词,因此先要编译scws,网上下载下来的源码有vs2008的版本,因此,直接编译,后来出现头文件无法加入的错误,将相关文件编码方式修改成unicode解决(利用notepad++ 编码->转换为UTF-8编码)。 ② 编译zhparser:由于zhparser只提供了linux下面的makefile文件,想着用Mingw去编译,后来因为postgresql所在目录有空格,导致编译过程也失败。只好到vs2008里自建一个工程,建好工程后,首先把scws的头文件和库文件加进去,然后把postgresql的头文件和库文件加进去,设置项目属性为生成dll文件。 ③ 扩展安装编译成功以后,在postgresql下面运行:create extension zhparser; 提示找不到control文件,在下载的源码文件夹里找到该文件,放到指定目录,再次运行sql,提示找不到dll文件,将dll文件放到指定目录,再运行,提示找不到zhprs_start函数,看来是dll没有正常explort出函数,修改源代码中的相关函数声明,再次运行后,显示成功。但zhparser源文件下其实还有很多文件,包括一个词典文件和ini配制文件,还不知道应该放在何处。 ④ zhparser运行环境配制:运行测试sql语句,发现并没有实现分词,想起来应该是词典位置不正确,到zhparser源码中去搜寻,发现如下代码,看来是放到tsearch_data目录下面,用同样的方法找到ini文件的目录,将它们都放进去,再次运行测试代码,得到正确结果。 测试代码: REATE EXTENSION zhparser; -- make test configuration using parser CREATE TEXT SEARCH CONFIGURATION testzhcfg (PARSER = zhparser); ALTER TEXT SEARCH CONFIGURATION testzhcfg ADD MAPPING FOR n,v,a,i,e,l WITH simple; select to_tsvector('testzhcfg','南京市长江大桥');
评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

kmblack1

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值