完成搜索扩展需求-按首字母、拼音、汉字查询
思路:
- 数据库表增加两个字段分别存储全拼和首字母简拼
- 插入/更新数据时把拼音同步更新
- 搜索接口需要实现参数分词,拼接以及sql 查询并返回结果
一、mysql 方面需要处理的问题
1、汉字提取首字母
fristPinyin : 此函数是将一个中文字符串的第一个汉字转成拼音字母 (例如:“李”->l)
CREATE FUNCTION `fristPinyin`(P_NAME VARCHAR(255)) RETURNS varchar(255) CHARSET utf8
BEGIN
DECLARE V_RETURN VARCHAR(255);
SET V_RETURN = ELT(INTERVAL(CONV(HEX(left(CONVERT(P_NAME USING gbk),1)),16,10),
0xB0A1,0xB0C5,0xB2C1,0xB4EE,0xB6EA,0xB7A2,0xB8C1,0xB9FE,0xBBF7,
0xBFA6,0xC0AC,0xC2E8,0xC4C3,0xC5B6,0xC5BE,0xC6DA,0xC8BB,
0xC8F6,0xCBFA,0xCDDA,0xCEF4,0xD1B9,0xD4D1),
'a','b','c','d','e','f','g','h','j','k','l','m','n','o','p','q','r','s','t','w','x','y','z');
RETURN V_RETURN;
END
2、姓名首字母全拼
pinyin :此函数是将一个中文字符串对应拼音母的每个相连 (例如:“李佳航”->ljh(或者说"张伟"-zw))
CREATE FUNCTION `pinyin`(P_NAME VARCHAR(255)) RETURNS varchar(255) CHARSET utf8
BEGIN
DECLARE V_COMPARE VARCHAR(255);
DECLARE V_RETURN VARCHAR(255);
DECLARE I INT;
SET I = 1;
SET V_RETURN = '';
while I < LENGTH(P_NAME) do
SET V_COMPARE = SUBSTR(P_NAME, I, 1);
IF (V_COMPARE != '') THEN
#SET V_RETURN = CONCAT(V_RETURN, ',', V_COMPARE);
SET V_RETURN = CONCAT(V_RETURN, fristPinyin(V_COMPARE));
#SET V_RETURN = fristPinyin(V_COMPARE);
END IF;
SET I = I + 1;
end while;
IF (ISNULL(V_RETURN) or V_RETURN = '') THEN
SET V_RETURN = P_NAME;
END IF;
RETURN V_RETURN;
END
3、汉字转换拼音
getPY:例如李磊&—> lilei&
CREATE DEFINER=`root`@`%` FUNCTION `getPY`(in_string VARCHAR(16000)) RETURNS varchar(21845) CHARSET utf8
BEGIN
#截取字符串,每次做截取后的字符串存放在该变量中,初始为函数参数in_string值
DECLARE tmp_str VARCHAR(21845) CHARSET gbk DEFAULT '' ;
#tmp_str的长度
DECLARE tmp_len SMALLINT DEFAULT 0;
#tmp_str的长度
DECLARE tmp_loc SMALLINT DEFAULT 0;
#截取字符,每次 left(tmp_str,1) 返回值存放在该变量中
DECLARE tmp_char VARCHAR(2) CHARSET gbk DEFAULT '';
#结果字符串
DECLARE tmp_rs VARCHAR(16000)CHARSET gbk DEFAULT '';
#拼音字符,存放单个汉字对应的拼音首字符
DECLARE tmp_cc VARCHAR(2) CHARSET gbk DEFAULT '';
#初始化,将in_string赋给tmp_str
SET tmp_str = in_string;
#初始化长度
SET tmp_len = LENGTH(tmp_str);
#如果被计算的tmp_str长度大于0则进入该while
WHILE tmp_len > 0 DO
#获取tmp_str最左端的首个字符,注意这里是获取首个字符,该字符可能是汉字,也可能不是。
SET tmp_char = LEFT(tmp_str,1);
#左端首个字符赋值给拼音字符
SET tmp_cc = tmp_char;
#获取字符的编码范围的位置,为了确认汉字拼音首字母是那一个
SET tmp_loc=INTERVAL(CONV(HEX(tmp_char),16,10),0xB0A1,0xB0C5,0xB2C1,0xB4EE,0xB6EA,0xB7A2,0xB8C1,0xB9FE,0xBBF7,0xBFA6,0xC0AC
,0xC2E8,0xC4C3,0xC5B6,0xC5BE,0xC6DA,0xC8BB,0xC8F6,0xCBFA,0xCDDA ,0xCEF4,0xD1B9,0xD4D1);
#判断左端首个字符是多字节还是单字节字符,要是多字节则认为是汉字且作以下拼音获取,要是单字节则不处理。如果是多字节字符但是不在对应的编码范围之内,即对应的不是大写字母则也不做处理,这样数字或者特殊字符就保持原样了
IF (LENGTH(tmp_char)>1 AND tmp_loc>0 AND tmp_loc<24) THEN
#获得汉字拼音首字符
SELECT ELT(tmp_loc,'A','B','C','D','E','F','G','H','J','K','L','M','N','O','P','Q','R','S','T','W','X','Y','Z') INTO tmp_cc;
END IF;
#将当前tmp_str左端首个字符拼音首字符与返回字符串拼接
SET tmp_rs = CONCAT(tmp_rs,tmp_cc);
#将tmp_str左端首字符去除
SET tmp_str = SUBSTRING(tmp_str,2);
#计算当前字符串长度
SET tmp_len = LENGTH(tmp_str);
END WHILE;
#返回结果字符串,如果需要小写加 LOWER()
RETURN LOWER(tmp_rs);
END
二、代码层面
- 导入依赖
<!-- ik分词 -->
<dependency>
<groupId>com.janeluo</groupId>
<artifactId>ikanalyzer</artifactId>
<version>2012_u6</version>
</dependency>
<!-- 汉字和拼音转换 -->
<dependency>
<groupId>com.pinyin4j</groupId>
<artifactId>pinyin4j</artifactId>
<version>2.5.0</version>
</dependency>
- 完善表结构
whole_py:全拼
short_py:首字母简拼
- 编写汉字转拼音工具类
public class PinYinUtils {
/**
* 获取汉字首字母简拼
*
* @param hanZi 汉字
* @return
*/
public static String getJaneSpell(String hanZi) {
String result = null;
if (StringUtils.isNotBlank(hanZi)) {
char[] charArray = hanZi.toCharArray();
StringBuffer sb = new StringBuffer();
for (char ch : charArray) {
// 逐个汉字进行转换,每个汉字返回值为一个String数组(因为有多音字)
String[] stringArray = PinyinHelper.toHanyuPinyinStringArray(ch);
if (null != stringArray) {
sb.append(stringArray[0].charAt(0));
}
}
if (sb.length() > 0) {
result = sb.toString();
}
}
return result;
}
/**
* 获取汉字全拼
*
* @param hanZi 汉字
* @return 转换失败 返回null
*/
public static List<String> getWholeSpell(String hanZi) {
List<String> charList = new ArrayList<>();
hanZi = hanZi.trim().toLowerCase();
if (StringUtils.isNotBlank(hanZi)) {
char[] charArray = hanZi.toCharArray();
StringBuilder eng = null;
LinkedHashSet<String> pySet = new LinkedHashSet<>();
for (char ch : charArray) {
String str = String.valueOf(ch);
if (!str.matches("[a-zA-Z\\d]")) {
if (eng != null) {
charList.add(eng.toString());
eng = null;
}
String[] pyArray = PinyinHelper.toHanyuPinyinStringArray(ch);
if (null != pyArray && pyArray.length > 0) {
pySet.add(pyArray[0].replaceAll("\\d", ""));
}
} else {
if (eng == null) {
eng = new StringBuilder();
}
eng.append(ch);
}
}
charList.addAll(pySet);
if (eng != null) {
charList.add(eng.toString());
}
}
return charList;
}
}
- 分词处理
private static final String PY_REGEX = "[^aoeiuv]?h?[iuv]?(ai|ei|ao|ou|er|ang?|eng? |ong|a|o|e|i|u|ng|n)?";
private Set<String> textSplit(String keyword) {
Set<String> set = new LinkedHashSet<>();
StringReader reader = new StringReader(keyword.trim());
IKSegmenter iks = new IKSegmenter(reader, true);
try {
Lexeme lexeme;
while ((lexeme = iks.next()) != null) {
set.add(lexeme.getLexemeText());
}
} catch (IOException e) {
e.printStackTrace();
}
return set;
}
private Set<String> pySplit(String keyword) {
int tag = 0;
String s = keyword;
Set<String> set = new LinkedHashSet<>();
for (int i = s.length(); i > 0; i = i - tag) {
Matcher matcher = Pattern.compile(PY_REGEX).matcher(s);
if (matcher.find()) {
set.add(matcher.group());
tag = matcher.end() - matcher.start();
s = s.substring(tag);
}
}
return set;
}
/**
* 业务实现类搜索方法.
*
* @param keyWord 搜索的关键词
*/
@Override
public List<DiseaseListVO> search(String keyWord) {
if (StringUtils.isEmpty(keyWord)) {
return null;
}
// 过滤出中英文、数字
Set<String> keywordSet = textSplit(keyWord.replaceAll("[^0-9a-zA-Z\\u4e00-\\u9fa5]", ""));
StringBuilder pyBuilder = new StringBuilder();
StringBuilder hzBuilder = new StringBuilder();
keywordSet.forEach(k -> {
if (Pattern.compile("^[a-zA-Z]+$").matcher(k).matches()) {
pyBuilder.append(k);
} else {
hzBuilder.append(k).append("|");
}
});
DiseaseQueryParam query = new DiseaseQueryParam();
query.setExactName(keyWord);
// 多种检索条件为了使搜索范围更加广泛
if (pyBuilder.length() > 0) {
String whole = pyBuilder.toString().toLowerCase();
query.setShortPyRegexp(whole);
Set<String> pySet = pySplit(whole);
if (CollectionUtils.isNotEmpty(pySet)) {
whole = StringUtils.join(pySet, "|");
query.setWholePyLike(StringUtils.join(pySet, ","));
}
query.setWholePyRegexp(whole);
}
if (hzBuilder.length() > 0) {
String hz = hzBuilder.toString();
query.setLikeName(hz.substring(0, hz.length() - 1));
} else {
query.setLikeName(keyWord);
}
return libMapper.listForSearch(query);
}
- Mapper映射文件
<!-- CASE...WHEN...是为了按照满足条件数排序,从而筛选出最优结果。 -->
<select id="listForSearch" parameterType="parameterType" resultType="resultType">
<bind name="exactNameLike" value="'%'+exactName+'%'"/>
SELECT dl.id,dl.`name`,(
(CASE WHEN (dl.`name` = #{exactName}) THEN 1 ELSE 0 END) +
(CASE WHEN (dl.`name` LIKE #{exactNameLike}) THEN 1 ELSE 0 END) +
(CASE WHEN (dl.`name` REGEXP #{likeName}) THEN 1 ELSE 0 END) +
(CASE WHEN (dc.`name` REGEXP #{likeName}) THEN 1 ELSE 0 END)
<if test="wholePyRegexp!=null and wholePyRegexp!=''">
<bind name="pyLike" value="'%'+wholePyLike+'%'"/>
+ (CASE WHEN (dl.whole_py LIKE #{pyLike}) THEN 1 ELSE 0 END)
+ (CASE WHEN (dl.whole_py REGEXP #{wholePyRegexp}) THEN 1 ELSE 0 END)
</if>
<if test="shortPyRegexp!=null and shortPyRegexp!=''">
+ (CASE WHEN (dl.short_py REGEXP #{shortPyRegexp}) THEN 1 ELSE 0 END)
</if>
) i
FROM t_disease_lib dl
LEFT JOIN t_disease_classify dc ON dl.classify_id=dc.id
WHERE dl.`status`=1
HAVING i>1
ORDER BY i DESC
</select>
三、测试
{
"code": "200",
"message": "OK",
"data": [
{
"id": "1aefc5bdb8e444be8e47531eec1ce810",
"name": "蚕豆"
}
]
}
输入 cand / cd / 蚕豆 / 蚕 ,也均能正确返回,再广泛一点输入 c / d,则会 返回包含该字母的结果,字段文本更长的话同理
四、总结
- 需要先判断空,如果为空全查询 需要单独做处理,要不然分词查询会出问题
if(!name.isEmpty() && name != "") {
queryMap = PinYinUtil.nameSplit(name);
}
- 可满足简单搜索功能需求
- 缺点:只支持少数据量搜索,数据量大时SQL执行慢。