ngram模型 java_NgramContextUtils.java

/*

* Copyright (C) 2014 The Android Open Source Project

*

* Licensed under the Apache License, Version 2.0 (the "License");

* you may not use this file except in compliance with the License.

* You may obtain a copy of the License at

*

* http://www.apache.org/licenses/LICENSE-2.0

*

* Unless required by applicable law or agreed to in writing, software

* distributed under the License is distributed on an "AS IS" BASIS,

* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

* See the License for the specific language governing permissions and

* limitations under the License.

*/

package com.android.inputmethod.latin.utils;

import com.android.inputmethod.latin.NgramContext;

import com.android.inputmethod.latin.NgramContext.WordInfo;

import com.android.inputmethod.latin.define.DecoderSpecificConstants;

import com.android.inputmethod.latin.settings.SpacingAndPunctuations;

import java.util.Arrays;

import java.util.regex.Pattern;

import javax.annotation.Nonnull;

public final class NgramContextUtils {

private NgramContextUtils() {

// Intentional empty constructor for utility class.

}

private static final Pattern NEWLINE_REGEX = Pattern.compile("[\\r\\n]+");

private static final Pattern SPACE_REGEX = Pattern.compile("\\s+");

// Get context information from nth word before the cursor. n = 1 retrieves the words

// immediately before the cursor, n = 2 retrieves the words before that, and so on. This splits

// on whitespace only.

// Also, it won't return words that end in a separator (if the nth word before the cursor

// ends in a separator, it returns information representing beginning-of-sentence).

// Example (when Constants.MAX_PREV_WORD_COUNT_FOR_N_GRAM is 2):

// (n = 1) "abc def|" -> abc, def

// (n = 1) "abc def |" -> abc, def

// (n = 1) "abc 'def|" -> empty, 'def

// (n = 1) "abc def. |" -> beginning-of-sentence

// (n = 1) "abc def . |" -> beginning-of-sentence

// (n = 2) "abc def|" -> beginning-of-sentence, abc

// (n = 2) "abc def |" -> beginning-of-sentence, abc

// (n = 2) "abc 'def|" -> empty. The context is different from "abc def", but we cannot

// represent this situation using NgramContext. See TODO in the method.

// TODO: The next example's result should be "abc, def". This have to be fixed before we

// retrieve the prior context of Beginning-of-Sentence.

// (n = 2) "abc def. |" -> beginning-of-sentence, abc

// (n = 2) "abc def . |" -> abc, def

// (n = 2) "abc|" -> beginning-of-sentence

// (n = 2) "abc |" -> beginning-of-sentence

// (n = 2) "abc. def|" -> beginning-of-sentence

@Nonnull

public static NgramContext getNgramContextFromNthPreviousWord(final CharSequence prev,

final SpacingAndPunctuations spacingAndPunctuations, final int n) {

if (prev == null) return NgramContext.EMPTY_PREV_WORDS_INFO;

final String[] lines = NEWLINE_REGEX.split(prev);

if (lines.length == 0) {

return new NgramContext(WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO);

}

final String[] w = SPACE_REGEX.split(lines[lines.length - 1]);

final WordInfo[] prevWordsInfo =

new WordInfo[DecoderSpecificConstants.MAX_PREV_WORD_COUNT_FOR_N_GRAM];

Arrays.fill(prevWordsInfo, WordInfo.EMPTY_WORD_INFO);

for (int i = 0; i < prevWordsInfo.length; i++) {

final int focusedWordIndex = w.length - n - i;

// Referring to the word after the focused word.

if ((focusedWordIndex + 1) >= 0 && (focusedWordIndex + 1) < w.length) {

final String wordFollowingTheNthPrevWord = w[focusedWordIndex + 1];

if (!wordFollowingTheNthPrevWord.isEmpty()) {

final char firstChar = wordFollowingTheNthPrevWord.charAt(0);

if (spacingAndPunctuations.isWordConnector(firstChar)) {

// The word following the focused word is starting with a word connector.

// TODO: Return meaningful context for this case.

break;

}

}

}

// If we can't find (n + i) words, the context is beginning-of-sentence.

if (focusedWordIndex < 0) {

prevWordsInfo[i] = WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO;

break;

}

final String focusedWord = w[focusedWordIndex];

// If the word is empty, the context is beginning-of-sentence.

final int length = focusedWord.length();

if (length <= 0) {

prevWordsInfo[i] = WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO;

break;

}

// If the word ends in a sentence terminator, the context is beginning-of-sentence.

final char lastChar = focusedWord.charAt(length - 1);

if (spacingAndPunctuations.isSentenceTerminator(lastChar)) {

prevWordsInfo[i] = WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO;

break;

}

// If ends in a word separator or connector, the context is unclear.

// TODO: Return meaningful context for this case.

if (spacingAndPunctuations.isWordSeparator(lastChar)

|| spacingAndPunctuations.isWordConnector(lastChar)) {

break;

}

prevWordsInfo[i] = new WordInfo(focusedWord);

}

return new NgramContext(prevWordsInfo);

}

}

Java程序

|

114行

|

5.43 KB

/*

* Copyright (C) 2014 The Android Open Source Project

*

* Licensed under the Apache License, Version 2.0 (the "License");

* you may not use this file except in compliance with the License.

* You may obtain a copy of the License at

*

* http://www.apache.org/licenses/LICENSE-2.0

*

* Unless required by applicable law or agreed to in writing, software

* distributed under the License is distributed on an "AS IS" BASIS,

* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

* See the License for the specific language governing permissions and

* limitations under the License.

*/

package com.android.inputmethod.latin.utils;

import com.android.inputmethod.latin.NgramContext;

import com.android.inputmethod.latin.NgramContext.WordInfo;

import com.android.inputmethod.latin.define.DecoderSpecificConstants;

import com.android.inputmethod.latin.settings.SpacingAndPunctuations;

import java.util.Arrays;

import java.util.regex.Pattern;

import javax.annotation.Nonnull;

public final class NgramContextUtils {

private NgramContextUtils() {

// Intentional empty constructor for utility class.

}

private static final Pattern NEWLINE_REGEX = Pattern.compile("[\\r\\n]+");

private static final Pattern SPACE_REGEX = Pattern.compile("\\s+");

// Get context information from nth word before the cursor. n = 1 retrieves the words

// immediately before the cursor, n = 2 retrieves the words before that, and so on. This splits

// on whitespace only.

// Also, it won't return words that end in a separator (if the nth word before the cursor

// ends in a separator, it returns information representing beginning-of-sentence).

// Example (when Constants.MAX_PREV_WORD_COUNT_FOR_N_GRAM is 2):

// (n = 1) "abc def|" -> abc, def

// (n = 1) "abc def |" -> abc, def

// (n = 1) "abc 'def|" -> empty, 'def

// (n = 1) "abc def. |" -> beginning-of-sentence

// (n = 1) "abc def . |" -> beginning-of-sentence

// (n = 2) "abc def|" -> beginning-of-sentence, abc

// (n = 2) "abc def |" -> beginning-of-sentence, abc

// (n = 2) "abc 'def|" -> empty. The context is different from "abc def", but we cannot

// represent this situation using NgramContext. See TODO in the method.

// TODO: The next example's result should be "abc, def". This have to be fixed before we

// retrieve the prior context of Beginning-of-Sentence.

// (n = 2) "abc def. |" -> beginning-of-sentence, abc

// (n = 2) "abc def . |" -> abc, def

// (n = 2) "abc|" -> beginning-of-sentence

// (n = 2) "abc |" -> beginning-of-sentence

// (n = 2) "abc. def|" -> beginning-of-sentence

@Nonnull

public static NgramContext getNgramContextFromNthPreviousWord(final CharSequence prev,

final SpacingAndPunctuations spacingAndPunctuations, final int n) {

if (prev == null) return NgramContext.EMPTY_PREV_WORDS_INFO;

final String[] lines = NEWLINE_REGEX.split(prev);

if (lines.length == 0) {

return new NgramContext(WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO);

}

final String[] w = SPACE_REGEX.split(lines[lines.length - 1]);

final WordInfo[] prevWordsInfo =

new WordInfo[DecoderSpecificConstants.MAX_PREV_WORD_COUNT_FOR_N_GRAM];

Arrays.fill(prevWordsInfo, WordInfo.EMPTY_WORD_INFO);

for (int i = 0; i < prevWordsInfo.length; i++) {

final int focusedWordIndex = w.length - n - i;

// Referring to the word after the focused word.

if ((focusedWordIndex + 1) >= 0 && (focusedWordIndex + 1) < w.length) {

final String wordFollowingTheNthPrevWord = w[focusedWordIndex + 1];

if (!wordFollowingTheNthPrevWord.isEmpty()) {

final char firstChar = wordFollowingTheNthPrevWord.charAt(0);

if (spacingAndPunctuations.isWordConnector(firstChar)) {

// The word following the focused word is starting with a word connector.

// TODO: Return meaningful context for this case.

break;

}

}

}

// If we can't find (n + i) words, the context is beginning-of-sentence.

if (focusedWordIndex < 0) {

prevWordsInfo[i] = WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO;

break;

}

final String focusedWord = w[focusedWordIndex];

// If the word is empty, the context is beginning-of-sentence.

final int length = focusedWord.length();

if (length <= 0) {

prevWordsInfo[i] = WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO;

break;

}

// If the word ends in a sentence terminator, the context is beginning-of-sentence.

final char lastChar = focusedWord.charAt(length - 1);

if (spacingAndPunctuations.isSentenceTerminator(lastChar)) {

prevWordsInfo[i] = WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO;

break;

}

// If ends in a word separator or connector, the context is unclear.

// TODO: Return meaningful context for this case.

if (spacingAndPunctuations.isWordSeparator(lastChar)

|| spacingAndPunctuations.isWordConnector(lastChar)) {

break;

}

prevWordsInfo[i] = new WordInfo(focusedWord);

}

return new NgramContext(prevWordsInfo);

}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值