jieba java 磁性_jieba分詞的應用（java）

最新推荐文章于 2024-04-25 19:52:07 发布

以号拼命多次

最新推荐文章于 2024-04-25 19:52:07 发布

阅读量181

点赞数

文章标签： jieba java 磁性

本文链接：https://blog.csdn.net/weixin_36091906/article/details/114314354

版权

在上一篇說的猜你喜歡功能中，又加了新的需求，需要對關鍵詞進行分詞，擴大推薦文章的范圍，這樣能夠拓展用戶的喜歡范圍，這時候我就想到可以用jieba分詞對中文進行分詞，同樣的需要去官網下載源碼，這樣方便自己對源碼的修改以達到自己的目的。這里，我需要判斷切分出來的詞是否是無意義的詞，就需要對切出來的詞進行篩選，這時候，jieba分詞的一個屬性就體現出它的強大之處了，jieba分詞會將切分出來的詞進行詞性的定義，我可以通過對於jieba分此后詞的詞性進行判斷，篩選出名詞，去掉無用的連接詞，形容詞等其他詞性的詞來達到我的分詞目的。下面是對源碼進行修改的部分。(大家也可以根據自己的需要，暴露原來隱藏的屬性來實現自己的功能。)

/**

*在jieba分詞的SegToken.java中對SegToken類增加一個成員變量properties來存儲單詞的詞性

**/

package com.huaban.analysis.jieba;

public class SegToken {

public String word;

public int startOffset;

public int endOffset;

public String properties;

public SegToken(String word, int startOffset, int endOffset, String properties) {

this.word = word;

this.startOffset = startOffset;

this.endOffset = endOffset;

this.properties = properties;

}

@Override

public String toString() {

return "[" + word + ", " + startOffset + ", " + endOffset + ", " + properties + "]";

}

將從字典文件dict.txt中讀取出來的單詞的詞性存儲到properties 的字段中

/**

*在WordDictionary.java中增加property的Map存儲word與詞性的關系。建立索引關系，增加獲取詞性的公共方法

**/

package com.huaban.analysis.jieba;

import java.io.BufferedReader;

import java.nio.file.DirectoryStream;

import java.nio.file.Files;

import java.nio.file.Path;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.nio.charset.Charset;

import java.nio.charset.StandardCharsets;

import java.util.HashMap;

import java.util.HashSet;

import java.util.Locale;

import java.util.Map;

import java.util.Map.Entry;

import java.util.Set;

public class WordDictionary {

private static WordDictionary singleton;

private static final String MAIN_DICT = "/dict.txt";

private static String USER_DICT_SUFFIX = ".dict";

public final Map freqs = new HashMap();

public final Map property = new HashMap();

public final Set loadedPath = new HashSet();

private Double minFreq = Double.MAX_VALUE;

private Double total = 0.0;

private DictSegment _dict;

private WordDictionary() {

this.loadDict();

}

public static WordDictionary getInstance() {

if (singleton == null) {

synchronized (WordDictionary.class) {

if (singleton == null) {

singleton = new WordDictionary();

return singleton;

}

return singleton;

}

/**

* for ES to initialize the user dictionary.

*@param configFile

public void init(Path configFile) {

String abspath = configFile.toAbsolutePath().toString();

System.out.println("initialize user dictionary:" + abspath);

synchronized (WordDictionary.class) {

if (loadedPath.contains(abspath))

return;

DirectoryStream stream;

try {

stream = Files.newDirectoryStream(configFile, String.format(Locale.getDefault(), "*%s", USER_DICT_SUFFIX));

for (Path path: stream){

System.err.println(String.format(Locale.getDefault(), "loading dict %s", path.toString()));

singleton.loadUserDict(path);

}

loadedPath.add(abspath);

} catch (IOException e) {

// TODO Auto-generated catch block

// e.printStackTrace();

System.err.println(String.format(Locale.getDefault(), "%s: load user dict failure!", configFile.toString()));

}

/**

* let user just use their own dict instead of the default dict

public void resetDict(){

_dict = new DictSegment((char) 0);

freqs.clear();

}

public void loadDict() {

_dict = new DictSegment((char) 0);

InputStream is = this.getClass().getResourceAsStream(MAIN_DICT);

try {

BufferedReader br = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));

long s = System.currentTimeMillis();

while (br.ready()) {

String line = br.readLine();

String[] tokens = line.split("[\t ]+");

if (tokens.length < 2)

continue;

String word = tokens[0];

String properties = "";

double freq = Double.valueOf(tokens[1]);

if(tokens.length == 3)

properties = tokens[2];

total += freq;

word = addWord(word);

freqs.put(word, freq);

property.put(word, properties);//存儲單詞與詞性的索引關系

}

// normalize

for (Entry entry : freqs.entrySet()) {

entry.setValue((Math.log(entry.getValue() / total)));

minFreq = Math.min(entry.getValue(), minFreq);

}

System.out.println(String.format(Locale.getDefault(), "main dict load finished, time elapsed %d ms",

System.currentTimeMillis() - s));

}

catch (IOException e) {

System.err.println(String.format(Locale.getDefault(), "%s load failure!", MAIN_DICT));

}

finally {

try {

if (null != is)

is.close();

}

catch (IOException e) {

System.err.println(String.format(Locale.getDefault(), "%s close failure!", MAIN_DICT));

}

private String addWord(String word) {

if (null != word && !"".equals(word.trim())) {

String key = word.trim().toLowerCase(Locale.getDefault());

_dict.fillSegment(key.toCharArray());

return key;

}

else

return null;

}

public void loadUserDict(Path userDict) {

loadUserDict(userDict, StandardCharsets.UTF_8);

}

public void loadUserDict(Path userDict, Charset charset) {

try {

BufferedReader br = Files.newBufferedReader(userDict, charset);

long s = System.currentTimeMillis();

int count = 0;

while (br.ready()) {

String line = br.readLine();

String[] tokens = line.split("[\t ]+");

if (tokens.length < 1) {

// Ignore empty line

continue;

}

String word = tokens[0];

double freq = 3.0d;

String properties = "";

if (tokens.length == 2)

freq = Double.valueOf(tokens[1]);

if(tokens.length == 3)

properties = tokens[2];//獲取單詞的詞性，存入map中

word = addWord(word);

freqs.put(word, Math.log(freq / total));

property.put(word, properties);

count++;

}

System.out.println(String.format(Locale.getDefault(), "user dict %s load finished, tot words:%d, time elapsed:%dms", userDict.toString(), count, System.currentTimeMillis() - s));

br.close();

}

catch (IOException e) {

System.err.println(String.format(Locale.getDefault(), "%s: load user dict failure!", userDict.toString()));

}

public DictSegment getTrie() {

return this._dict;

}

public boolean containsWord(String word) {

return freqs.containsKey(word);

}

public String getProperties(String word){//通過單詞獲取單詞的詞性

if(containsWord(word))

return property.get(word);

else

return "";

}

public Double getFreq(String key) {

if (containsWord(key))

return freqs.get(key);

else

return minFreq;

}

將詞性存儲到SegToken的成員變量中，方便生成和調取。

/**

*在JiebaSegmenter.java中生成每個切分詞的SegToken對象進行存儲，方便使用

**/

public List process(String paragraph, SegMode mode) {//對paragraphs進行切分，存儲到SegToken中

List tokens = new ArrayList();

StringBuilder sb = new StringBuilder();

int offset = 0;

for (int i = 0; i < paragraph.length(); ++i) {

char ch = CharacterUtil.regularize(paragraph.charAt(i));

if (CharacterUtil.ccFind(ch))

sb.append(ch);

else {

if (sb.length() > 0) {

// process

if (mode == SegMode.SEARCH) {

for (String word : sentenceProcess(sb.toString())) {

tokens.add(new SegToken(word, offset, offset += word.length(), wordDict.getProperties(word)));//將詞性存儲進去

}

else {

for (String token : sentenceProcess(sb.toString())) {

if (token.length() > 2) {

String gram2;

int j = 0;

for (; j < token.length() - 1; ++j) {

gram2 = token.substring(j, j + 2);

if (wordDict.containsWord(gram2))

tokens.add(new SegToken(gram2, offset + j, offset + j + 2, wordDict.getProperties(gram2)));

}

if (token.length() > 3) {

String gram3;

int j = 0;

for (; j < token.length() - 2; ++j) {

gram3 = token.substring(j, j + 3);

if (wordDict.containsWord(gram3))

tokens.add(new SegToken(gram3, offset + j, offset + j + 3, wordDict.getProperties(gram3)));

}

tokens.add(new SegToken(token, offset, offset += token.length(), wordDict.getProperties(token)));

}

sb = new StringBuilder();

offset = i;

}

if (wordDict.containsWord(paragraph.substring(i, i + 1)))

tokens.add(new SegToken(paragraph.substring(i, i + 1), offset, ++offset, wordDict.getProperties(paragraph.substring(i, i + 1))));

else

tokens.add(new SegToken(paragraph.substring(i, i + 1), offset, ++offset, wordDict.getProperties(paragraph.substring(i, i + 1))));

}

if (sb.length() > 0)

if (mode == SegMode.SEARCH) {

for (String token : sentenceProcess(sb.toString())) {

tokens.add(new SegToken(token, offset, offset += token.length(), wordDict.getProperties(token)));

}

else {

for (String token : sentenceProcess(sb.toString())) {

if (token.length() > 2) {

String gram2;

int j = 0;

for (; j < token.length() - 1; ++j) {

gram2 = token.substring(j, j + 2);

if (wordDict.containsWord(gram2))

tokens.add(new SegToken(gram2, offset + j, offset + j + 2, wordDict.getProperties(gram2)));

}

if (token.length() > 3) {

String gram3;

int j = 0;

for (; j < token.length() - 2; ++j) {

gram3 = token.substring(j, j + 3);

if (wordDict.containsWord(gram3))

tokens.add(new SegToken(gram3, offset + j, offset + j + 3, wordDict.getProperties(gram3)));

}

tokens.add(new SegToken(token, offset, offset += token.length(), wordDict.getProperties(token)));

}

return tokens;

}

然后在關鍵詞切分的方法中進行判斷，選擇所需要詞性的word即可

//對關鍵詞進行結巴keyword分詞

for (String sentence : keyword_list) {

List tokens = segmenter.process(sentence, SegMode.SEARCH);

for(SegToken s : tokens)

if(s.word.length() > 1)

keyword += " "+s.word;

}

keyword_list = keyword.split("[,;\\s'\\*\\+|\\^]+");

Set keywordList = new LinkedHashSet(Arrays.asList(keyword_list));//用set是為了去除文章的重復

到此完成新需求的實現。與大家共勉~

最后附上jieba分詞的此行類別及表示方法：

形語素

形容詞性語素。形容詞代碼為 a，語素代碼ｇ前面置以A。

形容詞

取英語形容詞 adjective的第1個字母。

副形詞

直接作狀語的形容詞。形容詞代碼 a和副詞代碼d並在一起。

名形詞

具有名詞功能的形容詞。形容詞代碼 a和名詞代碼n並在一起。

區別詞

取漢字“別”的聲母。

連詞

取英語連詞 conjunction的第1個字母。

副語素

副詞性語素。副詞代碼為 d，語素代碼ｇ前面置以D。

副詞

取 adverb的第2個字母，因其第1個字母已用於形容詞。

嘆詞

取英語嘆詞 exclamation的第1個字母。

方位詞

取漢字“方”

語素

絕大多數語素都能作為合成詞的“詞根”，取漢字“根”的聲母。

前接成分

取英語 head的第1個字母。

成語

取英語成語 idiom的第1個字母。

簡稱略語

取漢字“簡”的聲母。

后接成分

習用語

習用語尚未成為成語，有點“臨時性”，取“臨”的聲母。

數詞

取英語 numeral的第3個字母，n，u已有他用。

名語素

名詞性語素。名詞代碼為 n，語素代碼ｇ前面置以N。

名詞

取英語名詞 noun的第1個字母。

人名

名詞代碼 n和“人(ren)”的聲母並在一起。

地名

名詞代碼 n和處所詞代碼s並在一起。

機構團體

“團”的聲母為 t，名詞代碼n和t並在一起。

其他專名

“專”的聲母的第 1個字母為z，名詞代碼n和z並在一起。

擬聲詞

取英語擬聲詞 onomatopoeia的第1個字母。

介詞

取英語介詞 prepositional的第1個字母。

量詞

取英語 quantity的第1個字母。

代詞

取英語代詞 pronoun的第2個字母,因p已用於介詞。

處所詞

取英語 space的第1個字母。

時語素

時間詞性語素。時間詞代碼為 t,在語素的代碼g前面置以T。

時間詞

取英語 time的第1個字母。

助詞

取英語助詞 auxiliary

動語素

動詞性語素。動詞代碼為 v。在語素的代碼g前面置以V。

動詞

取英語動詞 verb的第一個字母。

副動詞

直接作狀語的動詞。動詞和副詞的代碼並在一起。

名動詞

指具有名詞功能的動詞。動詞和名詞的代碼並在一起。

標點符號

非語素字

非語素字只是一個符號，字母 x通常用於代表未知數、符號。

語氣詞

取漢字“語”的聲母。

狀態詞

取漢字“狀”的聲母的前一個字母。

未知詞

不可識別詞及用戶自定義詞組。取英文Unkonwn首兩個字母。(非北大標准，CSW分詞中定義)

以号拼命多次

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
jieba java 磁性_jieba分詞的應用（java）

在上一篇說的猜你喜歡功能中，又加了新的需求，需要對關鍵詞進行分詞，擴大推薦文章的范圍，這樣能夠拓展用戶的喜歡范圍，這時候我就想到可以用jieba分詞對中文進行分詞，同樣的需要去官網下載源碼，這樣方便自己對源碼的修改以達到自己的目的。這里，我需要判斷切分出來的詞是否是無意義的詞，就需要對切出來的詞進行篩選，這時候，jieba分詞的一個屬性就體現出它的強大之處了，jieba分詞會將切分出來的詞進行詞性...
复制链接

扫一扫