java 文本倾向分析_网页文本倾向性分析

import java.io.BufferedReader;

import java.io.FileNotFoundException;

import java.io.FileReader;

import java.io.IOException;

import java.util.ArrayList;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import org.springframework.stereotype.Component;

import com.yidatec.vis.psms.commons.PSMSConstants;

/**

* 加载词汇

* @author William Xu

*/

@Component

public class TrendencyWordsLoader {

private Map negWordMap;

private Map posWordMap;

private List refWordList;

public TrendencyWordsLoader(){

loadWords();

}

private void loadWords(){

negWordMap = new HashMap();

posWordMap = new HashMap();

refWordList = new ArrayList();

try {

FileReader fr = new FileReader(this.getClass().getClassLoader().getResource(PSMSConstants.NEG_WORDS_PATH).getFile());

BufferedReader br = new BufferedReader(fr);

String line = null;

while((line = br.readLine()) != null){

String[] words = line.split("-");

negWordMap.put(words[0], Integer.parseInt(words[1]));

}

fr = new FileReader(this.getClass().getClassLoader().getResource(PSMSConstants.POS_WORDS_PATH).getFile());

br = new BufferedReader(fr);

line = null;

while((line = br.readLine()) != null){

String[] words = line.split("-");

posWordMap.put(words[0], Integer.parseInt(words[1]));

}

fr = new FileReader(this.getClass().getClassLoader().getResource(PSMSConstants.REL_WORDS_PATH).getFile());

br = new BufferedReader(fr);

line = null;

while((line = br.readLine()) != null){

refWordList.add(line);

}

br.close();

fr.close();

} catch (FileNotFoundException e) {

e.printStackTrace();

} catch (NumberFormatException e) {

e.printStackTrace();

} catch (IOException e) {

e.printStackTrace();

}

}

public Map getNegWordMap() {

return negWordMap;

}

public Map getPosWordMap() {

return posWordMap;

}

public List getRefWordList() {

return refWordList;

}

}

加载词汇表后,就可以使用mmSeg4j对网页文本进行分词,并进行分析了,代码如下:

import java.io.IOException;

import java.io.Reader;

import java.io.StringReader;

import java.util.ArrayList;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import java.util.Set;

import org.springframework.beans.factory.annotation.Autowired;

import org.springframework.stereotype.Component;

import com.chenlb.mmseg4j.ComplexSeg;

import com.chenlb.mmseg4j.Dictionary;

import com.chenlb.mmseg4j.MMSeg;

import com.chenlb.mmseg4j.Word;

import com.yidatec.vis.psms.entity.SolrQueryResult;

@Component

public class TrendencyAnalyser {

@Autowired

TrendencyWordsLoader wordLoader;

protected static final Dictionary dic = Dictionary.getInstance();

protected static final ComplexSeg seg = new ComplexSeg(dic);

/**

* 正序阈值

*/

private final int PS_THRESHOLD = 50;

/**

* 逆序阈值

*/

private final int NS_THRESHOLD = 30;

/**

* 整片文章分词Map

*/

private Map> segments = null;

private List negs = null;

private List poses = null;

private List rels = null;

public int analyzeTrendency(String title, String content) {

try {

boolean flag = isRelTitle(title);

if (flag) {

int titleTendency = getTitleTrendency();

if (titleTendency < 0) {

return SolrQueryResult.NEGATIVE_NATURE;

} else if (titleTendency > 0) {

return SolrQueryResult.POSITIVE_NATURE;

}

}

clearAll();

initSegmentsMap(new StringReader(title + " " + content));

parseNegWordsMap();

parsePosWordsMap();

int result = analyzeContentsTrendency();

if (flag) { // 标题相关,仅判断文本倾向性

if (result < 0) {

return SolrQueryResult.NEGATIVE_NATURE;

} else if (result == 0) {

return SolrQueryResult.NEUTRAL_NATURE;

} else {

return SolrQueryResult.POSITIVE_NATURE;

}

} else { // 标题无关,需要复杂的矩阵算法

parseRelWordsMap();

if (result < 0) {

if (analyzeTrendencyByMatrix()) {

return SolrQueryResult.NEGATIVE_NATURE;

} else {

return SolrQueryResult.NEUTRAL_NATURE;

}

} else if (result == 0) {

return SolrQueryResult.NEUTRAL_NATURE;

} else {

return SolrQueryResult.POSITIVE_NATURE;

}

}

} catch (IOException e) {

return SolrQueryResult.NEUTRAL_NATURE;

}

}

private void clearAll() {

if (segments != null) {

segments.clear();

}

if (negs != null) {

negs.clear();

}

if (poses != null) {

poses.clear();

}

}

/**

* 是否是倾向性相关标题

*

* @param title

* @return

*/

private boolean isRelTitle(String title) {

try {

initTitleSegmentsMap(new StringReader(title));

List relWords = wordLoader.getRefWordList();

for (String word : relWords) {

if (segments.containsKey(word)) {

return true;

}

}

} catch (IOException e) {

return false;

}

return false;

}

/**

* 获取标题倾向性

*

* @param title

* @return

*/

private int getTitleTrendency() {

parseNegWordsMap();

parsePosWordsMap();

return analyzeContentsTrendency();

}

/**

* 判断整篇文章的倾向性

*

* @param title

* @param content

* @return

*/

private int analyzeContentsTrendency() {

int negScore = 0;

int posScore = 0;

if (negs != null && negs.size() > 0) {

for (Word word : negs) {

negScore += wordLoader.getNegWordMap().get(word.getString());

}

}

if (poses != null && poses.size() > 0) {

for (Word word : poses) {

posScore += wordLoader.getPosWordMap().get(word.getString());

}

}

return posScore - negScore;

}

/**

* 交叉矩阵判断文本倾向性

*

* @return

*/

private boolean analyzeTrendencyByMatrix() {

if (rels == null || rels.size() == 0) {

return false;

}

if (negs == null || negs.size() == 0) {

return false;

}

for (int i = 0; i < rels.size(); i++) {

for (int j = 0; j < negs.size(); j++) {

Word relWord = rels.get(i);

Word negWord = negs.get(j);

if (relWord.getStartOffset() < negWord.getStartOffset()) {

if (negWord.getStartOffset() - relWord.getStartOffset()

- relWord.getLength() < PS_THRESHOLD) {

return true;

}

} else {

if (relWord.getStartOffset() - negWord.getStartOffset()

- negWord.getLength() < NS_THRESHOLD) {

return true;

}

}

}

}

return false;

}

/**

* 先对标题进行分词

*

* @param reader

* @throws IOException

*/

private void initTitleSegmentsMap(Reader reader) throws IOException {

segments = new HashMap>();

MMSeg mmSeg = new MMSeg(reader, seg);

Word word = null;

while ((word = mmSeg.next()) != null) {

if (segments.containsKey(word.getString())) {

segments.get(word.getString()).add(word);

}

List words = new ArrayList();

words.add(word);

segments.put(word.getString(), words);

}

}

/**

* 对正文进行分词

*

* @param reader

* @throws IOException

*/

private void initSegmentsMap(Reader reader) throws IOException {

if (segments == null) {

segments = new HashMap>();

}

MMSeg mmSeg = new MMSeg(reader, seg);

Word word = null;

while ((word = mmSeg.next()) != null) {

if (segments.containsKey(word.getString())) {

segments.get(word.getString()).add(word);

}

List words = new ArrayList();

words.add(word);

segments.put(word.getString(), words);

}

}

/**

* 解析负面词汇

*/

private void parseNegWordsMap() {

Map negMap = wordLoader.getNegWordMap();

Set negKeys = negMap.keySet();

for (String negKey : negKeys) {

List negWords = segments.get(negKey);

if (negWords != null) {

if (negs == null) {

negs = new ArrayList();

}

negs.addAll(negWords);

}

}

}

/**

* 解析正面词汇

*/

private void parsePosWordsMap() {

Map posMap = wordLoader.getPosWordMap();

Set posKeys = posMap.keySet();

for (String posKey : posKeys) {

List posWords = segments.get(posKey);

if (posWords != null) {

if (poses == null) {

poses = new ArrayList();

}

poses.addAll(posWords);

}

}

}

/**

* 解析相关词汇

*/

private void parseRelWordsMap() {

List refWords = wordLoader.getRefWordList();

for (String word : refWords) {

List relWords = segments.get(word);

if (relWords != null) {

if (rels == null) {

rels = new ArrayList();

}

rels.addAll(relWords);

}

}

}

}

这里面用了一些策略:

先分析标题,如果标题中出现相关词汇,仅需判断正文倾向性即可。

如果标题中出现相关词汇,并且标题存在倾向,以标题倾向为准。

如果上述都不成立,则合并标题与正文,一起进行分词与情感词汇识别。

对于通篇识别为负面情感的文章需要进一步判断相关性。

采用距离矩阵的方式判断相关性。

需要设定正向最大距离阈值与反向最大距离阈值。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值