利用mmSeg4j分词实现网页文本倾向性分析
最近一直在做网页情感倾向性分析的工作,找了一些论文,发现基于机器学习的算法在项目中不太合适,于是自己鼓捣了一套基于中文分词和正负面词库的分析算法。
原理很简单:
文章倾向性 = ∑(出现的正面词汇 * 权重) —∑(出现的负面词汇 * 权重)。
在这个基础上对于负面新闻再加上相关性判断。
在中文分词方面选择了mmSeg4j,没别的原因,就是之前一直用这个,相对来说性能非常不错,但有些词汇需要自己添加到他的words.dic文件中。mmSeg4j下载地址:http://code.google.com/p/mmseg4j/。
在正式编码之前规划了3个文本文件:
- neg_words: 配置负面词汇,每个词一行,格式为“太差-1”。“-”后面的数字作为负面词汇的权重。
- pos_words:配置的正面词汇,配置方式与负面词汇类似。
- rel_words: 相关词汇表,每行一个词即可,增加这个配置文件是为了识别出于特定内容相关的文本情感。如:仅关心近期与“万科”有关的分析。
在工程启动时将这三个文件加载到一个对象中(单例)代码如下:
- importjava.io.BufferedReader;
- importjava.io.FileNotFoundException;
- importjava.io.FileReader;
- importjava.io.IOException;
- importjava.util.ArrayList;
- importjava.util.HashMap;
- importjava.util.List;
- importjava.util.Map;
- importorg.springframework.stereotype.Component;
- importcom.yidatec.vis.psms.commons.PSMSConstants;
- /**
- *加载词汇
- *@authorWilliamXu
- */
- @Component
- publicclassTrendencyWordsLoader{
- privateMap<String,Integer>negWordMap;
- privateMap<String,Integer>posWordMap;
- privateList<String>refWordList;
- publicTrendencyWordsLoader(){
- loadWords();
- }
- privatevoidloadWords(){
- negWordMap=newHashMap<String,Integer>();
- posWordMap=newHashMap<String,Integer>();
- refWordList=newArrayList<String>();
- try{
- FileReaderfr=newFileReader(this.getClass().getClassLoader().getResource(PSMSConstants.NEG_WORDS_PATH).getFile());
- BufferedReaderbr=newBufferedReader(fr);
- Stringline=null;
- while((line=br.readLine())!=null){
- String[]words=line.split("-");
- negWordMap.put(words[0],Integer.parseInt(words[1]));
- }
- fr=newFileReader(this.getClass().getClassLoader().getResource(PSMSConstants.POS_WORDS_PATH).getFile());
- br=newBufferedReader(fr);
- line=null;
- while((line=br.readLine())!=null){
- String[]words=line.split("-");
- posWordMap.put(words[0],Integer.parseInt(words[1]));
- }
- fr=newFileReader(this.getClass().getClassLoader().getResource(PSMSConstants.REL_WORDS_PATH).getFile());
- br=newBufferedReader(fr);
- line=null;
- while((line=br.readLine())!=null){
- refWordList.add(line);
- }
- br.close();
- fr.close();
- }catch(FileNotFoundExceptione){
- e.printStackTrace();
- }catch(NumberFormatExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- publicMap<String,Integer>getNegWordMap(){
- returnnegWordMap;
- }
- publicMap<String,Integer>getPosWordMap(){
- returnposWordMap;
- }
- publicList<String>getRefWordList(){
- returnrefWordList;
- }
- }
加载词汇表后,就可以使用mmSeg4j对网页文本进行分词,并进行分析了,代码如下:
- importjava.io.IOException;
- importjava.io.Reader;
- importjava.io.StringReader;
- importjava.util.ArrayList;
- importjava.util.HashMap;
- importjava.util.List;
- importjava.util.Map;
- importjava.util.Set;
- importorg.springframework.beans.factory.annotation.Autowired;
- importorg.springframework.stereotype.Component;
- importcom.chenlb.mmseg4j.ComplexSeg;
- importcom.chenlb.mmseg4j.Dictionary;
- importcom.chenlb.mmseg4j.MMSeg;
- importcom.chenlb.mmseg4j.Word;
- importcom.yidatec.vis.psms.entity.SolrQueryResult;
- @Component
- publicclassTrendencyAnalyser{
- @Autowired
- TrendencyWordsLoaderwordLoader;
- protectedstaticfinalDictionarydic=Dictionary.getInstance();
- protectedstaticfinalComplexSegseg=newComplexSeg(dic);
- /**
- *正序阈值
- */
- privatefinalintPS_THRESHOLD=50;
- /**
- *逆序阈值
- */
- privatefinalintNS_THRESHOLD=30;
- /**
- *整片文章分词Map
- */
- privateMap<String,List<Word>>segments=null;
- privateList<Word>negs=null;
- privateList<Word>poses=null;
- privateList<Word>rels=null;
- publicintanalyzeTrendency(Stringtitle,Stringcontent){
- try{
- booleanflag=isRelTitle(title);
- if(flag){
- inttitleTendency=getTitleTrendency();
- if(titleTendency<0){
- returnSolrQueryResult.NEGATIVE_NATURE;
- }elseif(titleTendency>0){
- returnSolrQueryResult.POSITIVE_NATURE;
- }
- }
- clearAll();
- initSegmentsMap(newStringReader(title+""+content));
- parseNegWordsMap();
- parsePosWordsMap();
- intresult=analyzeContentsTrendency();
- if(flag){//标题相关,仅判断文本倾向性
- if(result<0){
- returnSolrQueryResult.NEGATIVE_NATURE;
- }elseif(result==0){
- returnSolrQueryResult.NEUTRAL_NATURE;
- }else{
- returnSolrQueryResult.POSITIVE_NATURE;
- }
- }else{//标题无关,需要复杂的矩阵算法
- parseRelWordsMap();
- if(result<0){
- if(analyzeTrendencyByMatrix()){
- returnSolrQueryResult.NEGATIVE_NATURE;
- }else{
- returnSolrQueryResult.NEUTRAL_NATURE;
- }
- }elseif(result==0){
- returnSolrQueryResult.NEUTRAL_NATURE;
- }else{
- returnSolrQueryResult.POSITIVE_NATURE;
- }
- }
- }catch(IOExceptione){
- returnSolrQueryResult.NEUTRAL_NATURE;
- }
- }
- privatevoidclearAll(){
- if(segments!=null){
- segments.clear();
- }
- if(negs!=null){
- negs.clear();
- }
- if(poses!=null){
- poses.clear();
- }
- }
- /**
- *是否是倾向性相关标题
- *
- *@paramtitle
- *@return
- */
- privatebooleanisRelTitle(Stringtitle){
- try{
- initTitleSegmentsMap(newStringReader(title));
- List<String>relWords=wordLoader.getRefWordList();
- for(Stringword:relWords){
- if(segments.containsKey(word)){
- returntrue;
- }
- }
- }catch(IOExceptione){
- returnfalse;
- }
- returnfalse;
- }
- /**
- *获取标题倾向性
- *
- *@paramtitle
- *@return
- */
- privateintgetTitleTrendency(){
- parseNegWordsMap();
- parsePosWordsMap();
- returnanalyzeContentsTrendency();
- }
- /**
- *判断整篇文章的倾向性
- *
- *@paramtitle
- *@paramcontent
- *@return
- */
- privateintanalyzeContentsTrendency(){
- intnegScore=0;
- intposScore=0;
- if(negs!=null&&negs.size()>0){
- for(Wordword:negs){
- negScore+=wordLoader.getNegWordMap().get(word.getString());
- }
- }
- if(poses!=null&&poses.size()>0){
- for(Wordword:poses){
- posScore+=wordLoader.getPosWordMap().get(word.getString());
- }
- }
- returnposScore-negScore;
- }
- /**
- *交叉矩阵判断文本倾向性
- *
- *@return
- */
- privatebooleananalyzeTrendencyByMatrix(){
- if(rels==null||rels.size()==0){
- returnfalse;
- }
- if(negs==null||negs.size()==0){
- returnfalse;
- }
- for(inti=0;i<rels.size();i++){
- for(intj=0;j<negs.size();j++){
- WordrelWord=rels.get(i);
- WordnegWord=negs.get(j);
- if(relWord.getStartOffset()<negWord.getStartOffset()){
- if(negWord.getStartOffset()-relWord.getStartOffset()
- -relWord.getLength()<PS_THRESHOLD){
- returntrue;
- }
- }else{
- if(relWord.getStartOffset()-negWord.getStartOffset()
- -negWord.getLength()<NS_THRESHOLD){
- returntrue;
- }
- }
- }
- }
- returnfalse;
- }
- /**
- *先对标题进行分词
- *
- *@paramreader
- *@throwsIOException
- */
- privatevoidinitTitleSegmentsMap(Readerreader)throwsIOException{
- segments=newHashMap<String,List<Word>>();
- MMSegmmSeg=newMMSeg(reader,seg);
- Wordword=null;
- while((word=mmSeg.next())!=null){
- if(segments.containsKey(word.getString())){
- segments.get(word.getString()).add(word);
- }
- List<Word>words=newArrayList<Word>();
- words.add(word);
- segments.put(word.getString(),words);
- }
- }
- /**
- *对正文进行分词
- *
- *@paramreader
- *@throwsIOException
- */
- privatevoidinitSegmentsMap(Readerreader)throwsIOException{
- if(segments==null){
- segments=newHashMap<String,List<Word>>();
- }
- MMSegmmSeg=newMMSeg(reader,seg);
- Wordword=null;
- while((word=mmSeg.next())!=null){
- if(segments.containsKey(word.getString())){
- segments.get(word.getString()).add(word);
- }
- List<Word>words=newArrayList<Word>();
- words.add(word);
- segments.put(word.getString(),words);
- }
- }
- /**
- *解析负面词汇
- */
- privatevoidparseNegWordsMap(){
- Map<String,Integer>negMap=wordLoader.getNegWordMap();
- Set<String>negKeys=negMap.keySet();
- for(StringnegKey:negKeys){
- List<Word>negWords=segments.get(negKey);
- if(negWords!=null){
- if(negs==null){
- negs=newArrayList<Word>();
- }
- negs.addAll(negWords);
- }
- }
- }
- /**
- *解析正面词汇
- */
- privatevoidparsePosWordsMap(){
- Map<String,Integer>posMap=wordLoader.getPosWordMap();
- Set<String>posKeys=posMap.keySet();
- for(StringposKey:posKeys){
- List<Word>posWords=segments.get(posKey);
- if(posWords!=null){
- if(poses==null){
- poses=newArrayList<Word>();
- }
- poses.addAll(posWords);
- }
- }
- }
- /**
- *解析相关词汇
- */
- privatevoidparseRelWordsMap(){
- List<String>refWords=wordLoader.getRefWordList();
- for(Stringword:refWords){
- List<Word>relWords=segments.get(word);
- if(relWords!=null){
- if(rels==null){
- rels=newArrayList<Word>();
- }
- rels.addAll(relWords);
- }
- }
- }
- }
- //3个文件路径
- public class PSMSConstants { public static final String NEG_WORDS_PATH = "D:\\pdfReader\\153分钟学会R.txt"; public static final String POS_WORDS_PATH = "D:\\pdfReader\\R语言简明入门教程.txt"; public static final String REL_WORDS_PATH = "D:\\pdfReader\\R语言与统计分析.txt"; }
- //正倾向1 负倾向-1 相关词 0
- public class SolrQueryResult { public static final int NEGATIVE_NATURE = -1; public static final int NEUTRAL_NATURE = 0; public static final int POSITIVE_NATURE = 1; }
这里面用了一些策略:
- 先分析标题,如果标题中出现相关词汇,仅需判断正文倾向性即可。
- 如果标题中出现相关词汇,并且标题存在倾向,以标题倾向为准。
- 如果上述都不成立,则合并标题与正文,一起进行分词与情感词汇识别。
- 对于通篇识别为负面情感的文章需要进一步判断相关性。
- 采用距离矩阵的方式判断相关性。
- 需要设定正向最大距离阈值与反向最大距离阈值。