text rank java 实现_基于TextRank的summarization实现

这是一个Java实现的TextRank算法,用于文本摘要。通过建立句子的相似度矩阵并进行归一化处理,然后使用TextRank模型计算句子权重,最终根据权重选取最重要的句子作为摘要。
摘要由CSDN通过智能技术生成

/**

*

*/

package net.phoenix.nlp.summarization;

import java.io.IOException;

import java.io.Reader;

import java.util.ArrayList;

import java.util.Collections;

import java.util.Comparator;

import java.util.Iterator;

import java.util.List;

import net.phoenix.nlp.pos.Term;

import net.phoenix.nlp.pos.Tokenizer;

import net.phoenix.nlp.sentence.Detector;

import net.phoenix.nlp.sentence.Sentence;

import org.ejml.ops.CommonOps;

import org.ejml.simple.SimpleMatrix;

/**

* 基于TextRank的summarization实现,算法描述参见论文: ,

* Rada Mihalcea and Paul Tarau

*

* @author lixf

*

*/

public class TextRankSummarization implements Summarization {

private Tokenizer tokenizer;

private Detector detector;

private double threshold;

private double damp;

/**

*

* @param tokenizer

*            设置分词算法;

* @param detector

*            设置断句算法;

*/

public TextRankSummarization(Tokenizer tokenizer, Detector detector) {

this.tokenizer = tokenizer;

this.detector = detector;

this.threshold = 0.001;

this.damp = 0.8;

}

@Override

public List summarize(Reader paragraph, int size)

throws IOException {

List sentences = this.toList(this.detector.detect(paragraph));

SimpleMatrix similarity = this.buildSimilarityMatrix(sentences);

//              System.out.println("similarity========================");

//              System.out.println(similarity);

this.normalize(similarity);

//              System.out.println("normalize========================");

//              System.out.println(similarity);

SimpleMatrix weights = this.buildWeightVector(similarity);

//              System.out.println("weights========================");

//              System.out.println(weights);

return this.rankByWeight(sentences, weights, size);

}

private List rankByWeight(List sentences,

SimpleMatrix weights, int size) {

if(size> sentences.size())

return sentences;

int index = 0;

for(Sentence sentence : sentences){

((SentenceWrapper)sentence).setScore(weights.get(index));

this.scoreSentence((SentenceWrapper)sentence);

index ++;

}

//排序,获得权重最高的size个句子;

Collections.sort(sentences, new Comparator(){

@Override

public int compare(Sentence o1, Sentence o2) {

double sub =  ((SentenceWrapper)o1).getScore() - ((SentenceWrapper)o2).getScore();

if(sub > 0)

return -1;

else

return 1;

}});

//对权重最高的size个句子,恢复它们的顺序;

List result = new ArrayList(sentences.subList(0, size));

Collections.sort(result,  new Comparator(){

@Override

public int compare(Sentence o1, Sentence o2) {

return o1.getStartOffset() - o2.getStartOffset();

}});

return result;

}

/**

* 可以继承这个方法来给句子打分;

* @param sentence

*/

protected void scoreSentence(SentenceWrapper sentence){

}

/**

* 用textRank算法计算权重矩阵。

*

* @param matrix

* @return

*/

protected SimpleMatrix buildWeightVector(SimpleMatrix matrix) {

SimpleMatrix vector = new SimpleMatrix(matrix.numCols(), 1);

vector.set(1);

SimpleMatrix vecDamp = new SimpleMatrix(matrix.numCols(), 1);

vecDamp.set(1- this.damp);

double diff = 1;

while(diff > this.threshold){

SimpleMatrix next = matrix.mult(vector);

//next = (1-damp)+damp * next;

next = vecDamp.plus(this.damp, next);

diff = next.minus(vector).normF();

vector = next;

//                      System.out.println("weight==========");

//                      System.out.println(vector);

}

return vector;

}

/**

* 建立相似度矩阵

*

* @param sentences

* @return

*/

protected SimpleMatrix buildSimilarityMatrix(List sentences) {

for (Sentence sentence : sentences) {

List terms = this.tokenizer.tokenize(sentence.toString());

((SentenceWrapper) sentence).setTerms(terms);

}

SimpleMatrix matrix = new SimpleMatrix(sentences.size(),

sentences.size());

matrix.set(0);

for (int i = 0; i < sentences.size(); i++)

for (int j = i + 1; j < sentences.size(); j++) {

// 相似度+1,消除0值;

double similarity = this.similarity(sentences.get(i),

sentences.get(j)) + 1;

matrix.set(i, j, similarity);

matrix.set(j, i, similarity);

}

return matrix;

}

/**

* 将matrix归一化处理。

*

* @param matrix

*/

protected void normalize(SimpleMatrix matrix) {

SimpleMatrix one = new SimpleMatrix(matrix.numCols(), matrix.numRows());

one.set(1);

SimpleMatrix sum = matrix.mult(one);

CommonOps.elementDiv(matrix.getMatrix(), sum.getMatrix());

CommonOps.transpose(matrix.getMatrix());

}

/**

* 计算句子之间的相似度

*

* @param first

* @param second

* @return

*/

protected double similarity(Sentence sentence1, Sentence sentence2) {

List tokens1 = ((SentenceWrapper) sentence1).getTerms();

List tokens2 = ((SentenceWrapper) sentence2).getTerms();

if (tokens1.size() == 0 || tokens2.size() == 0)

return 0;

int count = 0;

for (Term first : tokens1)

for (Term second : tokens2) {

if (first.getName().equalsIgnoreCase(second.getName()))

count++;

}

return count / Math.log(tokens1.size()) + Math.log(tokens2.size());

}

/**

* 将iterator转为List。

*

* @param sentences

* @return

*/

private List toList(Iterator sentences) {

List list = new ArrayList();

while (sentences.hasNext())

list.add(new SentenceWrapper(sentences.next()));

return list;

}

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值