<pre name="code" class="java">public static void main(String[] args) {
// TODO Auto-generated method stub
String text = "There was no difference in the effects of AzaC versus AzadC, as both increased the IRF-4 mRNA level"
+ "in CML-T1 cells as well (data not shown).";
int startIndex = 89;
int endIndex = 99;
String str = text.substring(0,startIndex);
//冒号和逗号
int count = str.split("\\s{1,}"). // "\\s{1,}" 表示一个以上的空白字符
String str2 = text.substring(startIndex,endIndex);
int count2 = str2.split("\\s{1,}").length;
for(int i=0 ; i<count2 ; i++){
System.out.println(count+i-1);
}
}
其中要确定位置的是”mRNA level“是句中的第几个单词,我们所知道的信息是m字符在句中的位置是89,level后面空格的位置是99.
以上方式有个缺点:就是遇到标点符号和括号的时候,不能将之认为是一个“单词” ,于是进行改进后的算法如下,以下运用了斯坦福的分词器
package com.a2Process;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
public class FindPosition2 {
public static void main(String[] args) {
// TODO Auto-generated method stub
String text = "To define the requirements of Foxp3 with respect to inhibition of NF-kappaB-dependent transcription, we utilized a mutant of Foxp3 acking the FKH domain (Figure 2A) [16], similar to the scurfy mutant Foxp3 of mice, and a mutant Foxp3 protein from a patient with IPEX [4,11,14,17].";
int startIndex = 143;
int endIndex = 153;
String str = text.substring(0,startIndex-1);
int preCountWord = wordNum(str);
String selectedStr = text.substring(startIndex-1,endIndex);
int selectedCountWord = wordNum(selectedStr);
for(int i=0 ; i<selectedCountWord; i++){
System.out.println(preCountWord+i);
}
}
/**
* 获取一个字符串里包含多少个单词
*/
public static int wordNum(String str){
Properties props = new Properties();
props.put("annotators", "tokenize, ssplit");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
Annotation document = new Annotation(str);
pipeline.annotate(document);
List<CoreMap> sentences = document.get(SentencesAnnotation.class);
int wordnum = 0;
for(CoreMap sentence : sentences){
for(CoreLabel token : sentence.get(TokensAnnotation.class)){
wordnum++;
String word = token.get(TextAnnotation.class);
System.out.println(word);
}
}
return wordnum;
}
}