package com.triggerprotein;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
public class TriggerLocation
{
public static void main(String[] args)
{
SentTokenizer sentT = new SentTokenizer();
try
{
File file = new File("E:\\2009_BioEvent\\devel\\BioNLP_2009_devel_triggerInfo");
File[] files = file.listFiles();
for(int fileId = 0; fileId < files.length; fileId++)
{
String fileName = files[fileId].getName();
FileReader fr = new FileReader(files[fileId]);
BufferedReader br = new BufferedReader(fr);
FileWriter fw = new FileWriter("E:\\2009_BioEvent\\devel\\BioNLP_2009_devel_NewTriggerInfo\\" + fileName);
BufferedWriter bw = new BufferedWriter(fw);
Map<Integer,Character> oldSentMap = new HashMap<Integer,Character>();
Map<Integer,Character> newSentMap = new HashMap<Integer,Character>();
String sent;
while((sent = br.readLine())!= null)
{
//对句子进行处理
if(sent.length() == 0)
{
oldSentMap.clear();
newSentMap.clear();
bw.newLine();
bw.flush();
}else if(sent.startsWith("#")) //蛋白质
{
//# T6 Protein S5 75 79 ERK2
String[] pro = sent.split(" ");
int proStart = Integer.parseInt(pro[4]);
int proEnd = Integer.parseInt(pro[5]);
StringBuffer proB = new StringBuffer();
for(int j = 6; j < pro.length; j++)
{
if(j == pro.length - 1)
{
proB.append(pro[j]);
}else
{
proB.append(pro[j] + " ");
}
}
String protein = proB.toString();
int num1 = 0,num2 = 0,num3 = 0, num4 = 0;
String oldMapStr ="", newMapStr ="";
int mark = 0;
for(int proX = 0; proX < oldSentMap.size(); proX++)
{
if(mark == 1)
{
proX = proX - 1;
mark = 0;
}
Character oldMapChar = oldSentMap.get(proX);
Character newMapChar = newSentMap.get(proX+num1+num3);
if(proX >= proStart && proX < proEnd)
{
if(oldMapChar.equals(newMapChar)&& oldMapChar.equals(' '))
{
num4 +=1;
oldMapStr += oldSentMap.get(proX);
newMapStr += newSentMap.get(proX +num1+num3);
}else if(oldMapChar.equals(newMapChar)&& !oldMapChar.equals(' '))
{
oldMapStr += oldSentMap.get(proX);
newMapStr += newSentMap.get(proX +num1+num3);
if(proX == proEnd-1)
{
if(oldMapStr.equals(newMapStr) && oldMapStr.equals(protein))
{
String newProtein = sentT.sentTokenizer(protein);
int numm = 0;
if(newProtein.indexOf(" ") != -1)
{
String[] strr =newProtein.split(" ");
numm = strr.length;
}else
{
numm =1;
}
if(proStart > 0 && (oldSentMap.get(proStart-1).equals('-') || oldSentMap.get(proStart-1).equals('(')|| oldSentMap.get(proStart-1).equals('/')))
{
int proIndexStart = num1 + num2 + 1;
int proIndexEnd = num1 + num2 + num3 + num4;
if(numm == (proIndexEnd - proIndexStart +1))
{
bw.write("# " + proIndexStart + " " + proIndexEnd + " " + newProtein);
bw.newLine();
bw.flush();
break;
}else
{
System.out.println(fileName +" %%%");
System.out.println(sent+" %%% ");
System.out.println(protein + " " + proIndexStart + " " + proIndexEnd + " " + newProtein);
System.out.println("计算蛋白质单词个数的时候出错");
break;
}
}else
{
int proIndexStart = num1 + num2;
int proIndexEnd = num1 + num2 + num3 + num4;
bw.write("# " + proIndexStart + " " + proIndexEnd + " " + newProtein);
bw.newLine();
bw.flush();
break;
}
}
}
}else if(!oldMapChar.equals(newMapChar) && newMapChar.equals(' '))
{
num3 +=1;
mark = 1;
}else
{
System.out.println(fileName);
System.out.println(sent + "***");
System.out.println("处理蛋白质时字符出错");
}
}else if(!oldMapChar.equals(newMapChar) && newMapChar.equals(' '))
{
num1 += 1;
mark = 1;
}else if(oldMapChar.equals(newMapChar) && oldMapChar.equals(' '))
{
num2 += 1;
}else if(oldMapChar.equals(newMapChar) && !oldMapChar.equals(' '))
{
}else
{
System.out.println(fileName);
System.out.println(sent);
System.out.println(proX +" " + oldMapChar +"***"+ newMapChar+"不相等");
}
}
}else if(sent.startsWith("@")) //trigger
{
//@ T120 Gene_expression S4 90 97 produce
String[] tri = sent.split(" ");
String triType = tri[2];
int triStart = Integer.parseInt(tri[4]);
int triEnd = Integer.parseInt(tri[5]);
StringBuffer triB = new StringBuffer();
for(int j = 6; j < tri.length; j++)
{
if(j == tri.length - 1)
{
triB.append(tri[j]);
}else
{
triB.append(tri[j] + " ");
}
}
String trigger = triB.toString();
int num1 = 0, num2 = 0, num3 = 0, num4 = 0;
String oldMapStr ="", newMapStr ="";
int label = 0;
for(int triX = 0; triX < oldSentMap.size(); triX++)
{
if(label == 1)
{
triX = triX - 1;
label = 0;
}
Character oldMapChar = oldSentMap.get(triX);
Character newMapChar = newSentMap.get(triX+num1+num3);
if(triX >= triStart && triX < triEnd)
{
if(oldMapChar.equals(newMapChar) && oldMapChar.equals(' '))
{
num4 += 1;
oldMapStr += oldSentMap.get(triX);
newMapStr += newSentMap.get(triX +num1+num3);
}else if(oldMapChar.equals(newMapChar) && !oldMapChar.equals(' '))
{
oldMapStr += oldSentMap.get(triX);
newMapStr += newSentMap.get(triX +num1+num3);
if(triX == triEnd-1)
{
if(oldMapStr.equals(newMapStr)&& oldMapStr.equals(trigger))
{
String newTrigger = sentT.sentTokenizer(trigger);
int numm = 0;
if(newTrigger.indexOf(" ") != -1)
{
String[] strr =newTrigger.split(" ");
numm = strr.length;
}else
{
numm =1;
}
if(triStart > 0 && (oldSentMap.get(triStart-1).equals('-')||oldSentMap.get(triStart-1).equals('(')||oldSentMap.get(triStart-1).equals('/') || oldSentMap.get(triStart-1).equals('-') || oldSentMap.get(triStart-1).equals('[') || Character.isDigit(oldSentMap.get(triStart-1))))
{
int triIndexStart = num1 + num2 + 1;
int triIndexEnd = num1 + num2 + num3 + num4;
bw.write("@ " + triType + " " + + triIndexStart + " " + triIndexEnd + " " + newTrigger);
bw.newLine();
bw.flush();
break;
}else
{
int triIndexStart = num1 + num2;
int triIndexEnd = num1 + num2 + num3 + num4;
if(numm == (triIndexEnd - triIndexStart +1))
{
bw.write("@ "+ triType + " " + triIndexStart + " " + triIndexEnd + " " + newTrigger);
bw.newLine();
bw.flush();
break;
}else
{
System.out.println(fileName +" *** ");
System.out.println(sent + " *** ");
System.out.println(trigger + " " + newTrigger + " "+ triIndexStart + " " + triIndexEnd);
System.out.println("计算的trigger词长度有问题");
break;
}
}
}
}
}else if(!oldMapChar.equals(newMapChar) && newMapChar.equals(' '))
{
num3 += 1;
label = 1;
}else
{
System.out.println(fileName);
System.out.println(sent + "&&&");
System.out.println("处理触发词时出错");
}
}else if(!oldMapChar.equals(newMapChar) && newMapChar.equals(' '))
{
num1 += 1;
label = 1;
}else if(oldMapChar.equals(newMapChar) && oldMapChar.equals(' '))
{
num2 += 1;
}else if(oldMapChar.equals(newMapChar) && !oldMapChar.equals(' '))
{
}else
{
System.out.println(oldMapChar +"***"+ newMapChar+"不相等");
}
}
}else if(sent.startsWith("%"))//event
{
}else //sentence
{
char[] oldSentChar = sent.toCharArray();
for(int charId = 0; charId<oldSentChar.length; charId++)
{
oldSentMap.put(charId, oldSentChar[charId]);
}
String newSent = sentT.sentTokenizer(sent);
char[] newSentChar = newSent.toCharArray();
for(int chId = 0; chId<newSentChar.length; chId++)
{
newSentMap.put(chId, newSentChar[chId]);
}
bw.write(newSent);
bw.newLine();
bw.flush();
}
}
}
}catch(IOException io)
{
io.printStackTrace();
}
}
}
上面的这个程序基本没有问题, 原理是分词前的句子每个字符都存储到Map中,分词后的字符也都存储到Map中,两个Map 进行比对,根据单词的对应字符找到单词在分词后的句中的位置。
package com.triggerprotein;
import java.util.List;
import java.util.StringTokenizer;
import cmu.arktweetnlp.Twokenize;
public class SentTokenizer
{
public String sentTokenizer(String sent) //返回句子
{
String result = "";
StringBuffer sentBuff = new StringBuffer();
List<String> tokenList = Twokenize.tokenizeRawTweetText(sent);
for(int i = 0; i < tokenList.size(); i++)
{
String str11 = tokenList.get(i);
if(tokenList.get(i).indexOf("/") != -1 && !tokenList.get(i).startsWith("/")&& tokenList.get(i).endsWith("/"))
{
String stri = tokenList.get(i).substring(0, tokenList.get(i).length()-1);
sentBuff.append(stri + " / ");
}else if(tokenList.get(i).indexOf("/") != -1 && tokenList.get(i).startsWith("/")&& !tokenList.get(i).endsWith("/"))
{
String st = tokenList.get(i).substring(1);
sentBuff.append("/ ");
if(st.indexOf("-") != -1)
{
sentBuff.append(st.replace("-", " - "));
sentBuff.append(" ");
}else
{
sentBuff.append(st + " ");
}
}else if(tokenList.get(i).indexOf("/") != -1 && !tokenList.get(i).endsWith("/"))
{
String[] string = tokenList.get(i).split("/");
for(int p = 0; p < string.length; p++)
{
if(p == string.length - 1)
{
if(string[p].indexOf("-")!= -1 && string[p].indexOf("(") == -1 && string[p].indexOf(")") == -1)
{
if(string[p].equals("-"))
{
sentBuff.append(string[p].replace("-", "- "));
}else
{
sentBuff.append(string[p].replace("-", " - "));
sentBuff.append(" ");
}
}else if(string[p].indexOf("-") == -1 && string[p].indexOf("(") == -1 && string[p].indexOf(")") == -1)
{
sentBuff.append(string[p] + " ");
}else if(string[p].indexOf("(") != -1 && string[p].indexOf(")") != -1)
{
StringTokenizer strToke = new StringTokenizer(string[p], "()", true);// 打印分隔符
while (strToke.hasMoreElements())
{
sentBuff.append(strToke.nextToken()+" ");
}
}else if(string[p].indexOf("(") != -1 && string[p].indexOf(")") == -1)
{
if(!string[p].endsWith("("))
{
sentBuff.append(string[p].replace("(", " ( "));
sentBuff.append(" ");
}else
{
sentBuff.append(string[p].replace("(", " ( "));
}
}else if(string[p].indexOf(")") != -1 && !string[p].endsWith(")"))
{
String strr = string[p].replace(")", " )");
if(strr.indexOf(")-") != -1)
{
sentBuff.append(strr.replace("-", " - "));
}else
{
sentBuff.append(strr);
sentBuff.append(" ");
}
sentBuff.append(" ");
}else if(string[p].equals("+)"))
{
sentBuff.append("+ ) ");
}else if(string[p].equals("-)"))
{
sentBuff.append("- ) ");
}else
{
sentBuff.append(string[p]+" ");
}
}else
{
if(string[p].indexOf("-")!= -1 && string[p].indexOf("(") == -1 && string[p].indexOf(")") == -1)
{
if(string[p].startsWith("-"))
{
sentBuff.append(string[p].replace("-", "- "));
sentBuff.append("/ ");
}else
{
sentBuff.append(string[p].replace("-", " - "));
sentBuff.append(" / ");
}
}else if(string[p].indexOf("-") == -1 && string[p].indexOf("(") == -1 && string[p].indexOf(")") == -1)
{
if(string[p].endsWith("+"))
{
sentBuff.append(string[p].replace("+", " + "));
sentBuff.append("/ ");
}else
{
sentBuff.append(string[p] + " ");
sentBuff.append("/ ");
}
}else if(string[p].indexOf("(") != -1 && string[p].indexOf(")") != -1)
{
StringTokenizer strToke = new StringTokenizer(string[p], "()", true);// 打印分隔符
while (strToke.hasMoreElements())
{
sentBuff.append(strToke.nextToken()+" ");
}
sentBuff.append("/ ");
}else if(string[p].indexOf(")") != -1 && string[p].endsWith(")")&& string[p].indexOf("(") == -1)
{
String st = string[p].replace(")", " ) ");
if(st.indexOf("-") != -1)
{
sentBuff.append(string[p].replace("-", " - "));
}else
{
sentBuff.append(st);
}
sentBuff.append("/ ");
}else if(string[p].endsWith("+") || string[p].endsWith("-"))
{ if(string[p].endsWith("+"))
{
sentBuff.append(string[p].replace("+", " + "));
sentBuff.append("/ ");
}else if(string[p].endsWith("-"))
{
sentBuff.append(string[p].replace("-", " - "));
sentBuff.append("/ ");
}
}
}
}
}else if((tokenList.get(i).indexOf("-")) != -1 && !tokenList.get(i).startsWith("-") && !tokenList.get(i).endsWith("-"))
{
String[] str = tokenList.get(i).split("-");
for(int p = 0; p < str.length; p++)
{
if(p == str.length - 1)
{
sentBuff.append(str[p] + " ");
}else
{
if(str[p].endsWith(")"))
{
if(str[p].indexOf("(")!= -1)
{
String rr = str[p].replace("(", " ( ");
sentBuff.append(rr.replace(")", " ) "));
sentBuff.append("- ");
}else
{
sentBuff.append(str[p].replace(")", " ) "));
sentBuff.append("- ");
}
}else
{
sentBuff.append(str[p] + " ");
sentBuff.append("- ");
}
}
}
}else if(tokenList.get(i).startsWith("-") && !tokenList.get(i).equals("-"))
{
sentBuff.append("- " + tokenList.get(i).substring(1)+" ");
}else if(tokenList.get(i).endsWith("-") && !tokenList.get(i).equals("-"))
{
String preStr = tokenList.get(i).substring(0, tokenList.get(i).length()-1);
if(preStr.indexOf("+") != -1)
{
String[] plusSplit = preStr.split("\\+");
for(int p = 0; p < plusSplit.length; p++)
{
if(p == plusSplit.length - 1)
{
sentBuff.append(plusSplit[p] + " ");
}else
{
sentBuff.append(plusSplit[p] + " ");
sentBuff.append("+ ");
}
}
}else if(preStr.indexOf("-") != -1)
{
String[] plusSplit = preStr.split("-");
for(int p = 0; p < plusSplit.length; p++)
{
if(p == plusSplit.length - 1)
{
sentBuff.append(plusSplit[p] + " ");
}else
{
sentBuff.append(plusSplit[p] + " ");
sentBuff.append("- ");
}
}
}
else
{
sentBuff.append(preStr + " ");
}
sentBuff.append("- ");
}else if(tokenList.get(i).startsWith("+") && !tokenList.get(i).equals("+"))
{
sentBuff.append("+ " + tokenList.get(i).substring(1)+ " ");
}else if(tokenList.get(i).endsWith("+") && !tokenList.get(i).equals("+"))
{
String plusStr = tokenList.get(i).substring(0, tokenList.get(i).length()-1);
if(plusStr.indexOf("+") != -1)
{
String[] plusSplit = plusStr.split("\\+");
for(int p = 0; p < plusSplit.length; p++)
{
if(p == plusSplit.length - 1)
{
sentBuff.append(plusSplit[p] + " ");
}else
{
sentBuff.append(plusSplit[p] + " ");
sentBuff.append("+ ");
}
}
}else
{
sentBuff.append(plusStr + " ");
}
sentBuff.append("+ ");
}else if((tokenList.get(i).indexOf("+") != -1) && !tokenList.get(i).startsWith("+") && !tokenList.get(i).endsWith("+"))
{
String[] str = tokenList.get(i).split("\\+");
for(int p = 0; p < str.length; p++)
{
if(p == str.length - 1)
{
sentBuff.append(str[p] + " ");
}else
{
sentBuff.append(str[p] + " ");
sentBuff.append("+ ");
}
}
}else if(tokenList.get(i).indexOf(")") != -1 && !tokenList.get(i).equals(")") && tokenList.get(i).endsWith(")"))
{
String plusStr = tokenList.get(i).substring(0, tokenList.get(i).length()-1);
sentBuff.append(plusStr +" ) ");
}else if(tokenList.get(i).indexOf(")") != -1 && !tokenList.get(i).equals(")") && tokenList.get(i).startsWith(")"))
{
String sufixStr = tokenList.get(i).substring(1);
sentBuff.append(") " + sufixStr+" ");
}else if(tokenList.get(i).indexOf("(") != -1 && !tokenList.get(i).endsWith("(") && tokenList.get(i).indexOf(")") == -1)
{
sentBuff.append(tokenList.get(i).replace("(", " ( "));
sentBuff.append(" ");
}else if(i==tokenList.size()-1)
{
sentBuff.append(tokenList.get(i));
}else
{
sentBuff.append(tokenList.get(i) + " ");
}
}
result = sentBuff.toString();
return result;
}
}
上面的程序是对推特分词工具分词后处理