此代码有错误,判断预测的segment 是否在 goldSegment 中时仅仅对比了字符串是否相等,需要改进
List<InputStructure> input = new ArrayList<InputStructure>();
List<OutputStructure> output = new ArrayList<OutputStructure>();
Stemmer stem = new Stemmer();
CorpusProcessing corpus = new CorpusProcessing();
//加载InputStructure和OutputStructure
try
{
FileReader frX = new FileReader("E:\\multiword-trigger\\merge\\train.txt");
BufferedReader brX = new BufferedReader(frX);
FileReader frY = new FileReader("E:\\multiword-trigger\\merge\\goldSent.txt");
BufferedReader brY = new BufferedReader(frY);
String sentX, sentY;
while((sentX = brX.readLine()) != null && (sentY = brY.readLine()) != null)
{
InputStructure XStructure = new InputStructure();
List<String> tokens = corpus.getToken(sentX);
XStructure.sentStructure = corpus.getToken(sentX);
input.add(XStructure);
OutputStructure YStructure = new OutputStructure();
//对 YStructure 中的 segment 进行赋值
String[] goldSeg= sentY.split(" ");
for(int yi = 0; yi < goldSeg.length; yi++)
{
if(goldSeg[yi].startsWith("@"))
{
YStructure.goldSegment.add(goldSeg[yi].substring(1));
}else
{
YStructure.goldSegment.add(goldSeg[yi]);
}
}
//对 YStructure 中 goldFeature 进行赋值
List<List<String>> result = corpus.getSentInfo(sentX);
List<String> sentPos = result.get(1);
List<String> sentLemma = result.get(2);
for(int endId = 0; endId < tokens.size(); endId++)
{
int maxLength = 5;
if(endId + 1 >= maxLength)
{
for(int segLength = 1; segLength <= maxLength; segLength++)
{
String preSeg = "", xtokenPos="", xtokenLemma="", xtokenStem= "";
// 两行 * 号之间的 for 循环就是为了得到 segment
//************************************
for(int k = 1; k <= segLength; k++)
{
if(k == 1)
{
preSeg = tokens.get(endId - k + 1);
xtokenPos = sentPos.get(endId - k + 1);
xtokenLemma = sentLemma.get(endId - k + 1);
xtokenStem = stem.tokenStemmer(tokens.get(endId - k + 1) + " ");
}else
{
preSeg = tokens.get(endId - k + 1) + " " + preSeg;
xtokenPos = sentPos.get(endId - k + 1) + " " + xtokenPos;
xtokenLemma = sentLemma.get(endId - k + 1) + " " + xtokenLemma;
xtokenStem = stem.tokenStemmer(tokens.get(endId - k + 1) + " ") + " " + xtokenStem;
}
}
//************************************
//判断得到的 xtoken 是不是标准的 trigger
for(int segId = 0; segId < endId; segId++)
{
String[] segType = goldSeg[segId].split("__");
String seg = segType[0];
String type = segType[1];
if(preSeg.equals(seg))
{
//计算 seg#type 的 goldFeature
Map<String, Double> goldFeature = new HashMap<String, Double>();
goldFeature.put("f1=" + preSeg + "#" + type, 1.0);
goldFeature.put("f2=" + xtokenPos + "#" + type, 1.0);
goldFeature.put("f3=" + xtokenLemma + "#" + type, 1.0);
goldFeature.put("f4=" + xtokenStem + "#" + type, 1.0);
YStructure.sentGoldFeature.add(goldFeature);
}else
{
//计算 seg#non 的 goldFeature
Map<String, Double> goldFeature = new HashMap<String, Double>();
goldFeature.put("f1=" + preSeg + "#non", 1.0);
goldFeature.put("f2=" + xtokenPos + "#non", 1.0);
goldFeature.put("f3=" + xtokenLemma + "#non", 1.0);
goldFeature.put("f4=" + xtokenStem + "#non", 1.0);
YStructure.sentGoldFeature.add(goldFeature);
}
}//内部代码逻辑有错误
}
}else
{
maxLength = endId+1;
for(int segLength = 1; segLength <= maxLength; segLength++)
{
String preSeg = "", xtokenPos="", xtokenLemma="", xtokenStem= "";
// 两行 * 号之间的 for 循环就是为了得到 segment
//************************************
for(int k = 1; k <= segLength; k++)
{
if(k == 1)
{
preSeg = tokens.get(endId - k + 1);
xtokenPos = sentPos.get(endId - k + 1);
xtokenLemma = sentLemma.get(endId - k + 1);
xtokenStem = stem.tokenStemmer(tokens.get(endId - k + 1) + " ");
}else
{
preSeg = tokens.get(endId - k + 1) + " " + preSeg;
xtokenPos = sentPos.get(endId - k + 1) + " " + xtokenPos;
xtokenLemma = sentLemma.get(endId - k + 1) + " " + xtokenLemma;
xtokenStem = stem.tokenStemmer(tokens.get(endId - k + 1) + " ") + " " + xtokenStem;
}
}
//************************************
//判断得到的 xtoken 是不是标准的 trigger
boolean equal = false;
String triType = "";
for(int segId = 0; segId <= endId; segId++)
{
String[] segType = goldSeg[segId].split("__");
String seg = segType[0];
String type = segType[1];
triType = type;
if(preSeg.equals(seg))
{
equal = true;
break;
}else
{
equal = false;
}
}
//&&&&&&&&&&&&
if(equal == true)
{
//计算 seg#type 的 goldFeature
Map<String, Double> goldFeature = new HashMap<String, Double>();
goldFeature.put("f1=" + preSeg + "#" + triType, 1.0);
goldFeature.put("f2=" + xtokenPos + "#" + triType, 1.0);
goldFeature.put("f3=" + xtokenLemma + "#" + triType, 1.0);
goldFeature.put("f4=" + xtokenStem + "#" + triType, 1.0);
YStructure.sentGoldFeature.add(goldFeature);
}else
{
//计算 seg#non 的 goldFeature
Map<String, Double> goldFeature = new HashMap<String, Double>();
goldFeature.put("f1=" + preSeg + "#non", 1.0);
goldFeature.put("f2=" + xtokenPos + "#non", 1.0);
goldFeature.put("f3=" + xtokenLemma + "#non", 1.0);
goldFeature.put("f4=" + xtokenStem + "#non", 1.0);
YStructure.sentGoldFeature.add(goldFeature);
}
//************
}
}
}
output.add(YStructure);
}//对句子进行循环的结尾
brY.close();
frY.close();
brX.close();
frX.close();
}catch(IOException io)
{
io.printStackTrace();
}
2883

被折叠的 条评论
为什么被折叠?



