segment-based 解析

此代码有错误,判断预测的segment 是否在 goldSegment 中时仅仅对比了字符串是否相等,需要改进
List<InputStructure> input = new ArrayList<InputStructure>();
		
		List<OutputStructure> output = new ArrayList<OutputStructure>();
		
		Stemmer stem = new Stemmer();
		
		CorpusProcessing corpus = new CorpusProcessing();
		//加载InputStructure和OutputStructure
		try
		{
			FileReader frX = new FileReader("E:\\multiword-trigger\\merge\\train.txt");
			
			BufferedReader brX = new BufferedReader(frX);
			
			FileReader frY = new FileReader("E:\\multiword-trigger\\merge\\goldSent.txt");
			
			BufferedReader brY = new BufferedReader(frY);
			
			String sentX, sentY;
			
			while((sentX = brX.readLine()) != null && (sentY = brY.readLine()) != null)
			{
				InputStructure XStructure = new InputStructure();
				
				List<String> tokens = corpus.getToken(sentX);
				
				XStructure.sentStructure = corpus.getToken(sentX);
								
				input.add(XStructure);
				
				OutputStructure YStructure = new OutputStructure();
				//对 YStructure 中的 segment 进行赋值
				String[] goldSeg= sentY.split(" ");
				
				for(int yi = 0; yi < goldSeg.length; yi++)
				{
					if(goldSeg[yi].startsWith("@"))
					{
						YStructure.goldSegment.add(goldSeg[yi].substring(1));				
					}else
					{
						YStructure.goldSegment.add(goldSeg[yi]);
					}
				}
				//对 YStructure 中 goldFeature 进行赋值
				List<List<String>> result = corpus.getSentInfo(sentX);
				
				List<String> sentPos = result.get(1);
				
				List<String> sentLemma = result.get(2);
				
				for(int endId = 0; endId < tokens.size(); endId++)
				{	
					int maxLength = 5;
					
					if(endId + 1 >= maxLength)
					{
						for(int segLength = 1; segLength <= maxLength; segLength++)
						{
							String preSeg = "", xtokenPos="", xtokenLemma="", xtokenStem= "";
							// 两行 * 号之间的 for 循环就是为了得到 segment
							//************************************
							for(int k = 1; k <= segLength; k++)
							{
								if(k == 1)
								{
									preSeg = tokens.get(endId - k + 1);
									
									xtokenPos = sentPos.get(endId - k + 1);
									
									xtokenLemma = sentLemma.get(endId - k + 1);
									
									xtokenStem = stem.tokenStemmer(tokens.get(endId - k + 1) + " ");
								}else
								{
									preSeg = tokens.get(endId - k + 1) + " " + preSeg;
									
									xtokenPos = sentPos.get(endId - k + 1) + " " + xtokenPos;
									
									xtokenLemma = sentLemma.get(endId - k + 1) + " " + xtokenLemma;
									
									xtokenStem = stem.tokenStemmer(tokens.get(endId - k + 1) + " ") + " " + xtokenStem;
								}
							}
							//************************************
							//判断得到的 xtoken 是不是标准的 trigger
							
							for(int segId = 0; segId < endId; segId++)
							{
								String[] segType = goldSeg[segId].split("__");
								
								String seg = segType[0];
								
								String type = segType[1];
								
								if(preSeg.equals(seg))
								{
									//计算 seg#type 的 goldFeature
									Map<String, Double> goldFeature = new HashMap<String, Double>();
									
									goldFeature.put("f1=" + preSeg + "#" + type, 1.0);
									
									goldFeature.put("f2=" + xtokenPos + "#" + type, 1.0);
									
									goldFeature.put("f3=" + xtokenLemma + "#" + type, 1.0);
									
									goldFeature.put("f4=" + xtokenStem + "#" + type, 1.0);
									
									YStructure.sentGoldFeature.add(goldFeature);
								}else
								{
									//计算 seg#non 的 goldFeature
									Map<String, Double> goldFeature = new HashMap<String, Double>();
									
									goldFeature.put("f1=" + preSeg + "#non", 1.0);
									
									goldFeature.put("f2=" + xtokenPos + "#non", 1.0);
									
									goldFeature.put("f3=" + xtokenLemma + "#non", 1.0);
									
									goldFeature.put("f4=" + xtokenStem + "#non", 1.0);
									
									YStructure.sentGoldFeature.add(goldFeature);
								}
							}//内部代码逻辑有错误
						}
						
					}else
					{
                        maxLength = endId+1;
						
						for(int segLength = 1; segLength <= maxLength; segLength++)
						{
							String preSeg = "", xtokenPos="", xtokenLemma="", xtokenStem= "";
							// 两行 * 号之间的 for 循环就是为了得到 segment
							//************************************
							for(int k = 1; k <= segLength; k++)
							{
								if(k == 1)
								{
									preSeg = tokens.get(endId - k + 1);
									
									xtokenPos = sentPos.get(endId - k + 1);
									
									xtokenLemma = sentLemma.get(endId - k + 1);
									
									xtokenStem = stem.tokenStemmer(tokens.get(endId - k + 1) + " ");
								}else
								{
									preSeg = tokens.get(endId - k + 1) + " " + preSeg;
									
									xtokenPos = sentPos.get(endId - k + 1) + " " + xtokenPos;
									
									xtokenLemma = sentLemma.get(endId - k + 1) + " " + xtokenLemma;
									
									xtokenStem = stem.tokenStemmer(tokens.get(endId - k + 1) + " ") + " " + xtokenStem;
								}
							}
							//************************************
							
							//判断得到的 xtoken 是不是标准的 trigger	
							boolean equal = false;
							
							String triType = "";
							
							for(int segId = 0; segId <= endId; segId++)
							{
								String[] segType = goldSeg[segId].split("__");
								
								String seg = segType[0];
								
								String type = segType[1];
								
								triType = type;
								
								if(preSeg.equals(seg))
								{
									equal = true;
									break;
								}else
								{
									equal = false;
								}
							}
							//&&&&&&&&&&&&
							if(equal == true)
							{
								//计算 seg#type 的 goldFeature
								Map<String, Double> goldFeature = new HashMap<String, Double>();
								
								goldFeature.put("f1=" + preSeg + "#" + triType, 1.0);
								
								goldFeature.put("f2=" + xtokenPos + "#" + triType, 1.0);
								
								goldFeature.put("f3=" + xtokenLemma + "#" + triType, 1.0);
								
								goldFeature.put("f4=" + xtokenStem + "#" + triType, 1.0);
								
								YStructure.sentGoldFeature.add(goldFeature);
							}else
							{
								//计算 seg#non 的 goldFeature
								Map<String, Double> goldFeature = new HashMap<String, Double>();
								
								goldFeature.put("f1=" + preSeg + "#non", 1.0);
								
								goldFeature.put("f2=" + xtokenPos + "#non", 1.0);
								
								goldFeature.put("f3=" + xtokenLemma + "#non", 1.0);
								
								goldFeature.put("f4=" + xtokenStem + "#non", 1.0);
								
								YStructure.sentGoldFeature.add(goldFeature);
							}
							//************
						}					
					}						
			   }
			   
			   output.add(YStructure);
			}//对句子进行循环的结尾
			
			brY.close();
			
			frY.close();
			
			brX.close();
			
			frX.close();
			
		}catch(IOException io)
		{
			io.printStackTrace();
		}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值
>