英语分词

最新推荐文章于 2021-09-13 19:48:08 发布

小妖精Fsky

最新推荐文章于 2021-09-13 19:48:08 发布

阅读量752

点赞数

分类专栏： Text Process 文章标签：文本处理

本文链接：https://blog.csdn.net/appleml/article/details/46581163

版权

Text Process 专栏收录该内容

6 篇文章 1 订阅

订阅专栏

package com.triggerprotein;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

public class TriggerLocation 
{
	public static void main(String[] args) 
	{
		SentTokenizer sentT = new SentTokenizer();
		try
		{
			File file = new File("E:\\2009_BioEvent\\devel\\BioNLP_2009_devel_triggerInfo");
			File[] files = file.listFiles();
			for(int fileId = 0; fileId < files.length; fileId++)
			{
				String fileName = files[fileId].getName();
				FileReader fr = new FileReader(files[fileId]);
				BufferedReader br = new BufferedReader(fr);
				
				FileWriter fw = new FileWriter("E:\\2009_BioEvent\\devel\\BioNLP_2009_devel_NewTriggerInfo\\" + fileName);
				BufferedWriter bw = new BufferedWriter(fw);
				Map<Integer,Character> oldSentMap = new HashMap<Integer,Character>();
				Map<Integer,Character> newSentMap = new HashMap<Integer,Character>();
				String sent;
				while((sent = br.readLine())!= null)
				{
					//对句子进行处理					
					if(sent.length() == 0)
				    {  
						oldSentMap.clear();
						newSentMap.clear();
						bw.newLine();
						bw.flush();
				    }else if(sent.startsWith("#")) //蛋白质
					{
				    	//# T6 Protein S5 75 79 ERK2
				    	
				    	String[] pro = sent.split(" ");				    	
				    	int proStart = Integer.parseInt(pro[4]);
				    	int proEnd = Integer.parseInt(pro[5]);
				    	
				    	StringBuffer proB = new StringBuffer();	
				    	
						for(int j = 6; j < pro.length; j++)
						{
							if(j == pro.length - 1)
							{
								proB.append(pro[j]);
							}else
							{
								proB.append(pro[j] + " ");
							}
						}							
						String protein = proB.toString();
										
						int num1 = 0,num2 = 0,num3 = 0, num4 = 0;
						String oldMapStr ="", newMapStr ="";
						int mark = 0;
						for(int proX = 0; proX < oldSentMap.size(); proX++)
						{
							if(mark == 1)
							{
								proX = proX - 1;
								mark = 0;
							}
							Character oldMapChar = oldSentMap.get(proX);
							Character newMapChar = newSentMap.get(proX+num1+num3);							
							if(proX >= proStart && proX < proEnd)
							{
								if(oldMapChar.equals(newMapChar)&& oldMapChar.equals(' '))
								{
								    num4 +=1;	
									oldMapStr += oldSentMap.get(proX);
								    newMapStr += newSentMap.get(proX +num1+num3);
								}else if(oldMapChar.equals(newMapChar)&& !oldMapChar.equals(' '))
								{
									oldMapStr += oldSentMap.get(proX);
								    newMapStr += newSentMap.get(proX +num1+num3);
								    if(proX == proEnd-1)
								    {
								    	if(oldMapStr.equals(newMapStr) && oldMapStr.equals(protein))
									    {										    		
								    		String newProtein = sentT.sentTokenizer(protein);
								    		
								    		int numm = 0;
								    		if(newProtein.indexOf(" ") != -1)
								    		{
								    			String[] strr =newProtein.split(" ");
								    			numm = strr.length;								    			
								    			
								    		}else 
								    		{
								    			numm =1;
								    		}
								    		if(proStart > 0 && (oldSentMap.get(proStart-1).equals('-') || oldSentMap.get(proStart-1).equals('(')|| oldSentMap.get(proStart-1).equals('/')))
									    	{
									    		int proIndexStart = num1 + num2 + 1;
									    		int proIndexEnd = num1 + num2 + num3 + num4;
									    		if(numm == (proIndexEnd - proIndexStart +1))
									    		{
										    		bw.write("# " + proIndexStart + " " + proIndexEnd + " " + newProtein);
										    		bw.newLine();
										    		bw.flush();									    		
										    		break;
									    		}else
									    		{
									    			System.out.println(fileName +" %%%");
									    			System.out.println(sent+" %%% ");
									    			System.out.println(protein + " " + proIndexStart + " " + proIndexEnd + " " + newProtein);
									    			System.out.println("计算蛋白质单词个数的时候出错");
									    			break;
									    		}
									    	}else
									    	{
									    		int proIndexStart = num1 + num2;
									    		int proIndexEnd = num1 + num2 + num3 + num4;									    	
									    		bw.write("# " + proIndexStart + " " + proIndexEnd + " " + newProtein);
									    		bw.newLine();
									    		bw.flush();
									    		break;									    		
									    	}
									    }
								    }							    
									
								}else if(!oldMapChar.equals(newMapChar) && newMapChar.equals(' '))
								{
									num3 +=1;
									mark = 1;
								}else
								{
									System.out.println(fileName);
									System.out.println(sent + "***");
									System.out.println("处理蛋白质时字符出错");
								}								
							    
							}else if(!oldMapChar.equals(newMapChar) && newMapChar.equals(' '))
							{
								num1 += 1;
								mark = 1;
							}else if(oldMapChar.equals(newMapChar) && oldMapChar.equals(' '))
							{
								num2 += 1;								
							}else if(oldMapChar.equals(newMapChar) && !oldMapChar.equals(' '))
							{
								
							}else 
							{
								System.out.println(fileName);
								System.out.println(sent);
								System.out.println(proX +"   " + oldMapChar +"***"+ newMapChar+"不相等");
							}	
						}						
						
					}else if(sent.startsWith("@")) //trigger
					{
						//@ T120 Gene_expression S4 90 97 produce
						
						String[] tri = sent.split(" ");
						String triType = tri[2];
						int triStart = Integer.parseInt(tri[4]);
						int triEnd = Integer.parseInt(tri[5]);
						
						StringBuffer triB = new StringBuffer();				    								
						for(int j = 6; j < tri.length; j++)
						{
							if(j == tri.length - 1)
							{
								triB.append(tri[j]);
							}else
							{
								triB.append(tri[j] + " ");
							}
						}							
						String trigger = triB.toString();	
												
						int num1 = 0, num2 = 0, num3 = 0, num4 = 0;
						String oldMapStr ="", newMapStr ="";
						
						int label = 0;
						for(int triX = 0; triX < oldSentMap.size(); triX++)
						{							
							if(label == 1)
							{
								triX = triX - 1;
								label = 0;
							}
							Character oldMapChar = oldSentMap.get(triX);
							Character newMapChar = newSentMap.get(triX+num1+num3);
							
							if(triX >= triStart && triX < triEnd)
							{
								if(oldMapChar.equals(newMapChar) && oldMapChar.equals(' '))
								{
									num4 += 1;
									oldMapStr += oldSentMap.get(triX);
								    newMapStr += newSentMap.get(triX +num1+num3);
								}else if(oldMapChar.equals(newMapChar) && !oldMapChar.equals(' '))
								{
									oldMapStr += oldSentMap.get(triX);
								    newMapStr += newSentMap.get(triX +num1+num3);
								    if(triX == triEnd-1)
								    {
								    	if(oldMapStr.equals(newMapStr)&& oldMapStr.equals(trigger))
									    {								    	
								    		String newTrigger = sentT.sentTokenizer(trigger);
								    		int numm = 0;
								    		if(newTrigger.indexOf(" ") != -1)
								    		{
								    			String[] strr =newTrigger.split(" ");
								    			numm = strr.length;
								    		}else 
								    		{
								    			numm =1;
								    		}
								    		if(triStart > 0 && (oldSentMap.get(triStart-1).equals('-')||oldSentMap.get(triStart-1).equals('(')||oldSentMap.get(triStart-1).equals('/') || oldSentMap.get(triStart-1).equals('-') || oldSentMap.get(triStart-1).equals('[') || Character.isDigit(oldSentMap.get(triStart-1))))
									    	{
									    		int triIndexStart = num1 + num2 + 1;
										    	int triIndexEnd = num1 + num2 + num3 + num4;	
										    	
										    	bw.write("@ " + triType + " " +  + triIndexStart + " " + triIndexEnd + " " + newTrigger);
	                                            bw.newLine();
	                                            bw.flush();
										    	break;
									    	}else
									    	{
									    		int triIndexStart = num1 + num2;
										    	int triIndexEnd = num1 + num2 + num3 + num4;
										    	if(numm == (triIndexEnd - triIndexStart +1))
										    	{
											    	bw.write("@ "+ triType + " " + triIndexStart + " " + triIndexEnd  + " " + newTrigger);
		                                            bw.newLine();
		                                            bw.flush();
											    	break;
										    	}else
										    	{
										    		System.out.println(fileName +" *** ");
										    		System.out.println(sent + " *** ");
										    		System.out.println(trigger + " " + newTrigger + " "+ triIndexStart + " " + triIndexEnd);
										    		System.out.println("计算的trigger词长度有问题");
										    		break;
										    	}
									    	}								    		
                                        }	
								    }
								}else if(!oldMapChar.equals(newMapChar) && newMapChar.equals(' '))
								{
									num3 += 1;									
								    label = 1;
								}else
								{
									System.out.println(fileName);
									System.out.println(sent + "&&&");
									System.out.println("处理触发词时出错");
								}
								
							}else if(!oldMapChar.equals(newMapChar) && newMapChar.equals(' '))
							{
								num1 += 1;
								label = 1;
							}else if(oldMapChar.equals(newMapChar) && oldMapChar.equals(' '))
							{
								num2 += 1;
								
							}else if(oldMapChar.equals(newMapChar) && !oldMapChar.equals(' '))
							{
								
							}else 
							{
								System.out.println(oldMapChar +"***"+ newMapChar+"不相等");
							}
						}						
					}else if(sent.startsWith("%"))//event
					{
						
					}else  //sentence
					{						
						char[] oldSentChar = sent.toCharArray();
						for(int charId = 0; charId<oldSentChar.length; charId++)
						{
							oldSentMap.put(charId, oldSentChar[charId]);
						}
						
						String newSent = sentT.sentTokenizer(sent);	
						
						char[] newSentChar = newSent.toCharArray();						
						for(int chId = 0; chId<newSentChar.length; chId++)
						{
							newSentMap.put(chId, newSentChar[chId]);
						}
						bw.write(newSent);
						bw.newLine();
						bw.flush();
					}					
				}
			}
		}catch(IOException io)
		{
			io.printStackTrace();
		}
	}
}

上面的这个程序基本没有问题，原理是分词前的句子每个字符都存储到Map中，分词后的字符也都存储到Map中，两个Map 进行比对，根据单词的对应字符找到单词在分词后的句中的位置。

package com.triggerprotein;

import java.util.List;
import java.util.StringTokenizer;

import cmu.arktweetnlp.Twokenize;

public class SentTokenizer 
{
	public String sentTokenizer(String sent) //返回句子
	{
		String result = "";		
		StringBuffer sentBuff = new StringBuffer();
		List<String> tokenList = Twokenize.tokenizeRawTweetText(sent);
		
		for(int i = 0; i < tokenList.size(); i++)
		{			
			String str11 = tokenList.get(i);			
			if(tokenList.get(i).indexOf("/") != -1 && !tokenList.get(i).startsWith("/")&& tokenList.get(i).endsWith("/"))
			{
				String stri = tokenList.get(i).substring(0, tokenList.get(i).length()-1);
				sentBuff.append(stri + " / ");
			}else if(tokenList.get(i).indexOf("/") != -1 && tokenList.get(i).startsWith("/")&& !tokenList.get(i).endsWith("/"))
			{
			   String st = tokenList.get(i).substring(1);
			   sentBuff.append("/ ");
			   if(st.indexOf("-") != -1)
			   {
				   sentBuff.append(st.replace("-", " - "));
				   sentBuff.append(" ");
			   }else
			   {
				   sentBuff.append(st + " ");
			   }
			}else if(tokenList.get(i).indexOf("/") != -1 && !tokenList.get(i).endsWith("/")) 
			{				
				String[] string = tokenList.get(i).split("/");
				
				for(int p = 0; p < string.length; p++)
				{
					if(p == string.length - 1)
					{						
						if(string[p].indexOf("-")!= -1 && string[p].indexOf("(") == -1 && string[p].indexOf(")") == -1)
						{		
							if(string[p].equals("-"))
							{
								sentBuff.append(string[p].replace("-", "- "));								
							}else
							{
								sentBuff.append(string[p].replace("-", " - "));
								sentBuff.append(" ");
							}
							
						}else if(string[p].indexOf("-") == -1 && string[p].indexOf("(") == -1 && string[p].indexOf(")") == -1)
						{
							sentBuff.append(string[p] + " ");
							
						}else if(string[p].indexOf("(") != -1 && string[p].indexOf(")") != -1)
						{						
							StringTokenizer strToke = new StringTokenizer(string[p], "()", true);// 打印分隔符
							
							while (strToke.hasMoreElements()) 
							{
								sentBuff.append(strToke.nextToken()+" ");
							}
						}else if(string[p].indexOf("(") != -1 && string[p].indexOf(")") == -1)
						{
						    if(!string[p].endsWith("("))
						    {
						    	sentBuff.append(string[p].replace("(", " ( "));
						    	sentBuff.append(" ");
						    }else 
						    {
						    	sentBuff.append(string[p].replace("(", " ( "));
						    }
						    
						}else if(string[p].indexOf(")") != -1 && !string[p].endsWith(")"))
						{
							String strr = string[p].replace(")", " )");
							if(strr.indexOf(")-") != -1)
							{
								sentBuff.append(strr.replace("-", " - "));
							}else
							{
								sentBuff.append(strr);
								sentBuff.append(" ");
							}
							sentBuff.append(" ");
						}else if(string[p].equals("+)"))
						{
							sentBuff.append("+ ) ");
						}else if(string[p].equals("-)"))
						{
							sentBuff.append("- ) ");
						}else
						{
							sentBuff.append(string[p]+" ");
						}
					}else
					{
						if(string[p].indexOf("-")!= -1 && string[p].indexOf("(") == -1 && string[p].indexOf(")") == -1)
						{						
							if(string[p].startsWith("-"))
							{
								sentBuff.append(string[p].replace("-", "- "));
								sentBuff.append("/ ");
							}else
							{
								sentBuff.append(string[p].replace("-", " - "));
								sentBuff.append(" / ");
							}
						}else if(string[p].indexOf("-") == -1 && string[p].indexOf("(") == -1 && string[p].indexOf(")") == -1)
						{
							if(string[p].endsWith("+"))
							{
								sentBuff.append(string[p].replace("+", " + "));
								sentBuff.append("/ ");
							}else
							{
								sentBuff.append(string[p] + " ");
								sentBuff.append("/ ");
							}
						}else if(string[p].indexOf("(") != -1 && string[p].indexOf(")") != -1)
						{
							StringTokenizer strToke = new StringTokenizer(string[p], "()", true);// 打印分隔符						
							while (strToke.hasMoreElements()) 
							{
								sentBuff.append(strToke.nextToken()+" ");
							}
							sentBuff.append("/ ");
						}else if(string[p].indexOf(")") != -1 && string[p].endsWith(")")&& string[p].indexOf("(") == -1)
						{
							String st = string[p].replace(")", " ) ");
							if(st.indexOf("-") != -1)
							{
								sentBuff.append(string[p].replace("-", " - "));
							}else 
							{
								sentBuff.append(st);
							}
							sentBuff.append("/ ");
						}else if(string[p].endsWith("+") || string[p].endsWith("-"))
						{   if(string[p].endsWith("+"))
						    {
								sentBuff.append(string[p].replace("+", " + "));	
								sentBuff.append("/ ");
						    }else if(string[p].endsWith("-"))
						    {
						    	sentBuff.append(string[p].replace("-", " - "));	
						    	sentBuff.append("/ ");
						    }						   
						}
					}
				}
				
			}else if((tokenList.get(i).indexOf("-")) != -1 && !tokenList.get(i).startsWith("-") && !tokenList.get(i).endsWith("-"))
			{
				String[] str = tokenList.get(i).split("-");
				for(int p = 0; p < str.length; p++)
				{
					if(p == str.length - 1)
					{
						sentBuff.append(str[p] + " ");
					}else
					{
						if(str[p].endsWith(")"))
						{
							if(str[p].indexOf("(")!= -1)
							{
								String rr = str[p].replace("(", " ( ");
								sentBuff.append(rr.replace(")", " ) "));
								sentBuff.append("- ");
							}else
							{
								sentBuff.append(str[p].replace(")", " ) "));
								sentBuff.append("- ");								
							}
						}else
						{
							sentBuff.append(str[p] + " ");
							sentBuff.append("- ");
						}
					}
				}
						
			}else if(tokenList.get(i).startsWith("-") && !tokenList.get(i).equals("-"))
			{
				sentBuff.append("- " + tokenList.get(i).substring(1)+" ");
			}else if(tokenList.get(i).endsWith("-") && !tokenList.get(i).equals("-"))
			{
				String preStr = tokenList.get(i).substring(0, tokenList.get(i).length()-1);
				if(preStr.indexOf("+") != -1)
				{
					String[] plusSplit = preStr.split("\\+");
					for(int p = 0; p < plusSplit.length; p++)
					{
						if(p == plusSplit.length - 1)
						{
							sentBuff.append(plusSplit[p] + " ");
						}else
						{
							sentBuff.append(plusSplit[p] + " ");
							sentBuff.append("+ ");
						}
					}
						
				}else if(preStr.indexOf("-") != -1)
				{
					String[] plusSplit = preStr.split("-");
					for(int p = 0; p < plusSplit.length; p++)
					{
						if(p == plusSplit.length - 1)
						{
							sentBuff.append(plusSplit[p] + " ");
						}else
						{
							sentBuff.append(plusSplit[p] + " ");
							sentBuff.append("- ");
						}
					}
				}
				else
				{
					sentBuff.append(preStr + " ");
				}
				
				sentBuff.append("- ");
				
			}else if(tokenList.get(i).startsWith("+") && !tokenList.get(i).equals("+"))
			{
				sentBuff.append("+ " + tokenList.get(i).substring(1)+ " ");
			}else if(tokenList.get(i).endsWith("+") && !tokenList.get(i).equals("+"))
			{
				String plusStr = tokenList.get(i).substring(0, tokenList.get(i).length()-1);
				
				if(plusStr.indexOf("+") != -1)
				{
					String[] plusSplit = plusStr.split("\\+");
					for(int p = 0; p < plusSplit.length; p++)
					{
						if(p == plusSplit.length - 1)
						{
							sentBuff.append(plusSplit[p] + " ");
						}else
						{
							sentBuff.append(plusSplit[p] + " ");
							sentBuff.append("+ ");
						}
					}						
				}else
				{
					sentBuff.append(plusStr + " ");
				}				
				sentBuff.append("+ ");
				
			}else if((tokenList.get(i).indexOf("+") != -1) && !tokenList.get(i).startsWith("+") && !tokenList.get(i).endsWith("+"))
			{
				String[] str = tokenList.get(i).split("\\+");
				for(int p = 0; p < str.length; p++)
				{
					if(p == str.length - 1)
					{
						sentBuff.append(str[p] + " ");
					}else
					{
						sentBuff.append(str[p] + " ");
						sentBuff.append("+ ");
					}
				}
			}else if(tokenList.get(i).indexOf(")") != -1 && !tokenList.get(i).equals(")") && tokenList.get(i).endsWith(")"))
			{
				String plusStr = tokenList.get(i).substring(0, tokenList.get(i).length()-1);
				sentBuff.append(plusStr +" ) ");		
			}else if(tokenList.get(i).indexOf(")") != -1 && !tokenList.get(i).equals(")") && tokenList.get(i).startsWith(")"))
			{
				String sufixStr = tokenList.get(i).substring(1);
				sentBuff.append(") " + sufixStr+" "); 
			}else if(tokenList.get(i).indexOf("(") != -1 && !tokenList.get(i).endsWith("(") && tokenList.get(i).indexOf(")") == -1)
			{
				sentBuff.append(tokenList.get(i).replace("(", " ( "));
				sentBuff.append(" ");
			}else if(i==tokenList.size()-1)
			{
				sentBuff.append(tokenList.get(i));
			}else
			{							
				sentBuff.append(tokenList.get(i) + " ");
			}				
	    }	
		result = sentBuff.toString();
		return result;
	}
}

上面的程序是对推特分词工具分词后处理