需要导入的包:ark-tweet-nlp-0.3.2.jar(进行推特文本处理的工具)
package com.regex;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import cmu.arktweetnlp.Twokenize;
public class Token {
/**
* @param args
*/
public static void main(String[] args)
{
try
{
FileReader fr = new FileReader("F:\\test.txt");
BufferedReader br = new BufferedReader(fr);
String sent;
int decide = 0;
List<String> mylist = Arrays.asList("FKH", "S", "48-kDa", "P", "G/T)GGGCGG(G/A)(G/A)(C/T");
while((sent = br.readLine()) != null)
{
List<String> tokenList = Twokenize.tokenizeRawTweetText(sent);
for(int i = 0; i < tokenList.size(); i++)
{
if(decide == 1)
{
i= i+1;
decide = 0;
}else if(decide == 2)
{
i= i+2;
decide = 0;
}
if((i <= tokenList.size()-2 && tokenList.get(i+1).equals("'")))
{
System.out.print(tokenList.get(i)+ tokenList.get(i+1) + " ");
decide = 1;
}else if(tokenList.get(i).indexOf("/") != -1)
{
String[] string = tokenList.get(i).split("/");
for(int p = 0; p < string.length; p++)
{
System.out.print(string[p] + " ");
}
}else if(tokenList.get(i).equals("(") && mylist.contains(tokenList.get(i+1))&& tokenList.get(i+2).equals(")"))
{
System.out.print(tokenList.get(i)+ tokenList.get(i+1) + tokenList.get(i+2)+ " ");
decide = 2;
}else if((tokenList.get(i).indexOf("-")) != -1 && !tokenList.get(i).endsWith("-") && !tokenList.get(i).startsWith("-"))
{
String[] str = tokenList.get(i).split("-");
boolean guess = false;
for(int j = 0; j < str.length; j++)
{
if(str[j].matches("[0-9]+"))//判断str[j]是否有数字,有数字则不分开
{
guess = true;
break;
}
}
if(guess == true)
{
for(int q = 0; q < str.length; q++)
{
if(q == str.length - 1)
{
System.out.print(str[q] + " ");
}else
{
System.out.print(str[q] + "-");
}
}
guess = false;
}else
{
for(int r = 0; r < str.length; r++)
{
System.out.print(str[r] + " ");
}
}
}else
{
System.out.print(tokenList.get(i) + " ");
}
}
System.out.println();
}
br.close();
fr.close();
}catch(IOException ioe)
{
ioe.printStackTrace();
}
}
}
1733

被折叠的 条评论
为什么被折叠?



