stanford parser 中文句法分析

最新推荐文章于 2021-04-11 16:12:51 发布

vincent2610

最新推荐文章于 2021-04-11 16:12:51 发布

阅读量2.7k

点赞数

分类专栏：自然语言处理

自然语言处理专栏收录该内容

13 篇文章 0 订阅

订阅专栏

来自http://blog.csdn.net/boy178564309/article/details/12418691

为了进行中文句法分析，使用了stanford parser.开始时，一头茫然。搜索网上资源也有很多异常，勉强处理了下，现将可以运行的代码粘贴如下，希望对于用到的人有所帮助：

[java]view plaincopy 
   
 import java.util.*;  
 import java.io.StringReader;  
   
   
 import edu.stanford.nlp.process.CoreLabelTokenFactory;  
 import edu.stanford.nlp.process.DocumentPreprocessor;  
 import edu.stanford.nlp.process.PTBTokenizer;  
 import edu.stanford.nlp.process.TokenizerFactory;  
 import edu.stanford.nlp.ling.CoreLabel;    
 import edu.stanford.nlp.ling.HasWord;    
 import edu.stanford.nlp.trees.*;  
 import edu.stanford.nlp.parser.lexparser.LexicalizedParser;  
   
 class ParserDemo {  
   
   public static void main(String[] args) {  
     LexicalizedParser lp =   
        LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz");  
     if (args.length > 0) {  
       demoDP(lp, args[0]);  
     } else {  
       demoAPI(lp);  
     }  
   }  
   
   public static void demoDP(LexicalizedParser lp, String filename) {  
     // This option shows loading and sentence-segment and tokenizing  
     // a file using DocumentPreprocessor  
     TreebankLanguagePack tlp = new PennTreebankLanguagePack();  
     GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();  
     // You could also create a tokenier here (as below) and pass it  
     // to DocumentPreprocessor  
     for (List<HasWord> sentence : new DocumentPreprocessor(filename)) {  
       Tree parse = lp.apply(sentence);  
       parse.pennPrint();  
       System.out.println();  
         
       GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);  
       Collection tdl = gs.typedDependenciesCCprocessed(true);  
       System.out.println(tdl);  
       System.out.println();  
     }  
   }  
   
   public static void demoAPI(LexicalizedParser lp) {  
     // This option shows parsing a list of correctly tokenized words  
     String[] sent = { "我", "是", "一名", "好", "学生", "。" };  
     List<CoreLabel> rawWords = new ArrayList<CoreLabel>();  
     for (String word : sent) {  
       CoreLabel l = new CoreLabel();  
       l.setWord(word);  
       rawWords.add(l);  
     }  
     Tree parse = lp.apply(rawWords);  
     parse.pennPrint();  
     System.out.println();  
   
     TreebankLanguagePack tlp = lp.getOp().langpack();  
     GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();  
     GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);  
     List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();  
     System.out.println(tdl);  
     System.out.println();  
     
     TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed",tlp);  
     tp.printTree(parse);  
   }  
   
   private ParserDemo() {} // static methods only  
   
 }  

运行结果：

[plain]view plaincopy 
   
 Loading parser from serialized file edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz ... done [3.3 sec].  
 (ROOT  
   (IP  
     (NP (PN 我))  
     (VP (VC 是)  
       (NP  
         (QP (CD 一名))  
         (ADJP (JJ 好))  
         (NP (NN 学生))))  
     (PU 。)))  
   
 [top(是-2, 我-1), root(ROOT-0, 是-2), nummod(学生-5, 一名-3), amod(学生-5, 好-4), attr(是-2, 学生-5)]  
   
 (ROOT  
   (IP  
     (NP (PN 我))  
     (VP (VC 是)  
       (NP  
         (QP (CD 一名))  
         (ADJP (JJ 好))  
         (NP (NN 学生))))  
     (PU 。)))  
   
 top(是-2, 我-1)  
 root(ROOT-0, 是-2)  
 nummod(学生-5, 一名-3)  
 amod(学生-5, 好-4)  
 attr(是-2, 学生-5)