NLP:stanfordNLP

http://www.zmonster.me/2016/06/08/use-stanford-nlp-package-in-nltk.html

http://stanfordnlp.github.io/CoreNLP/

http://blog.csdn.net/ltbylc/article/details/8557965


1. stanfordNLP

  • 分词: StanfordTokenizer
  • 词性标注: StanfordPOSTagger
  • 命名实体识别: StanfordNERTagger
  • 句法分析: StanfordParser
  • 依存句法分析: StanfordDependencyParser, StanfordNeuralDependencyParser

2. 分词

2.1 分词demo

  • vm参数
-mx1g
import java.util.*;  
import edu.stanford.nlp.ie.crf.CRFClassifier;  


public class stanfordSeg {
      public static String doSegment(String data, CRFClassifier c) {  
            String[] strs = (String[]) c.segmentString(data).toArray();  

            StringBuffer buf = new StringBuffer();  

            for (String s : strs) {  
                buf.append(s + " ");  
            }  

            return buf.toString();  
        }  


      public static void main(String[] args) throws Exception {  
            Properties props = new Properties();  
            props.setProperty("sighanCorporaDict", "data");  
            props.setProperty("serDictionary", "data/dict-chris6.ser.gz");  
            props.setProperty("inputEncoding", "UTF-8");  
            props.setProperty("sighanPostProcessing", "true");  
            CRFClassifier classifier = new CRFClassifier(props);  
            classifier.loadClassifierNoExceptions("data/ctb.gz", props);  
            classifier.flags.setProperties(props);  

            String sentence = "某处女同志去吃饭。";  
            String ret = doSegment(sentence, classifier);  
            System.out.println(ret);  

        }  

    }  

3. 词性标准

java -mx300m -classpath stanford-postagger.jar edu.stanford.nlp.tagger.maxent.MaxentTagger
-model models/chinese-distsim.tagger -textFile inputFile > outputFile

import java.util.*;  
import edu.stanford.nlp.ie.crf.CRFClassifier;  

import java.io.BufferedReader;
import java.io.FileReader;
import java.util.List;

import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;

class stanfordSeg {

  private stanfordSeg() {}

  public static void main(String[] args) throws Exception {
    if (args.length != 2) {
      System.err.println("usage: java TaggerDemo modelFile fileToTag");
      return;
    }
    MaxentTagger tagger = new MaxentTagger(args[0]);
    List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new BufferedReader(new FileReader(args[1])));
    for (List<HasWord> sentence : sentences) {
      List<TaggedWord> tSentence = tagger.tagSentence(sentence);
      System.out.println(Sentence.listToString(tSentence, false));
    }
  }

}

4. 实体识别

http://blog.csdn.net/sparkexpert/article/details/49497231

http://blog.csdn.net/shijiebei2009/article/details/42525091

import edu.stanford.nlp.ie.AbstractSequenceClassifier;   
import edu.stanford.nlp.ie.crf.CRFClassifier;   
import edu.stanford.nlp.ling.CoreLabel;   

/**  
*  
* <p>  
* ClassName ExtractDemo  
* </p>  
* <p>  
* Description 加载NER模块  
* </p>  
*  
* @author wangxu wangx89@126.com  
* <p>  
* Date 2015年1月8日 下午2:53:45  
* </p>  
* @version V1.0.0  
*  
*/   
public class stanfordtest{   
private static AbstractSequenceClassifier<CoreLabel> ner;   
public stanfordtest() {   
InitNer();   
}   
public void InitNer() {   
String serializedClassifier = "classifiers/chinese.misc.distsim.crf.ser.gz"; // chinese.misc.distsim.crf.ser.gz   
if (ner == null) {   
ner = CRFClassifier.getClassifierNoExceptions(serializedClassifier);   
}   
}   

public String doNer(String sent) {   
return ner.classifyWithInlineXML(sent);   
}   

public static void main(String args[]) {   
String str = "我 去 吃饭 , 告诉 李强 一声 。";   
stanfordtest extractDemo = new stanfordtest();   
System.out.println(extractDemo.doNer(str));   
System.out.println("Complete!");   
}   

}   

5. 依存句法分析

http://blog.sina.com.cn/s/blog_8af106960101abvu.html

//package com.parser;  

import java.util.List;  
import java.io.StringReader;  

import edu.stanford.nlp.process.Tokenizer;  
import edu.stanford.nlp.process.TokenizerFactory;  
import edu.stanford.nlp.process.CoreLabelTokenFactory;  

import edu.stanford.nlp.process.PTBTokenizer;  
import edu.stanford.nlp.ling.CoreLabel;  
import edu.stanford.nlp.trees.*;  
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;  

public class stanfordtest {  
  private stanfordtest() {} // static methods only  
  public static void main(String[] args) {  

      String parserModel = "edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz";  
      LexicalizedParser lp = LexicalizedParser.loadModel(parserModel);  
      String sent2 = "他 和 我  在 学校  里 常  打 台球.";  
      demoAPI(lp,sent2);  

  }  

  public static void demoAPI(LexicalizedParser lp,String str) {  


      TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");  
      Tokenizer<CoreLabel> tok = tokenizerFactory.getTokenizer(new StringReader(str));  
      List<CoreLabel> rawWords2 = tok.tokenize();  
      Tree parse = lp.apply(rawWords2);  

      TreebankLanguagePack tlp = lp.treebankLanguagePack(); // PennTreebankLanguagePack for English  
      GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();  
      GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);  
      List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();  

      for(int i = 0; i < tdl.size(); i++){  
          System.out.println(tdl.get(i));  
      }  
      //System.out.println(tdl);  
      // System.out.println();  

          // You can also use a TreePrint object to print trees and dependencies  
          //TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");  
          //tp.printTree(parse);  
  }  

}  

6. 句法分析

import java.util.Collection;
import java.util.List;
import java.io.StringReader;

import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;

class stanfordtest {

  /**
   * The main method demonstrates the easiest way to load a parser.
   * Simply call loadModel and specify the path of a serialized grammar
   * model, which can be a file, a resource on the classpath, or even a URL.
   * For example, this demonstrates loading a grammar from the models jar
   * file, which you therefore need to include on the classpath for ParserDemo
   * to work.
   *
   * Usage: {@code java ParserDemo [[model] textFile]}
   * e.g.: java ParserDemo edu/stanford/nlp/models/lexparser/chineseFactored.ser.gz data/chinese-onesent-utf8.txt
   *
   */
  public static void main(String[] args) {
    String parserModel = "edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz";
    if (args.length > 0) {
      parserModel = args[0];
    }
    LexicalizedParser lp = LexicalizedParser.loadModel(parserModel);

    if (args.length == 0) {
      demoAPI(lp);
    } else {
      String textFile = (args.length > 1) ? args[1] : args[0];
      demoDP(lp, textFile);
    }
  }

  /**
   * demoDP demonstrates turning a file into tokens and then parse
   * trees.  Note that the trees are printed by calling pennPrint on
   * the Tree object.  It is also possible to pass a PrintWriter to
   * pennPrint if you want to capture the output.
   * This code will work with any supported language.
   */
  public static void demoDP(LexicalizedParser lp, String filename) {
    // This option shows loading, sentence-segmenting and tokenizing
    // a file using DocumentPreprocessor.
    TreebankLanguagePack tlp = lp.treebankLanguagePack(); // a PennTreebankLanguagePack for English
    GrammaticalStructureFactory gsf = null;
    if (tlp.supportsGrammaticalStructures()) {
      gsf = tlp.grammaticalStructureFactory();
    }
    // You could also create a tokenizer here (as below) and pass it
    // to DocumentPreprocessor
    for (List<HasWord> sentence : new DocumentPreprocessor(filename)) {
      Tree parse = lp.apply(sentence);
      parse.pennPrint();
      System.out.println();

      if (gsf != null) {
        GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
        Collection tdl = gs.typedDependenciesCCprocessed();
        System.out.println(tdl);
        System.out.println();
      }
    }
  }

  /**
   * demoAPI demonstrates other ways of calling the parser with
   * already tokenized text, or in some cases, raw text that needs to
   * be tokenized as a single sentence.  Output is handled with a
   * TreePrint object.  Note that the options used when creating the
   * TreePrint can determine what results to print out.  Once again,
   * one can capture the output by passing a PrintWriter to
   * TreePrint.printTree. This code is for English.
   */
  public static void demoAPI(LexicalizedParser lp) {
    // This option shows parsing a list of correctly tokenized words
    String[] sent = { "他", "和", "我", "经常", "打", "台球","." };
    List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent);
    Tree parse = lp.apply(rawWords);
    parse.pennPrint();
    System.out.println();

    // This option shows loading and using an explicit tokenizer
    String sent2 = "This is another sentence.";
    TokenizerFactory<CoreLabel> tokenizerFactory =
        PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    Tokenizer<CoreLabel> tok =
        tokenizerFactory.getTokenizer(new StringReader(sent2));
    List<CoreLabel> rawWords2 = tok.tokenize();
    parse = lp.apply(rawWords2);

    TreebankLanguagePack tlp = lp.treebankLanguagePack(); // PennTreebankLanguagePack for English
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
    List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
    System.out.println(tdl);
    System.out.println();

    // You can also use a TreePrint object to print trees and dependencies
    TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
    tp.printTree(parse);
  }

  private stanfordtest() {} // static methods only

}
阅读更多
版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/mijian1207mijian/article/details/52717073
个人分类: NLP
想对作者说点什么? 我来说一句

没有更多推荐了,返回首页

关闭
关闭
关闭