阅读nutch.Analysis.jj

最新推荐文章于 2021-02-28 18:08:55 发布

kauu

最新推荐文章于 2021-02-28 18:08:55 发布

阅读量2.4k

点赞数

分类专栏：程序人生搜索引擎文章标签： token string query import manager whitespace

本文链接：https://blog.csdn.net/kauu/article/details/1393220

版权

程序人生同时被 2 个专栏收录

21 篇文章 0 订阅

订阅专栏

搜索引擎

20 篇文章 0 订阅

订阅专栏

今晚对nutch的analysis.jj 进行了详细的阅读。
这是我对它加入ICTCLAS分词后的.jj文件。

/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/** JavaCC code for the Nutch lexical analyzer. */

options {
STATIC = false;
USER_CHAR_STREAM = true;
OPTIMIZE_TOKEN_MANAGER = true;
UNICODE_INPUT = true;
//DEBUG_TOKEN_MANAGER = true;
}

PARSER_BEGIN(NutchAnalysis)

package org.apache.nutch.analysis;

import java.io.StringReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.searcher.Query;
import org.apache.nutch.searcher.QueryFilters;
import org.apache.nutch.searcher.Query.Clause;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;

import java.io.*;
import java.util.*;

/** The JavaCC-generated Nutch lexical analyzer and query parser. */
public class NutchAnalysis {
                                                                    //过滤的词语
private static final String[] STOP_WORDS = {
    "a", "and", "are", "as", "at", "be", "but", "by",
    "for", "if", "in", "into", "is", "it",
    "no", "not", "of", "on", "or", "s", "such",
    "t", "that", "the", "their", "then", "there", "these",
    "they", "this", "to", "was", "will", "with"
};

private static final Set STOP_SET = StopFilter.makeStopSet(STOP_WORDS);

private Analyzer analyzer = null;
private String queryString;
private QueryFilters queryFilters;

/** Constructs a nutch analysis. */
public NutchAnalysis(String query, Analyzer analyzer) {
    this(new FastCharStream(new StringReader(query)));
    this.analyzer = analyzer;
}

/** True iff word is a stop word. Stop words are only removed from queries.
   * Every word is indexed. */
public static boolean isStopWord(String word) {
    return STOP_SET.contains(word);
}

/** Construct a query parser for the text in a reader. */
public static Query parseQuery(String queryString, Configuration conf) throws IOException {
    return parseQuery(queryString, null, conf);
}

/** Construct a query parser for the text in a reader. */
public static Query parseQuery(String queryString, Analyzer analyzer, Configuration conf)
    throws IOException {
    NutchAnalysis parser = new NutchAnalysis(
          queryString, (analyzer != null) ? analyzer : new NutchDocumentAnalyzer(conf));
    parser.queryString = queryString;
    parser.queryFilters = new QueryFilters(conf);
    return parser.parse(conf);
}

/** For debugging. */
public static void main(String[] args) throws Exception {
    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
    while (true) {
      System.out.print("Query: ");
      String line = in.readLine();
      System.out.println(parseQuery(line, NutchConfiguration.create()));
    }
}

}

PARSER_END(NutchAnalysis)

TOKEN_MGR_DECLS : {

/** Constructs a token manager for the provided Reader. */
public NutchAnalysisTokenManager(Reader reader) {
    this(new FastCharStream(reader));
}

}

TOKEN : {                    // token regular expressions

// basic word -- lowercase it
<WORD: ((<LETTER>|<DIGIT>|<WORD_PUNCT>)+ | <IRREGULAR_WORD>)>
{ matchedToken.image = matchedToken.image.toLowerCase(); }

// special handling for acronyms: U.S.A., I.B.M., etc: dots are removed
| <ACRONYM: <LETTER> "." (<LETTER> ".")+ >
    {                                             // remove dots
      for (int i = 0; i < image.length(); i++) {
    if (image.charAt(i) == '.')
    image.deleteCharAt(i--);
      }
      matchedToken.image = image.toString().toLowerCase();
    }

// chinese, japanese and korean characters
|<SIGRAM: <OTHER_CJK>>
                     //| <SIGRAM: <CJK> >
|<CNWORD: (<CHINESE>)+>//chinese word

   // irregular words
| <#IRREGULAR_WORD: (<C_PLUS_PLUS>|<C_SHARP>)>
| <#C_PLUS_PLUS: ("C"|"c") "++" >
| <#C_SHARP: ("C"|"c") "#" >

// query syntax characters
| <PLUS: "+" >
| <MINUS: "-" >
| <QUOTE: "/"" >
| <COLON: ":" >
| <SLASH: "/" >
| <DOT: "." >
| <ATSIGN: "@" >
| <APOSTROPHE: "'" >

| <WHITE: ~[] >                                   // treat unrecognized chars
                                                  // as whitespace
// primitive, non-token patterns

| <#WORD_PUNCT: ("_"|"&")>                        // allowed anywhere in words

| < #LETTER:                    // alphabets
    [
        "/u0041"-"/u005a",
        "/u0061"-"/u007a",
        "/u00c0"-"/u00d6",
        "/u00d8"-"/u00f6",
        "/u00f8"-"/u00ff",
        "/u0100"-"/u1fff"
    ]
    >

//| <#CJK:                                   // non-alphabets
| <#OTHER_CJK:       //japanese and token characters
      [
       "/u3040"-"/u318f",
       "/u3300"-"/u337f",
       "/u3400"-"/u3d2d",
       "/uf900"-"/ufaff"
      ]
    >


| <#CHINESE:                  //chinese characters
    [
        "/u4e00"-"/u9fff"
    ]


| < #DIGIT:                    // unicode digits
      [
       "/u0030"-"/u0039",
       "/u0660"-"/u0669",
       "/u06f0"-"/u06f9",
       "/u0966"-"/u096f",
       "/u09e6"-"/u09ef",
       "/u0a66"-"/u0a6f",
       "/u0ae6"-"/u0aef",
       "/u0b66"-"/u0b6f",
       "/u0be7"-"/u0bef",
       "/u0c66"-"/u0c6f",
       "/u0ce6"-"/u0cef",
       "/u0d66"-"/u0d6f",
       "/u0e50"-"/u0e59",
       "/u0ed0"-"/u0ed9",
       "/u1040"-"/u1049"
      ]
>

}

/** Parse a query. */
Query parse(Configuration conf) :
{
Query query = new Query(conf);
ArrayList terms;
Token token;
String field;
boolean stop;
boolean prohibited;

}
{
nonOpOrTerm()                                   // skip noise
(
    { stop=true; prohibited=false; field = Clause.DEFAULT_FIELD; }

                                                  // optional + or - operator
    ( <PLUS> {stop=false;} | (<MINUS> { stop=false;prohibited=true; } ))?

                                                  // optional field spec.
    ( LOOKAHEAD(<WORD><COLON>(phrase(field)|compound(field)))
      token=<WORD> <COLON> { field = token.image; } )?

    ( terms=phrase(field) {stop=false;} |         // quoted terms or
      terms=compound(field))                      // single or compound term

    nonOpOrTerm()                                 // skip noise

    {
      String[] array = (String[])terms.toArray(new String[terms.size()]);

      if (stop
          && field == Clause.DEFAULT_FIELD
          && terms.size()==1
          && isStopWord(array[0])) {
        // ignore stop words only when single, unadorned terms in default field
      } else {
        if (prohibited)
          query.addProhibitedPhrase(array, field);
        else
          query.addRequiredPhrase(array, field);
      }
    }
)*

{ return query; }

}

/** Parse an explcitly quoted phrase query. Note that this may return a single
* term, a trivial phrase.*/
ArrayList phrase(String field) :
{
int start;
int end;
ArrayList result = new ArrayList();
String term;
}
{
<QUOTE>

{ start = token.endColumn; }

(nonTerm())*                                    // skip noise
( term = term() { result.add(term); }           // parse a term
    (nonTerm())*)*                                // skip noise

{ end = token.endColumn; }

(<QUOTE>|<EOF>)

{
    if (this.queryFilters.isRawField(field)) {
      result.clear();
      result.add(queryString.substring(start, end));
    }
    return result;
}

}

/** Parse a compound term that is interpreted as an implicit phrase query.
* Compounds are a sequence of terms separated by infix characters. Note that
* htis may return a single term, a trivial compound. */
ArrayList compound(String field) :
{
int start;
ArrayList result = new ArrayList();
String term;
StringBuffer terms = new StringBuffer();
}
{
{ start = token.endColumn; }

term = term() {
    terms.append(term).append(" ");
    //result.add(term);
}
( LOOKAHEAD( (infix())+ term() )
    (infix())+
    term = term() {
      terms.append(term).append(" ");
      //result.add(term);
    })*

{
    if (this.queryFilters.isRawField(field)) {
//      result.clear();
      result.add(queryString.substring(start, token.endColumn));

    } else {
      org.apache.lucene.analysis.Token token;
      TokenStream tokens = analyzer.tokenStream(
                              field, new StringReader(terms.toString()));

      while (true) {
        try {
          token = tokens.next();
        } catch (IOException e) {
          token = null;
        }
        if (token == null) { break; }
        result.add(token.termText());
      }
      try {
        tokens.close();
      } catch (IOException e) {
        // ignore
      }
    }
    return result;
}

}

/** Parse a single term. */
String term() :
{
Token token;
}
{
( token=<WORD> | token=<ACRONYM> | token=<SIGRAM> | token=<CNWORD>)

{ return token.image; }
}

/** Parse anything but a term or a quote. */
void nonTerm() :
{}
{
<WHITE> | infix()
}

void nonTermOrEOF() :
{}
{
nonTerm() | <EOF>
}

/** Parse anything but a term or an operator (plur or minus or quote). */
void nonOpOrTerm() :
{}
{
(LOOKAHEAD(2) (<WHITE> | nonOpInfix() | ((<PLUS>|<MINUS>) nonTermOrEOF())))*
}

/** Characters which can be used to form compound terms. */
void infix() :
{}
{
<PLUS> | <MINUS> | nonOpInfix()
}

/** Parse infix characters except plus and minus. */
void nonOpInfix() :
{}
{
<COLON>|<SLASH>|<DOT>|<ATSIGN>|<APOSTROPHE>
}

javacc编译后会生成很多这个文件所定义的接口文件。