本文分析nutch提供的脚本,nutch提供的脚本包括nutch和craw。
1、nutch脚本:可以分别执行inject、fetch、generate等等操作
2、crawl脚本,批量命令,调用nutch脚本执行:inject,随后循环执行generate、fetch、parse、update操作脚本分析目的:1)帮助理解nutch的工作方式,爬取数据并存储分析的完整流程;
2)程序的入口,从程序入口分析代码,为后续庖丁解牛式的分析nutch提供入口和基础。
首先分析nutch脚本:
1 #!/bin/bash
2 #
3 # Licensed to the Apache SoftwareFoundation (ASF) under one or more
4 # contributor license agreements. See the NOTICE file distributed with
5 # this work for additional informationregarding copyright ownership.
6 # The ASF licenses this file to You underthe Apache License, Version 2.0
7 # (the "License"); you may notuse this file except in compliance with
8 # the License. You may obtain a copy of the License at
9 #
10 # http://www.apache.org/licenses/LICENSE-2.0
11 #
12 # Unless required by applicable law oragreed to in writing, software
13 # distributed under the License isdistributed on an "AS IS" BASIS,
14 # WITHOUT WARRANTIES OR CONDITIONS OFANY KIND, either express or implied.
15 # See the License for the specificlanguage governing permissions and
16 # limitations under the License.
17 #
18 # The Nutch command script
19 #
20 # Environment Variables
21 #
22 # NUTCH_JAVA_HOME The java implementation to use. Overrides JAVA_HOME.
23 #
24 # NUTCH_HEAPSIZE The maximum amountof heap to use, in MB.
25 # Default is 1000.
26 #
27 # NUTCH_OPTS Extra Java runtimeoptions.
28 # Multiple options must beseparated by white space.
29 #
30 # NUTCH_LOG_DIR Log directory(default: $NUTCH_HOME/logs)
31 #
32 # NUTCH_LOGFILE Log file (default:hadoop.log)
33 #
34 # NUTCH_CONF_DIR Path(s) toconfiguration files (default: $NUTCH_HOME/conf).
35 # Multiple paths must beseparated by a colon ':'.
36 #
[1~36行]##注释,声明nutch命令脚本依赖的环境变量
37 cygwin=false
38 case "`uname`" in
39 CYGWIN*) cygwin=true;;
40 esac
41[37~40行]##使用uname命令获取系统类型,判断是否在cygwin中运行nutch脚本
42
43 # resolve links - $0 may be a softlink
44 THIS="$0"
45 [44行]##脚本运行第一个参数,为脚本名本身
46 while [ -h "$THIS" ]; do
47 [46行]##当前运行的nutch文件是个软链接
48 ls=`ls -ld "$THIS"`
49 [48行]##获取软链接详细信息,eg:lrwxrwxrwx1 swrd swrd 51 Jul 11 23:45 nutch ->/home/swrd/software/apache-nutch-2.3.1-bin/bin/nutch
50 link=`expr "$ls" : '.*-> \(.*\)$'`
51 [50行]##获取软链接文件的真实路径,expr后加冒号表示使用该表达式匹配$ls的内容,eg:/home/swrd/software/apache-nutch-2.3.1-bin/bin/nutch
52 if expr "$link" : '.*/.*' > /dev/null; then
53 THIS="$link"
54[52~53行]##link路径为完整的路径
55 else
56 THIS=`dirname "$THIS"`/"$link"
57[55~56行]##link不是完整的绝对路径,则添加当前软链接执行路径
58 fi
59 done
60
61 # if no args specified, show usage
62 if [ $# = 0 ]; then
64 echo "Usage: nutch COMMAND"
65 echo "where COMMAND is one of:"
66 echo " inject injectnew urls into the database"
67 echo " hostinject createsor updates an existing host table from a text file"
68 echo " generate generate newbatches to fetch from crawl db"
69 echo " fetch fetchURLs marked during generate"
70 echo " parse parseURLs marked during fetch"
71 echo " updatedb update webtable after parsing"
72 echo " updatehostdb updatehost table after parsing"
73 echo " readdb read/dumprecords from page database"
74 echo " readhostdb displayentries from the hostDB"
75 echo " index run the plugin-based indexer on parsedbatches"
76 echo " elasticindex run theelasticsearch indexer - DEPRECATED use the index command instead"
77 echo " solrindex run thesolr indexer on parsed batches - DEPRECATED use the index command instead"
78 echo " solrdedup removeduplicates from solr"
79 echo " solrclean removeHTTP 301 and 404 documents from solr - DEPRECATED use the clean commandinstead"
80 echo " clean removeHTTP 301 and 404 documents and duplicates from indexing backends configured viaplugins"
81 echo " parsechecker checkthe parser for a given url"
82 echo " indexchecker checkthe indexing filters for a given url"
83 echo " plugin load aplugin and run one of its classes main()"
84 echo " nutchserver run a(local) Nutch server on a user defined port"
85 echo " webapp run alocal Nutch web application"
86 echo " junit runs the given JUnit test"
87 echo " or"
88 echo " CLASSNAME run theclass named CLASSNAME"
89 echo "Most commands print help when invoked w/o parameters."
90 exit 1
91 fi
[62~91行]##如果nutch后带有的参数为空,输出nutch命令使用指导并退出
92
93 # get arguments
94 COMMAND=$1
95 [94行]##获取nutch命令后带的第一个参数
96 shift
97 [96行]##将nutch后带的参数个数($#数值减一),参数所在位置提前一位
98
99 # some directories
100 THIS_DIR="`dirname"$THIS"`"
101 [101行]##获取当前执行脚本所在路径
102 NUTCH_HOME="`cd"$THIS_DIR/.." ; pwd`"
103 [102行]##回退到上一层目录,到达nutch的home目录
104
105 # some Java parameters
106 if [ "$NUTCH_JAVA_HOME" !="" ]; then
107 #echo "run java in $NUTCH_JAVA_HOME"
108 JAVA_HOME="$NUTCH_JAVA_HOME"
109 fi
110[106~109行]##如果配置了NUTCH_JAVA_HOME环境变量,则使用该环境变量作为JAVA_HOME
111
112 if [ "$JAVA_HOME" ="" ]; then
113 echo "Error: JAVA_HOME is not set."
114 exit 1
115 fi
116[112~115行]##如果未配置JAVA_HOME,报错退出
117
118
119 # NUTCH_JOB
120 if [ -f"${NUTCH_HOME}"/*nutch*.job ]; then
121 local=false
122 for f in "$NUTCH_HOME"/*nutch*.job; do
123 NUTCH_JOB="$f";
124[120~123行]##遍历"$NUTCH_HOME"/*nutch*.job,赋值给NUTCH_JOB
125 done
126 # cygwin path translation
127 if $cygwin; then
128 NUTCH_JOB="`cygpath -p -w "$NUTCH_JOB"`"
129 fi
130 else
131 local=true
132 fi
133[127~132行]##判断是否运行在local模式(本地文件系统运行,只能有一个Map和Reduce)还是deploy(hadoop 运行),deploy模式下将nutchjob提交到jobtracker
134
135 JAVA="$JAVA_HOME/bin/java"
136 JAVA_HEAP_MAX=-Xmx1000m
137[135~136行]##定义运行java堆内存大小
138
139 # check envvars which might overridedefault args
140 if [ "$NUTCH_HEAPSIZE" !="" ]; then
141 #echo "run with heapsize $NUTCH_HEAPSIZE"
142 JAVA_HEAP_MAX="-Xmx""$NUTCH_HEAPSIZE""m"
143 #echo $JAVA_HEAP_MAX
144 fi
145[140~144行]##如果未配置NUTCH_HEAPSIZE,则使用默认的JAVA_HEAP_MAX
146
147 # CLASSPATH initially contains$NUTCH_CONF_DIR, or defaults to $NUTCH_HOME/conf
148CLASSPATH="${NUTCH_CONF_DIR:=$NUTCH_HOME/conf}"
149 [148行]##赋值java运行参数classpath为,如果NUTCH_CONF_DIR为空,则使用$NUTCH_HOME/conf
150CLASSPATH="${CLASSPATH}:$JAVA_HOME/lib/tools.jar"
151
152
153 # so that filenames w/ spaces arehandled correctly in loops below
154 IFS=
155 [154行]##IFS(InternalField Seprator),内部区域分隔符”,它也是一个内置环境变量,存储着默认的文本分隔符,默认下这分隔符是空格符(space character),制表符(tab)以及新行(newline)
156
157 # add libs to CLASSPATH
158 if $local; then
159 for f in "$NUTCH_HOME"/lib/*.jar; do
160 CLASSPATH="${CLASSPATH}:$f";
161 done
162 # local runtime
163 # add plugins to classpath
164 if [ -d "$NUTCH_HOME/plugins" ]; then
165 CLASSPATH="${NUTCH_HOME}:${CLASSPATH}"
166 fi
167 fi
168[158~167行]##local运行模式下,将NUTCH_HOME目录lib下以及plugins下所有jar包加入到CLASSPATH
169
170 # cygwin path translation
171 if $cygwin; then
172 CLASSPATH="`cygpath -p -w "$CLASSPATH"`"
173 fi
174[171~173行]##在cygwin中运行时,需特殊CLASSPATH
175
176 # setup 'java.library.path' fornative-hadoop code if necessary
177 # used only in local mode
178 JAVA_LIBRARY_PATH=''
179 if [ -d"${NUTCH_HOME}/lib/native" ]; then
180
181 JAVA_PLATFORM=`"${JAVA}" -classpath "$CLASSPATH"org.apache.hadoop.util.PlatformName | sed -e 's/ /_/g'`
182 [179~181行]##存在native文件夹,执行hadoop中PlatformName方法获取平台名称,并使用sed命令将空格替换成"_",此处目的是获取当前系统使用的系统平台。eg:Linux-amd64-64
183
184 if [ -d "${NUTCH_HOME}/lib/native" ]; then
185 if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
186 [185行]##判断JAVA_LIBRARY_PATH是否为空
187 JAVA_LIBRARY_PATH="${JAVA_LIBRARY_PATH}:${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}"
188 else
189 JAVA_LIBRARY_PATH="${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}"
190 fi
191
192 fi
193 fi
194
195 if [ $cygwin = true -a"X${JAVA_LIBRARY_PATH}" != "X" ]; then
196 JAVA_LIBRARY_PATH="`cygpath -p -w"$JAVA_LIBRARY_PATH"`"
197 fi
198[195~197行]##cygwin下运行nutch,JAVA_LIBRARY_PATH赋值
199
200 # restore ordinary behaviour
201 unset IFS
202
203 # default log directory & file
204 if [ "$NUTCH_LOG_DIR" ="" ]; then
205 NUTCH_LOG_DIR="$NUTCH_HOME/logs"
206 fi
207 if [ "$NUTCH_LOGFILE" ="" ]; then
208 NUTCH_LOGFILE='hadoop.log'
209 fi
210[204~209行]##设置NUTCH_LOG_DIR和NUTCH_LOGFILE
211
212 #Fix log path under cygwin
213 if $cygwin; then
214 NUTCH_LOG_DIR="`cygpath -p -w "$NUTCH_LOG_DIR"`"
215 fi
216
217 NUTCH_OPTS=($NUTCH_OPTS-Dhadoop.log.dir="$NUTCH_LOG_DIR")
218NUTCH_OPTS=("${NUTCH_OPTS[@]}"-Dhadoop.log.file="$NUTCH_LOGFILE")
219[217~218行]##${NUTCH_OPTS[@]表示数组NUTCH_OPTS所有元素
220
221 if [ "x$JAVA_LIBRARY_PATH" !="x" ]; then
222 NUTCH_OPTS=("${NUTCH_OPTS[@]}"-Djava.library.path="$JAVA_LIBRARY_PATH")
223 fi
224
225 # figure out which class to run
226 if [ "$COMMAND" ="crawl" ] ; then
227 echo "Command $COMMAND is deprecated, please use bin/crawlinstead"
228 exit -1
229 elif [ "$COMMAND" ="inject" ] ; then
230CLASS=org.apache.nutch.crawl.InjectorJob
231 elif [ "$COMMAND" ="hostinject" ] ; then
232CLASS=org.apache.nutch.host.HostInjectorJob
233 elif [ "$COMMAND" ="generate" ] ; then
234CLASS=org.apache.nutch.crawl.GeneratorJob
235 elif [ "$COMMAND" ="fetch" ] ; then
236CLASS=org.apache.nutch.fetcher.FetcherJob
237 elif [ "$COMMAND" ="parse" ] ; then
238 CLASS=org.apache.nutch.parse.ParserJob
239 elif [ "$COMMAND" ="updatedb" ] ; then
240 CLASS=org.apache.nutch.crawl.DbUpdaterJob
241 elif [ "$COMMAND" ="updatehostdb" ] ; then
242CLASS=org.apache.nutch.host.HostDbUpdateJob
243 elif [ "$COMMAND" ="readdb" ] ; then
244CLASS=org.apache.nutch.crawl.WebTableReader
245 elif [ "$COMMAND" ="readhostdb" ] ; then
246 CLASS=org.apache.nutch.host.HostDbReader
247 elif [ "$COMMAND" ="elasticindex" ] ; then
248CLASS=org.apache.nutch.indexer.elastic.ElasticIndexerJob
249 elif [ "$COMMAND" ="solrindex" ] ; then
250CLASS="org.apache.nutch.indexer.IndexingJob -D solr.server.url=$1"
251 shift
252 elif [ "$COMMAND" ="index" ] ; then
253CLASS=org.apache.nutch.indexer.IndexingJob
254 elif [ "$COMMAND" ="solrdedup" ] ; then
255CLASS=org.apache.nutch.indexer.solr.SolrDeleteDuplicates
256 elif [ "$COMMAND" ="solrclean" ] ; then
257 CLASS="org.apache.nutch.indexer.CleaningJob -D solr.server.url=$2$1"
258 shift; shift
259 elif [ "$COMMAND" ="clean" ] ; then
260 CLASS=org.apache.nutch.indexer.CleaningJob
261 elif [ "$COMMAND" ="parsechecker" ] ; then
262 CLASS=org.apache.nutch.parse.ParserChecker
263 elif [ "$COMMAND" ="indexchecker" ] ; then
264 CLASS=org.apache.nutch.indexer.IndexingFiltersChecker
265 elif [ "$COMMAND" ="plugin" ] ; then
266CLASS=org.apache.nutch.plugin.PluginRepository
267 elif [ "$COMMAND" ="webapp" ] ; then
268CLASS=org.apache.nutch.webui.NutchUiServer
269 elif [ "$COMMAND" ="nutchserver" ] ; then
270 CLASS=org.apache.nutch.api.NutchServer
271 elif [ "$COMMAND" ="junit" ] ; then
272 CLASSPATH="$CLASSPATH:$NUTCH_HOME/test/classes/"
273 CLASS=org.junit.runner.JUnitCore
274 else
275 CLASS=$COMMAND
276 fi
277[221~276行]##根据输入的命令指定CLASS,后面java运行时,指定运行的入口类
278
279
280 if $local; then
281 # fix for the external Xerces lib issue with SAXParserFactory
282 NUTCH_OPTS=(-Djavax.xml.parsers.DocumentBuilderFactory=com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl"${NUTCH_OPTS[@]}")
283 EXEC_CALL=("$JAVA" $JAVA_HEAP_MAX "${NUTCH_OPTS[@]}"-classpath "$CLASSPATH")
284 [282~283行]##本地模式下,EXEC_CALL的值。eg:/home/用户名/software/jdk1.8.0_101/bin/java-Xmx1000m-Djavax.xml.parsers.DocumentBuilderFactory=com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl -Dhadoop.log.dir="$NUTCH_LOG_DIR"-Dhadoop.log.file="$NUTCH_LOGFILE" -Djava.library.path="$JAVA_LIBRARY_PATH"-classpath/home/swrd/software/apache-nutch-2.3.1-bin/lib/*.jar:/home/swrd/software/apache-nutch-2.3.1-bin/conf/*.jar
286 ##${#数组名[@/*]} 可得到数组的长度
287
288 else
289 # check that hadoop can be found on the path
290 if[ $(which hadoop | wc -l ) -eq 0 ]; then
291 echo "Can't find Hadoop executable. Add HADOOP_HOME/bin to the pathor run in local mode."
292 exit -1;
293 fi
294 # distributed mode
295 EXEC_CALL=(hadoop jar "$NUTCH_JOB")
296 fi
297[290~295行]##deploy模式下,EXEC_CALL的值
298
299 # run it
300 exec "${EXEC_CALL[@]}" $CLASS"$@"
301 [300行]##最终执行的java命令。eg:在EXEC_CALL基础上增加CLASS以及nutch命令后带的参数,如<seedDir><crawlID>等