浅析爬虫nutch2.0(三)

本文分析nutch提供的脚本,nutch提供的脚本包括nutch和craw。

1、nutch脚本:可以分别执行inject、fetch、generate等等操作

2、crawl脚本,批量命令,调用nutch脚本执行:inject,随后循环执行generate、fetch、parse、update操作

脚本分析目的:1)帮助理解nutch的工作方式,爬取数据并存储分析的完整流程;

                            2)程序的入口,从程序入口分析代码,为后续庖丁解牛式的分析nutch提供入口和基础。

首先分析nutch脚本:


1 #!/bin/bash

2 #

3 # Licensed to the Apache SoftwareFoundation (ASF) under one or more

4 # contributor license agreements.  See the NOTICE file distributed with

5 # this work for additional informationregarding copyright ownership.

6 # The ASF licenses this file to You underthe Apache License, Version 2.0

7 # (the "License"); you may notuse this file except in compliance with

8 # the License.  You may obtain a copy of the License at

9 #

10 #    http://www.apache.org/licenses/LICENSE-2.0

11 #

12 # Unless required by applicable law oragreed to in writing, software

13 # distributed under the License isdistributed on an "AS IS" BASIS,

14 # WITHOUT WARRANTIES OR CONDITIONS OFANY KIND, either express or implied.

15 # See the License for the specificlanguage governing permissions and

16 # limitations under the License.

17 #

18 # The Nutch command script

19 #

20 # Environment Variables

21 #

22 #  NUTCH_JAVA_HOME The java implementation to use.  Overrides JAVA_HOME.

23 #

24 #  NUTCH_HEAPSIZE  The maximum amountof heap to use, in MB.

25 #                   Default is 1000.

26 #

27 #  NUTCH_OPTS      Extra Java runtimeoptions.

28 #                   Multiple options must beseparated by white space.

29 #

30 #  NUTCH_LOG_DIR   Log directory(default: $NUTCH_HOME/logs)

31 #

32 #  NUTCH_LOGFILE   Log file (default:hadoop.log)

33 #

34 #  NUTCH_CONF_DIR  Path(s) toconfiguration files (default: $NUTCH_HOME/conf).

35 #                   Multiple paths must beseparated by a colon ':'.

36 #

[1~36]##注释,声明nutch命令脚本依赖的环境变量

37 cygwin=false

38 case "`uname`" in

39 CYGWIN*) cygwin=true;;

40 esac

41[37~40]##使用uname命令获取系统类型,判断是否在cygwin中运行nutch脚本

42

43 # resolve links - $0 may be a softlink

44 THIS="$0"

45 [44]##脚本运行第一个参数,为脚本名本身

46 while [ -h "$THIS" ]; do

47 [46]##当前运行的nutch文件是个软链接

48  ls=`ls -ld "$THIS"`

49 [48]##获取软链接详细信息,eglrwxrwxrwx1 swrd swrd 51 Jul 11 23:45 nutch ->/home/swrd/software/apache-nutch-2.3.1-bin/bin/nutch

50  link=`expr "$ls" : '.*-> \(.*\)$'`

51 [50]##获取软链接文件的真实路径,expr后加冒号表示使用该表达式匹配$ls的内容,eg/home/swrd/software/apache-nutch-2.3.1-bin/bin/nutch

52  if expr "$link" : '.*/.*' > /dev/null; then

53    THIS="$link"

54[52~53]##link路径为完整的路径

55  else

56    THIS=`dirname "$THIS"`/"$link"

57[55~56]##link不是完整的绝对路径,则添加当前软链接执行路径

58  fi

59 done

60

61 # if no args specified, show usage

62 if [ $# = 0 ]; then

64  echo "Usage: nutch COMMAND"

65  echo "where COMMAND is one of:"

66  echo " inject            injectnew urls into the database"

67  echo " hostinject     createsor updates an existing host table from a text file"

68  echo " generate   generate newbatches to fetch from crawl db"

69  echo " fetch           fetchURLs marked during generate"

70  echo " parse                   parseURLs marked during fetch"

71  echo " updatedb   update webtable after parsing"

72  echo " updatehostdb   updatehost table after parsing"

73  echo " readdb       read/dumprecords from page database"

74  echo " readhostdb     displayentries from the hostDB"

75  echo " index          run the plugin-based indexer on parsedbatches"

76  echo " elasticindex   run theelasticsearch indexer - DEPRECATED use the index command instead"

77  echo " solrindex    run thesolr indexer on parsed batches - DEPRECATED use the index command instead"

78  echo " solrdedup removeduplicates from solr"

79  echo " solrclean      removeHTTP 301 and 404 documents from solr - DEPRECATED use the clean commandinstead"

80  echo " clean          removeHTTP 301 and 404 documents and duplicates from indexing backends configured viaplugins"

81  echo " parsechecker   checkthe parser for a given url"

82  echo " indexchecker   checkthe indexing filters for a given url"

83  echo " plugin         load aplugin and run one of its classes main()"

84  echo " nutchserver    run a(local) Nutch server on a user defined port"

85  echo " webapp         run alocal Nutch web application"

86  echo " junit           runs the given JUnit test"

87  echo " or"

88  echo " CLASSNAME       run theclass named CLASSNAME"

89  echo "Most commands print help when invoked w/o parameters."

90  exit 1

91 fi

[62~91]##如果nutch后带有的参数为空,输出nutch命令使用指导并退出

92

93 # get arguments

94 COMMAND=$1

95 [94]##获取nutch命令后带的第一个参数

96 shift

97 [96]##nutch后带的参数个数($#数值减一),参数所在位置提前一位

98

99 # some directories

100 THIS_DIR="`dirname"$THIS"`"

101 [101]##获取当前执行脚本所在路径

102 NUTCH_HOME="`cd"$THIS_DIR/.." ; pwd`"

103 [102]##回退到上一层目录,到达nutchhome目录

104

105 # some Java parameters

106 if [ "$NUTCH_JAVA_HOME" !="" ]; then

107  #echo "run java in $NUTCH_JAVA_HOME"

108  JAVA_HOME="$NUTCH_JAVA_HOME"

109 fi

110[106~109]##如果配置了NUTCH_JAVA_HOME环境变量,则使用该环境变量作为JAVA_HOME

111  

112 if [ "$JAVA_HOME" ="" ]; then

113  echo "Error: JAVA_HOME is not set."

114  exit 1

115 fi

116[112~115]##如果未配置JAVA_HOME,报错退出

117

118

119 # NUTCH_JOB

120 if [ -f"${NUTCH_HOME}"/*nutch*.job ]; then

121  local=false

122  for f in "$NUTCH_HOME"/*nutch*.job; do

123    NUTCH_JOB="$f";

124[120~123]##遍历"$NUTCH_HOME"/*nutch*.job,赋值给NUTCH_JOB

125  done

126  # cygwin path translation

127  if $cygwin; then

128    NUTCH_JOB="`cygpath -p -w "$NUTCH_JOB"`"

129  fi

130 else

131  local=true

132 fi

133[127~132]##判断是否运行在local模式(本地文件系统运行,只能有一个MapReduce)还是deployhadoop 运行),deploy模式下将nutchjob提交到jobtracker

134

135 JAVA="$JAVA_HOME/bin/java"

136 JAVA_HEAP_MAX=-Xmx1000m

137[135~136]##定义运行java堆内存大小

138

139 # check envvars which might overridedefault args

140 if [ "$NUTCH_HEAPSIZE" !="" ]; then

141  #echo "run with heapsize $NUTCH_HEAPSIZE"

142  JAVA_HEAP_MAX="-Xmx""$NUTCH_HEAPSIZE""m"

143  #echo $JAVA_HEAP_MAX

144 fi

145[140~144]##如果未配置NUTCH_HEAPSIZE,则使用默认的JAVA_HEAP_MAX

146

147 # CLASSPATH initially contains$NUTCH_CONF_DIR, or defaults to $NUTCH_HOME/conf

148CLASSPATH="${NUTCH_CONF_DIR:=$NUTCH_HOME/conf}"

149 [148]##赋值java运行参数classpath为,如果NUTCH_CONF_DIR为空,则使用$NUTCH_HOME/conf

150CLASSPATH="${CLASSPATH}:$JAVA_HOME/lib/tools.jar"

151

152

153 # so that filenames w/ spaces arehandled correctly in loops below

154 IFS=

155 [154]##IFS(InternalField Seprator),内部区域分隔符”,它也是一个内置环境变量,存储着默认的文本分隔符,默认下这分隔符是空格符(space character),制表符(tab)以及新行(newline)

156

157 # add libs to CLASSPATH

158 if $local; then

159  for f in "$NUTCH_HOME"/lib/*.jar; do

160   CLASSPATH="${CLASSPATH}:$f";

161  done

162  # local runtime

163   # add plugins to classpath

164  if [ -d "$NUTCH_HOME/plugins" ]; then

165     CLASSPATH="${NUTCH_HOME}:${CLASSPATH}"

166  fi

167 fi

168[158~167]##local运行模式下,将NUTCH_HOME目录lib下以及plugins下所有jar包加入到CLASSPATH

169

170 # cygwin path translation

171 if $cygwin; then

172  CLASSPATH="`cygpath -p -w "$CLASSPATH"`"

173 fi

174[171~173]##cygwin中运行时,需特殊CLASSPATH

175

176 # setup 'java.library.path' fornative-hadoop code if necessary

177 # used only in local mode

178 JAVA_LIBRARY_PATH=''

179 if [ -d"${NUTCH_HOME}/lib/native" ]; then

180

181  JAVA_PLATFORM=`"${JAVA}" -classpath "$CLASSPATH"org.apache.hadoop.util.PlatformName | sed -e 's/ /_/g'`

182   [179~181]##存在native文件夹,执行hadoopPlatformName方法获取平台名称,并使用sed命令将空格替换成"_",此处目的是获取当前系统使用的系统平台。egLinux-amd64-64

183

184  if [ -d "${NUTCH_HOME}/lib/native" ]; then

185    if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then

186       [185]##判断JAVA_LIBRARY_PATH是否为空

187      JAVA_LIBRARY_PATH="${JAVA_LIBRARY_PATH}:${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}"

188    else

189      JAVA_LIBRARY_PATH="${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}"

190    fi

191         

192  fi

193 fi

194

195 if [ $cygwin = true -a"X${JAVA_LIBRARY_PATH}" != "X" ]; then

196  JAVA_LIBRARY_PATH="`cygpath -p -w"$JAVA_LIBRARY_PATH"`"

197 fi

198[195~197]##cygwin下运行nutchJAVA_LIBRARY_PATH赋值

199

200 # restore ordinary behaviour

201 unset IFS

202

203 # default log directory & file

204 if [ "$NUTCH_LOG_DIR" ="" ]; then

205  NUTCH_LOG_DIR="$NUTCH_HOME/logs"

206 fi

207 if [ "$NUTCH_LOGFILE" ="" ]; then

208  NUTCH_LOGFILE='hadoop.log'

209 fi

210[204~209]##设置NUTCH_LOG_DIRNUTCH_LOGFILE

211

212 #Fix log path under cygwin

213 if $cygwin; then

214  NUTCH_LOG_DIR="`cygpath -p -w "$NUTCH_LOG_DIR"`"

215 fi

216

217 NUTCH_OPTS=($NUTCH_OPTS-Dhadoop.log.dir="$NUTCH_LOG_DIR")

218NUTCH_OPTS=("${NUTCH_OPTS[@]}"-Dhadoop.log.file="$NUTCH_LOGFILE")

219[217~218]##${NUTCH_OPTS[@]表示数组NUTCH_OPTS所有元素

220

221 if [ "x$JAVA_LIBRARY_PATH" !="x" ]; then

222  NUTCH_OPTS=("${NUTCH_OPTS[@]}"-Djava.library.path="$JAVA_LIBRARY_PATH")

223 fi

224

225 # figure out which class to run

226 if [ "$COMMAND" ="crawl" ] ; then

227  echo "Command $COMMAND is deprecated, please use bin/crawlinstead"

228  exit -1

229 elif [ "$COMMAND" ="inject" ] ; then

230CLASS=org.apache.nutch.crawl.InjectorJob

231 elif [ "$COMMAND" ="hostinject" ] ; then

232CLASS=org.apache.nutch.host.HostInjectorJob

233 elif [ "$COMMAND" ="generate" ] ; then

234CLASS=org.apache.nutch.crawl.GeneratorJob

235 elif [ "$COMMAND" ="fetch" ] ; then

236CLASS=org.apache.nutch.fetcher.FetcherJob

237 elif [ "$COMMAND" ="parse" ] ; then

238 CLASS=org.apache.nutch.parse.ParserJob

239 elif [ "$COMMAND" ="updatedb" ] ; then

240 CLASS=org.apache.nutch.crawl.DbUpdaterJob

241 elif [ "$COMMAND" ="updatehostdb" ] ; then

242CLASS=org.apache.nutch.host.HostDbUpdateJob

243 elif [ "$COMMAND" ="readdb" ] ; then

244CLASS=org.apache.nutch.crawl.WebTableReader

245 elif [ "$COMMAND" ="readhostdb" ] ; then

246 CLASS=org.apache.nutch.host.HostDbReader

247 elif [ "$COMMAND" ="elasticindex" ] ; then

248CLASS=org.apache.nutch.indexer.elastic.ElasticIndexerJob

249 elif [ "$COMMAND" ="solrindex" ] ; then

250CLASS="org.apache.nutch.indexer.IndexingJob -D solr.server.url=$1"

251 shift

252 elif [ "$COMMAND" ="index" ] ; then

253CLASS=org.apache.nutch.indexer.IndexingJob

254 elif [ "$COMMAND" ="solrdedup" ] ; then

255CLASS=org.apache.nutch.indexer.solr.SolrDeleteDuplicates

256 elif [ "$COMMAND" ="solrclean" ] ; then

257  CLASS="org.apache.nutch.indexer.CleaningJob -D solr.server.url=$2$1"

258  shift; shift

259 elif [ "$COMMAND" ="clean" ] ; then

260  CLASS=org.apache.nutch.indexer.CleaningJob

261 elif [ "$COMMAND" ="parsechecker" ] ; then

262  CLASS=org.apache.nutch.parse.ParserChecker

263 elif [ "$COMMAND" ="indexchecker" ] ; then

264  CLASS=org.apache.nutch.indexer.IndexingFiltersChecker

265 elif [ "$COMMAND" ="plugin" ] ; then

266CLASS=org.apache.nutch.plugin.PluginRepository

267 elif [ "$COMMAND" ="webapp" ] ; then

268CLASS=org.apache.nutch.webui.NutchUiServer

269 elif [ "$COMMAND" ="nutchserver" ] ; then

270 CLASS=org.apache.nutch.api.NutchServer

271 elif [ "$COMMAND" ="junit" ] ; then

272  CLASSPATH="$CLASSPATH:$NUTCH_HOME/test/classes/"

273  CLASS=org.junit.runner.JUnitCore

274 else

275 CLASS=$COMMAND

276 fi

277[221~276]##根据输入的命令指定CLASS,后面java运行时,指定运行的入口类

278

279

280 if $local; then

281 # fix for the external Xerces lib issue with SAXParserFactory

282 NUTCH_OPTS=(-Djavax.xml.parsers.DocumentBuilderFactory=com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl"${NUTCH_OPTS[@]}")

283 EXEC_CALL=("$JAVA" $JAVA_HEAP_MAX "${NUTCH_OPTS[@]}"-classpath "$CLASSPATH")

284  [282~283]##本地模式下,EXEC_CALL的值。eg/home/用户名/software/jdk1.8.0_101/bin/java-Xmx1000m-Djavax.xml.parsers.DocumentBuilderFactory=com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl -Dhadoop.log.dir="$NUTCH_LOG_DIR"-Dhadoop.log.file="$NUTCH_LOGFILE" -Djava.library.path="$JAVA_LIBRARY_PATH"-classpath/home/swrd/software/apache-nutch-2.3.1-bin/lib/*.jar:/home/swrd/software/apache-nutch-2.3.1-bin/conf/*.jar

286  ##${#数组名[@/*]} 可得到数组的长度

287

288 else

289 # check that hadoop can be found on the path

290  if[ $(which hadoop | wc -l ) -eq 0 ]; then

291    echo "Can't find Hadoop executable. Add HADOOP_HOME/bin to the pathor run in local mode."

292    exit -1;

293 fi

294 # distributed mode

295 EXEC_CALL=(hadoop jar "$NUTCH_JOB")

296 fi

297[290~295]##deploy模式下,EXEC_CALL的值

298

299 # run it

300 exec "${EXEC_CALL[@]}" $CLASS"$@"

301 [300]##最终执行的java命令。eg:在EXEC_CALL基础上增加CLASS以及nutch命令后带的参数,如<seedDir><crawlID>


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值