nutch 1.15中crawl命令解析

金正阳

于 2019-01-09 22:28:27 发布

阅读量1.2k

点赞数

分类专栏：搜索引擎

搜索引擎专栏收录该内容

4 篇文章 0 订阅

订阅专栏

nutch的最新版本为1.15，在实际生产中使用的时候，发现怎么也抓取不了，网上各个版本的命令解析都他娘的不能用，一怒之下，花了26块银子买了本二手的《shell 脚本学习指南》，看了后，用编辑器打开crawl命令一看，我靠，和以前的crawl一点都不一样了，而且还有错误。注释于此，备忘。

另外需说明的是，1.15版的crawl命令配置正确的可以直接抓取，不需要nutch的配合。

#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Usage: crawl [options] <crawl_dir> <num_rounds>
#
# Arguments:
# <crawl_dir> Directory where the crawl/host/link/segments dirs are saved
# <num_rounds> The number of rounds to run this crawl for
#
# Options:
# -i|--index Indexes crawl results into a configured indexer，该参数不带值
# -D A Java property to pass to Nutch calls
# -w|--wait <NUMBER[SUFFIX]> Time to wait before generating a new segment when no URLs
# are scheduled for fetching. Suffix can be: s for second,
# m for minute, h for hour and d for day. If no suffix is
# specified second is used by default. [default: -1]
# -s <seed_dir> Path to seeds file(s)
# -sm <sitemap_dir> Path to sitemap URL file(s)
#
# --hostdbupdate Boolean indicator if we call hostdbupdate or not
# --hostdbgenerate Boolean indicator if we use hostdb in generate or not
#
# --num-slaves <num_slaves> Number of slave nodes [default: 1]
# Note: This can only be set when running in distribution mode
# --num-tasks <num_tasks> Number of reducer tasks [default: 2]
# --size-fetchlist <size_fetchlist> Number of URLs to fetch in one iteration [default: 50000]
# --time-limit-fetch <time_limit_fetch> Number of minutes allocated to the fetching [default: 180]
# --num-threads <num_threads> Number of threads for fetching / sitemap processing [default: 50]
#
#
# UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND
# INDEXING FOR EACH SEGMENT

function __to_seconds() {
NUMBER=$(echo $1 | tr -dc '0-9') #取出第一个参数中数字
MODIFIER=$(echo $1 | tr -dc '[^s|h|m|d]]') #取出第一个参数中的表示时间的单位

echo $SECONDS
}

#下面的这个函数的作用是打印crawl的用法。
function __print_usage {
echo "Usage: crawl [options] <crawl_dir> <num_rounds>"
echo -e "" #-e的作用是：若字符串中出现以下字符，则特别加以处理，而不会将它当成一般文字输出
echo -e "Arguments:"
echo -e " <crawl_dir>\t\t\t\tDirectory where the crawl/host/link/segments dirs are saved"
echo -e " <num_rounds>\t\t\t\tThe number of rounds to run this crawl for"
echo -e ""
echo -e "Options:"
echo -e " -i|--index\t\t\t\tIndexes crawl results into a configured indexer"
echo -e " -D\t\t\t\t\tA Java property to pass to Nutch calls"
echo -e " -w|--wait <NUMBER[SUFFIX]>\t\tTime to wait before generating a new segment when no URLs"
echo -e " \t\t\t\t\tare scheduled for fetching. Suffix can be: s for second,"
echo -e " \t\t\t\t\tm for minute, h for hour and d for day. If no suffix is"
echo -e " \t\t\t\t\tspecified second is used by default. [default: -1]"
echo -e " -s <seed_dir>\t\t\t\tPath to seeds file(s)"
echo -e " -sm <sitemap_dir>\t\t\tPath to sitemap URL file(s)"
echo -e " --hostdbupdate\t\t\t\tBoolean flag showing if we either update or not update hostdb for each round"
echo -e " --hostdbgenerate\t\t\tBoolean flag showing if we use hostdb in generate or not"
echo -e " --num-slaves <num_slaves>\t\tNumber of slave nodes [default: 1]"
echo -e " \t\t\t\t\tNote: This can only be set when running in distribution mode"
echo -e " --num-tasks <num_tasks>\t\tNumber of reducer tasks [default: 2]"
echo -e " --size-fetchlist <size_fetchlist>\tNumber of URLs to fetch in one iteration [default: 50000]"
echo -e " --time-limit-fetch <time_limit_fetch>\tNumber of minutes allocated to the fetching [default: 180]"
echo -e " --num-threads <num_threads>\t\tNumber of threads for fetching / sitemap processing [default: 50]"
echo -e " --sitemaps-from-hostdb <frequency>\tWhether and how often to process sitemaps based on HostDB."
echo -e " \t\t\t\t\tSupported values are:"
echo -e " \t\t\t\t\t - never [default]"
echo -e " \t\t\t\t\t - always (processing takes place in every iteration)"
echo -e " \t\t\t\t\t - once (processing only takes place in the first iteration)"

exit 1
}

# default values
#下面的初始值是craw命令缺省的参数值
INDEXFLAG=false
HOSTDBUPDATE=false #不更新hostdb
HOSTDBGENERATE=false #不生成hostdb
JAVA_PROPERTIES="" #默认为空
WAIT=-1 # don't wait if there are no URLs to fetch
SEEDDIR="" #seeddir为空
NUM_SLAVES=1 #不是分布式
NUM_TASKS=2 # 2 x NUM_SLAVES 任务数默认为2
SIZE_FETCHLIST=50000 # 25K x NUM_TASKS
TIME_LIMIT_FETCH=180
NUM_THREADS=50
SITEMAPS_FROM_HOSTDB_FREQUENCY=never #好像是从hostdb获取sitemap的频率，但是参数中居然没说。

while [[ $# > 0 ]] #$# 命令的参数个数
do
case $1 in
-i|--index) #如果第一个参数为-i，则INDEXFLAG赋值为true，参数左移1次（将第一个参数删除）
INDEXFLAG=true
shift
;;
-D)
JAVA_PROPERTIES="-D${2} ${JAVA_PROPERTIES}"
shift 2
;;
-s)
SEEDDIR="${2}" #获取seed目录
shift 2
;;
-sm)
SITEMAPDIR="${2}"
shift 2
;;
-w|--wait)
WAIT="${2}"
shift 2
;;
--num-slaves)
NUM_SLAVES="${2}"
shift 2
;;
--num-tasks)
NUM_TASKS="${2}"
shift 2
;;
--size-fetchlist)
SIZE_FETCHLIST="${2}"
shift 2
;;
--time-limit-fetch)
TIME_LIMIT_FETCH="${2}"
shift 2
;;
--num-threads)
NUM_THREADS="${2}"
shift 2
;;
--sitemaps-from-hostdb)
SITEMAPS_FROM_HOSTDB_FREQUENCY="${2}"
shift 2
;;
--hostdbupdate)
HOSTDBUPDATE=true
shift
;;
--hostdbgenerate)
HOSTDBGENERATE=true
shift
;;
*)
break
;;
esac
done
#如果SITEMAPS_FROM_HOSTDB_FREQUENCY不包含never、always、once中的任一字符，报错，并且打印crawl的用法。
if [[ ! "$SITEMAPS_FROM_HOSTDB_FREQUENCY" =~ ^(never|always|once)$ ]]; then
echo "Error: --sitemaps-from-hostdb <frequency> has to be one of never, always, once."
echo -e ""
__print_usage
fi

#如果参数个数不为2，打印crawl的用法。
if [[ $# != 2 ]]; then
__print_usage
fi

CRAWL_PATH="$1"
LIMIT="$2"

# convert wait time to seconds for compatibility reasons
if [ "$WAIT" != "-1" ]; then
WAIT=$( __to_seconds "$WAIT" )
echo "Time to wait (--wait) = $WAIT sec."
fi

bin="`dirname "$0"`"
bin="`cd "$bin"; pwd`"

# determines whether mode based on presence of job file
mode=local
if [ -f "${bin}"/../*nutch*.job ]; then
mode=distributed
fi
if [[ "$mode" = "local" ]]; then
NUM_SLAVES=1
fi

# note that some of the options listed here could be set in the
# corresponding hadoop site xml param file
commonOptions="-D mapreduce.job.reduces=$NUM_TASKS -D mapred.child.java.opts=-Xmx1000m -D mapreduce.reduce.speculative=false -D mapreduce.map.speculative=false -D mapreduce.map.output.compress=true"

# check that hadoop can be found on the path
if [ $mode = "distributed" ]; then
if [ $(which hadoop | wc -l ) -eq 0 ]; then
echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode."
exit -1;
fi
fi

#该函数使用nutch命令进行抓取
function __bin_nutch {
# run $bin/nutch, exit if exit value indicates error

echo "$bin/nutch $@" ;# echo command and arguments
"$bin/nutch" "$@" #传递给脚本或函数的所有参数

RETCODE=$? #上个命令的退出状态,或函数的返回值
if [ $RETCODE -ne 0 ] #-ne 表示不等于
then
echo "Error running:"
echo " $bin/nutch $@"
echo "Failed with exit value $RETCODE."
exit $RETCODE
fi
}

# check if directory exists locally or on hdfs 检查目录是否在在本地或者hdf存在
function __directory_exists {
if [[ "$mode" == local && -d "$1" ]]; then #如果是本地模式并且第一个参数是目录，那么返回0
return 0
elif [[ "$mode" == distributed ]] && hadoop fs -test -d "$1"; then
return 0
else
return 1
fi
}

#更新hostdb

#在这里调用了nutch命令后面有参数：updatehostdb

function __update_hostdb {
if __directory_exists "$CRAWL_PATH"/crawldb; then
echo "Updating HostDB"
__bin_nutch updatehostdb -crawldb "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb
fi
}

# initial injection 向crawldb中注入seeddir
if [[ ! -z $SEEDDIR ]]; then #如果SEEDDIR 的长度不为0，[ -z STRING ] “STRING” 的长度为零则为真。
echo "Injecting seed URLs"
__bin_nutch inject "$CRAWL_PATH"/crawldb "$SEEDDIR"
fi

# sitemap processing based on sitemap definition file(s) 站点地图处理

#sitemap 网站地图，可方便网站管理员通知搜索引擎他们网站上有哪些可供抓取的网页
if [[ ! -z $SITEMAPDIR ]]; then #如果站点地图参数长度不为0，
echo "Processing sitemaps defined in $SITEMAPDIR"
__bin_nutch sitemap "$CRAWL_PATH/crawldb" -sitemapUrls "$SITEMAPDIR" -threads $NUM_THREADS
fi

# main loop : rounds of generate - fetch - parse - update 主循环，重复生成-抓取-解析-更新步骤
for ((a=1; ; a++))
do
if [ -e ".STOP" ]; then #[ -e FILE ] 如果 FILE 存在则为真。
echo "STOP file found - escaping loop"
break
fi

if [ $LIMIT -ne -1 ]; then #如果抓取次数不为-1
if [ $a -gt $LIMIT ]; then #如果循环变量大于抓取次数，则终止递归操作。
echo `date` ": Finished loop with $LIMIT iterations"
break
fi
echo `date` ": Iteration $a of $LIMIT" #显示某年月日，递归操作 n of m次
else
echo `date` ": Iteration $a" #显示某年月日，递归n次
fi

if [[ "$SITEMAPS_FROM_HOSTDB_FREQUENCY" = "always" || ("$SITEMAPS_FROM_HOSTDB_FREQUENCY" = "once" && $a -eq 1) ]]; then
# create / update HostDB on first run
[[ $a -eq 1 ]] && __update_hostdb

# sitemap processing based on HostDB
if __directory_exists "$CRAWL_PATH"/hostdb; then
echo "Processing sitemaps based on hosts in HostDB"
__bin_nutch sitemap "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb -threads $NUM_THREADS
fi
fi

echo "Generating a new segment"
if [[ "$HOSTDBGENERATE" == "true" ]] && __directory_exists "$CRAWL_PATH"/hostdb; then
generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_SLAVES -noFilter -hostdb "$CRAWL_PATH"/hostdb)
else
generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_SLAVES -noFilter)
fi

echo "$bin/nutch generate ${generate_args[@]}"
$bin/nutch generate "${generate_args[@]}"
RETCODE=$?
if [ $RETCODE -eq 0 ]; then
: # ok: no error
elif [ $RETCODE -eq 1 ]; then
echo "Generate returned 1 (no new segments created)"

if [ "$WAIT" -ne -1 ]; then
echo "Waiting for $WAIT sec. ..."
sleep $WAIT
continue
else
echo "Escaping loop: no more URLs to fetch now"
break
fi
else
echo "Error running:"
echo " $bin/nutch generate ${generate_args[@]}"
echo "Failed with exit value $RETCODE."
exit $RETCODE
fi

# capture the name of the segment
# call hadoop in distributed mode
# or use ls

echo "Operating on segment : $SEGMENT"

# fetching the segment
echo "Fetching : $SEGMENT"
__bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$TIME_LIMIT_FETCH "$CRAWL_PATH"/segments/$SEGMENT -noParsing -threads $NUM_THREADS

# parsing the segment
echo "Parsing : $SEGMENT"
# enable the skipping of records for the parsing so that a dodgy document
# so that it does not fail the full task
skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D mapreduce.map.skip.maxrecords=1"
__bin_nutch parse $commonOptions $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT

# updatedb with this segment
echo "CrawlDB update"
__bin_nutch updatedb $commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments/$SEGMENT

# note that the link inversion - indexing routine can be done within the main loop
# on a per segment basis
echo "Link inversion"
__bin_nutch invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT

echo "Dedup on crawldb"
__bin_nutch dedup "$CRAWL_PATH"/crawldb

if $INDEXFLAG; then
echo "Indexing $SEGMENT to index"
__bin_nutch index $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT

echo "Cleaning up index if possible"
__bin_nutch clean $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb
else
echo "Skipping indexing ..."
fi

echo "HostDB update"
if $HOSTDBUPDATE; then
__update_hostdb
fi

#######################################################
# The following commands fall into WebGraph territory
# and should be uncommented based on your requirements
#######################################################
#echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/"
#__bin_nutch webgraph $commonOptions -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH"

#echo "Running Loops Job on WebGraph within $CRAWL_PATH"
#__bin_nutch org.apache.nutch.scoring.webgraph.Loops $commonOptions -webgraphdb "$CRAWL_PATH"

#echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH"
#__bin_nutch linkrank $commonOptions -webgraphdb "$CRAWL_PATH"

#echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and WebGraph within $CRAWL_PATH"
#__bin_nutch scoreupdater $commonOptions -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH"

#echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores"
#__bin_nutch nodedumper $commonOptions -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores

done

exit 0