#!/bin/bash
url_path="-";
WGET_TIMECOUNT=2;
WGET_TIME=10;
FORK_SLEEP_TIME=1;
ONEURL_SLEEP_TIME=1;
SPIDER_PID_NUM=6;
function usage(){
echo"usage:spider.sh -u url_path -d page_store_dir";exit 3;
}
function version(){
echo"same-source-tools-spider-1.0.0";exit 4;
}while getopts l:u:d:t:T:s:S:p:OPTIONdocase$OPTIONin
u)url_path=${OPTARG};;
d)spider_dir=${OPTARG};;
t)WGET_TIMECOUNT=${OPTARG};;
T)WGET_TIME=${OPTARG};;
s)FORK_SLEEP_TIME=${OPTARG};;
S)ONEURL_SLEEP_TIME=${OPTARG};;
p)SPIDER_PID_NUM=${OPTARG};;
l)LOG_PATH=${OPTARG};;
h)usage;;
v)version;;/?)usage;;
esac
done
touch ${LOG_PATH};#检查抓取文件是否存在
if [ -e ${url_path} ]; then
echo"spider test: ${url_path} is exist";elseecho"url_path spider test: ${url_path} is not exist";exit 1;
fi#检查存储网页的目录是否存在
if [ -e ${spider_dir} ]; then
echo"spider test: ${spider_dir} is exist";elseecho"spider_dir spider test: ${spider_dir} is not exist";exit 2;
fi#清除原来的url文件
url_first_path="${spider_dir}/url_0";if [ -e ${url_first_path} ]; then
rm ${spider_dir}/url_*;
fi#创建url多进程抓取文件
for ((i=0;i
touch ${spider_dir}/url_${i};
}
doneno=0;#向url多进程抓取文件中写入抓取的url
cat ${url_path} | while readlinedoecho$line >> ${spider_dir}/url_${no};no=$(($no+1));if [ $no -ge ${SPIDER_PID_NUM} ]; thenno=0;
fi;
done#开始多进程抓取
for ((i=0;i
sleep${FORK_SLEEP_TIME};
{
url_path="${spider_dir}/url_${i}";if [ -e $url_path]; then
cat ${url_path}| /
while readurldo
sleep${ONEURL_SLEEP_TIME};
url_md5=`echo ${url} | md5sum | awk -F" " '{print $1}'`;
wget"${url}" -o ${LOG_PATH}_${url_md5} -O ${spider_dir}/${url_md5} -t ${WGET_TIMECOUNT} -T ${WGET_TIME};#wget ${url} -o ${LOG_PATH}_${url_md5} -a ${LOG_PATH} -O ${spider_dir}/${url_md5} -t ${WGET_TIMECOUNT} -T ${WGET_TIME};
dateFlag=`date +"%Y%m%d-%H:%M:%S"`;if [ $? -eq 0]; then
echo"${dateFlag} NOTICE:spiderwgetsuccess ${url}";elseecho"${dateFlag} ERROR:spiderwgeterror ${url}";
rm ${spider_dir}/${url_md5};
fi
doneelse
continue;
fi
}&donewait