python wget 跨进程_wget多进程抓取的实现

#!/bin/bash

url_path="-";

WGET_TIMECOUNT=2;

WGET_TIME=10;

FORK_SLEEP_TIME=1;

ONEURL_SLEEP_TIME=1;

SPIDER_PID_NUM=6;

function usage(){

echo"usage:spider.sh -u url_path -d page_store_dir";exit 3;

}

function version(){

echo"same-source-tools-spider-1.0.0";exit 4;

}while getopts l:u:d:t:T:s:S:p:OPTIONdocase$OPTIONin

u)url_path=${OPTARG};;

d)spider_dir=${OPTARG};;

t)WGET_TIMECOUNT=${OPTARG};;

T)WGET_TIME=${OPTARG};;

s)FORK_SLEEP_TIME=${OPTARG};;

S)ONEURL_SLEEP_TIME=${OPTARG};;

p)SPIDER_PID_NUM=${OPTARG};;

l)LOG_PATH=${OPTARG};;

h)usage;;

v)version;;/?)usage;;

esac

done

touch ${LOG_PATH};#检查抓取文件是否存在

if [ -e ${url_path} ]; then

echo"spider test: ${url_path} is exist";elseecho"url_path spider test: ${url_path} is not exist";exit 1;

fi#检查存储网页的目录是否存在

if [ -e ${spider_dir} ]; then

echo"spider test: ${spider_dir} is exist";elseecho"spider_dir spider test: ${spider_dir} is not exist";exit 2;

fi#清除原来的url文件

url_first_path="${spider_dir}/url_0";if [ -e ${url_first_path} ]; then

rm ${spider_dir}/url_*;

fi#创建url多进程抓取文件

for ((i=0;i

touch ${spider_dir}/url_${i};

}

doneno=0;#向url多进程抓取文件中写入抓取的url

cat ${url_path} | while readlinedoecho$line >> ${spider_dir}/url_${no};no=$(($no+1));if [ $no -ge ${SPIDER_PID_NUM} ]; thenno=0;

fi;

done#开始多进程抓取

for ((i=0;i

sleep${FORK_SLEEP_TIME};

{

url_path="${spider_dir}/url_${i}";if [ -e $url_path]; then

cat ${url_path}| /

while readurldo

sleep${ONEURL_SLEEP_TIME};

url_md5=`echo ${url} | md5sum | awk -F" " '{print $1}'`;

wget"${url}" -o ${LOG_PATH}_${url_md5} -O ${spider_dir}/${url_md5} -t ${WGET_TIMECOUNT} -T ${WGET_TIME};#wget ${url} -o ${LOG_PATH}_${url_md5} -a ${LOG_PATH} -O ${spider_dir}/${url_md5} -t ${WGET_TIMECOUNT} -T ${WGET_TIME};

dateFlag=`date +"%Y%m%d-%H:%M:%S"`;if [ $? -eq 0]; then

echo"${dateFlag} NOTICE:spiderwgetsuccess ${url}";elseecho"${dateFlag} ERROR:spiderwgeterror ${url}";

rm ${spider_dir}/${url_md5};

fi

doneelse

continue;

fi

}&donewait

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值