shell curl 登陆有验证码的网站。
使用curl对需要登陆(验证码),的网站进行爬虫。
function main(){
url=$1;
doname=$(getDoname $url);
# Cookie存在位置
cookie="/tmp/$doname.cookie";
# 删除旧cookiew
rm -rf $cookie;
# 定义User Agent
ua="Mozilla/5.0 (Linux; Android 10; VOG-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Mobile Safari/537.36";
# 请求获取验证码图片,
curl -c $cookie -b $cookie -H "User-Agent: $ua" "https://sso.39doo.com/captcha/" -o "/tmp/captcha.jpg";
# 显示验证码图片
ristretto "/tmp/captcha.jpg" &
# 读取用户输入的验证码
read -t 30 -p "Please input your username:" captcha;
echo $captcha;
# 获取表单 csrf 验证值
csrfmiddlewaretoken=`curl -c $cookie -b $cookie https://sso.39doo.com/cas/login | grep "csrfmiddlewaretoken" | sed -r "s/.*csrfmiddlewaretoken'\s*value='([^']+).*/\1/g"`;
# 判断是否获取成功
if [[ $csrfmiddlewaretoken == "" ]]; then
echo "Err : Unable to obtain crsf!"
return;
fi
# 登陆服务器
curl -c $cookie -b $cookie -d "captcha=$captcha&csrfmiddlewaretoken=$csrfmiddlewaretoken&email=你的邮箱&password=您的密码" -H "User-Agent: $ua" "https://sso.39doo.com/cas/login";
curl -c $cookie -b $cookie -H "User-Agent: $ua" $url;
# 开始爬虫
echo -e "\nget api data ....";
for (( i = 0; i < 15; i++ )); do
sleep 2;
echo -ne "\rpage$i ....";
psearch=`echo $search | sed -r "s/page=[0-9]+/page=$i/g"`;
curl -c $cookie -b $cookie --referer "$url" -H "User-Agent: $ua" $psearch &>/dev/null >> "./zoomeye.log";
echo -e "\n" >> "./zoomeye.log";
done
}

本文介绍如何利用shell和curl命令行工具登录含有验证码的网站,探讨在爬虫过程中解决验证码问题的方法。
186

被折叠的 条评论
为什么被折叠?



