linux 脚本爬虫,shell脚本--练习1(爬虫)

#!/bin/bash

# date: 2018-03-xx

# author: yk

# descrption: Climbing 51cto data

# version: 0.1

source /etc/profile

. /etc/init.d/functions

# Create a temporary file

TmpFile="/tmp/.$(date +%Y%m%d_%H%M%S).log.tmp"

touch $TmpFile

# Store web page information

BlogFile="/tmp/$(date +%Y%m%d_%H%M%S)_blog.html"

touch $BlogFile

# Let the user enter the 51cto blogger's homepage URL

read -p 'please input websitei' Website

# Climb 51cto blogger home

wget -q -O $TmpFile $Website &>/dev/null

[ $? -ne 0 ] && echo "you input website is not exist" && exit 1

# Blogger's last page blog. That is, the last page contains the number of pages

MainURL=$(sed -n '/class="last".*末页.*/p' $TmpFile | egrep -o 'http:.*p[0-9]{1,}')

# 28 pages

Pages=$(echo $MainURL | sed -n 's#^.*p##gp')

# If it is not the home page, the number of extracted pages is definitely not a number

if [ "$Pages" -gt 0 ] &>/dev/null

then

echo "please wait ......"

else

echo "you input url is not homepage"

rm -f $TmpFile

rm -f $BlogFile

exit 1

fi

# Url address, in addition to the last number

UR=$(echo $MainURL | sed -rn 's#[0-9]{1,}$##gp')

# Traverse every page

for ((i=1;i<=$Pages;i++))

do

# Splice together, which is the complete blogger's website

wget -q -O $TmpFile ${UR}$i &>/dev/null

# Get time, title, link

egrep -A 1 '

echo "Extracting required data from downloaded data ......"

echo "please wait ....."

# ===============================================================

i=0

# Extract the desired data for each line of the file

while read line

do

# Because every 4th line is the content of a blog, it only needs to extract from every 4th line and loop execution.

((++i))

case "$i" in

1)

# Get blog posting time

Time=$(echo $line | sed -r 's#^.*>发布于:(.*)

#\1#g')

;;

3)

# get href

Href=$(echo $line | sed -r 's#^.*href=\"(.*)\">#\1#g')

;;

4)

# get blog title

Title=$(echo $line | sed -r 's#^(.*)<.>

;;

*)

esac

# Every 4 acts as a blog, appends the acquired information to a temporary file

if [ "$i" -eq "4" ]

then

i=0

echo "$Time---$Title
" >> $TmpFile

fi

done < $BlogFile

# clear file

>$BlogFile

# Sort by time , Append to file $BlogFile

cat $TmpFile | sort -rt '>' -k2 >>$BlogFile

rm -f $TmpFile

action "success" /bin/true

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值