1 #!/bin/bash
2 #download blogs on csdn
3 #by iambefu
4
5 #first,download table of contents
6 table_contents_download()
7 {
8 for ((i=1;i<=10;i++)) #download the first 10 page
9 do
10 url="http://blog.csdn.net/index.html?"
11 url=$url"page=$i"
12 curl $url >>csdntb
13 done
14 grep "
2 #download blogs on csdn
3 #by iambefu
4
5 #first,download table of contents
6 table_contents_download()
7 {
8 for ((i=1;i<=10;i++)) #download the first 10 page
9 do
10 url="http://blog.csdn.net/index.html?"
11 url=$url"page=$i"
12 curl $url >>csdntb
13 done
14 grep "
" -A4 csdntb >csdntb1 #提取关键信息,减少数据量
15 grep " ]' '{print $3 "@"$2}' | sed -e 's/href=\"/@/' -e 's/\" target/@/'|sort -u |awk -F@ '{print $1"@" $3}'>csdntb2
16 }
17
18 declare -A table_contents[]
19 blogs_download()
20 {
21 i=1
22 while read line
23 do
24 # blogs_name=$(echo ${line%%@*})
25 blogs_url=$(echo ${line##*@})
26 curl $blogs_url -o $i
27 let i++
28 sleep 2
29
30 done < csdntb2
31 }
32 table_contents_download
33 blogs_download
15 grep " ]' '{print $3 "@"$2}' | sed -e 's/href=\"/@/' -e 's/\" target/@/'|sort -u |awk -F@ '{print $1"@" $3}'>csdntb2
16 }
17
18 declare -A table_contents[]
19 blogs_download()
20 {
21 i=1
22 while read line
23 do
24 # blogs_name=$(echo ${line%%@*})
25 blogs_url=$(echo ${line##*@})
26 curl $blogs_url -o $i
27 let i++
28 sleep 2
29
30 done < csdntb2
31 }
32 table_contents_download
33 blogs_download
来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/29038506/viewspace-766782/,如需转载,请注明出处,否则将追究法律责任。
转载于:http://blog.itpub.net/29038506/viewspace-766782/