解题报告

 #!/bin/bash 
 #function: crawl the webpage, and get urls
 #data: 2015/5/19
 #author: Aleda
 
 #$1 is the website
 
 function curlLinks()
 {
         for ((i=0; i<63; i++))
         do
                 flag=`cat "pages/page$i" | grep -i 'rel="bookmark" title=' | awk '{print $2}' | awk -F '"' '{print $2}' >> links`
         done
 }
 
 function uniqLinks()
 {
         flag=`sort -u links > newLinks`
 }
 
 function delSpace()
 {
         flag=`cat newLinks | sed '/^[[:space:]]*$/d' > tureLinks`
 }
 
 function finalize()
 {
         flag=`rm links`
         flag=`rm newLinks`
 }
 pages="${1}/page"
  
 #echo $pages
 
 #exist file
 
 #push pages into file
 
 for ((i=1; i<63; i++))
 do
         flag=`curl -o "pages/page$i" "${pages}/$i/"`
 done
 
 curlLinks
 
 uniqLinks
 
 delSpace
 
 finalize



 1 #!/bin/bash
  2 
  3 #function: crawl the www.bokra.net
  4 #data: 2014/5/20
  5 #author: Aleda
  6 
  7 #create folders
  8 
  9 function createFolder()
 10 {
 11         for ((i=1; i<=5; i++))
 12         do
 13                 if [ -e "page$i" ]; then
 14                         echo "page${i} exies!"
 15                 else
 16                         flag=`mkdir "page${i}"`
 17                 fi
 18         done
 19 }
 20 
 21 function crawlPages()
 22 {
 23         case ${1} in
 24                 1)
 25                         url="http://www.bokra.net/VideoCategory/42/اm~Am~Dاm~E.html"
 26                         for ((i=1; i<=34; i++))
 27                         do
 28                                 flag=`curl -o "page1/pages${i}" "${url}/${i}"`
 29                         done
 30                         #solve the web pages
 31                         for ((i=1; i<=34; i++))
 32                         do
 33                                 flag=`cat "page1/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links`
 34                         done
 35                         ;;
 36                 2)
 37                         url="http://www.bokra.net/VideoCategory/39/براm~Eج_تm~Dm~Aزm~Jm~Hm~Fm~Jة.html"
 38                         for ((i=1; i<=3; i++))
 39                         do
 40                                 flag=`curl -o "page2/pages${i}" "${url}/${i}"`
 41                         done
 42                         #solve the web pages
 43                         for ((i=1; i<=3; i++))
 44                         do
 45                                 flag=`cat "page2/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links`
 46                         done
 47                         ;;
 48                 3)
 49                         url="http://www.bokra.net/VideoCategory/44/m~Eسرحm~Jات.html"
 50                         for ((i=1; i<=2; i++))
 51                         do
 52                                 flag=`curl -o "page3/pages${i}" "${url}/${i}"`
 53                         done
 54                         #solve the web pages
 55                         for ((i=1; i<=2; i++))
 56                         do
 57                                 flag=`cat "page3/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links`
 58                         done
 59                         ;;
 60                 4)
 61                         url="http://www.bokra.net/VideoCategory/43/m~Eسm~Dسm~Dات.html"
 62                         for ((i=1; i<=15; i++))
 63                         do
 64                                 flag=`curl -o "page4/pages${i}" "${url}/${i}"`
 65                         done
 66                         for ((i=1; i<=15; i++))
 67                         do
 68                                 flag=`cat "page4/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links`
 69                         done
 70                         ;;
 71                 5)
 72                         url="http://www.bokra.net/VideoCategory/113/بm~Cرا_TV.html"
 73                         flag=`cd page5`
 74                         for ((i=1; i<=1; i++))
 75                         do
 76                                 flag=`curl -o "page5/pages${i}" "${url}/${i}"`
 77                         done
 78                         for ((i=1; i<=1; i++))
 79                         do
 80                                 flag=`cat "page5/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links`
 81                         done
 82                         ;;
 83                 *)
 84                         echo "Usage: 1, 2, 3, 4, 5"
 85                         exit 1
 86         esac
 87 
 88 }
 89 
 90 function crawling()
 91 {
 92         if [ -e links ]; then
 93                 flag=`cat /dev/null links`
 94         else
 95                 flag=`touch links`
 96         fi
 97         for ((ii=1; ii<=5; ii++))
 98         do
 99                 crawlPages ${ii}
100 #               echo "${ii}..................................."
101         done
102 }
103 
104 function uniqLinks()
105 {
106         flag=`sort -u links > tureLinks`
107         flag=`rm links`
108 }
109 
110 #createFolder
111 
112 crawling
113 
114 uniqLinks

update:

#!/bin/bash
#function: crawl the www.bokra.net
#data: 2014/5/20
#author: Aleda

#create folders

function createFolder()
{
        for ((i=1; i<=5; i++))
        do
                if [ -e "page$i" ]; then
                        echo "page${i} exies!"
                else
                        flag=`mkdir "page${i}"`
                fi
        done
}

function crawlPages()
{
        case ${1} in
                1)
                        url="http://www.bokra.net/VideoCategory/42/اm~Am~Dاm~E.html"
                        for ((i=1; i<=34; i++))
                        do
                                 flag=`curl -o "page1/pages${i}" "${url}/${i}"`
                        done
                        #solve the web pages
                        for ((i=1; i<=34; i++))
                        do
                                flag=`cat "page1/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links`
                        done
                        ;;
                2)
                        url="http://www.bokra.net/VideoCategory/39/براm~Eج_تm~Dm~Aزm~Jm~Hm~Fm~Jة.html"
                        for ((i=1; i<=3; i++))
                        do
                                flag=`curl -o "page2/pages${i}" "${url}/${i}"`
                        done
                        #solve the web pages
                        for ((i=1; i<=3; i++))
                        do
                                flag=`cat "page2/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links`
                        done
                        ;;
                3)
                        url="http://www.bokra.net/VideoCategory/44/m~Eسرحm~Jات.html"
                        for ((i=1; i<=2; i++))
                        do
                                flag=`curl -o "page3/pages${i}" "${url}/${i}"`
                        done
                        #solve the web pages
                        for ((i=1; i<=2; i++))
                        do
                                flag=`cat "page3/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links`
                        done
                        ;;
                4)
                        url="http://www.bokra.net/VideoCategory/43/m~Eسm~Dسm~Dات.html"
                        for ((i=1; i<=15; i++))
                        do
                                flag=`curl -o "page4/pages${i}" "${url}/${i}"`
                        done
                        for ((i=1; i<=15; i++))
                        do
                                flag=`cat "page4/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links`
                        done
                        ;;
                5)
                        url="http://www.bokra.net/VideoCategory/113/بm~Cرا_TV.html"
                        flag=`cd page5`
                        for ((i=1; i<=1; i++))
                        do
                                flag=`curl -o "page5/pages${i}" "${url}/${i}"`
                        done
                        for ((i=1; i<=1; i++))
                        do
                                flag=`cat "page5/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links`
                        done
                        ;;
                *)
                        echo "Usage: 1, 2, 3, 4, 5"
                        exit 1
        esac
 
}

function crawling()
{
        if [ -e links ]; then
                flag=`cat /dev/null links`
        else
                flag=`touch links`
        fi
        for ((ii=1; ii<=5; ii++))
        do
                crawlPages ${ii}
                #echo "${ii}..................................."
        done
}

function uniqLinks()
{
        flag=`sort -u links > tureLinks`
        flag=`rm links`
}

function readLines()
{
        if [ -e deepPages ]; then
                echo "File exists!"
        else
                flag=`mkdir deepPages`
        fi
        i=1;
        while read line
        do
                flag=`curl -A Googlebot -o "deepPages/pages${i}" "$line"`
                i=$(($i+1))
        done < trueLinks
}

function getLinks()
{
        for ((i=1; i<=2530; i++))
        do
                flag=`cat "deepPages/pages${i}" | grep -i '<div class="pic[t]*' | awk '{print $3}' | awk -F '"' '{print $2}' >> examLinks`
        done
}

function deepCrawl()
{
        readLines
        getLinks
        flag=`sort -u examLinks > Links`
}

function finalize()
{
        flag=`rm deepPages/pages*`
        flag=`rm deepPages`
        flag=`rm examLinks`
        flag=`rm trueLinks`
        flag=`rm -r page[1-9]`
}

#createFolder

crawling

uniqLinks
 
deepCrawl
#finalize


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值