#!/bin/bash
#function: crawl the webpage, and get urls
#data: 2015/5/19
#author: Aleda
#$1 is the website
function curlLinks()
{
for ((i=0; i<63; i++))
do
flag=`cat "pages/page$i" | grep -i 'rel="bookmark" title=' | awk '{print $2}' | awk -F '"' '{print $2}' >> links`
done
}
function uniqLinks()
{
flag=`sort -u links > newLinks`
}
function delSpace()
{
flag=`cat newLinks | sed '/^[[:space:]]*$/d' > tureLinks`
}
function finalize()
{
flag=`rm links`
flag=`rm newLinks`
}
pages="${1}/page"
#echo $pages
#exist file
#push pages into file
for ((i=1; i<63; i++))
do
flag=`curl -o "pages/page$i" "${pages}/$i/"`
done
curlLinks
uniqLinks
delSpace
finalize
1 #!/bin/bash
2
3 #function: crawl the www.bokra.net
4 #data: 2014/5/20
5 #author: Aleda
6
7 #create folders
8
9 function createFolder()
10 {
11 for ((i=1; i<=5; i++))
12 do
13 if [ -e "page$i" ]; then
14 echo "page${i} exies!"
15 else
16 flag=`mkdir "page${i}"`
17 fi
18 done
19 }
20
21 function crawlPages()
22 {
23 case ${1} in
24 1)
25 url="http://www.bokra.net/VideoCategory/42/اm~Am~Dاm~E.html"
26 for ((i=1; i<=34; i++))
27 do
28 flag=`curl -o "page1/pages${i}" "${url}/${i}"`
29 done
30 #solve the web pages
31 for ((i=1; i<=34; i++))
32 do
33 flag=`cat "page1/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links`
34 done
35 ;;
36 2)
37 url="http://www.bokra.net/VideoCategory/39/براm~Eج_تm~Dm~Aزm~Jm~Hm~Fm~Jة.html"
38 for ((i=1; i<=3; i++))
39 do
40 flag=`curl -o "page2/pages${i}" "${url}/${i}"`
41 done
42 #solve the web pages
43 for ((i=1; i<=3; i++))
44 do
45 flag=`cat "page2/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links`
46 done
47 ;;
48 3)
49 url="http://www.bokra.net/VideoCategory/44/m~Eسرحm~Jات.html"
50 for ((i=1; i<=2; i++))
51 do
52 flag=`curl -o "page3/pages${i}" "${url}/${i}"`
53 done
54 #solve the web pages
55 for ((i=1; i<=2; i++))
56 do
57 flag=`cat "page3/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links`
58 done
59 ;;
60 4)
61 url="http://www.bokra.net/VideoCategory/43/m~Eسm~Dسm~Dات.html"
62 for ((i=1; i<=15; i++))
63 do
64 flag=`curl -o "page4/pages${i}" "${url}/${i}"`
65 done
66 for ((i=1; i<=15; i++))
67 do
68 flag=`cat "page4/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links`
69 done
70 ;;
71 5)
72 url="http://www.bokra.net/VideoCategory/113/بm~Cرا_TV.html"
73 flag=`cd page5`
74 for ((i=1; i<=1; i++))
75 do
76 flag=`curl -o "page5/pages${i}" "${url}/${i}"`
77 done
78 for ((i=1; i<=1; i++))
79 do
80 flag=`cat "page5/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links`
81 done
82 ;;
83 *)
84 echo "Usage: 1, 2, 3, 4, 5"
85 exit 1
86 esac
87
88 }
89
90 function crawling()
91 {
92 if [ -e links ]; then
93 flag=`cat /dev/null links`
94 else
95 flag=`touch links`
96 fi
97 for ((ii=1; ii<=5; ii++))
98 do
99 crawlPages ${ii}
100 # echo "${ii}..................................."
101 done
102 }
103
104 function uniqLinks()
105 {
106 flag=`sort -u links > tureLinks`
107 flag=`rm links`
108 }
109
110 #createFolder
111
112 crawling
113
114 uniqLinks
update:
#!/bin/bash
#function: crawl the www.bokra.net
#data: 2014/5/20
#author: Aleda
#create folders
function createFolder()
{
for ((i=1; i<=5; i++))
do
if [ -e "page$i" ]; then
echo "page${i} exies!"
else
flag=`mkdir "page${i}"`
fi
done
}
function crawlPages()
{
case ${1} in
1)
url="http://www.bokra.net/VideoCategory/42/اm~Am~Dاm~E.html"
for ((i=1; i<=34; i++))
do
flag=`curl -o "page1/pages${i}" "${url}/${i}"`
done
#solve the web pages
for ((i=1; i<=34; i++))
do
flag=`cat "page1/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links`
done
;;
2)
url="http://www.bokra.net/VideoCategory/39/براm~Eج_تm~Dm~Aزm~Jm~Hm~Fm~Jة.html"
for ((i=1; i<=3; i++))
do
flag=`curl -o "page2/pages${i}" "${url}/${i}"`
done
#solve the web pages
for ((i=1; i<=3; i++))
do
flag=`cat "page2/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links`
done
;;
3)
url="http://www.bokra.net/VideoCategory/44/m~Eسرحm~Jات.html"
for ((i=1; i<=2; i++))
do
flag=`curl -o "page3/pages${i}" "${url}/${i}"`
done
#solve the web pages
for ((i=1; i<=2; i++))
do
flag=`cat "page3/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links`
done
;;
4)
url="http://www.bokra.net/VideoCategory/43/m~Eسm~Dسm~Dات.html"
for ((i=1; i<=15; i++))
do
flag=`curl -o "page4/pages${i}" "${url}/${i}"`
done
for ((i=1; i<=15; i++))
do
flag=`cat "page4/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links`
done
;;
5)
url="http://www.bokra.net/VideoCategory/113/بm~Cرا_TV.html"
flag=`cd page5`
for ((i=1; i<=1; i++))
do
flag=`curl -o "page5/pages${i}" "${url}/${i}"`
done
for ((i=1; i<=1; i++))
do
flag=`cat "page5/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links`
done
;;
*)
echo "Usage: 1, 2, 3, 4, 5"
exit 1
esac
}
function crawling()
{
if [ -e links ]; then
flag=`cat /dev/null links`
else
flag=`touch links`
fi
for ((ii=1; ii<=5; ii++))
do
crawlPages ${ii}
#echo "${ii}..................................."
done
}
function uniqLinks()
{
flag=`sort -u links > tureLinks`
flag=`rm links`
}
function readLines()
{
if [ -e deepPages ]; then
echo "File exists!"
else
flag=`mkdir deepPages`
fi
i=1;
while read line
do
flag=`curl -A Googlebot -o "deepPages/pages${i}" "$line"`
i=$(($i+1))
done < trueLinks
}
function getLinks()
{
for ((i=1; i<=2530; i++))
do
flag=`cat "deepPages/pages${i}" | grep -i '<div class="pic[t]*' | awk '{print $3}' | awk -F '"' '{print $2}' >> examLinks`
done
}
function deepCrawl()
{
readLines
getLinks
flag=`sort -u examLinks > Links`
}
function finalize()
{
flag=`rm deepPages/pages*`
flag=`rm deepPages`
flag=`rm examLinks`
flag=`rm trueLinks`
flag=`rm -r page[1-9]`
}
#createFolder
crawling
uniqLinks
deepCrawl
#finalize