package main
import (
"fmt"
"io"
"net/http"
"os"
"regexp"
"strconv"
)
func main(){
var start,end int
fmt.Println("请输入爬取的起始页(>=1):")
fmt.Scan(&start)
fmt.Println("请输入爬取的终止页(>=start):")
fmt.Scan(&end)
Working(start,end)
}
func Working(start , end int){
fmt.Printf("正在爬取第%d页到第%d页...\n",start,end)
ch := make(chan int)
for i := start ; i <= end ; i++ {
go SpiderPage(i,ch)
}
for i := start ; i <= end ; i++ {
fmt.Printf("第%d个网页已经爬取完成\n",<-ch)
}
}
func SpiderPage(i int,ch chan int){
url := "https://movie.douban.com/top250?start="+ strconv.Itoa((i-1)*25) + "&filter="
result ,err := httpGet(url)
if err != nil{
fmt.Println("HttpGet err",err)
return
}
ret := regexp.MustCompile(`<img width="100" alt="(.*?)"`)
filmName := ret.FindAllStringSubmatch(result,-1)
ret1 := regexp.MustCompile(`<span class="rating_num" property="v:average">(?s:(.*?))</span>`)
filmScore := ret1.FindAllStringSubmatch(result,-1)
ret2 := regexp.MustCompile(`<span>(.*?)人评价</span>`)
filmNum := ret2.FindAllStringSubmatch(result,-1)
Save2File(i,filmName,filmScore,filmNum)
ch <- i
}
func Save2File(idx int, filmName,filmScore,fileNum [][]string){
f , err := os.Create("第 "+ strconv.Itoa(idx)+" 页.txt")
if err != nil{
fmt.Println("Create err:",err)
return
}
defer f.Close()
n := len(filmName)
f.WriteString("电影名称" + "\t\t\t" + "评分" + "\t\t\t" + "评价人数" + "\n")
for i := 0 ; i < n ; i++{
f.WriteString(filmName[i][1] + "\t\t\t" + filmScore[i][1] + "\t\t\t" + fileNum[i][1] + "\n")
}
}
func httpGet(url string)(result string ,err error){
resp , err1 := http.Get(url)
if err1 != nil {
err = err1
return
}
defer resp.Body.Close()
buf := make([]byte,4096)
for{
n,err2 := resp.Body.Read(buf)
if n == 0 {
break
}
if err2 != nil && err2 != io.EOF{
err = err2
return
}
result += string(buf[:n])
}
return
}