这里以爬网页图片链接为例,再下载到本地到img文件下
package main
import (
"fmt"
"io/ioutil"
"net/http"
"regexp"
"strconv"
"strings"
"sync"
"time"
)
var (
chanPageUrls chan string//数据通道chan
waitGroup sync.WaitGroup
chanTask chan string //监控chan
)
const (
PageReg = `https?://[^"]+?(\.((jpg)|(png)|(jpeg)|(gif)|(bmp)))` //图片链接正则
)
func main() {
//初始化chan
chanPageUrls = make(chan string, 10000)
chanTask = make(chan string, 3)
//多线程(3)爬目标网页
for i := 1; i < 4; i++ {
waitGroup.Add(1)
go GetPageUrls("https://www.umei.cc/touxiangtupian/katongtouxiang/"+strconv.Itoa(i)+".htm", PageReg)//网址,正则参数
}
//监控
waitGroup.Add(1)
go CheckOk()
//多线程(2)下载数据
for i := 0; i < 2; i++ {
waitGroup.Add(1)
go DownloadFiles()
}
waitGroup.Wait()
}
//迭代写入数据
func DownloadFiles() {
// fmt.Println(chanPageUrls)
for url := range chanPageUrls {
filename := GetFilename(url)
ok := DownloadFile(url, filename)
if ok {
fmt.Printf("%s下载成功\n", filename)
} else {
fmt.Printf("%s下载失败\n", filename)
}
}
waitGroup.Done()
}
//写数据到目标文件夹下
func DownloadFile(url, filename string) bool {
pageBytes := GetPageBytes(url)
path := "/Users/fhy/Desktop/gowork/src/reptile/img/" + filename
err := ioutil.WriteFile(path, pageBytes, 0666)
if err != nil {
return false
} else {
return true
}
}
//下载文件名称
func GetFilename(url string) (filename string) {
index := strings.LastIndex(url, "/")
urlname := url[index+1:]
timePrefix := strconv.Itoa(int(time.Now().UnixNano()))
filename = timePrefix + "_" + urlname
return filename
}
//监控爬虫
func CheckOk() {
var count int
for {
check := <-chanTask
count++
fmt.Printf("%s完成爬虫\n", check)
if count == 3 {
close(chanPageUrls)
break
}
}
waitGroup.Done()
}
//爬数据放入chan
func GetPageUrls(url, regSomething string) {
urls := GetContent(url, regSomething)
for _, val := range urls {
chanPageUrls <- val
}
chanTask <- url
waitGroup.Done()
}
//得到匹配正则的数据
func GetContent(url, regSomething string) (urls []string) {
pageStr := string(GetPageBytes(url))
re := regexp.MustCompile(regSomething) //regSomething 匹配正则
List := re.FindAllStringSubmatch(pageStr, -1)
fmt.Printf("共找到%d条结果", len(List))
for _, v := range List {
urls = append(urls, v[0])
}
return
}
//爬原始数据ing
func GetPageBytes(url string) []byte {
resp, err := http.Get(url) //url爬的网址
HandleErr(err)
defer resp.Body.Close()
pageBytes, err := ioutil.ReadAll(resp.Body)
HandleErr(err)
// pageStr := string(pageBytes)
return pageBytes
}
//错误error处理
func HandleErr(err error) {
if err != nil {
fmt.Println(err)
return
}
}