使用准备
1.安装Golang
2.下载爬虫包
go get -v github.com/hunterhug/marmot/expert
go get -v github.com/hunterhug/marmot/miner
go get -v github.com/hunterhug/parrot/util
程序
该程序只能抓取HTML中src="http"中的图片, 必须带有协议头http(s), 其他如data-src和混淆在JS中的无法抓取
package main
import (
"errors"
"fmt"
"net/url"
"strings"
"github.com/hunterhug/marmot/expert"
"github.com/hunterhug/marmot/miner"
"github.com/hunterhug/parrot/util"
)
// Num of miner, We can run it at the same time to crawl data fast
var MinerNum = 5
// You can update this decide whether to proxy
var ProxyAddress interface{}
func main() {
// You can Proxy!
// ProxyAddress = "socks5://127.0.0.1:1080"
fmt.Println(`Welcome: Input "url" and picture keep "dir"`)
for {
fmt.Println("---------------------------------------------")
url := util.Input(`URL(Like: "http://publicdomainarchive.com")`, "http://publicdomainarchive.com")
dir := util.Input(`DIR(Default: "./picture")`, "./picture")
fmt.Printf("You will keep %s picture in dir %s\n", url, dir)
fmt.Println("---------------------------------------------")
// Start Catch
err := CatchPicture(url, dir)
if err != nil {
fmt.Println("Error:" + err.Error())
}
}
}
// Come on!
func CatchPicture(picture_url string, dir string) error {
// Check valid
_, err := url.Parse(picture_url)
if err != nil {
return err
}
// Make dir!
err = util.MakeDir(dir)
if err != nil {
return err
}
// New a worker to get url
worker, _ := miner.New(ProxyAddress)
result, err := worker.SetUrl(picture_url).SetUa(miner.RandomUa()).Get()
if err != nil {
return err
}
// Find all pictu