golang版本的图片爬虫,效果如下
github链接地址:https://github.com/lengyue1084/sspider
里面有可执行文件,使用了go的协程以及一些特性,新手可以拿来学习...
当然你也可以爬一些你想爬的图片,找到图片列表初始页,就可以一页一页爬取了。
编译后的可执行文件下载地址:https://download.csdn.net/download/lengyue1084/12926501
执行界面如下图
采集结果如下图
说明:代码如下,有兴趣的可以拿去玩一下,参考了一些网上的代码,按照如下规则输入命令都可以采集(建议修改一下里面的demo网址,或者协程设置少一点,不要把人家的站搞挂了)。
###########################################:
胖达图片采集器v0.1
该版本支持采集列表规则,页码{{n}}为变量
如:《https://www.58pic.com/collect/fav-{{n}}.html》
其中n为页码,起始页码一般默认为1
###########################################:
命令列表:
1、设置采集前缀 如:https://www.58pic.com/collect/fav-
2、设置采集后缀 如:.html,根据实际情况设置,默认为空
3、设置起始页码 如:1,默认值为1
4、设置最大页码 如:999,根据需要采集的页面列表采集设置默认为999
5、设置最大线程数默认8
6、开始采集
8、退出程序
代码入下:
package main
import (
"bufio"
"fmt"
"github.com/PuerkitoBio/goquery"
"io"
"log"
"math/rand"
"net/http"
"os"
"path"
"strconv"
"strings"
"sync"
"time"
)
var MaxGoroutineNum = 8
var Scheme = "https"
var SaveFolder = "/images/"
var StartUrl = "https://www.58pic.com/collect/fav-1.html"
//var StartUrlPre = "https://www.58pic.com/collect/fav-"
//var StartUrlend =".html"
var StartUrlPre = "https://www.58pic.com/collect/fav-1"
var StartUrlend =".html"
var CurentPageNum = 1
var MaxPageNum = 999
var pageUrlChan = make(chan string, 50)
var headers = map[string][]string{
"Accept": []string{"text/html,application/xhtml+xml,application/xml", "q=0.9,image/webp,*/*;q=0.8"},
"Accept-Encoding": []string{"gzip, deflate, sdch"},
"Accept-Language": []string{"zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4"},
"Accept-Charset": []string{"utf-8"},
"Connection": []string{"keep-alive"},
"DNT": []string{"1"},
"Host": []string{"www.58pic.com"},
"Referer": []string{"https://www.58pic.com/collect/fav-1.html"},
"Upgrade-Insecure-Requests": []string{"1"},
"User-Agent": []string{"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"},
}
var wg sync.WaitGroup
func main() {
// 从标准输入流中接收输入数据
input := bufio.NewScanner(os.Stdin)
fmt.Printf("###########################################:\n")
fmt.Printf("胖达图片采集器v0.1\n")
fmt.Printf("该版本支持采集列表规则,页码{{n}}为变量\n如:《https://www.58pic.com/collect/fav-{{n}}.html》\n其中n为页码,起始页码一般默认为1\n")
fmt.Printf("###########################################:\n")
fmt.Printf("命令列表:\n1、设置采集前缀 如:https://www.58pic.com/collect/fav-\n")
fmt.Printf("2、设置采集后缀 如:.html,根据实际情况设置,默认为空\n")
fmt.Printf("3、设置起始页码 如:1,默认值为1\n")
fmt.Printf("4、设置最大页码 如:999,根据需要采集的页面列表采集设置默认为999\n")
fmt.Printf("5、设置最大线程数默认8 \n")
fmt.Printf("6、开始采集 \n")
fmt.Printf("8、退出程序 \n")
fmt.Printf("请输入命令:\n")
// 逐行扫描
for input.Scan() {
cmd := input.Text()
// 输入bye时 结束
switch cmd {
case "1":
fmt.Printf("请输入采集前缀:")
for input.Scan(){
s := input.Text()
StartUrlPre = s
fmt.Println("您输入的值为采集前缀为:",s)
fmt.Printf("请输入命令:\n")
break;
}
case "2":
fmt.Printf("请输入采集后缀:")
for input.Scan(){
s := input.Text()
StartUrlend = s
fmt.Println("您输入的值为采集后缀为:",s)
fmt.Printf("请输入命令:\n")
break;
}
case "3":
fmt.Printf("设置起始页码:")
for input.Scan(){
s := input.Text()
CurentPageNum, _ = strconv.Atoi(s)
fmt.Println("您输入起始页码默认为:",s)
fmt.Printf("请输入命令:\n")
break;
}
case "4":
fmt.Printf("设置最大页码:")
for input.Scan(){
s := input.Text()
MaxPageNum, _ = strconv.Atoi(s)
fmt.Println("您输入的最大页码为:",s)
fmt.Printf("请输入命令:\n")
break;
}
case "5":
fmt.Printf("设置最大线程数:")
for input.Scan(){
s := input.Text()
MaxPageNum, _ = strconv.Atoi(s)
fmt.Println("您输入的最大线程数为:",s)
fmt.Printf("请输入命令:\n")
break;
}
case "6":
goto start
case "8":
fmt.Printf("程序即将退出~\n")
CountTime(5)
os.Exit(0)
default:
fmt.Printf("请输入命令前的序号比如 1~\n")
}
}
//fmt.Printf("Please type in something:\n")
start:
//判断存储文件夹是否存在
if err := MkDirForImages(); err != nil {
fmt.Println(err.Error())
}
wg.Add(MaxGoroutineNum)
for i := 0; i < MaxGoroutineNum; i++ {
go getAndSaveImages(i)
}
scheme := StartUrlPre[0:5]
if scheme != Scheme{
Scheme = "http"
}
fmt.Println(Scheme)
setNextPageUrl()
wg.Wait()
}
func MkDirForImages() error {
rootpath, _ := os.Getwd()
SaveFolder = rootpath + SaveFolder + time.Now().Format("2006-01-02") + "/"
f, err := os.Open(SaveFolder)
defer f.Close()
if err != nil && os.IsExist(err) {
return nil
}
if err := os.MkdirAll(SaveFolder, 0777); err != nil {
fmt.Println("创建文件夹失败:", err)
return err
}
os.Chmod(SaveFolder, 0777)
fmt.Println("文件夹创建成功:", SaveFolder)
return nil
}
func getAndSaveImages(i int) {
//out := <- pageUrlChan
//log.Println(out)
for url := range pageUrlChan {
fmt.Println("携程开启:", i)
fmt.Println("携程开启爬取的url:", url)
getPageUrlV2(url)
//out := <- pageUrlChan
//log.Println(out)
}
wg.Done()
}
func getPageUrlV2(url string) {
// Request the HTML page.
res, err := http.Get(url)
if err != nil {
log.Println("抓取分页请求失败:",err)
return
//log.Fatal("请求失败:",err)
}
defer res.Body.Close()
if res.StatusCode == 200 {
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
log.Println("读取分页html失败",err)
}
// Find the review items
doc.Find("img").Each(func(i int, s *goquery.Selection) {
// For each item found, get the band and title
imageSrc, _ := s.Attr("src")
if imageSrc != "" {
str := ""
imageSrc = strings.Replace(imageSrc, " ", "", -1)
if imageSrc[0:4] != "http" && imageSrc[0:5] != "https" {
if imageSrc[0:2] != "//" && imageSrc[0:3]!= "://"{
str = Scheme + "://"
}else if imageSrc[0:2] == "//" {
str = Scheme + ":"
}else if imageSrc[0:2] == "://" {
str = Scheme
}
}
imageSrc = str + imageSrc
fmt.Printf("获取图片地址: %d: %s\n", i, imageSrc)
saveImage(imageSrc)
}
})
}
return
log.Println("status code error: %d %s", res.StatusCode, res.Status)
//log.Fatalf("status code error: %d %s", res.StatusCode, res.Status)
// Load the HTML document
}
func saveImage(imageUrl string) {
res := getReponseWithGlobalHeaders(imageUrl)
if err := recover(); res == nil {
log.Println("Skip panic2", err)
return
}
defer func() {
if err := res.Body.Close(); err != nil {
fmt.Println(err)
}
}()
// 获取图片扩展名
fileNameExt := path.Ext(imageUrl)
if fileNameExt != ".png" || fileNameExt != ".jpg" || fileNameExt != ".bmp" || fileNameExt != ".gif" {
fileNameExt = ".jpg"
}
// 图片保存的全路径
savePath := path.Join(SaveFolder, strconv.Itoa(rand.Int())+fileNameExt)
log.Println("savePath", savePath)
imageWriter, _ := os.OpenFile(savePath, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0777)
length, _ := io.Copy(imageWriter, res.Body)
fmt.Println(savePath + " image saved! " + strconv.Itoa(int(length)) + " bytes." + imageUrl)
}
func getReponseWithGlobalHeaders(url string) *http.Response {
req, _ := http.NewRequest("GET", url, nil)
if headers != nil && len(headers) != 0 {
for k, v := range headers {
for _, val := range v {
req.Header.Add(k, val)
}
}
}
res, err := http.DefaultClient.Do(req)
if err != nil {
if err := recover(); err != nil {
log.Println("Skip panic", err)
}
log.Println(err)
}
return res
}
func setNextPageUrl() {
defer close(pageUrlChan)
for {
fmt.Println(CurentPageNum)
pageUrlChan <- (StartUrlPre + strconv.Itoa(CurentPageNum) + StartUrlend)
CurentPageNum++
if CurentPageNum == MaxPageNum {
break
}
}
}
func CountTime(num int) {
if num > 0 {
fmt.Printf("倒计时:%d秒\n",num)
time.Sleep(time.Duration(1) * time.Second)
CountTime(num - 1)
}
}
func Dump(s string) {
fmt.Println("您输入的值为:",s)
}