用Go 写一个简易版单机爬虫_go 简单爬虫-CSDN博客

本文链接：https://blog.csdn.net/weixin_42012911/article/details/106781244

前言

爬虫一般是一个递归过程，每一种类型的页面应该配置一个不同的解析器，从一个种子页面开始爬取，然后把爬取到的信息进行处理，这个页面的其他url放入一个任务队列里面排队，并附加一个对应的解析器。
在这里插入图片描述

package engine

type Request struct {
	Url        string
	ParserFunc func([]byte) ParseResult // 处理url的函数
}

// 任务队列requests 接下来可以递归的url和解析器
// items 获取的值
type ParseResult struct {
	Requests []Request
	Items    []interface{} // 任何类型
}

// 还没想好要怎么处理可以先定义一个空的解析器
func NilParser([]byte) ParseResult{
	return ParseResult{}
}

在这里插入图片描述
下面以爬取一个相亲网站为例子,是一个模拟的相亲网站，如果大家需要可以私聊我，我发给大家

获取页面信息

package fetcher
import (
	"bufio"
	"fmt"
	"golang.org/x/net/html/charset"
	"golang.org/x/text/encoding"
	"golang.org/x/text/encoding/unicode"
	"golang.org/x/text/transform"
	"io/ioutil"
	"log"
	"net/http"
)

// 从网络上拉取信息 返回
func Fetch(url string) ([]byte, error) {
	resp, err := http.Get(url)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()
	if resp.StatusCode != http.StatusOK {
		return nil, fmt.Errorf("wrong status code: %d", resp.StatusCode)
	}

	// go get golang.org/x/text
	// go get golang.org/x/net/html

	// bufio防止读取之后，不能再次读取
	r := bufio.NewReader(resp.Body)
	e := determineEncoding(r)
	//把resp转换为指定的编码输出
	utf8Reader :=transform.NewReader(r, e.NewDecoder())
	return ioutil.ReadAll(utf8Reader)
}

// 获取编码格式
func determineEncoding(r *bufio.Reader) encoding.Encoding {
	bytes, err := r.Peek(1024)
	if err != nil {
		log.Printf("Fetcher error : %v", err)
		return unicode.UTF8
	}
	e, _, _ := charset.DetermineEncoding(bytes, "")
	return e
}

解析器

城市列表解析器

package parser

import (
	"fmt"
	"learn2/crawler/zhenai/engine"
	"regexp"
)

// 解析器 从网络上拉取的东西进行解析， 返回内容和可以递归的request request 附加解析器（指定这个url要用什么解析器解析）
const cityListRe = `<a href="(http://localhost:8080/mock/www.zhenai.com/zhenghun/[0-9a-z]*)"[^>]*>([^<]+)</a>`
func ParseCityList(contents [] byte) engine.ParseResult {
	re := regexp.MustCompile(cityListRe)
	matches := re.FindAllSubmatch(contents, -1)
	result := engine.ParseResult{}
	for _,m :=range matches {
		fmt.Printf("City : %s, URL : %s",m[2],m[1])
		result.Items = append(result.Items,string(m[2]))
		result.Requests = append(result.Requests, engine.Request{
			Url: string(m[1]), 
			ParserFunc: ParseCity,
		})
		fmt.Println()
	}
	return result

}

城市解析器

package parser

import (
	"learn2/crawler/zhenai/engine"
	"regexp"
)

const cityRe = `<a href="(http://localhost:8080/mock/album.zhenai.com/u/[0-9]*)">([^<]+)</a>`

func ParseCity(content []byte) engine.ParseResult {
	re := regexp.MustCompile(cityRe)
	matches := re.FindAllSubmatch(content, -1)
	result := engine.ParseResult{}
	for _, match := range matches {
		name := string(match[2])
		result.Items = append(result.Items, name)
		result.Requests = append(result.Requests, engine.Request{
			Url: string(match[1]),
			//函数式编程，这里因为处理用户信息需要传入姓名，又不能破坏原来的Request的结构，所以采用函数式编程，
			ParserFunc: func(bytes []byte)  engine.ParseResult {
				return ParseProfile(bytes, name)
			},
		})
	}
	return result
}

用户解析器

package parser

import (
	"learn2/crawler/model"
	"learn2/crawler/zhenai/engine"
	"regexp"
	"strconv"
)

var ageRe = regexp.MustCompile(`<span class="label">年龄：</span>([0-9]+)岁`)
var incomeRe = regexp.MustCompile("<span class=\"label\">月收入：</span>([0-9\\-]+)元</td>")

func ParseProfile(content []byte, name string) engine.ParseResult {
	result := engine.ParseResult{}
	profile := model.Profile{}
	profile.Name = name
	age, err := strconv.Atoi(getResult(ageRe, content))
	if err == nil{
		//user age
		profile.Age = age
	}
	profile.Income = getResult(incomeRe, content)
	result.Items = append(result.Items, profile)
	return result
}

func getResult(re *regexp.Regexp, content []byte) string {
	match := re.FindSubmatch(content)
	if len(match) >= 2 {
		print(string(match[1]), "  ")
		return string(match[1])
	} else {
		return ""
	}
}

引擎

package engine

import (
	"learn2/crawler/fetcher"
	"log"
)

// 运行 从种子开始
func Run(seeds ...Request) {
	var requests []Request
	for _, r := range seeds {
		requests = append(requests, r)
	}

	for len(requests) > 0 { // 把一个个请求取出来
		r := requests[0]
		requests = requests[1:]

		log.Printf("Fetching %s", r.Url)
		body, err := fetcher.Fetch(r.Url) // 获取内容
		if err != nil {
			log.Printf("Fetcher :error fetching url %s :%v)", r.Url, err)
		}

		parseResult := r.ParserFunc(body) // 解析
		requests = append(requests, parseResult.Requests...) // 把url存进去

		for _, item := range parseResult.Items { // 处理有用信息
			log.Printf("Got item %v", item)
		}
	}
}

关于测试

package parser

import (
	"io/ioutil"
	"testing"
)

// 一般测试不要直接从网络上获取，这样网络拉取这个部分会影响测试结果。
// 可以先从网络上获取，然后存到本地进行测试
func TestParseCityList(t *testing.T) {
	//content, err := fetcher.Fetch("http://localhost:8080/mock/www.zhenai.com/zhenghun")
	//if err != nil {
	//	panic(err)
	//}
	//fmt.Printf("%s",content)
	content, err := ioutil.ReadFile("citylist_test_data.html")
	if err != nil {
		panic(err)
	}

	expectedUrls := []string{
		"http://localhost:8080/mock/www.zhenai.com/zhenghun/aba",
		"http://localhost:8080/mock/www.zhenai.com/zhenghun/akesu",
		"http://localhost:8080/mock/www.zhenai.com/zhenghun/alashanmeng",
	}
	expectedCities := []string{
		"阿坝", "阿克苏", "阿拉善盟",
	}

	result := ParseCityList(content)
	const resultSize = 470

	if len(result.Requests) != resultSize {
		t.Errorf("result should have %d requests; but had %d", resultSize, len(result.Requests))
	}

	for i, url := range expectedUrls {
		if result.Requests[i].Url != url {
			t.Errorf("expected url #%d: %s; but was %s", i, url, result.Requests[i].Url)
		}
	}
	for i, city := range expectedCities {
		if result.Items[i] != city {
			t.Errorf("expected city #%d: %s; but was %s", i, city, result.Items[i])
		}
	}
}

其他的例子类似

main函数

package main

import (
	"learn2/crawler/zhenai/engine"
	"learn2/crawler/zhenai/parser"
)

func main() {
	engine.Run(engine.Request{
		Url:        "http://localhost:8080/mock/www.zhenai.com/zhenghun",
		ParserFunc: parser.ParseCityList,
	})
}