golang知识点——爬虫实战（单任务版）

最新推荐文章于 2024-06-30 17:03:06 发布

kuc火

最新推荐文章于 2024-06-30 17:03:06 发布

阅读量486

点赞数

分类专栏：笔记文章标签： go golang

本文链接：https://blog.csdn.net/qq_37109456/article/details/108906225

版权

笔记专栏收录该内容

25 篇文章 2 订阅

订阅专栏

爬虫项目介绍

有一定的复杂性
可以灵活调整项目的复杂性
平衡语言/爬虫之间的比重
通用爬虫,如baidu,google
聚焦爬虫，从互联网获取结构化数据
- 把网页转换成数据
go语言的爬虫库/框架
- henrylee2cn/pholcus
- gocrawl
- colly
- hu17889/go_spider
不使用现成爬虫库/框架来写一个爬虫项目
使用elasticsearch作为数据存储
使用Go语言标准模板库实现http数据展示部分

爬虫的法律风险

robots协议
技术上没有约束力
法律上仅作为参考
结果导向
使用常识进行判断
QPS（每秒发送请求的数量）

新爬虫的选择

爬取：比较廉价的数据，访问量大的网站
金融、体育、新闻、产品。。。
爬取爱卡汽车网的各种车型数据
项目的设计使得我们只需要增量修改
新的解析器，新的配置

单任务版爬虫

regex/main.go(for test regular expression)

package main

import (
	"fmt"
	"regexp"
)

const text = `
My email is ccmouse@gmail.com@123.com
email2 is abc@def.rg
email3 is kkk@qq.com.cn`

func main() {
	re := regexp.MustCompile(`([a-zA-Z0-9]+)@([a-zA-Z0-9]+)(\.[a-zA-Z0-9.]+)`)
	// match := re.FindString(text)
	match := re.FindAllString(text, -1)
	match2 := re.FindAllStringSubmatch(text, -1)
	fmt.Println(match)
	fmt.Println(match2)
	for _, m := range match2 {
		fmt.Println(m)
	}
}

crawler/engine/engine.go

package engine

import (
	"log"
	"muke/15/5/crawler/fetch"
)

// Run :
func Run(seeds ...Request) {
	var requests []Request
	for _, seed := range seeds {
		requests = append(requests, seed)
	}

	for len(requests) > 0 {
		r := requests[0]
		requests = requests[1:]

		log.Printf("fetching %s", r.URL)

		fetchBody, err := fetch.Fetch(r.URL)
		if err != nil {
			log.Printf("Fetch : error fetching url %s : %v", r.URL, err)
			continue
		}
		ParseResult := r.ParserFunc(fetchBody)
		requests = append(requests, ParseResult.Requests...)

		for _, item := range ParseResult.Items {

			log.Printf("Got item %v", item)
		}
	}
}

crawler/engine/types.go

package engine

// Request : request body
type Request struct {
	URL        string
	ParserFunc func([]byte) ParseResult
}

// ParseResult : ParserResult body
type ParseResult struct {
	Requests []Request
	Items    []interface{}
}

// NilParse :
func NilParse([]byte) ParseResult {
	return ParseResult{}
}

crawler/fetch/fetch.go

package fetch

import (
	"fmt"
	"io/ioutil"
	"net/http"
)

//Fetch : get request body
func Fetch(url string) ([]byte, error) {
	resp, errGet := http.Get(url)
	if errGet != nil {
		return nil, errGet
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		return nil, fmt.Errorf("wrong status code : %d", resp.StatusCode)
	}
	return ioutil.ReadAll(resp.Body)
}

crawler/model/profile.go

package model

type Profile struct {
	Name       string
	Gender     string
	Age        int
	Hight      int
	Weight     int
	Income     string
	Marriage   string
	Education  string
	Occupation string
	Jiguan     string
	Xinzuo     string
	House      string
	Car        string
}

crawler/zhenai/parser/city.go

package engine

import (
	"log"
)

// ConcurrentEngine :
type ConcurrentEngine struct {
	Scheduler   Scheduler
	WorkerCount int
	// wg          *sync.WaitGroup
}

// Scheduler :
type Scheduler interface {
	Submit(Request)
	// ConfigureMasterWorkerChan(chan Request)
	WorkerReady(chan Request)
	Run()
}

// Run : ConcurrentEngine's run API
func (c *ConcurrentEngine) Run(seeds ...Request) {

	out := make(chan ParseResult)
	c.Scheduler.Run()

	for i := 0; i < c.WorkerCount; i++ {
		createWorker(out, c.Scheduler)
	}

	for _, seed := range seeds {
		c.Scheduler.Submit(seed)
	}

	itemCount := 0
	for {
		result := <-out
		for _, item := range result.Items {
			log.Printf("Got item %d: %v", itemCount, item)
			itemCount++
		}

		for _, request := range result.Requests {
			c.Scheduler.Submit(request)
		}
	}

}

func createWorker(out chan ParseResult, s Scheduler) {
	in := make(chan Request)
	go func() {
		for {
			// tell scheduler i'm ready

			s.WorkerReady(in)
			request := <-in
			result, err := work(request)
			if err != nil {
				continue
			}
			out <- result
		}
	}()
}

crawler/zhenai/parser/citylist_test.go

package parser

import (
	"muke/15/5/crawler/fetch"
	"testing"
)

func TestParseCityList(t *testing.T) {
	body, err := fetch.Fetch("http://localhost:8080/mock/www.zhenai.com/zhenghun")
	if err != nil {
		panic(err)
	}

	// fmt.Printf("%s\n", body)
	result := ParseCityList(body)
	const resultSize = 470
	expectedUrls := []string{
		"http://localhost:8080/mock/www.zhenai.com/zhenghun/aba",
		"http://localhost:8080/mock/www.zhenai.com/zhenghun/akesu",
		"http://localhost:8080/mock/www.zhenai.com/zhenghun/alashanmeng",
	}
	expectedCities := []string{
		"city : 阿坝", "city : 阿克苏", "city : 阿拉善盟",
	}
	if len(result.Requests) != resultSize {
		t.Errorf("result should have %d request; but had %d", resultSize, len(result.Requests))
	}
	for i, url := range expectedUrls {
		if result.Requests[i].URL != url {
			t.Errorf("expected url #%d: %s ; but was %s", i, url, result.Requests[i].URL)
		}
	}
	if len(result.Items) != resultSize {
		t.Errorf("result should have %d request; but had %d", resultSize, len(result.Items))
	}
	for i, city := range expectedCities {
		if result.Items[i].(string) != city {
			t.Errorf("expected city #%d: %s ; but was %s", i, city, result.Items[i].(string))
		}
	}
	//verify result
}

crawler/zhenai/parser/citylist.go

package parser

import (
	"muke/15/5/crawler/engine"
	"regexp"
)

//<a href="http://localhost:8080/mock/www.zhenai.com/zhenghun/aba" class="">阿坝</a>
// const cityListRe = `<a href="(http://www.zhenai.com/zhenghun/[a-z0-9]*)"[^>]*>([^<]*)</a>`
const cityListRe = `<a href="(http://localhost:8080/mock/www.zhenai.com/zhenghun/[a-z0-9]*)"[^>]*>([^<]*)</a>`

// ParseCityList :
func ParseCityList(contents []byte) engine.ParseResult {
	re := regexp.MustCompile(cityListRe)
	all := re.FindAllSubmatch(contents, -1)
	result := engine.ParseResult{}
	for _, v := range all {
		result.Items = append(result.Items, "city : "+string(v[2]))
		result.Requests = append(result.Requests, engine.Request{
			URL:        string(v[1]),
			ParserFunc: ParseCity,
		})
		// fmt.Printf("URL:%s city:%s\n", v[1], v[2])
	}
	return result
}

crawler/zhenai/parser/profile.go

package parser

import (
	"muke/15/5/crawler/engine"
	"muke/15/5/crawler/model"
	"regexp"
	"strconv"
)

var (
	ageRe        = regexp.MustCompile(`<td><span class="label">年龄：</span>(\d+)岁</td>`)
	marriageRe   = regexp.MustCompile(`<td><span class="label">婚况：</span>([^<]+)</td>`)
	hightRe      = regexp.MustCompile(`<td><span class="label">身高：</span>(\d+)CM</td>`)
	weightRe     = regexp.MustCompile(`<td><span class="label">体重：</span><span field="">(\d+)KG</span></td>`)
	genderRe     = regexp.MustCompile(`<td><span class="label">性别：</span><span field="">([^<]+)</span></td>`)
	incomeRe     = regexp.MustCompile(`<td><span class="label">月收入：</span>([^<]+)</td>`)
	educationRe  = regexp.MustCompile(`<td><span class="label">学历：</span>([^<]+)</td>`)
	occupationRe = regexp.MustCompile(`<td><span class="label">职业： </span>([^<]+)</td>`)
	jiguanRe     = regexp.MustCompile(`<td><span class="label">籍贯：</span>([^<]+)</td>`)
	xinzuoRe     = regexp.MustCompile(`<td><span class="label">星座：</span><span field="">([^<]+)</span></td>`)
	houseRe      = regexp.MustCompile(`<td><span class="label">住房条件：</span><span field="">([^<]+)</span></td>`)
	carRe        = regexp.MustCompile(`<td><span class="label">是否购车：</span><span field="">([^<]+)</span></td>`)
)

// ParseProfile :
func ParseProfile(contents []byte, name string) engine.ParseResult {
	profile := model.Profile{}

	//get age
	age, errAtoi := strconv.Atoi(extractString(contents, ageRe))
	if errAtoi == nil {
		profile.Age = age
	}
	//get hight
	hight, errHight := strconv.Atoi(extractString(contents, hightRe))
	if errHight == nil {
		profile.Hight = hight
	}
	//get weight
	weight, errWeight := strconv.Atoi(extractString(contents, weightRe))
	if errWeight == nil {
		profile.Weight = weight
	}

	//get marriage
	profile.Marriage = extractString(contents, marriageRe)
	//get gender
	profile.Gender = extractString(contents, genderRe)
	//get income
	profile.Income = extractString(contents, incomeRe)
	//get education
	profile.Education = extractString(contents, educationRe)
	//get occupation
	profile.Occupation = extractString(contents, occupationRe)
	//get jiguan
	profile.Jiguan = extractString(contents, jiguanRe)
	//get xinzuo
	profile.Xinzuo = extractString(contents, xinzuoRe)
	//get house
	profile.House = extractString(contents, houseRe)
	//get car
	profile.Car = extractString(contents, carRe)
	//get name
	profile.Name = name

	return engine.ParseResult{
		Items: []interface{}{profile},
	}
}

// 匹配结果
func extractString(contents []byte, re *regexp.Regexp) string {
	match := re.FindSubmatch(contents)
	if len(match) >= 2 {
		return string(match[1])
	}
	return ""
}

crawler/zhenai/parser/profile_test.go

package parser

import (
	"io/ioutil"
	"muke/15/5/crawler/model"
	"testing"
)

func TestParseProfile(t *testing.T) {
	contents, err := ioutil.ReadFile("profile_test_data.html")
	if err != nil {
		panic(err)
	}
	result := ParseProfile(contents, "安静的雪")
	if len(result.Items) != 1 {
		t.Errorf("Items should contain 1 element; but was %v", result.Items)
	}

	profile := result.Items[0].(model.Profile)
	expected := model.Profile{
		Name:       "安静的雪",
		Gender:     "女",
		Age:        34,
		Hight:      162,
		Weight:     57,
		Income:     "3001-5000元",
		Marriage:   "离异",
		Education:  "大学本科",
		Occupation: "人事/行政",
		Jiguan:     "山东菏泽",
		Xinzuo:     "牡羊座",
		House:      "已购房",
		Car:        "未购车",
	}

	if profile != expected {
		t.Errorf("expect %v; but was %v", expected, profile)
	}
}