golang知识点——爬虫实战(单任务版)

爬虫项目介绍
  • 有一定的复杂性
  • 可以灵活调整项目的复杂性
  • 平衡语言/爬虫之间的比重
  • 通用爬虫,如baidu,google
  • 聚焦爬虫,从互联网获取结构化数据
    • 把网页转换成数据
  • go语言的爬虫库/框架
    • henrylee2cn/pholcus
    • gocrawl
    • colly
    • hu17889/go_spider
  • 不使用现成爬虫库/框架来写一个爬虫项目
  • 使用elasticsearch作为数据存储
  • 使用Go语言标准模板库实现http数据展示部分
爬虫的法律风险
  • robots协议
  • 技术上没有约束力
  • 法律上仅作为参考
  • 结果导向
  • 使用常识进行判断
  • QPS(每秒发送请求的数量)
新爬虫的选择
  • 爬取:比较廉价的数据,访问量大的网站
  • 金融、体育、新闻、产品。。。
  • 爬取爱卡汽车网的各种车型数据
  • 项目的设计使得我们只需要增量修改
  • 新的解析器,新的配置
单任务版爬虫

regex/main.go(for test regular expression)

package main

import (
	"fmt"
	"regexp"
)

const text = `
My email is ccmouse@gmail.com@123.com
email2 is abc@def.rg
email3 is kkk@qq.com.cn`

func main() {
	re := regexp.MustCompile(`([a-zA-Z0-9]+)@([a-zA-Z0-9]+)(\.[a-zA-Z0-9.]+)`)
	// match := re.FindString(text)
	match := re.FindAllString(text, -1)
	match2 := re.FindAllStringSubmatch(text, -1)
	fmt.Println(match)
	fmt.Println(match2)
	for _, m := range match2 {
		fmt.Println(m)
	}
}

crawler/engine/engine.go

package engine

import (
	"log"
	"muke/15/5/crawler/fetch"
)

// Run :
func Run(seeds ...Request) {
	var requests []Request
	for _, seed := range seeds {
		requests = append(requests, seed)
	}

	for len(requests) > 0 {
		r := requests[0]
		requests = requests[1:]

		log.Printf("fetching %s", r.URL)

		fetchBody, err := fetch.Fetch(r.URL)
		if err != nil {
			log.Printf("Fetch : error fetching url %s : %v", r.URL, err)
			continue
		}
		ParseResult := r.ParserFunc(fetchBody)
		requests = append(requests, ParseResult.Requests...)

		for _, item := range ParseResult.Items {

			log.Printf("Got item %v", item)
		}
	}
}

crawler/engine/types.go

package engine

// Request : request body
type Request struct {
	URL        string
	ParserFunc func([]byte) ParseResult
}

// ParseResult : ParserResult body
type ParseResult struct {
	Requests []Request
	Items    []interface{}
}

// NilParse :
func NilParse([]byte) ParseResult {
	return ParseResult{}
}

crawler/fetch/fetch.go

package fetch

import (
	"fmt"
	"io/ioutil"
	"net/http"
)

//Fetch : get request body
func Fetch(url string) ([]byte, error) {
	resp, errGet := http.Get(url)
	if errGet != nil {
		return nil, errGet
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		return nil, fmt.Errorf("wrong status code : %d", resp.StatusCode)
	}
	return ioutil.ReadAll(resp.Body)
}

crawler/model/profile.go

package model

type Profile struct {
	Name       string
	Gender     string
	Age        int
	Hight      int
	Weight     int
	Income     string
	Marriage   string
	Education  string
	Occupation string
	Jiguan     string
	Xinzuo     string
	House      string
	Car        string
}

crawler/zhenai/parser/city.go

package engine

import (
	"log"
)

// ConcurrentEngine :
type ConcurrentEngine struct {
	Scheduler   Scheduler
	WorkerCount int
	// wg          *sync.WaitGroup
}

// Scheduler :
type Scheduler interface {
	Submit(Request)
	// ConfigureMasterWorkerChan(chan Request)
	WorkerReady(chan Request)
	Run()
}

// Run : ConcurrentEngine's run API
func (c *ConcurrentEngine) Run(seeds ...Request) {

	out := make(chan ParseResult)
	c.Scheduler.Run()

	for i := 0; i < c.WorkerCount; i++ {
		createWorker(out, c.Scheduler)
	}

	for _, seed := range seeds {
		c.Scheduler.Submit(seed)
	}

	itemCount := 0
	for {
		result := <-out
		for _, item := range result.Items {
			log.Printf("Got item %d: %v", itemCount, item)
			itemCount++
		}

		for _, request := range result.Requests {
			c.Scheduler.Submit(request)
		}
	}

}

func createWorker(out chan ParseResult, s Scheduler) {
	in := make(chan Request)
	go func() {
		for {
			// tell scheduler i'm ready

			s.WorkerReady(in)
			request := <-in
			result, err := work(request)
			if err != nil {
				continue
			}
			out <- result
		}
	}()
}

crawler/zhenai/parser/citylist_test.go

package parser

import (
	"muke/15/5/crawler/fetch"
	"testing"
)

func TestParseCityList(t *testing.T) {
	body, err := fetch.Fetch("http://localhost:8080/mock/www.zhenai.com/zhenghun")
	if err != nil {
		panic(err)
	}

	// fmt.Printf("%s\n", body)
	result := ParseCityList(body)
	const resultSize = 470
	expectedUrls := []string{
		"http://localhost:8080/mock/www.zhenai.com/zhenghun/aba",
		"http://localhost:8080/mock/www.zhenai.com/zhenghun/akesu",
		"http://localhost:8080/mock/www.zhenai.com/zhenghun/alashanmeng",
	}
	expectedCities := []string{
		"city : 阿坝", "city : 阿克苏", "city : 阿拉善盟",
	}
	if len(result.Requests) != resultSize {
		t.Errorf("result should have %d request; but had %d", resultSize, len(result.Requests))
	}
	for i, url := range expectedUrls {
		if result.Requests[i].URL != url {
			t.Errorf("expected url #%d: %s ; but was %s", i, url, result.Requests[i].URL)
		}
	}
	if len(result.Items) != resultSize {
		t.Errorf("result should have %d request; but had %d", resultSize, len(result.Items))
	}
	for i, city := range expectedCities {
		if result.Items[i].(string) != city {
			t.Errorf("expected city #%d: %s ; but was %s", i, city, result.Items[i].(string))
		}
	}
	//verify result
}

crawler/zhenai/parser/citylist.go

package parser

import (
	"muke/15/5/crawler/engine"
	"regexp"
)

//<a href="http://localhost:8080/mock/www.zhenai.com/zhenghun/aba" class="">阿坝</a>
// const cityListRe = `<a href="(http://www.zhenai.com/zhenghun/[a-z0-9]*)"[^>]*>([^<]*)</a>`
const cityListRe = `<a href="(http://localhost:8080/mock/www.zhenai.com/zhenghun/[a-z0-9]*)"[^>]*>([^<]*)</a>`

// ParseCityList :
func ParseCityList(contents []byte) engine.ParseResult {
	re := regexp.MustCompile(cityListRe)
	all := re.FindAllSubmatch(contents, -1)
	result := engine.ParseResult{}
	for _, v := range all {
		result.Items = append(result.Items, "city : "+string(v[2]))
		result.Requests = append(result.Requests, engine.Request{
			URL:        string(v[1]),
			ParserFunc: ParseCity,
		})
		// fmt.Printf("URL:%s city:%s\n", v[1], v[2])
	}
	return result
}

crawler/zhenai/parser/profile.go

package parser

import (
	"muke/15/5/crawler/engine"
	"muke/15/5/crawler/model"
	"regexp"
	"strconv"
)

var (
	ageRe        = regexp.MustCompile(`<td><span class="label">年龄:</span>(\d+)岁</td>`)
	marriageRe   = regexp.MustCompile(`<td><span class="label">婚况:</span>([^<]+)</td>`)
	hightRe      = regexp.MustCompile(`<td><span class="label">身高:</span>(\d+)CM</td>`)
	weightRe     = regexp.MustCompile(`<td><span class="label">体重:</span><span field="">(\d+)KG</span></td>`)
	genderRe     = regexp.MustCompile(`<td><span class="label">性别:</span><span field="">([^<]+)</span></td>`)
	incomeRe     = regexp.MustCompile(`<td><span class="label">月收入:</span>([^<]+)</td>`)
	educationRe  = regexp.MustCompile(`<td><span class="label">学历:</span>([^<]+)</td>`)
	occupationRe = regexp.MustCompile(`<td><span class="label">职业: </span>([^<]+)</td>`)
	jiguanRe     = regexp.MustCompile(`<td><span class="label">籍贯:</span>([^<]+)</td>`)
	xinzuoRe     = regexp.MustCompile(`<td><span class="label">星座:</span><span field="">([^<]+)</span></td>`)
	houseRe      = regexp.MustCompile(`<td><span class="label">住房条件:</span><span field="">([^<]+)</span></td>`)
	carRe        = regexp.MustCompile(`<td><span class="label">是否购车:</span><span field="">([^<]+)</span></td>`)
)

// ParseProfile :
func ParseProfile(contents []byte, name string) engine.ParseResult {
	profile := model.Profile{}

	//get age
	age, errAtoi := strconv.Atoi(extractString(contents, ageRe))
	if errAtoi == nil {
		profile.Age = age
	}
	//get hight
	hight, errHight := strconv.Atoi(extractString(contents, hightRe))
	if errHight == nil {
		profile.Hight = hight
	}
	//get weight
	weight, errWeight := strconv.Atoi(extractString(contents, weightRe))
	if errWeight == nil {
		profile.Weight = weight
	}

	//get marriage
	profile.Marriage = extractString(contents, marriageRe)
	//get gender
	profile.Gender = extractString(contents, genderRe)
	//get income
	profile.Income = extractString(contents, incomeRe)
	//get education
	profile.Education = extractString(contents, educationRe)
	//get occupation
	profile.Occupation = extractString(contents, occupationRe)
	//get jiguan
	profile.Jiguan = extractString(contents, jiguanRe)
	//get xinzuo
	profile.Xinzuo = extractString(contents, xinzuoRe)
	//get house
	profile.House = extractString(contents, houseRe)
	//get car
	profile.Car = extractString(contents, carRe)
	//get name
	profile.Name = name

	return engine.ParseResult{
		Items: []interface{}{profile},
	}
}

// 匹配结果
func extractString(contents []byte, re *regexp.Regexp) string {
	match := re.FindSubmatch(contents)
	if len(match) >= 2 {
		return string(match[1])
	}
	return ""
}

crawler/zhenai/parser/profile_test.go

package parser

import (
	"io/ioutil"
	"muke/15/5/crawler/model"
	"testing"
)

func TestParseProfile(t *testing.T) {
	contents, err := ioutil.ReadFile("profile_test_data.html")
	if err != nil {
		panic(err)
	}
	result := ParseProfile(contents, "安静的雪")
	if len(result.Items) != 1 {
		t.Errorf("Items should contain 1 element; but was %v", result.Items)
	}

	profile := result.Items[0].(model.Profile)
	expected := model.Profile{
		Name:       "安静的雪",
		Gender:     "女",
		Age:        34,
		Hight:      162,
		Weight:     57,
		Income:     "3001-5000元",
		Marriage:   "离异",
		Education:  "大学本科",
		Occupation: "人事/行政",
		Jiguan:     "山东菏泽",
		Xinzuo:     "牡羊座",
		House:      "已购房",
		Car:        "未购车",
	}

	if profile != expected {
		t.Errorf("expect %v; but was %v", expected, profile)
	}
}
  • 获取并打印所有城市第一页用户的详细信息clra
  • 使用css选择器、使用xpath、使用正则表达式来匹配html中对应数据
  • 解析器parser
    • 输入:utf-8编码的文本
    • 输出:Request{URL,对应Parser}列表,Item列表
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值