爬虫项目介绍
- 有一定的复杂性
- 可以灵活调整项目的复杂性
- 平衡语言/爬虫之间的比重
- 通用爬虫,如baidu,google
- 聚焦爬虫,从互联网获取结构化数据
- 把网页转换成数据
- go语言的爬虫库/框架
- henrylee2cn/pholcus
- gocrawl
- colly
- hu17889/go_spider
- 不使用现成爬虫库/框架来写一个爬虫项目
- 使用elasticsearch作为数据存储
- 使用Go语言标准模板库实现http数据展示部分
爬虫的法律风险
- robots协议
- 技术上没有约束力
- 法律上仅作为参考
- 结果导向
- 使用常识进行判断
- QPS(每秒发送请求的数量)
新爬虫的选择
- 爬取:比较廉价的数据,访问量大的网站
- 金融、体育、新闻、产品。。。
- 爬取爱卡汽车网的各种车型数据
- 项目的设计使得我们只需要增量修改
- 新的解析器,新的配置
单任务版爬虫
regex/main.go(for test regular expression)
package main
import (
"fmt"
"regexp"
)
const text = `
My email is ccmouse@gmail.com@123.com
email2 is abc@def.rg
email3 is kkk@qq.com.cn`
func main() {
re := regexp.MustCompile(`([a-zA-Z0-9]+)@([a-zA-Z0-9]+)(\.[a-zA-Z0-9.]+)`)
// match := re.FindString(text)
match := re.FindAllString(text, -1)
match2 := re.FindAllStringSubmatch(text, -1)
fmt.Println(match)
fmt.Println(match2)
for _, m := range match2 {
fmt.Println(m)
}
}
crawler/engine/engine.go
package engine
import (
"log"
"muke/15/5/crawler/fetch"
)
// Run :
func Run(seeds ...Request) {
var requests []Request
for _, seed := range seeds {
requests = append(requests, seed)
}
for len(requests) > 0 {
r := requests[0]
requests = requests[1:]
log.Printf("fetching %s", r.URL)
fetchBody, err := fetch.Fetch(r.URL)
if err != nil {
log.Printf("Fetch : error fetching url %s : %v", r.URL, err)
continue
}
ParseResult := r.ParserFunc(fetchBody)
requests = append(requests, ParseResult.Requests...)
for _, item := range ParseResult.Items {
log.Printf("Got item %v", item)
}
}
}
crawler/engine/types.go
package engine
// Request : request body
type Request struct {
URL string
ParserFunc func([]byte) ParseResult
}
// ParseResult : ParserResult body
type ParseResult struct {
Requests []Request
Items []interface{}
}
// NilParse :
func NilParse([]byte) ParseResult {
return ParseResult{}
}
crawler/fetch/fetch.go
package fetch
import (
"fmt"
"io/ioutil"
"net/http"
)
//Fetch : get request body
func Fetch(url string) ([]byte, error) {
resp, errGet := http.Get(url)
if errGet != nil {
return nil, errGet
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("wrong status code : %d", resp.StatusCode)
}
return ioutil.ReadAll(resp.Body)
}
crawler/model/profile.go
package model
type Profile struct {
Name string
Gender string
Age int
Hight int
Weight int
Income string
Marriage string
Education string
Occupation string
Jiguan string
Xinzuo string
House string
Car string
}
crawler/zhenai/parser/city.go
package engine
import (
"log"
)
// ConcurrentEngine :
type ConcurrentEngine struct {
Scheduler Scheduler
WorkerCount int
// wg *sync.WaitGroup
}
// Scheduler :
type Scheduler interface {
Submit(Request)
// ConfigureMasterWorkerChan(chan Request)
WorkerReady(chan Request)
Run()
}
// Run : ConcurrentEngine's run API
func (c *ConcurrentEngine) Run(seeds ...Request) {
out := make(chan ParseResult)
c.Scheduler.Run()
for i := 0; i < c.WorkerCount; i++ {
createWorker(out, c.Scheduler)
}
for _, seed := range seeds {
c.Scheduler.Submit(seed)
}
itemCount := 0
for {
result := <-out
for _, item := range result.Items {
log.Printf("Got item %d: %v", itemCount, item)
itemCount++
}
for _, request := range result.Requests {
c.Scheduler.Submit(request)
}
}
}
func createWorker(out chan ParseResult, s Scheduler) {
in := make(chan Request)
go func() {
for {
// tell scheduler i'm ready
s.WorkerReady(in)
request := <-in
result, err := work(request)
if err != nil {
continue
}
out <- result
}
}()
}
crawler/zhenai/parser/citylist_test.go
package parser
import (
"muke/15/5/crawler/fetch"
"testing"
)
func TestParseCityList(t *testing.T) {
body, err := fetch.Fetch("http://localhost:8080/mock/www.zhenai.com/zhenghun")
if err != nil {
panic(err)
}
// fmt.Printf("%s\n", body)
result := ParseCityList(body)
const resultSize = 470
expectedUrls := []string{
"http://localhost:8080/mock/www.zhenai.com/zhenghun/aba",
"http://localhost:8080/mock/www.zhenai.com/zhenghun/akesu",
"http://localhost:8080/mock/www.zhenai.com/zhenghun/alashanmeng",
}
expectedCities := []string{
"city : 阿坝", "city : 阿克苏", "city : 阿拉善盟",
}
if len(result.Requests) != resultSize {
t.Errorf("result should have %d request; but had %d", resultSize, len(result.Requests))
}
for i, url := range expectedUrls {
if result.Requests[i].URL != url {
t.Errorf("expected url #%d: %s ; but was %s", i, url, result.Requests[i].URL)
}
}
if len(result.Items) != resultSize {
t.Errorf("result should have %d request; but had %d", resultSize, len(result.Items))
}
for i, city := range expectedCities {
if result.Items[i].(string) != city {
t.Errorf("expected city #%d: %s ; but was %s", i, city, result.Items[i].(string))
}
}
//verify result
}
crawler/zhenai/parser/citylist.go
package parser
import (
"muke/15/5/crawler/engine"
"regexp"
)
//<a href="http://localhost:8080/mock/www.zhenai.com/zhenghun/aba" class="">阿坝</a>
// const cityListRe = `<a href="(http://www.zhenai.com/zhenghun/[a-z0-9]*)"[^>]*>([^<]*)</a>`
const cityListRe = `<a href="(http://localhost:8080/mock/www.zhenai.com/zhenghun/[a-z0-9]*)"[^>]*>([^<]*)</a>`
// ParseCityList :
func ParseCityList(contents []byte) engine.ParseResult {
re := regexp.MustCompile(cityListRe)
all := re.FindAllSubmatch(contents, -1)
result := engine.ParseResult{}
for _, v := range all {
result.Items = append(result.Items, "city : "+string(v[2]))
result.Requests = append(result.Requests, engine.Request{
URL: string(v[1]),
ParserFunc: ParseCity,
})
// fmt.Printf("URL:%s city:%s\n", v[1], v[2])
}
return result
}
crawler/zhenai/parser/profile.go
package parser
import (
"muke/15/5/crawler/engine"
"muke/15/5/crawler/model"
"regexp"
"strconv"
)
var (
ageRe = regexp.MustCompile(`<td><span class="label">年龄:</span>(\d+)岁</td>`)
marriageRe = regexp.MustCompile(`<td><span class="label">婚况:</span>([^<]+)</td>`)
hightRe = regexp.MustCompile(`<td><span class="label">身高:</span>(\d+)CM</td>`)
weightRe = regexp.MustCompile(`<td><span class="label">体重:</span><span field="">(\d+)KG</span></td>`)
genderRe = regexp.MustCompile(`<td><span class="label">性别:</span><span field="">([^<]+)</span></td>`)
incomeRe = regexp.MustCompile(`<td><span class="label">月收入:</span>([^<]+)</td>`)
educationRe = regexp.MustCompile(`<td><span class="label">学历:</span>([^<]+)</td>`)
occupationRe = regexp.MustCompile(`<td><span class="label">职业: </span>([^<]+)</td>`)
jiguanRe = regexp.MustCompile(`<td><span class="label">籍贯:</span>([^<]+)</td>`)
xinzuoRe = regexp.MustCompile(`<td><span class="label">星座:</span><span field="">([^<]+)</span></td>`)
houseRe = regexp.MustCompile(`<td><span class="label">住房条件:</span><span field="">([^<]+)</span></td>`)
carRe = regexp.MustCompile(`<td><span class="label">是否购车:</span><span field="">([^<]+)</span></td>`)
)
// ParseProfile :
func ParseProfile(contents []byte, name string) engine.ParseResult {
profile := model.Profile{}
//get age
age, errAtoi := strconv.Atoi(extractString(contents, ageRe))
if errAtoi == nil {
profile.Age = age
}
//get hight
hight, errHight := strconv.Atoi(extractString(contents, hightRe))
if errHight == nil {
profile.Hight = hight
}
//get weight
weight, errWeight := strconv.Atoi(extractString(contents, weightRe))
if errWeight == nil {
profile.Weight = weight
}
//get marriage
profile.Marriage = extractString(contents, marriageRe)
//get gender
profile.Gender = extractString(contents, genderRe)
//get income
profile.Income = extractString(contents, incomeRe)
//get education
profile.Education = extractString(contents, educationRe)
//get occupation
profile.Occupation = extractString(contents, occupationRe)
//get jiguan
profile.Jiguan = extractString(contents, jiguanRe)
//get xinzuo
profile.Xinzuo = extractString(contents, xinzuoRe)
//get house
profile.House = extractString(contents, houseRe)
//get car
profile.Car = extractString(contents, carRe)
//get name
profile.Name = name
return engine.ParseResult{
Items: []interface{}{profile},
}
}
// 匹配结果
func extractString(contents []byte, re *regexp.Regexp) string {
match := re.FindSubmatch(contents)
if len(match) >= 2 {
return string(match[1])
}
return ""
}
crawler/zhenai/parser/profile_test.go
package parser
import (
"io/ioutil"
"muke/15/5/crawler/model"
"testing"
)
func TestParseProfile(t *testing.T) {
contents, err := ioutil.ReadFile("profile_test_data.html")
if err != nil {
panic(err)
}
result := ParseProfile(contents, "安静的雪")
if len(result.Items) != 1 {
t.Errorf("Items should contain 1 element; but was %v", result.Items)
}
profile := result.Items[0].(model.Profile)
expected := model.Profile{
Name: "安静的雪",
Gender: "女",
Age: 34,
Hight: 162,
Weight: 57,
Income: "3001-5000元",
Marriage: "离异",
Education: "大学本科",
Occupation: "人事/行政",
Jiguan: "山东菏泽",
Xinzuo: "牡羊座",
House: "已购房",
Car: "未购车",
}
if profile != expected {
t.Errorf("expect %v; but was %v", expected, profile)
}
}
- 获取并打印所有城市第一页用户的详细信息clra
- 使用css选择器、使用xpath、使用正则表达式来匹配html中对应数据
- 解析器parser
- 输入:utf-8编码的文本
- 输出:Request{URL,对应Parser}列表,Item列表