由于go对爬虫不是非常友好,还是使用比较熟悉的Python爬虫
首先go使用了amqp这个库:
package main
import (
"encoding/json"
"fmt"
"github.com/streadway/amqp"
)
type One struct {
Name string `json:"name"`
Href string `json:"href"`
}
func main() {
conn,err := amqp.Dial("amqp://root:root@localhost:5672/admin") //创建连接
if err!=nil {
panic(err)
}
ch,er := conn.Channel() //创建一个通道
if er!= nil {
panic(er)
}
q,er1 := ch.QueueDeclare( //创建一个队列
"go",
false,
false,
false,
false,
nil,
)
if er1 != nil {
panic(er1)
}
err = ch.Publish( //发布一个消息
"",
q.Name,
false,
false,
amqp.Publishing{
ContentType:"text/plian",
Body:[]byte("DoSpider"),
},
)
if err !=nil {
panic(err)
}
fmt.Println("Send to python!")
q1,_ := ch.QueueDeclare( //创建一个队列
"python",
false,
false,
false,
false,
nil,
)
msg,_ := ch.Consume( //接收来自python的消息
q1.Name,
"",
true,
false,
false,
false,
nil,
)
forever := make(chan bool)
go func() {
for d:= range msg {
var ones []One
err := json.Unmarshal(d.Body,&ones) //获取二进制json,并转为one类型的切片
fmt.Println(ones)
if err!= nil {
panic(err)
}
forever <- true //受到一次数据就不再阻塞主goroutine
}
}()
<-forever
}
python端使用了pika:
import pika
from lxml import etree
import requests
import json
url = "http://www.quanben.co/sort/9_1.html"
def ss():
s = requests.get(url)
html = etree.HTML(s.content)
links = html.xpath("//div[@id='content']/div[@class='Sum Search_line']/ul/h2/a")
lists = []
for link in links :
name = link.xpath("./text()")[0]
href = link.xpath("./@href")[0]
dict = {"name":name,'href':href}
lists.append(dict)
return json.dumps(lists) #弄了好久 这里必须返回json的二进制 RabbitMQ是二进制存储的
def consume(ch):
ch.queue_declare(queue='go')
def callback(ch, method, properties, body):
print(" Received %r" % body)
ch.stop_consuming()
ch.basic_consume(callback,queue='go',no_ack=True)
ch.start_consuming()
def produce(ch,lists):
ch.queue_declare(queue='python')
ch.basic_publish(exchange='',
routing_key='python',
body=lists)
if __name__ == '__main__':
credent = pika.PlainCredentials("root","root")
connect = pika.BlockingConnection(pika.ConnectionParameters("localhost",5672,'admin',credent))
ch = connect.channel()
consume(ch)
lists = ss()
print(lists)
produce(ch,lists)
connect.close()