服务器端简单例子:
package main
import (
"fmt"
"io"
"log"
"net/http"
)
// hello world, the web server
// w: 给客户端回复数据, req: 读取客户端发送的数据
func HelloServer(w http.ResponseWriter, req *http.Request) {
// 打印客户端头信息
fmt.Println(req.Method)
fmt.Println(req.Header)
fmt.Println(req.Body)
fmt.Println(req.URL)
// 给客户端回复数据
io.WriteString(w, "hello, world!\n")
w.Write([]byte("lisa"))
}
func main() {
// 注册函数,用户连接, 自动调用指定处理函数
http.HandleFunc("/hello", HelloServer)
// 监听绑定
err := http.ListenAndServe(":12345", nil)
if err != nil {
log.Fatal("ListenAndServe: ", err)
}
}
客户端简单例子
package main
import (
"fmt"
"net/http"
)
func main() {
respon, err := http.Get("http://www.baidu.com")
if err != nil {
fmt.Println("http.get:", err)
return
}
// 关闭
defer respon.Body.Close()
fmt.Println("status:", respon.Status)
fmt.Println("header:", respon.Header)
fmt.Println("StatusCode:", respon.StatusCode)
//fmt.Println("body:", respon.Body)
//body是一个指针地址,需要读
var tmp string
buf := make([]byte, 1024*4)
for {
n, err := respon.Body.Read(buf)
if n == 0 {
fmt.Println("Body.Read err:", err)
break
}
tmp += string(buf[:n])
}
fmt.Println("body:", tmp)
}
贴吧单线程爬取例子
package main
import (
"fmt"
"net/http"
"os"
"strconv"
)
func main() {
var start, end int
fmt.Println("请输入起始页(>=1):")
fmt.Scan(&start)
fmt.Println("请输入终止页(>=1):")
fmt.Scan(&end)
DoWork(start, end)
}
func DoWork(start, end int) {
//提示信息
fmt.Printf("正在爬取 %d 到 %d 页面\n", start, end)
for i := start; i <= end; i++ {
url := "http://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=" + strconv.Itoa((i-1)*50)
// 爬取内容
fmt.Println(url)
resp, err := SpiderPage(url)
if err != nil {
fmt.Println("SpiderPage:", err)
continue //这里要用continue
}
// 获取到内容然后写入文件
fileName := strconv.Itoa(i) + ".html"
// 新建文件(会覆盖)
f, fErr := os.Create(fileName)
if fErr != nil {
fmt.Println("os.Create:", fErr)
continue //这里要用continue
}
// 写入信息
f.WriteString(resp)
f.Close()
}
}
// 爬取
func SpiderPage(url string) (resp string, err error) {
// 获取内容
resp1, err1 := http.Get(url)
if err1 != nil {
err = err
return
}
defer resp1.Body.Close()
// 如果没错就获取内容
buf := make([]byte, 1024*4)
for {
// 读body内容
n, bErr := resp1.Body.Read(buf)
if n == 0 { // 读取结束或者出问题
fmt.Println("resp.Body.Read:", bErr)
break //这里读取结束要break,n=0就是结束
}
resp += string(buf[:n])
}
return
}
贴吧多线程爬虫简单例子
package main
import (
"fmt"
"net/http"
"os"
"strconv"
)
// 声明一个管道,来保证爬取完毕
var page = make(chan int)
func main() {
var start, end int
fmt.Println("请输入起始页(>=1):")
fmt.Scan(&start)
fmt.Println("请输入终止页(>=1):")
fmt.Scan(&end)
DoWork(start, end)
}
func DoWork(start, end int) {
//提示信息
fmt.Printf("正在爬取 %d 到 %d 页面\n", start, end)
for i := start; i <= end; i++ {
// 封装函数, 开协成
go SpiderPage(i)
// 管道阻塞
// fmt.Printf("第%d页爬完了\n",
}
// 记住一定要单独接收管道消息!!!
for i := start; i <= end; i++ {
//管道阻塞
fmt.Printf("第%d页爬完了\n",
}
}
func SpiderPage(i int) {
url := "http://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=" + strconv.Itoa((i-1)*50)
// 爬取内容
fmt.Println(url)
resp, err := GetUrlContent(url)
if err != nil {
fmt.Println("SpiderPage:", err)
return
}
// 获取到内容然后写入文件
fileName := strconv.Itoa(i) + ".html"
// 新建文件(会覆盖)
f, fErr := os.Create(fileName)
if fErr != nil {
fmt.Println("os.Create:", fErr)
return
}
// 写入信息
f.WriteString(resp)
f.Close()
// 爬完一个就给管道传递数据
page
}
// 爬取
func GetUrlContent(url string) (resp string, err error) {
// 获取内容
resp1, err1 := http.Get(url)
if err1 != nil {
err = err
return
}
defer resp1.Body.Close()
// 如果没错就获取内容
buf := make([]byte, 1024*4)
for {
// 读body内容
n, bErr := resp1.Body.Read(buf)
if n == 0 { // 读取结束或者出问题
fmt.Println("resp.Body.Read:", bErr)
break //这里读取结束要break,n=0就是结束
}
resp += string(buf[:n])
}
return
}
多线程爬取多个页面例子
package main
import (
"fmt"
"net/http"
"os"
"strings"
//"os"
"regexp"
"strconv"
)
// 声明一个管道,来保证爬取完毕
var page = make(chan int)
func main() {
var start, end int
fmt.Println("请输入起始页(>=1):")
fmt.Scan(&start)
fmt.Println("请输入终止页(>=1):")
fmt.Scan(&end)
DoWork(start, end) //工作函数
}
func DoWork(start, end int) {
//提示信息
fmt.Printf("正在爬取 %d 到 %d 页面\n", start, end)
for i := start; i <= end; i++ {
// 定义一个函数,爬取主页面
go SpiderPage(i)
}
// 记住一定要单独接收管道消息!!!
for i := start; i <= end; i++ {
//管道阻塞
fmt.Printf("第%d页爬完了\n",
}
}
func SpiderPage(i int) {
url := "https://www.pengfu.com/xiaohua_" + strconv.Itoa(i) + ".html"
// 爬取内容
fmt.Println(url)
resp, err := GetUrlContent(url)
if err != nil {
fmt.Println("GetUrlContent:", err)
return
}
// 正则表达式取内容
re := regexp.MustCompile(`
fmt.Println("regexp.MustCompile err")
joyUrls := re.FindAllStringSubmatch(resp, -1) //-1代表所有
fileTitle := make([]string, 0)
fileContent := make([]string, 0)
for _, data := range joyUrls {
title, content, err := SpiderOneJoy(data[1])
fmt.Println("SpiderOneJoy err", err)
// fmt.Printf("title=#%v#", title)
// fmt.Printf("content=#%v#", content)
fileTitle = append(fileTitle, title)
fileContent = append(fileContent, content)
StoreJoyToFile(i, fileTitle, fileContent)
func StoreJoyToFile(i int, fileTitle, fileContent []string) {
f, err := os.Create(strconv.Itoa(i) + ".txt")
fmt.Println("os.create err", err)
f.WriteString(fileTitle[i] + "\n")
f.WriteString(fileContent[i] + "\n")
f.WriteString("\n===============\n")
func SpiderOneJoy(url string) (title string, content string, err error) {
resp1, err1 := GetUrlContent(url)
(?s:(.*?))
`)err = fmt.Errorf("%s", "regexp.MustCompile title err")
tmpTitle := re1.FindAllStringSubmatch(resp1, 1) //1过滤一个
for _, data := range tmpTitle {
//title = strings.Replace(title, "\n", "", -1)
//title = strings.Replace(title, "\r", "", -1)
//title = strings.Replace(title, " ", "", -1)
title = strings.Replace(title, "\t", "", -1)