天涯爬虫php,天涯爬虫

package main

import (

"fmt"

"io"

"net/http"

"os"

"strconv"

"strings"

"time"

"github.com/PuerkitoBio/goquery"

"github.com/sirupsen/logrus"

)

var log = logrus.StandardLogger()

func init() {

initLog()

}

func main() {

start()

}

func start() {

var fileName string

var baseUrl string

var duration int32

prePrintInfo()

fmt.Println("请输入生成文件名称并回车:")

fmt.Scanln(&fileName)

fmt.Println("请输入帖子地址并回车:")

fmt.Scanln(&baseUrl)

fmt.Println("每页爬取间隔时间并回车")

fmt.Scanln(&duration)

run(fileName, baseUrl, duration)

}

func run(fileName string, url string, duration int32) {

fi := createFile(fileName)

defer func() {

if err := fi.Close(); err != nil {

panic(err)

}

}()

baseUrl, totalPage := parsingTopicInfo(url)

log.Infoln("总计:", totalPage, "页")

for i := 1; i <= totalPage; i++ {

log.Println("开始解析:第", i, "页")

parsing(baseUrl, i, fi)

time.Sleep(time.Duration(duration) * time.Millisecond)

log.Println("完成解析:第", i, "页")

}

}

//解析网页

func parsing(baseUrl string, pageNumber int, file *os.File) {

var text strings.Builder

var url = baseUrl + strconv.Itoa(pageNumber) + ".shtml"

createPageHeader(pageNumber, url, &text)

doc := httpGetUtil(url)

doc.Find(".atl-item").Each(func(i int, selection *goquery.Selection) {

doParsing(selection, &text)

})

file.WriteString(text.String())

}

func doParsing(selection *goquery.Selection, text *strings.Builder) {

parsingFloor(selection, text)

parsingAuthorAndReplayTime(selection, text)

parsingContent(selection, text)

}

//解析url前缀,总页码

func parsingTopicInfo(url string) (string, int) {

doc := httpGetUtil(url)

var totalPage string

var baseUrl string

doc.Find(".atl-pages").Each(func(i int, selection *goquery.Selection) {

if i == 0 {

selection.Find("a")

size := selection.Find("a").Size()

selection.Find("a").Each(func(i int, selection *goquery.Selection) {

if i == size-2 {

totalPage = selection.Text()

baseUrl, _ = selection.Attr("href")

}

})

}

})

baseUrl = strings.Replace(baseUrl, ".shtml", "", -1)

baseUrl = strings.Replace(baseUrl, totalPage, "", -1)

baseUrl = "http://bbs.tianya.cn" + baseUrl

count, _ := strconv.Atoi(totalPage)

return baseUrl, count

}

//生成每页信息

func createPageHeader(pageNumber int, url string, text *strings.Builder) {

text.WriteString("\r\n\r\n")

text.WriteString("第")

text.WriteString(strconv.Itoa(pageNumber))

text.WriteString("页 链接:")

text.WriteString(url)

text.WriteString("\r\n\r\n")

}

//解析回帖作者,时间

func parsingAuthorAndReplayTime(selection *goquery.Selection, text *strings.Builder) {

selection.Find(".atl-head").Find(".atl-info").Find("span").Each(func(i int, selection *goquery.Selection) {

if i == 0 {

str, _ := selection.Find("a").Attr("uname")

str = "作者:" + str

text.WriteString(str)

text.WriteString(" ")

} else if i == 1 {

str, _ := selection.Html()

text.WriteString(str)

text.WriteString("\r\n")

}

})

}

//解析楼层

func parsingFloor(selection *goquery.Selection, text *strings.Builder) {

selection.Find(".atl-reply").Find("span").Each(func(i int, selection *goquery.Selection) {

if i == 0 {

text.WriteString(selection.Text())

text.WriteString(" ")

}

})

}

//解析回帖内容

func parsingContent(selection *goquery.Selection, text *strings.Builder) {

replacer := strings.NewReplacer("\n", "", " ", "", " ", "")

content := selection.Find(".atl-con-bd").Find(".bbs-content").Text()

contentStr := replacer.Replace(content)

text.WriteString(contentStr)

text.WriteString("\r\n\r\n")

}

//http请求

func httpGetUtil(url string) (*goquery.Document) {

res, err := http.Get(url)

if err != nil {

log.Fatal("请求地址发生错误:", err, ",请检查地址是否正确")

}

defer res.Body.Close()

if res.StatusCode != 200 {

log.Fatalf("请求地址发生错误:%d %s", res.StatusCode, res.Status, "请检查地址是否正确")

}

doc, err := goquery.NewDocumentFromReader(res.Body)

if err != nil {

log.Fatalf(err.Error())

}

return doc

}

func createFile(fileName string) *os.File {

fi, err := os.Create(fileName + ".txt")

if err != nil {

log.Fatal(err)

}

fi.WriteString("============================")

line(fi, 1)

fi.WriteString("创建时间:" + time.Now().Format("2006-01-02 15:04:05"))

line(fi, 1)

fi.WriteString("作者:ananan")

line(fi, 1)

fi.WriteString("代码地址:github.com/Ouyangan/tianya-bbs-crawler")

line(fi, 1)

fi.WriteString("联系方式:981017952@qq.com")

line(fi, 1)

fi.WriteString("============================")

line(fi, 1)

return fi

}

//换行

func line(fi *os.File, count int) {

for i := 0; i < count; i ++ {

fi.WriteString("\r\n")

}

}

//log初始化

func initLog() {

log.Level = logrus.DebugLevel

log.Formatter = &logrus.TextFormatter{

TimestampFormat: "2006-01-02 15:04:05",

ForceColors: false,

FullTimestamp: true,

}

file, err := os.Create("tianya-bbs-crawler.log")

if err != nil {

log.Warn("创建日志异常,", err.Error())

log.Out = os.Stdout

} else {

mw := io.MultiWriter(file, os.Stdout)

log.Out = mw

}

}

//打印简介

func prePrintInfo() {

fmt.Println("作者:ananan")

fmt.Println("代码地址:github.com/Ouyangan/tianya-bbs-crawler")

fmt.Println("联系方式:981017952@qq.com")

fmt.Println("===========使用介绍===========")

fmt.Println("请依次输入三个参数:")

fmt.Println("1.生成文件名称")

fmt.Println("2.帖子地址")

fmt.Println("3.每页间隔爬取时间(毫秒) 推荐1000毫秒以上,防止被封:")

fmt.Println("输入完成后将会在同级目录下生成txt文件,请等待任务完成")

fmt.Println()

fmt.Println("参考示例:")

fmt.Println()

fmt.Println("天涯")

fmt.Println("http://bbs.tianya.cn/post-house-252774-1.shtml")

fmt.Println("1000")

fmt.Println("=============================")

fmt.Println()

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值