Golang爬虫

最新推荐文章于 2022-11-25 23:49:12 发布

转音视频的老王

最新推荐文章于 2022-11-25 23:49:12 发布

阅读量411

点赞数 2

分类专栏： Golang

本文链接：https://blog.csdn.net/naiwenw/article/details/79281130

版权

Golang 专栏收录该内容

4 篇文章 1 订阅

订阅专栏

###Golang爬虫库

goquery
###goquery安装
在cmd中执行下面的go命令：

 go get github.com/PuerkitoBio/goquery

###内涵段子

func GetJokes3()  {
	q, err := goquery.NewDocument("http://neihanshequ.com/")
	if err != nil{
		fmt.Println("document error")
	}
	q.Find(".site-width .options").Each(func(i int, selection *goquery.Selection) {
		text, _ := selection.Find(".share-wrapper").Attr("data-text")
		u, _ := selection.Find(".share-wrapper").Attr("data-url")
		p, _ := selection.Find(".share-wrapper").Attr("data-pic")
		d := selection.Find(".digg-wrapper .digg").Text()


		fmt.Println("开始了，结果是",text)
		fmt.Println("开始了，结果是",u)
		fmt.Println("开始了，结果是",p)
		fmt.Println("开始了，结果是",d)
	})
}

###中文幽默王

func GetJokes5()  {

	for i := 0; i < 10; i++ {
		//发起get请求获得html网页
		u := "http://www.haha365.com/joke/index_"
		u += fmt.Sprintf("%d", i)
		u += ".htm"
		fmt.Println("地址是：", u)
		resp, _ := http.Get(u)

		//读取网页数据
		defer resp.Body.Close()
		body , _ := ioutil.ReadAll(resp.Body)

		//由于网页是gbk, 而goquery只支持utf-8,所以将网页数据转码成gbk
		ecd := mahonia.NewDecoder("gbk")
		str := ecd.ConvertString(string(body))

		//转义字符：由于网页中存在&ldquo;&rdquo;等转义字符，所以将它转义成我们要的字符
		str = html.UnescapeString(str)


		//fmt.Println(str)
		//fmt.Println("====================================")

		//利用goquery.NewDocumentFromReader()将网页读取到doc中
		doc, _ := goquery.NewDocumentFromReader(strings.NewReader(str))

		//获取得到的doc得到对应的selector， 并且读取出来
		doc.Find("")

		doc.Find("#endtext").Each(func(i int, selection *goquery.Selection) {
			//拿到字符串
			text := selection.Text()
			fmt.Println(text)
		})
	}

}

###糗事百科

func GetJokes2()  {
	fmt.Println("开始爬取糗事百科热点笑话...")
	js, err := goquery.NewDocument("https://www.qiushibaike.com/hot/")
	if err != nil {
		log.Fatal(err)
	}
	js.Find("#content-left .article").Each(func(i int, contentSelection *goquery.Selection) {
		//先判断是否有图片
		img, _ := contentSelection.Find(".thumb img").Attr("src")
		if len(img) == 0 {
			hotsArt := HotsContent{}
			content := contentSelection.Find(".content span").Text()
			url, _ := contentSelection.Find(".contentHerf").Attr("href")
			comment_name := contentSelection.Find(".cmtMain .cmt-name").Text()
			comment_cont := contentSelection.Find(".cmtMain .main-text").Text()
			hotsArt.num = i + 1
			hotsArt.url = "https://www.qiushibaike.com" + url
			hotsArt.content = strings.Replace(content, "\n", "", -1)
			hotsArt.comment = strings.Replace(comment_name+comment_cont, "\n", "", -1)
			fmt.Println("第", hotsArt.num, "个笑话:")
			fmt.Println("\t", hotsArt.content)
			fmt.Println("\t 最热评论:" + hotsArt.comment)
			fmt.Println("\t 地址", hotsArt.url)
			count := contentSelection.Find(".stats .stats-vote .number").Text()
			fmt.Println("\t点赞数", count)
			fmt.Println("======================================================")
		}else{
			hotsArt := HotsContent{}
			content := contentSelection.Find(".content span").Text()
			url, _ := contentSelection.Find(".contentHerf").Attr("href")
			comment_name := contentSelection.Find(".cmtMain .cmt-name").Text()
			comment_cont := contentSelection.Find(".cmtMain .main-text").Text()
			hotsArt.num = i + 1
			hotsArt.url = "https://www.qiushibaike.com" + url
			hotsArt.content = strings.Replace(content, "\n", "", -1)
			hotsArt.comment = strings.Replace(comment_name+comment_cont, "\n", "", -1)
			fmt.Println("第", hotsArt.num, "个笑话:")
			fmt.Println("\t", hotsArt.content)
			fmt.Println("\t 最热评论:" + hotsArt.comment)
			fmt.Println("\t 地址", hotsArt.url)
			fmt.Println("\t图片地址", img)
			count := contentSelection.Find(".stats .stats-vote .number").Text()
			fmt.Println("\t点赞数", count)
			fmt.Println("======================================================")
		}
	})
}