仅做学习,莫要犯法!!!
package main
import (
"bufio"
"fmt"
"github.com/PuerkitoBio/goquery"
"net/http"
"os"
"runtime"
"sort"
"strconv"
"strings"
"time"
)
type titleUrl struct {
title string
url string
}
// 需要过滤掉的文章标题
var needFilter = []string{}
var baseUrl = "" // 需要爬取的网站的地址
func main() {
// 排序 方便二分查找
if needFilter != nil {
sort.Strings(needFilter)
}
url := baseUrl + "" // 网站目录地址
ch := make(chan titleUrl)
progress := make(chan bool, 1)
numCpu := runtime.NumCPU()
fmt.Println("当前核数量:" + strconv.Itoa(numCpu))
doc := httpGet(url)
// html class
dom := doc.Find("")
dom.Each(func(index int, selection *goquery.Selection) {
i := 0
// 获取每一章节的路径 和 标题
go func() {
for {
href, status := selection.Find("a").Eq(i).Attr("href")
title := selection.Find("a").Eq(i).Text()
title = strings.Replace(title, "\n", "", -1)
title = strings.Replace(title, "\r", "", -1)
title = strings.Replace(title, "\t", "", -1)
title = strings.Replace(title, "", "", -1)
i++
if status != true {
break
}
if needFilter != nil && !checkString(title) {
continue
}
mapping := titleUrl{url: href, title: title}
ch <- mapping
fmt.Println("href : " + href + ",title : " + title)
}
}()
})
go func() {
j := 0
ForEnd:
for {
select {
case v := <-ch:
writeContent(v)
//case <-progress:
// // 跳出循环
// close(ch)
// break ForEnd
default:
if j >= 20 {
progress<-true
fmt.Println("协程结束")
break ForEnd
}
j++
fmt.Println("阻塞中!")
time.Sleep(1000 * time.Millisecond)
}
}
}()
<-progress
}
// 标题筛选
func checkString(title string) bool {
index := sort.SearchStrings(needFilter, title)
if index < len(needFilter) && needFilter[index] == title {
return false
}
return true
}
/**
写入文件
*/
func writeContent(channel titleUrl) {
content := httpGet(baseUrl + channel.url)
text := content.Find("div.readAreaBox.content .p").Text()
if text == "" {
return
}
title := channel.title
filePath := "./text/" + title + ".txt"
var f *os.File
if checkFile(filePath) {
os.Remove(filePath)
}
f, _ = os.Create(filePath) //创立文件
defer f.Close()
w := bufio.NewWriter(f) //创立新的 Writer 对象
_, _ = w.WriteString(text)
w.Flush()
fmt.Println(filePath + "写入成功!")
}
// 校验文件是否存在
func checkFile(fileName string) bool {
var exist = true
if _, err := os.Stat(fileName); os.IsNotExist(err) {
exist = false
}
return exist
}
//异常处理
func throwError(nil2 error) {
if nil2 != nil {
println(nil2)
}
}
// http get 请求
func httpGet(url string) *goquery.Document {
client := &http.Client{}
req, _ := http.NewRequest("GET", url, nil)
req.Header.Set("User-Agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)")
resp, err := client.Do(req)
defer resp.Body.Close()
throwError(err)
doc, err := goquery.NewDocumentFromReader(resp.Body)
throwError(err)
return doc
}