用了goquery 和regexp两个包
用法如query:
dom,err:=goquery.NewDocumentFromReader(strings.NewReader(result))
if err!=nil{
fmt.Println("HttpGet err :",err)
}
dom.Find(".Programlist .Cont ul p").Each(func(i int, selection *goquery.Selection) {
// if selection != nil {
title += selection.Text() + "\r"
titleS = append(titleS,selection.Text())
// }
})
regexp用法:
// fmt.Println(video_cont)
rel2 := regexp.MustCompile(`"title":"流畅","url":"(.*?)"`)
if rel2 == nil {
fmt.Println("准备好了12")
}
arr2 := rel2.FindAllStringSubmatch(video_cont,-1) //获取的数据是二维的切片
package main
import (
"fmt"
"strconv"
"net/http"
"os"
"regexp"
"strings"
"github.com/PuerkitoBio/goquery"
"database/sql"
_ "github.com/go-sql-driver/mysql"
)
// type collectionmwd struct {
// ID int64 `db:"id"`
// title string `db:title`
// cover string `db:"cover"` //由于在mysql的users表中name没有设置为NOT NULL,所以name可能为null,在查询过程中会返回nil,如果是string类型则无法接收nil,但sql.NullString则可以接收nil值
// videoUrl string `db:"videourl"`
// }
const (
USERNAME = "root"
PASSWORD = "root"
NETWORK = "tcp"
SERVER = "localhost"
PORT = 3306
DATABASE = "guanfu_school"
)
func main () {
var start ,end int
fmt.Printf("请输入起始页:(2013开始,2019结束)")
fmt.Scan(&start)
fmt.Printf("请输入结束页:(2013开始,2019结束)")
fmt.Scan(&end)
// image := []string{}
// fmt.Println(len(image))
Dowork(start,end)
// title := []string{"12321321","dwefdsfsd","萨芬就开始放假都是放到数据库"}
// path := []string{"12321321","dwefdsfsd","萨芬就开始放假都是放到数据库"}
// fmt.Println(image[2])
// fmt.Println(title[2])
// fmt.Println(path[2])
// insertData(image,title,path)
}
func Dowork (start , end int) {
// return
fmt.Println("正在爬取数据")
// var title string
//开始循环每个年份
//然后每个年份再循环找每一页的数据
for i:=start;i<=end;i++ {
for j:=1;j<=3;j++ {
//写入文件
var img string
var title string
var path string
//插入数据库数据
var imgS = make([]string,0)
var titleS = make([]string,0)
var pathS = make([]string,0)
var url = fmt.Sprintf("http://vod.gxtv.cn/program/28/%s/%s.html",strconv.Itoa(i),strconv.Itoa(j))
//获取一年中每页的内容
result,err := HttpGet(url)
if err != nil {
fmt.Println("HttpGet err :",err)
break
}
// 过滤标题
dom,err:=goquery.NewDocumentFromReader(strings.NewReader(result))
if err!=nil{
fmt.Println("HttpGet err :",err)
}
dom.Find(".Programlist .Cont ul p").Each(func(i int, selection *goquery.Selection) {
// if selection != nil {
title += selection.Text() + "\r"
titleS = append(titleS,selection.Text())
// }
})
fmt.Println("title:",title)
fmt.Println("titleS",titleS)
// 过滤封面url
imgReg := regexp.MustCompile(`<img src="(.*?)"`)
if imgReg == nil {
fmt.Println("没有封面图")
}
imgMap := imgReg.FindAllStringSubmatch(result,-1)
fmt.Println("imgMap:",imgMap)
for k,data := range imgMap {
if k > 1 {
img += data[1] + "\r"
imgS = append(imgS,data[1])
}
}
fmt.Println("img :",img)
fmt.Println("imgS :",imgS)
//找到详情页路径 再爬取代码
rel := regexp.MustCompile(`<li><a href="(.*?)" `)
if rel == nil {
fmt.Println("准备好了吗")
}
arr := rel.FindAllStringSubmatch(result,-1)
// fmt.Println("全部的路径:",arr)
for _,data := range arr {
fmt.Println("url=",data[1])
video_url := "http://vod.gxtv.cn"+data[1]
video_cont,err := HttpGet(video_url)
if err != nil {
fmt.Println("准备好了")
}
// fmt.Println(video_cont)
rel2 := regexp.MustCompile(`"title":"流畅","url":"(.*?)"`)
if rel2 == nil {
fmt.Println("准备好了12")
}
arr2 := rel2.FindAllStringSubmatch(video_cont,-1) //获取的数据是二维的切片
for _,d := range arr2 {
fmt.Println("高清视频链接是",d)
// if d[1] != "" {
path += d[1]+"\n"
pathS = append(pathS,d[1])
// }
}
}
fmt.Println("path:",path)
fmt.Println("pathS:",pathS)
//把内容写入到文件
fileName := strconv.Itoa(i)+"_"+strconv.Itoa(j)+".txt"
// fileName := "path_"+strconv.Itoa(i)+"_"+strconv.Itoa(j)+".txt"
if title != "" && img != "" && path != "" {
f,err1 := os.Create(fileName)
if err1 != nil {
fmt.Println("os create error")
continue
}
// for i:=0;i<=len(titleMap);i++ {
f.WriteString(title)
f.WriteString(img)
f.WriteString(path)
insertData(imgS,titleS,pathS,strconv.Itoa(i))
// }
f.Close()
}
}
}
}
func HttpGet(url string) (result string ,err error) {
respon,err1 := http.Get(url)
if err1 != nil {
err = err1
return
}
defer respon.Body.Close()
//读取网页的内容 ?
buf := make([]byte,1024*4)
for {
n,err2 := respon.Body.Read(buf)
if n == 0{ //说明读取完毕
fmt.Println("resp111 body.Read err",err2)
break
}
result += string(buf[:n])
}
return
}
func insertData(img []string, title []string, url []string, year string) {
DB,err := DB()
if err != nil {
fmt.Println("数据库连接失败:",err)
}
fmt.Println("img的的长度是:",len(img))
fmt.Println("title的的长度是:",len(title))
fmt.Println("url",len(url))
for i:=0;i<len(img);i++ {
if len(img) == 0 || len(title) == 0 || len(url) == 0 {
continue
}
result,err := DB.Exec("insert INTO collectionmwd(cover,title,videoUrl,years) values(?,?,?,?)",img[i],title[i],url[i],year);
if err != nil {
fmt.Printf("Insert into 数据 failed err %d:%v", i,err)
continue
}
lastInsertId,err := result.LastInsertId()
if err != nil {
fmt.Printf("get lastInsertID failed :%v",lastInsertId)
continue
}
// rowsaffected,err := result.RowsAffected()
}
// fmt.Println("RowsAffected:",rowsaffected)
}
func DB() (DB *sql.DB,err error) {
dsn := fmt.Sprintf("%s:%s@%s(%s:%d)/%s",USERNAME,PASSWORD,NETWORK,SERVER,PORT,DATABASE)
DB,err = sql.Open("mysql",dsn)
if err != nil{
fmt.Printf("Open mysql failed,err:%v\n",err)
return
}
fmt.Println("数据库运行到这里了")
return
}