golang+docker+postgres爬取豆瓣电影评论

golang+docker+postgres爬取豆瓣电影评论

直接代码了

docker装pg的镜像,Navicat Premium 12数据库ide,

package main

import (
	"fmt"
	"io/ioutil"
	"log"
	"net/http"
	_ "reflect"
	"regexp"
	"strconv"
	"strings"
	"sync"
	"github.com/go-xorm/xorm"
	_ "github.com/lib/pq"
)

type DouBanTheDarkKnight struct {
	Id          string
	Grade       string
	Remark_time string
	Content     string
}

func httpDo(count int) []DouBanTheDarkKnight {
	var users []DouBanTheDarkKnight
	var user DouBanTheDarkKnight
	client := &http.Client{}
	//电影的url
	url := "https://movie.douban.com/subject/1851857/comments?start=" + strconv.Itoa(count) + "&limit=20&sort=new_score&status=P" //前200不用cookie,后面要,只能到500
	req, err := http.NewRequest("GET", url, strings.NewReader("name=cjb"))
	if err != nil {
		fmt.Println("handle error!")
	}
	req.Header.Add("Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0")
	req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
	//豆瓣的cookie 是sesion的(可能词不对),每次登陆会给个cookie,退出登陆cookie就失效了,cookie里只有dbcl2是每次会变
	//想通过模拟登陆来获取cookie,没找到登陆传递参数的方式,思路是发送登陆请求(有网站是在请求的url里插入账号密码),同时把账号密码传递过去,再获取cookie
	//模拟登陆就不用手动登陆获取cookie的dbcl2,我没实现
	//用post方式带cookie去请求页面会跳转的个人主页,不是请求的页面
	req.Header.Set("Cookie", "ll=108288; bid=FVlP52FmKzM; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1563431839%2C%22https%3A%2F%2Fwww.baidu.com%2Fbaidu%3Ftn%3Dmonline_3_dg%26ie%3Dutf-8%26wd%3Ddouban%22%5D; _pk_id.100001.4cf6=dd579b35d3bc52ce.1563431839.1.1563433653.1563431839.; _pk_ses.100001.4cf6=*; __yadk_uid=Eg92GOAas2yYcKDklDJPQynEq2L2QpvN; __utma=30149280.1864121473.1563431842.1563431842.1563431842.1; __utmb=30149280.5.10.1563431842; __utmc=30149280; __utmz=30149280.1563431842.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=223695111.1399805799.1563431842.1563431842.1563431842.1; __utmb=223695111.0.10.1563431842; __utmc=223695111; __utmz=223695111.1563431842.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic|utmctr=douban; _vwo_uuid_v2=D917E008A586E87EF9F386F7B5CE93636|c31639c2cdeb29b50367bd1c4dd4cd53; trc_cookie_storage=taboola%2520global%253Auser-id%3D0fd46ccd-91e6-4f3b-92ec-e78d47d676e8-tuct4299933; dbcl2=151879300:l6O+QrfUy20; ck=v9_0; push_noty_num=0; push_doumail_num=0; douban-profile-remind=1; __utmv=30149280.15187")
	resp, err := client.Do(req)
	if err != nil {
		fmt.Println("client.Do error!")
	}

	defer resp.Body.Close()

	body, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		fmt.Println("ioutil.ReadAll error!")
	}
	//去除换行
	reg := regexp.MustCompile(`\n`)//匹配回车不行,匹配的换行,不去除换行会导致部分数据匹配不到
	body_s := reg.ReplaceAllString(string(body), ``)
	//分成20块
	Reg := `<div class="comment-item" data-cid="(.*?)</p>` //这个正则有毒,(.*?)前面的-不会影响匹配,后面的有-会导致匹配不到,所以用的</p>
	rp1 := regexp.MustCompile(Reg)
	heads := rp1.FindAllStringSubmatch(body_s, -1)
	//用户名
	Reg_id := `<a href="https://www.douban.com/people/(.*?)/" class="">(.*?)</a>` //可以匹配多个(.*?)
	rp1_id := regexp.MustCompile(Reg_id)
	var heads_id [][]string
	//评分
	Reg_grade := `<span class="allstar(.*?) rating"`
	rp1_grade := regexp.MustCompile(Reg_grade)
	var heads_grade [][]string
	//时间
	Reg_time := `<span class="comment-time " title="(.*?)">`
	rp1_time := regexp.MustCompile(Reg_time)
	var heads_time [][]string
	//评论
	Reg_content := `<span class="short">(.*?)</span>`
	rp1_content := regexp.MustCompile(Reg_content)
	var heads_content [][]string
	//循环写,这里可以只用一个二维slice实现,不用4个,我还不会
	for _, v := range heads {
		heads_id = rp1_id.FindAllStringSubmatch(v[0], -1)
		heads_grade = rp1_grade.FindAllStringSubmatch(v[0], -1)
		heads_time = rp1_time.FindAllStringSubmatch(v[0], -1)
		heads_content = rp1_content.FindAllStringSubmatch(v[0], -1)
		if heads_grade == nil {
			heads_grade = [][]string{{"NULL", "NULL"}}//要初始化,不同于数组赋值初始化,下面的方式是错的
			//heads_grade[0][0] = "NULL"
			//heads_grade[0][1] = "NULL"
			//continue
		}
		fmt.Println(heads_id[0][2], heads_grade[0][1], heads_time[0][1], heads_content[0][1]) //输出各数据,注意id的下标不一样
		user = DouBanTheDarkKnight{
			Id:          heads_id[0][2],
			Grade:       heads_grade[0][1],
			Remark_time: heads_time[0][1],
			Content:     heads_content[0][1],
		}
		users = append(users, user)
	}
	return users
}

func getDBEngine() *xorm.Engine {
	engine, err := xorm.NewEngine("postgres", "user=postgres password=password dbname=postgres host=localhost port=54321 sslmode=disable")
	if err != nil {
		log.Fatal(err)
	}
	engine.ShowSQL() //菜鸟必备
	err = engine.Ping()
	if err != nil {
		log.Fatal(err)
	}
	fmt.Println("connect postgresql success")
	var user DouBanTheDarkKnight
	err = engine.CreateTables(user)//正常是在pg直接建表
	return engine
}

func InsertDB(users []DouBanTheDarkKnight, engine *xorm.Engine) {
	rows, err := engine.Insert(users)//这是一下插入多条,应该是一条里的一个数据依次插入,func Insertone()
	if err != nil {
		log.Println(err)
	}
	if rows == 0 {
		fmt.Println("false")
		return
	}
	fmt.Println(rows)
}

func main() {
	var wg sync.WaitGroup
	var users []DouBanTheDarkKnight
	engine := getDBEngine()
	for i := 0; i <= 480; i += 20 {
		wg.Add(1)
		go func(id int) {//一定得传参,不然数据会用相同的i,可能是因为用的地址
			defer wg.Done()
			users = httpDo(id)//不传参直接用i会少数据
			InsertDB(users, engine)//开发时要一个一个数据插,不然不知道哪里出错
		}(i)
	}
	wg.Wait()
}

在这里插入图片描述

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值