golang+docker+postgres爬取豆瓣电影评论
直接代码了
docker装pg的镜像,Navicat Premium 12数据库ide,
package main
import (
"fmt"
"io/ioutil"
"log"
"net/http"
_ "reflect"
"regexp"
"strconv"
"strings"
"sync"
"github.com/go-xorm/xorm"
_ "github.com/lib/pq"
)
type DouBanTheDarkKnight struct {
Id string
Grade string
Remark_time string
Content string
}
func httpDo(count int) []DouBanTheDarkKnight {
var users []DouBanTheDarkKnight
var user DouBanTheDarkKnight
client := &http.Client{}
//电影的url
url := "https://movie.douban.com/subject/1851857/comments?start=" + strconv.Itoa(count) + "&limit=20&sort=new_score&status=P" //前200不用cookie,后面要,只能到500
req, err := http.NewRequest("GET", url, strings.NewReader("name=cjb"))
if err != nil {
fmt.Println("handle error!")
}
req.Header.Add("Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0")
req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
//豆瓣的cookie 是sesion的(可能词不对),每次登陆会给个cookie,退出登陆cookie就失效了,cookie里只有dbcl2是每次会变
//想通过模拟登陆来获取cookie,没找到登陆传递参数的方式,思路是发送登陆请求(有网站是在请求的url里插入账号密码),同时把账号密码传递过去,再获取cookie
//模拟登陆就不用手动登陆获取cookie的dbcl2,我没实现
//用post方式带cookie去请求页面会跳转的个人主页,不是请求的页面
req.Header.Set("Cookie", "ll=108288; bid=FVlP52FmKzM; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1563431839%2C%22https%3A%2F%2Fwww.baidu.com%2Fbaidu%3Ftn%3Dmonline_3_dg%26ie%3Dutf-8%26wd%3Ddouban%22%5D; _pk_id.100001.4cf6=dd579b35d3bc52ce.1563431839.1.1563433653.1563431839.; _pk_ses.100001.4cf6=*; __yadk_uid=Eg92GOAas2yYcKDklDJPQynEq2L2QpvN; __utma=30149280.1864121473.1563431842.1563431842.1563431842.1; __utmb=30149280.5.10.1563431842; __utmc=30149280; __utmz=30149280.1563431842.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=223695111.1399805799.1563431842.1563431842.1563431842.1; __utmb=223695111.0.10.1563431842; __utmc=223695111; __utmz=223695111.1563431842.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic|utmctr=douban; _vwo_uuid_v2=D917E008A586E87EF9F386F7B5CE93636|c31639c2cdeb29b50367bd1c4dd4cd53; trc_cookie_storage=taboola%2520global%253Auser-id%3D0fd46ccd-91e6-4f3b-92ec-e78d47d676e8-tuct4299933; dbcl2=151879300:l6O+QrfUy20; ck=v9_0; push_noty_num=0; push_doumail_num=0; douban-profile-remind=1; __utmv=30149280.15187")
resp, err := client.Do(req)
if err != nil {
fmt.Println("client.Do error!")
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
fmt.Println("ioutil.ReadAll error!")
}
//去除换行
reg := regexp.MustCompile(`\n`)//匹配回车不行,匹配的换行,不去除换行会导致部分数据匹配不到
body_s := reg.ReplaceAllString(string(body), ``)
//分成20块
Reg := `<div class="comment-item" data-cid="(.*?)</p>` //这个正则有毒,(.*?)前面的-不会影响匹配,后面的有-会导致匹配不到,所以用的</p>
rp1 := regexp.MustCompile(Reg)
heads := rp1.FindAllStringSubmatch(body_s, -1)
//用户名
Reg_id := `<a href="https://www.douban.com/people/(.*?)/" class="">(.*?)</a>` //可以匹配多个(.*?)
rp1_id := regexp.MustCompile(Reg_id)
var heads_id [][]string
//评分
Reg_grade := `<span class="allstar(.*?) rating"`
rp1_grade := regexp.MustCompile(Reg_grade)
var heads_grade [][]string
//时间
Reg_time := `<span class="comment-time " title="(.*?)">`
rp1_time := regexp.MustCompile(Reg_time)
var heads_time [][]string
//评论
Reg_content := `<span class="short">(.*?)</span>`
rp1_content := regexp.MustCompile(Reg_content)
var heads_content [][]string
//循环写,这里可以只用一个二维slice实现,不用4个,我还不会
for _, v := range heads {
heads_id = rp1_id.FindAllStringSubmatch(v[0], -1)
heads_grade = rp1_grade.FindAllStringSubmatch(v[0], -1)
heads_time = rp1_time.FindAllStringSubmatch(v[0], -1)
heads_content = rp1_content.FindAllStringSubmatch(v[0], -1)
if heads_grade == nil {
heads_grade = [][]string{{"NULL", "NULL"}}//要初始化,不同于数组赋值初始化,下面的方式是错的
//heads_grade[0][0] = "NULL"
//heads_grade[0][1] = "NULL"
//continue
}
fmt.Println(heads_id[0][2], heads_grade[0][1], heads_time[0][1], heads_content[0][1]) //输出各数据,注意id的下标不一样
user = DouBanTheDarkKnight{
Id: heads_id[0][2],
Grade: heads_grade[0][1],
Remark_time: heads_time[0][1],
Content: heads_content[0][1],
}
users = append(users, user)
}
return users
}
func getDBEngine() *xorm.Engine {
engine, err := xorm.NewEngine("postgres", "user=postgres password=password dbname=postgres host=localhost port=54321 sslmode=disable")
if err != nil {
log.Fatal(err)
}
engine.ShowSQL() //菜鸟必备
err = engine.Ping()
if err != nil {
log.Fatal(err)
}
fmt.Println("connect postgresql success")
var user DouBanTheDarkKnight
err = engine.CreateTables(user)//正常是在pg直接建表
return engine
}
func InsertDB(users []DouBanTheDarkKnight, engine *xorm.Engine) {
rows, err := engine.Insert(users)//这是一下插入多条,应该是一条里的一个数据依次插入,func Insertone()
if err != nil {
log.Println(err)
}
if rows == 0 {
fmt.Println("false")
return
}
fmt.Println(rows)
}
func main() {
var wg sync.WaitGroup
var users []DouBanTheDarkKnight
engine := getDBEngine()
for i := 0; i <= 480; i += 20 {
wg.Add(1)
go func(id int) {//一定得传参,不然数据会用相同的i,可能是因为用的地址
defer wg.Done()
users = httpDo(id)//不传参直接用i会少数据
InsertDB(users, engine)//开发时要一个一个数据插,不然不知道哪里出错
}(i)
}
wg.Wait()
}