这几天看了go语言,练习一下写法,结合项目会比较有趣,碰到的问题也会比较多。
Go爬虫
使用Go爬豆瓣电影Top250 并写入数据库
#####先看效果
准备工作
mysql(如未安装,也可以不要安装,直接看代码,打印控制台即可)
go 环境, 略,baidu并配置环境变量
开发工具 当前使用过的是LiteIDE,好像还有更好的编译工具
mysql
如何安装,略
数据库结构如下
DROP TABLE IF EXISTS `film`;
CREATE TABLE `film` (
`id` int(255) NOT NULL AUTO_INCREMENT,
`name` varchar(255) NOT NULL,
`detail` varchar(255) DEFAULT NULL,
`score` float DEFAULT '0',
`commentCount` int(11) DEFAULT '0',
`icon` varchar(255) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=175 DEFAULT CHARSET=utf8;
##清空id=0,学习的时候,经常删表再建..否则不需要
truncate table `film`;
当前代码链接数比较多… 需要修改mysql最大链接数
mysql -u root
show variables like "max_connections";
set GLOBAL max_connections=1000;
go代码
main.go
// FilmProject project main.go
package main
import (
"fmt"
"time"
)
func main() {
fmt.Println("Hello World!")
getAllFilm()
// getFirstFilmList()
fmt.Println("end")
}
func getFirstFilmList() {
list, err := GetFilm(true)
if err != nil {
fmt.Println("err ")
}
fmt.Println(list)
if list != nil && len(list) > 0 {
for _, v := range list {
DbInsert(v)
}
}
}
func getAllFilm() {
count := 10
list, err := GetFilm(true)
if err != nil {
fmt.Println("- -- ")
}
for i := 1; i < count; i++ {
time.Sleep(2000)
fmt.Println("开始第", i, "页")
l, e := GetFilm(false)
if e != nil {
continue
} else {
list = append(list, l...)
}
}
printFilmList(list)
fmt.Println("size: ", len(list))
}
func printFilmList(list []FilmBean) {
for _, bean := range list {
bean.String()
DbInsert(bean)
}
}
RequestParseUtils.go
请求/解析film,并返回
// RequestParseUtils
package main
import (
"fmt"
"io/ioutil"
"net/http"
"regexp"
"strconv"
//"strings"
)
var (
forgeHeaders = map[string]string{
"Host": "movie.douban.com",
"Connection": "keep-alive",
"Cache-Control": "max-age=0",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Referer": "https://movie.douban.com/top250",
}
// urlTitleRe = regexp.MustCompile(`img alt="(.*?)" src=`)
// urlTitleRe = regexp.MustCompile(`<a href="(.*?)"> </a>`)
// urlTitleRe = regexp.MustCompile(`<a href="(.*?)"> <img [^>]* alt="(.*?)" src="(.*?)" [^>]*></a>`)
curPage = 0
pageUrl = "https://movie.douban.com/top250?start=%d&filter="
)
/*主要用于请求,并解析,返回数据*/
//电影bean
type FilmBean struct {
detail string
icon string
name string
score float32
commentCount int
}
func (f FilmBean) String() {
fmt.Println("")
fmt.Printf("name:%s,score %f,comment %d, detail: %s, icon: %s", f.name, f.score, f.commentCount, f.detail, f.icon)
}
func GetFilm(isFirst bool) (arr []FilmBean, err error) {
if isFirst {
curPage = 0
} else {
fmt.Print("第几页")
}
s := fmt.Sprintf(pageUrl, curPage*25)
fmt.Printf("url: %s", s)
resp, err := sendHttp(s)
if err != nil {
return nil, err
}
list := parseFilmFormResp(resp)
curPage++
return list, nil
}
//发送http请求
func sendHttp(url string) (resp string, err error) {
client := http.Client{}
req, err := http.NewRequest("GET", url, nil)
fmt.Println("发送请求")
if err != nil {
fmt.Print("出错了")
return "", err
}
for k, v := range forgeHeaders {
req.Header.Add(k, v)
}
response, err2 := client.Do(req)
if err2 != nil {
return "", err2
}
defer response.Body.Close()
//body := response.Body
// fmt.Println(response)
//fmt.Println(body)
bytearr, err3 := ioutil.ReadAll(response.Body)
if err3 != nil {
return "", err3
}
str := string(bytearr)
// fmt.Printf("result: %s", str)
return str, nil
}
func parseFilmFormResp(resp string) (arr []FilmBean) {
fmt.Println("parse-------------")
pattern22 := `<a href="(.*?)">[^>]*<img [^>]* alt="(.*?)" src="(.*?)"[^>]*>[^>]*</a>`
urlTitleRe := regexp.MustCompile(pattern22)
detailTitle := urlTitleRe.FindAllStringSubmatch(resp, -1)
// fmt.Println(resp)
// fmt.Println(detailTitle)
//评价人数
commentPattern := `<span>(.*?)人评价</span>`
commentRe := regexp.MustCompile(commentPattern)
commentList := commentRe.FindAllStringSubmatch(resp, -1)
//评分人数
scorePattern := `property="v:average">(.*?)</span>`
scoreRe := regexp.MustCompile(scorePattern)
scoreList := scoreRe.FindAllStringSubmatch(resp, -1)
var fmlist22 []FilmBean
if detailTitle != nil && len(detailTitle) > 0 {
length := len(detailTitle)
// fmlist := [length]FilmBean{}
fmlist22 = make([]FilmBean, length, length)
item := detailTitle[0]
fmt.Printf("url: %s, name:%s, img:%s", item[1], item[2], item[3])
fmt.Println("")
for index, item := range detailTitle {
fb := FilmBean{}
fb.detail = item[1]
fb.name = item[2]
fb.icon = item[3]
score1, _ := strconv.ParseFloat(scoreList[index][1], 32)
fb.score = float32(score1)
fb.commentCount, _ = strconv.Atoi(commentList[index][1])
fmt.Printf("url: %s, name:%s, img:%s", item[1], item[2], item[3])
// fmt.Printf("url:%s ", item[0])
fmt.Println("", index)
fmlist22[index] = fb
}
}
return fmlist22
}
FilmDb.go
// FilmDb
package main
import (
"database/sql"
"fmt"
_ "github.com/go-sql-driver/mysql"
)
//插入demo
func DbInsert(f FilmBean) {
fmt.Println("")
db, err := sql.Open("mysql", "root:@/go_film?charset=utf8")
checkErr(err)
stmt, err := db.Prepare(`INSERT film (name,detail,score,commentCount,icon) values (?,?,?,?,?)`)
checkErr(err)
res, err := stmt.Exec(f.name, f.detail, f.score, f.commentCount, f.icon)
checkErr(err)
id, err := res.LastInsertId()
checkErr(err)
fmt.Println(id)
}
func checkErr(err error) {
if err != nil {
panic(err)
}
}
注意:
如果接入mysql,把main里面 DbInsert(v) 、DbInsert(bean) 删除即可
如接入mysql, 注意上面的import _ “github.com/go-sql-driver/mysql”
非windows系统,根据网上使用git安装mysql驱动
此处对windows 手动安装说明
参考 https://www.cnblogs.com/wangqishu/p/5147108.html
下载包
https://github.com/go-sql-driver/mysql/releases
当前最新版本,mysql-1.4.1
查看自己的gopath 如:D:\devTools\go
在path/src/ 创建 github.com\go-sql-driver\mysql 目录,对应import的结构;解压刚下载的zip 到该目录; (应该也可以直接吧mysql解压到src下,import改成mysql,不带前面路径)
例:D:\devTools\go\src\github.com\go-sql-driver\mysql