首先我们需要谷歌浏览器(双核的不行),然后去下载相对应版本的webdriver
webdriver文件下载地址
http://chromedriver.storage.googleapis.com/index.html(浏览器版本和driver的版本要对应上)
先创建配置文件,看代码
package driver
import (
"os/exec"
"strconv"
)
const (
chromePath = `C:\Program Files\Google\Chrome\Application\chrome.exe` // 谷歌浏览器地址
chromePort = 9222
)
// StartChrome 启动Chrome浏览器并开启debug端口
func (d *Driver) StartChrome() error {
d.cmd = exec.Command(chromePath, `--disable-notifications`, `--remote-debugging-port=`+strconv.Itoa(chromePort+d.index)) //, `--user-data-dir=`+chromeDir)
return d.cmd.Start()
}
// StopChrome 停止Chrome浏览器
func (d *Driver) StopChrome() error {
return d.cmd.Process.Kill()
}
然后是我封装的一点抓取方法
package driver
import (
"fmt"
"log"
"os/exec"
"strconv"
"time"
"github.com/tebeka/selenium"
"github.com/tebeka/selenium/chrome"
)
// Driver 封装 WebDriver
type Driver struct {
Count int // 实例计数
index int
cmd *exec.Cmd
driver selenium.WebDriver
service *selenium.Service
}
type CheckRule struct {
ID int `gorm:"primary_key" json:"id"`
TypeID int `json:"type_id"`
Name string `json:"name"`
Selector string `json:"selector"`
Attribute string `json:"attribute"`
CreateTime time.Time `json:"create_time"`
UpdateTime time.Time `json:"update_time"`
}
// NewDriver 返回 Driver
func NewDriver() (driver *Driver, err error) {
var d Driver
d.index = 0
err = d.Start()
if err != nil {
return
}
return &d, err
}
// Start 启动 chrome
func (c *Driver) Start() (err error) {
err = c.StartChrome()
if err != nil {
log.Println(err)
return
}
time.Sleep(time.Second)
err = c.StartService()
if err != nil {
log.Println(err)
return
}
caps := selenium.Capabilities{
"browserName": "chrome",
}
caps.AddChrome(chrome.Capabilities{DebuggerAddr: `127.0.0.1:` + strconv.Itoa(chromePort+c.index), W3C: true})
c.driver, err = selenium.NewRemote(caps, fmt.Sprintf(`http://localhost:%d/wd/hub`, seleniumPort+c.index))
if err != nil {
log.Println(err)
return
}
return
}
// Stop 停止 chrome
func (c *Driver) Stop() (err error) {
err = c.driver.Close()
if err != nil {
log.Println("WebDriver.Close():", err)
}
err = c.driver.Quit()
if err != nil {
log.Println("WebDriver.Quit():", err)
err = c.StopChrome()
if err != nil {
log.Println("chrome.Stop():", err)
}
}
err = c.StopService()
if err != nil {
log.Println("service.Stop():", err)
}
return
}
// Parse 根据规则获取元素值
func (c *Driver) Parse(rule CheckRule) (value string, err error) {
var el selenium.WebElement
el, err = c.Find(rule.Selector)
if err != nil {
return
}
if rule.Attribute == "text" {
value, err = el.Text()
} else {
value, err = el.GetAttribute(rule.Attribute)
}
return
}
// ParseEnd 解析停止,只保留1个tab
func (c *Driver) ParseEnd() {
tabs, err := c.driver.WindowHandles()
if err != nil {
log.Println(err)
return
}
for len(tabs) >= 2 {
c.driver.CloseWindow(tabs[len(tabs)-1])
tabs, err = c.driver.WindowHandles()
if err != nil {
log.Println(err)
break
}
}
if len(tabs) == 1 {
c.driver.SwitchWindow(tabs[0])
}
}
// Get 获取 URL 内容
func (c *Driver) Get(url string) error {
return c.driver.Get(url)
}
// Wait 等待页面加载完成
func (c *Driver) Wait() error {
cond := func(wd selenium.WebDriver) (bool, error) {
_, err := wd.Title()
if err != nil {
return false, err
}
return true, err
}
return c.driver.WaitWithTimeout(cond, time.Second*5)
}
// Title 获取 title
func (c *Driver) Title() (value string, err error) {
return c.driver.Title()
}
// Source 获取 Source
func (c *Driver) Source() (value string, err error) {
return c.driver.PageSource()
}
// Switch 切换到新打开的窗口
func (c *Driver) Switch() error {
tabs, err := c.driver.WindowHandles()
if err != nil {
return err
}
err = c.driver.SwitchWindow(tabs[len(tabs)-1])
return err
}
// CurrentURL 返回当前URL
func (c *Driver) CurrentURL() (string, error) {
return c.driver.CurrentURL()
}
// Find 查找元素
func (c *Driver) Find(value string) (selenium.WebElement, error) {
return c.driver.FindElement(selenium.ByCSSSelector, value)
}
// Finds 查找多个元素
func (c *Driver) Finds(value string) ([]selenium.WebElement, error) {
return c.driver.FindElements(selenium.ByCSSSelector, value)
}
// FindBy 按 by 查找元素
func (c *Driver) FindBy(by string, value string) (selenium.WebElement, error) {
return c.driver.FindElement(by, value)
}
然后是服务启动文件
package driver
import (
"log"
"github.com/tebeka/selenium"
)
const (
// These paths will be different on your system.
seleniumPath = `F:\谷歌浏览器\chromedriver.exe`
// seleniumPath = `D:\chromedriver\chromedriver.exe`
seleniumPort = 9515
)
// StartService 启动 service
func (d *Driver) StartService() error {
var err error
// Start a Selenium WebDriver server instance (if one is not already running).
opts := []selenium.ServiceOption{}
// opts = append(opts, selenium.Output(os.Stderr)) // Output debug information to STDERR.
// selenium.SetDebug(true)
d.service, err = selenium.NewChromeDriverService(seleniumPath, seleniumPort+d.index, opts...)
if err != nil {
log.Println(err)
}
return err
}
func (d *Driver) StopService() error {
if d.service == nil {
return nil
}
err := d.service.Stop()
if err != nil {
return err
}
d.service = nil
return err
}
这样就可以使用webdriver抓取网站了
示例
d, _ := dr.NewDriver()
d.Get("xxx")
后续会更新怎么使用driver抓取数据