PHP lareal_Go语言开发的网站模板爬虫 Lea Web Template Spider

最新推荐文章于 2021-03-29 09:07:16 发布

weixin_39805734

最新推荐文章于 2021-03-29 09:07:16 发布

阅读量149

点赞数

文章标签： PHP lareal

本文链接：https://blog.csdn.net/weixin_39805734/article/details/111776839

版权

http://themeforest.net 内有很多的设计很好的网站模板, 平时没事就在那找漂亮的模板收藏, 但都收费, 不过既然那些模板都有Demo写个爬虫不就可以全部下载? 于是就快速用PHP写了一个模板下载程序, 但速度欠佳, 不支持多线程. 又因之前断断续续学了Go语言, 干脆用它来重写了, 使用了Goroutine速度快了很多.

正在学习Go语言的同学们, 看到模板想下载的同学们可以用用.

附主要代码:

package lealife

import (

"io/ioutil"

"net/http"

"strings"

"regexp"

"log"

"os"

"path/filepath"

"lealife/util"

"sync"

)

type LeaSpider struct {

indexUrl string

scheme string // http:// 或 https://

host string // www.lealife.com lealife.com

schemeAndHost string // http://lealife.com

targetPath string

noChildrenFileExts []string

hadDoneUrl map[string] bool

exceptionUrl map[string] bool

defaultFilename string // 生成的文件名

t int

goroutineNum int // 正在运行的goroutine数目

lock *sync.Mutex

// 并发

w sync.WaitGroup

ch chan bool

}

// 实例化LeaSpider

func NewLeaSpider() *LeaSpider {

lea := &LeaSpider{

targetPath: "D:",

defaultFilename: "index.html",

t: 1,

goroutineNum: 0,

lock: &sync.Mutex{},

noChildrenFileExts: []string{".js", ".ico", ".png", ".jpg", ".gif"}}

lea.ch = make(chan bool, 1000) // 仅limit个goroutine

lea.hadDoneUrl = make(map[string]bool, 1000)

lea.exceptionUrl = make(map[string]bool, 1000)

lea.setLogOutputWriter()

return lea

}

// 入口

func (this *LeaSpider) Fetch(url, targetPath string) {

url = strings.TrimSpace(url)

this.parseUrl(url)

// 保存路径

this.doTargetPath(targetPath)

// 去掉scheme

// a.com, a.com/index.html

url = util.Substring(url, len(this.scheme))

//url2, ok := this.getRalativeUrl("a.com/b/c/d/kk/eee.html", "http://a.com/e/c/d/kk")

//println(url2)

//println(ok)

//return

this.goDo(url, false)

this.w.Wait()

// 处理异常

this.doExceptionUrl()

}

// go routine do it

func (this *LeaSpider) goDo(url string, needException bool) {

// this.do(url, false)

this.w.Add(1)

// println(">>>>>>>>>>>>申请资源" + url)

this.ch

// println(">>>>>>>>>>>>申请资源成功" + url)

this.lock.Lock()

this.goroutineNum++

log.Println("当前共运行", this.goroutineNum, "goroutine")

this.lock.Unlock()

go func() {

defer func() {

this.w.Done()

}()

children := this.do(url, needException)

this.lock.Lock()

this.goroutineNum--

log.Println("当前共运行", this.goroutineNum, " goroutine")

this.lock.Unlock()

// println("<<<<<<<<<<<<

for _, cUrl := range children {

this.goDo(cUrl, false)

}

}()

}

// needException 需要处理异常?

// 这里的url可能是: a.com/b/c/d(没有schema), 不是以.html, .css, .js为后缀的

// 那么断定是一个页面, 此时自动生成一个文件名 => a.com/b/c/d/d_leaui_index.html

// 生成的文件名都按一个规则即可, 不必事先mapping

// 返回一个[]string 待处理的子

func (this *LeaSpider) do(url string, needException bool) (children []string) {

children = nil

url = this.trimUrl(url)

if this.isNotNeedUrl(url, needException) {

return;

}

// 文件是否已存在

// url = a.com/a/?id=12&id=1221, 那么genUrl=a.com/a/index.html?id=121

genUrl := this.genUrl(url)

if this.isExists(genUrl) {

return;

}

// 得到内容

fullUrl := this.scheme + url

if needException {

log.Println("正在处理 `异常` " + fullUrl)

} else {

log.Println("正在处理 " + fullUrl)

}

content, err := this.getContent(fullUrl)

if !needException && (err != nil || content == "") { // !needException防止处理异常时无限循环

this.exceptionUrl[url] = true

return;

}

this.hadDoneUrl[url] = true

ext := strings.ToLower(filepath.Ext(this.trimQueryParams(url))) // 很可能是a.css?v=1.3

// css文件中 url(../../img/search-icon.png)

if(ext == ".css") {

children = this.doCSS(url, content)

return;

}

// 如果是js, image文件就不往下执行了

if(util.InArray(this.noChildrenFileExts, ext)) {

// 保存该文件

if !this.writeFile(url, content) {

return;

}

return;

}

if(this.t == 1) {

// 解析html里的href, src

children = this.doHTML(url, genUrl, content)

}

return

}

// 处理css

func (this *LeaSpider) doCSS(url, content string) (children []string) {

children = nil

// 保存该文件

if !this.writeFile(url, content) {

return;

}

regular := "(?i)url\$(.+?)\$"

reg := regexp.MustCompile(regular)

re := reg.FindAllStringSubmatch(content, -1)

log.Println(url + " 含有: ");

log.Println(re);

baseDir := filepath.Dir(url)

for _, each := range re {

cUrl := this.trimUrl(each[1])

// 这里, goDo会申请资源, 导致doCSS一直不能释放资源

children = append(children, this.cleanUrl(baseDir + "/" + cUrl))

}

return

}

// url : a.com/a/b/d.html

// a.com/a/b/c genFilename: c_leaui_index.html

// 生成子的相对目录有用

func (this *LeaSpider) doHTML(pUrl, realPUrl, content string) (children []string) {

regular := "(?i)(src=|href=)[\"']([^#].*?)[\"']"

reg := regexp.MustCompile(regular)

re := reg.FindAllStringSubmatch(content, -1)

log.Println(pUrl + " => " + realPUrl);

log.Println(pUrl + " 含有: ");

//log.Println(re);

baseDir := filepath.Dir(realPUrl)

for _, each := range re {

// 为了完整替换

// 只替换src=""里的会有子串的问题, 一个url是另一个url子串

rawFullUrl := each[0] // src='http://www.uiueux.com/wp/webzine/wp-content/themes/webzine/js/googlefont.js.php?ver=1.6.4'

rawFullUrlPrefix := each[1]; // src=

// http://a.com/, /a/b/c/d.html, /a/b.jgp

// 如果是/a/b.jpg, 那么是相对host的, 而不是本文件的路径

rawCUrl := each[2]

cUrl := rawCUrl; // strings.TrimRight(rawCUrl, "/") // 万一就是/呢?

// 如果一个链接以//开头, 那么省略了http:, 如果以/开头, 则相对于host

prefixNotHttp := false

if strings.HasPrefix(cUrl, "//") {

cUrl = this.scheme + util.Substring(cUrl, 2)

prefixNotHttp = true

} else if strings.HasPrefix(cUrl, "/") {

cUrl = this.schemeAndHost + cUrl

}

// 如果这个url是一个目录, 新建一个文件

// 如果这个url是以http://a.com开头的, host是一样的,

// 那么content的url是相对于该url

// 生成的url, 如果是目录, 会生成一个文件

cRealUrl, ok := this.getRalativeUrl(realPUrl, cUrl)

// 错误, 不是本页面, 本host的页面

if ok == -1 {

// 如果之前//替换成了http://

if prefixNotHttp {

content = strings.Replace(content, rawFullUrl, rawFullUrlPrefix + "\"" + cRealUrl + "\"", -1)

}

continue

}

// 表示已处理过, 是相对目录了, 必须把内容的替换掉

// 但要处理的还是之前的链接http://

if ok == 1 {

cRealUrl = strings.Trim(cRealUrl, "/")

// 把//变成/

for strings.Index(cRealUrl, "//") != -1 {

cRealUrl = strings.Replace(cRealUrl, "//", "/", -1)

}

log.Println(rawCUrl + " >>>>>> " + cRealUrl)

content = strings.Replace(content, rawFullUrl, rawFullUrlPrefix + "\"" + cRealUrl + "\"", -1)

cUrl = strings.Replace(cUrl, this.scheme, "", 1) // 把sheme去掉, do

children = append(children, cUrl) // 不需要clean

} else {

children = append(children, this.cleanUrl(baseDir + "/" + cRealUrl))

}

// 把content保存起来

if !this.writeFile(realPUrl, content) {

return;

}

// this.t++

// return

return

}

// 得到相对目录

// realPUrl: a.com/b/c/index.html 不是a.com/b/c

// cUrl如果是以this.scheme + this.host开头, 则需要转换成相对目录

// cUrl a.com/c/d/e/g

// 在realPUrl页面到cUrl跳转

func (this *LeaSpider) getRalativeUrl(realPUrl, cUrl string) (url string, ok int) {

ok = 0

url = cUrl

if strings.HasPrefix(cUrl, this.scheme + this.host) {

url = ""

ok = 1

realCUrl := this.genUrl(cUrl) // 如果是目录, 生成一个

// 如果realPUrl == realCurl 那么返回"#"

realPUrl = strings.Replace(realPUrl, this.host, "", 1) // 去掉a.com

realCUrl = strings.Replace(realCUrl, this.scheme + this.host, "", 1) // 去掉http://a.com

realPUrl = this.trimUrl(realPUrl)

realCUrl = this.trimUrl(realCUrl)

if realPUrl == realCUrl {

url = "#"

return

}

// 去掉两个url相同的部分

realPUrlArr := strings.Split(realPUrl, "/")

realCUrlArr := strings.Split(realCUrl, "/")

log.Println(realPUrlArr)

log.Println(realCUrlArr)

i, j := 0, 0

for ; i < len(realCUrlArr) && j < len(realPUrlArr) && realCUrlArr[i] == realPUrlArr[j]; {

realCUrlArr[i] = ""

i++

j++

}

// 有多个少../?

n := len(realPUrlArr) - i - 1

for k := 0; k < n; k++ {

url += "../"

}

url += strings.Join(realCUrlArr, "/")

return;

}

// 如果是以http://, https://开头的, 返回false

if strings.HasPrefix(cUrl, "http://") || strings.HasPrefix(cUrl, "https://") {

ok = -1

return

}

return

}

// trimSpace, /, \, ", '

func (this *LeaSpider) trimUrl(url string) string {

if(url != "") {

url = strings.TrimSpace(url)

url = strings.Trim(url, "\"")

url = strings.Trim(url, "'")

url = strings.Trim(url, "/")

url = strings.Trim(url, "\\")

}

return url

}

// 处理异常

func (this *LeaSpider) doExceptionUrl() {

if(len(this.exceptionUrl) > 0) {

log.Println("正在处理异常Url....");

for url, _ := range this.exceptionUrl {

this.do(url, true)

}

// 如果url是 a.com/b/c/d

// 生成一个文件a.com/b/c/d/d_leaui_index.html

// 返回 d_leaui_index.html

// 如果不是一个目录, 返回""

func (this *LeaSpider) genFilename(url string) (string, bool) {

urlArr := strings.Split(url, "/")

if urlArr != nil {

last := urlArr[len(urlArr) - 1]

ext := strings.ToLower(filepath.Ext(last))

if ext == "" {

return this.defaultFilename, true // 需要append到url后面

} else if util.InArray([]string{".php", ".jsp", ".asp", ".aspx"}, ext) {

filename := filepath.Base(last) // a.php

filename = util.Substr(filename, 0, len(filename) - len(ext)) // a

return filename + ".html", false

}

return "", true;

}

// 生成真实的url

// 传来的url可能是http://a.com, 也可能是a.com

// getRelativeUrl传来的可以是http://a.com

// url = a.com/a/?id=12&id=1221, 那么genUrl=a.com/a/index.html?id=121

func (this *LeaSpider) genUrl(url string) string {

// 去掉?后面的

queryParam, fragment := "", "" // 包含?,#

pos := strings.Index(url, "?");

if pos != -1 {

queryParam = util.Substring(url, pos)

url = util.Substr(url, 0, pos);

} else {

pos = strings.Index(url, "#");

if pos != -1 {

fragment = util.Substring(url, pos)

url = util.Substr(url, 0, pos);

}

// 如果url == host

if url == this.host || url == this.schemeAndHost {

return url + "/" + this.defaultFilename + queryParam + fragment

}

genFilename, needApend := this.genFilename(url)

if genFilename != "" {

if needApend {

url += "/" + genFilename + queryParam + fragment

} else {

// 是a.php => a.html

urlArr := strings.Split(url, "/")

urlArr = urlArr[:len(urlArr)-1]

url = strings.Join(urlArr, "/") + "/" + genFilename

}

return url

}

func (this *LeaSpider) writeFile(url, content string) bool {

// $path = a.html?a=a11

url = this.trimQueryParams(url)

fullPath := this.targetPath + "/" + url

dir := filepath.Dir(fullPath)

log.Println("写目录", dir);

if err := os.MkdirAll(dir, 0777); err != nil {

log.Println("写目录" + dir + " 失败")

return false

}

// 写到文件中

file, err := os.Create(fullPath)

defer file.Close()

if err != nil {

log.Println("写文件" + fullPath + " 失败")

return false

}

file.WriteString(content)

return true;

}

func (this *LeaSpider) cleanUrl(url string) string {

url = filepath.Clean(url)

return strings.Replace(url, "\\", "/", -1)

}

// 将url ?, #后面的字符串去掉

func (this *LeaSpider) trimQueryParams(url string) string {

pos := strings.Index(url, "?");

if pos != -1 {

url = util.Substr(url, 0, pos);

}

pos = strings.Index(url, "#");

if pos != -1 {

url = util.Substr(url, 0, pos);

}

return url;

}

// 判断是否已存在

// url = a/b/c/d.html

func (this *LeaSpider) isExists(url string) bool {

return util.IsExists(this.targetPath + "/" + url)

}

// 不需要处理的url

// needException false 表示不要处理, 那么就要判断是否在其中

func (this *LeaSpider) isNotNeedUrl(url string, needException bool) bool {

if _, ok := this.hadDoneUrl[url]; ok {

return true

}

_, ok := this.exceptionUrl[url];

if !needException && ok {

return true

}

// http:\\/|https:\\/|

regular := "#|javascript:|mailto:|" class=|@.*?\\..+"

reg := regexp.MustCompile(regular)

if reg.MatchString(url) {

return true

}

if (strings.HasPrefix(url, "http:/") || strings.HasPrefix(url, "https:/")) &&

!strings.HasPrefix(url, this.scheme + this.host) {

return true

}

return false

}

// 处理url, 得到scheme, host

func (this *LeaSpider) parseUrl(url string) {

if(strings.HasPrefix(url, "http://")) {

this.scheme = "http://";

} else {

this.scheme = "https://";

}

// http://lealife.com/b/c

url = strings.Replace(url, this.scheme, "", 1)

index := strings.Index(url, "/")

if(index == -1) {

this.host = url

} else {

this.host = util.Substr(url, 0, index)

}

this.schemeAndHost = this.scheme + this.host

}

func (this *LeaSpider) getNoChildrenFileExts() []string {

return this.noChildrenFileExts;

}

// 得到内容

func (this *LeaSpider) getContent(url string) (content string, err error) {

var resp *http.Response

resp, err = http.Get(url)

if(resp != nil && resp.Body != nil) {

defer resp.Body.Close()

} else {

log.Println("ERROR " + url + " 返回为空 ")

}

if resp == nil || resp.Body == nil || err != nil || resp.StatusCode != http.StatusOK {

log.Println("ERROR " + url)

log.Println(err)

return

}

var buf []byte

buf, err = ioutil.ReadAll(resp.Body)

if(err != nil) {

return

}

content = string(buf);

return

}

// 生成存储位置

func (this *LeaSpider) doTargetPath(path string) {

path = strings.TrimRight(path, "/"); // 不能TrimLeft, 万一是linux呢?

path = strings.Trim(path, "\\");

if path != "" {

this.targetPath = path;

}

// 生成目录

if this.targetPath != "" {

os.MkdirAll(this.targetPath, 0777)

} else {

panic("存储位置异常")

}

func (this *LeaSpider) setLogOutputWriter() {

logfile, err := os.OpenFile("C:/Users/Administrator/workspace/lea/log.txt", os.O_RDWR|os.O_CREATE, 0);

if err != nil {

log.Printf("%s\r\n", err.Error());

os.Exit(-1);

}

log.SetOutput(logfile)

}

weixin_39805734

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
PHP lareal_Go语言开发的网站模板爬虫 Lea Web Template Spider

http://themeforest.net 内有很多的设计很好的网站模板, 平时没事就在那找漂亮的模板收藏, 但都收费, 不过既然那些模板都有Demo写个爬虫不就可以全部下载? 于是就快速用PHP写了一个模板下载程序, 但速度欠佳, 不支持多线程. 又因之前断断续续学了Go语言, 干脆用它来重写了, 使用了Goroutine速度快了很多.正在学习Go语言的同学们, 看到模板想下载的同学们可以用用...
复制链接

扫一扫