package demo
import (
"fmt"
"io/ioutil"
"net/http"
"regexp"
"strings"
"github.com/axgle/mahonia"
)
type ZolSpider struct {
indexUrl string
}
func (this ZolSpider) ConvertToString(src string, srcCode string, tagCode string) string {
srcCoder := mahonia.NewDecoder(srcCode)
srcResult := srcCoder.ConvertString(src)
tagCoder := mahonia.NewDecoder(tagCode)
_, cdata, _ := tagCoder.Translate([]byte(srcResult), true)
result := string(cdata)
return result
}
func (this ZolSpider) readUrlBody(url string) (string, error) {
client := &http.Client{}
request, err := http.NewRequest("GET", url, nil)
if err != nil {
return "err", err
}
request.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
request.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3423.2 Safari/537.36")
request.Header.Set("Referer", "http://www.baidu.com")
response, err := client.Do(request)
body, err := ioutil.ReadAll(response.Body)
defer response.Body.Close()
if response.StatusCode == 200 {
_, err := ioutil.ReadAll(response.Body)
if err != nil {
fmt.Println(err)
}
}
return this.ConvertToString(string(body), "GBK", "UTF-8"), err
}
func (this ZolSpider) catchCategoryUrl(url string) ([]string, []string) {
body, _ := this.readUrlBody(url)
rcg := regexp.MustCompile(`<a class="more" href="(.*?)" target="_blank">更多参数>><\/a>`)
regPrice := regexp.MustCompile(`<b class="price-type">(.*?)</b>`)
urls := rcg.FindAllStringSubmatch(body, -1)
prices := regPrice.FindAllStringSubmatch(body, -1)
cateUrl := make([]string, len(urls))
catePrice := make([]string, len(prices))
for i, u := range prices {
catePrice[i] = u[1]
}
for i, u := range urls {
cateUrl[i] = u[1]
}
return cateUrl, catePrice
}
func (this ZolSpider) catchProductInfo(url string) string {
body, _ := this.readUrlBody(url)
rcg := regexp.MustCompile(`<div class="detailed-parameters">(?sU:.*)<td class="copytable" colspan="2">`)
result := rcg.FindString(body)
re := regexp.MustCompile(`data-rel=\'(.*?)\'|<a(.*?)>(.*?)<\/a>`)
result = re.ReplaceAllString(result, "")
re = regexp.MustCompile(`<span id="(.*?)">(.*?)</span>`)
result = re.ReplaceAllString(result, "$2")
result = strings.Replace(result, "<br />", ",", -1)
result = strings.Replace(result, "<span></i>>", "", -1)
result = strings.Replace(result, " ", " ", -1)
result = strings.Replace(result, " class=\"hover-edit-param\"", "", -1)
result = strings.Replace(result, "<em class=\"edit-param\" data-role=\"user-login\" >纠错</em>", "", -1)
phoneNameRegx := regexp.MustCompile(`proName\: \'(?sU:.*)\'`)
phoneName := phoneNameRegx.FindString(body)
phoneBrandRegx := regexp.MustCompile(`|manuName\: \'(?sU:.*)\'`)
phoneBrand := phoneBrandRegx.FindString(body)
fmt.Println(phoneName)
fmt.Println(phoneBrand)
rowReg := regexp.MustCompile(`<tr>(?sU:.*)<\/tr>`)
ceilReg := regexp.MustCompile(`<t[d|h](.*?)>(?sU:.*)<\/t[d|h]>`)
params := rowReg.FindAllStringSubmatch(result, -1)
// reg := regexp.MustCompile(`<!--[^>]+>|<iframe[\S\s]+?</iframe>|<a[^>]+>|<td>|</td>|<th>|</th>|</a>|<script[\S\s]+?</script>|<div class="hzh_botleft">[\S\s]+?</div>`)
for i := 0; i < len(params); i++ {
ceil := ceilReg.FindAllStringSubmatch(params[i][0], -1)
if len(ceil) == 1 {
continue
}
left := ceil[0][0]
right := ceil[1][0]
fmt.Println(trimHtml(left), ":", trimHtml(right))
}
return ""
}
func trimHtml(src string) string {
//将HTML标签全转换成小写
re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
src = re.ReplaceAllStringFunc(src, strings.ToLower)
//去除STYLE
re, _ = regexp.Compile("\\<style[\\S\\s]+?\\</style\\>")
src = re.ReplaceAllString(src, "")
//去除SCRIPT
re, _ = regexp.Compile("\\<script[\\S\\s]+?\\</script\\>")
src = re.ReplaceAllString(src, "")
//去除所有尖括号内的HTML代码,并换成换行符
re, _ = regexp.Compile("\\<[\\S\\s]+?\\>")
src = re.ReplaceAllString(src, "\n")
//去除连续的换行符
re, _ = regexp.Compile("\\s{2,}")
src = re.ReplaceAllString(src, "\n")
return strings.TrimSpace(src)
}
func (this ZolSpider) run(url string) {
cateUrls, _ := this.catchCategoryUrl(url)
for i, url := range cateUrls {
this.catchProductInfo("http://detail.zol.com.cn" + string(url))
if i >= 3 {
break
}
fmt.Println("****************************************************", (i + 1), "*******************************************")
}
}
func Spider() {
spder := new(ZolSpider)
page := "http://detail.zol.com.cn/cell_phone_index/subcate57_0_list_1_0_1_1_0_1.html"
// $oldUrl = 'http://detail.zol.com.cn/history/subcate57_0_1_0_1_%d.html';
// $maxPage = 104;
// $oldPage = 200;
spder.run(page)
}