Colly 学习笔记(一)——爬虫框架,抓取中金公司行业市盈率数据
Colly 学习笔记(一)——爬虫框架,抓取中金公司行业市盈率数据
Colly 学习笔记(二)——爬虫框架,抓取下载数据(上证A股数据下载)
Colly 学习笔记(三)——爬虫框架,抓取动态页面数据(上证A股动态数据抓取)
-
Colly 主体类是 Collector类,Collector管理网络通信,当Collector的任务运行时也负责执行注册的回调函数。
初始化过程如下:
c:=colly.NewCollector()
- 你可以注册不同的回调函数,通过Collector来控制任务或检索信息。
c.OnRequest(func(r *colly.Request) { //在Request请求之前调用 fmt.Println("Visiting", r.URL) }) c.OnError(func(_ *colly.Response, err error) { //在收到Error消息后调用 log.Println("Something went wrong:", err) }) c.OnResponseHeaders(func(r *colly.Response) { //当收到responseHeader之后调用 fmt.Println("Visited", r.Request.URL) }) c.OnResponse(func(r *colly.Response) { //在收到response之后调用 fmt.Println("Visited", r.Request.URL) }) c.OnHTML("a[href]", func(e *colly.HTMLElement) { //正常回收html消息当onresponse之后 e.Request.Visit(e.Attr("href")) }) //抓取网页中对应的element c.OnHTML("tr td:nth-of-type(1)", func(e *colly.HTMLElement) { fmt.Println("First column of a table row:", e.Text) }) c.OnXML("//h1", func(e *colly.XMLElement) { //正常回收xml消息当onresponse之后 fmt.Println(e.Text) }) c.OnScraped(func(r *colly.Response) { //在xml之后调用 fmt.Println("Finished", r.Request.URL) })
-
示例
由于colly手册内如非常少,所以我这边写了一个简单的爬虫,用于抓取中金公司页面的市盈率数据,大家可以体验一下简单爬虫的开发流程
首先登陆中金公司页面,查看网页结构,获取查看单元格的selector
然后根据selector和表结构,进行数据抓取
#得到的selector如下 #这个是行业名称对应列的selector body > div.hysyl.i_content.w1200.mt-20 > div > div > div.j-tab-con.mb-15 > div:nth-child(1) > div.sCon > div:nth-child(1) > table > tbody > tr > td > table:nth-child(2) #这个是行业代码对应列的selector body > div.hysyl.i_content.w1200.mt-20 > div > div > div.j-tab-con.mb-15 > div:nth-child(1) > div.sCon > div:nth-child(1) > table > tbody > tr > td > table:nth-child(2) > tbody > tr > td:nth-child(1) #通过比对和整理得到如下selector,子行业类推得到 table > tbody > tr > td > table.list-div-table> tbody >tr
示例代码如下:
package main import ( "fmt" "github.com/PuerkitoBio/goquery" "github.com/gocolly/colly" log "github.com/sirupsen/logrus" "strconv" "time" ) //证监会行业市盈率 type EarningsOfIndustry struct { CodeOfIndustry string `json:"code_of_industry"` //行业代码 NameOfIndustry string `json:"name_of_industry"` //行业名称 NewData float64 `json:"new_data"` //最新数据 NumberOfStacks int `json:"number_of_stacks"` // 公司数量 NumberOfDeficit int `json:"number_of_deficit"` // 亏损公司数量 InOneMonth float64 `json:"in_one_month"` //近1个月 InThreeMonth float64 `json:"in_three_month"` //近3个月 InSixMonth float64 `json:"in_six_month"` //近6个月 InOneYear float64 `json:"in_one_year"` //近一年 SubIndustry []EarningsOfIndustry `json:"sub_industry"` //子行业 } func (receiver Collector) ScrapeEarnings(pType string, pData string) (error,[]*EarningsOfIndustry) { Url:="http://www.csindex.com.cn/zh-CN/downloads/industry-price-earnings-ratio?type="+pType+"&date="+pData receiver.MLog.GetLogHandle().WithFields(log.Fields{"URL":Url}).Info("URL...") c := colly.NewCollector(colly.UserAgent(RandomString()),colly.AllowURLRevisit()) EarningsOfIndustryList := make([]*EarningsOfIndustry, 0) c.OnRequest(func(r *colly.Request) { //r.Headers.Set("User-Agent", RandomString()) receiver.MLog.GetLogHandle().WithFields(log.Fields{"Request":fmt.Sprintf("%+v",*r),"Headers":fmt.Sprintf("%+v",*r.Headers)}).Info("Begin Visiting...") }) c.OnError(func(_ *colly.Response, err error) { receiver.MLog.GetLogHandle().WithFields(log.Fields{"error":err}).Info("Something went wrong:") }) //填入刚才获取query语句 c.OnHTML("table > tbody > tr > td > table.list-div-table> tbody >tr", func(e *colly.HTMLElement) { //link := e.Attr("href") item :=EarningsOfIndustry{ //获取每一行的对应列数据 CodeOfIndustry: e.ChildText("td:nth-child(1)"), NameOfIndustry: e.ChildText("td:nth-child(2)"), } EarningsOfIndustryList=append(EarningsOfIndustryList,&item) NewData, err := strconv.ParseFloat(e.ChildText("td:nth-child(3)"), 64) if err == nil { item.NewData = NewData } NumberOfStacks, err := strconv.ParseInt(e.ChildText("td:nth-child(4)"), 10,32) if err == nil { item.NumberOfStacks = int(NumberOfStacks) } NumberOfDeficit, err := strconv.ParseInt(e.ChildText("td:nth-child(5)"), 10,32) if err == nil { item.NumberOfDeficit = int(NumberOfDeficit) } InOneMonth, err := strconv.ParseFloat(e.ChildText("td:nth-child(6)"), 64) if err == nil { item.InOneMonth = InOneMonth } InThreeMonth, err := strconv.ParseFloat(e.ChildText("td:nth-child(7)"), 64) if err == nil { item.InThreeMonth = InThreeMonth } InSixMonth, err := strconv.ParseFloat(e.ChildText("td:nth-child(8)"), 64) if err == nil { item.InSixMonth = InSixMonth } InOneYear, err := strconv.ParseFloat(e.ChildText("td:nth-child(9)"), 64) if err == nil { item.InOneYear = InOneYear } item.SubIndustry = make([]EarningsOfIndustry, 0) //表中还有子行业,而且与行业的结构是同属于table结构,通过以下函数可以调用到对应的结构 e.DOM.Parent().Parent().Next().Find("table.list-div-table> tbody >tr").Each(func(i int, selection *goquery.Selection) { subItem :=EarningsOfIndustry{ CodeOfIndustry: e.ChildText("td:nth-child(1)"), NameOfIndustry: e.ChildText("td:nth-child(2)"), } NewData, err := strconv.ParseFloat(e.ChildText("td:nth-child(3)"), 64) if err == nil { item.NewData = NewData } NumberOfStacks, err := strconv.ParseInt(e.ChildText("td:nth-child(4)"), 10,32) if err == nil { item.NumberOfStacks = int(NumberOfStacks) } NumberOfDeficit, err := strconv.ParseInt(e.ChildText("td:nth-child(5)"), 10,32) if err == nil { item.NumberOfDeficit = int(NumberOfDeficit) } InOneMonth, err := strconv.ParseFloat(e.ChildText("td:nth-child(6)"), 64) if err == nil { item.InOneMonth = InOneMonth } InThreeMonth, err := strconv.ParseFloat(e.ChildText("td:nth-child(7)"), 64) if err == nil { item.InThreeMonth = InThreeMonth } InSixMonth, err := strconv.ParseFloat(e.ChildText("td:nth-child(8)"), 64) if err == nil { item.InSixMonth = InSixMonth } InOneYear, err := strconv.ParseFloat(e.ChildText("td:nth-child(9)"), 64) if err == nil { item.InOneYear = InOneYear } item.SubIndustry = append(item.SubIndustry,subItem) }) //receiver.MLog.GetLogHandle().WithFields(log.Fields{"Items":item}).Info("Find Item") }) //finish c.OnScraped(func(r *colly.Response) { //bData, _ := json.MarshalIndent(EarningsOfIndustryList, "", "\t") receiver.MLog.GetLogHandle().WithFields(log.Fields{"Number of Results":len(EarningsOfIndustryList)}).Info("Finish") }) c.Visit(Url) return nil,EarningsOfIndustryList } func (receiver Collector) ScrapeEarningsTest() error { now := time.Now() //zjh1:静态市盈率 //zjh2:滚动市盈率 //zjh3:市净率 //zjh4:股息率 var pType = []string{"zjh1","zjh2","zjh3","zjh4"} var pDate = fmt.Sprintf("%02d-%02d-%02d",now.Year(),now.Month(),now.Day()) for _ , v := range pType { err,res := receiver.ScrapeEarnings(v,pDate) receiver.MLog.GetLogHandle().WithFields(log.Fields{"Results":res,"err":err}).Info("Finish") } return nil } func main() { var c Collector c.MLog=receiver.MLog c.ScrapeEarningsTest() return }
运行结果如下: