Colly 学习笔记(二)——爬虫框架,抓取下载数据(上证A股数据下载)
Colly 学习笔记(一)——爬虫框架,抓取中金公司行业市盈率数据
Colly 学习笔记(二)——爬虫框架,抓取下载数据(上证A股数据下载)
Colly 学习笔记(三)——爬虫框架,抓取动态页面数据(上证A股动态数据抓取)
- 上一讲是简单说明了网页数据抓取,通过一个简单的爬虫,掌握简单的数据抓取,但是实际情况常出现网页只提供下载,页面没有数据的情况(如图),此时直接用colly抓取数据即可。
遇到这种提供下载的数据,colly可以直接下载。
- 首先查看列表网页结构,如下图所示
#原始的获取链接
#A股
http://query.sse.com.cn/security/stock/downloadStockListFile.do?csrcCode=&stockCode=&areaName=&stockType=1
#B股
http://query.sse.com.cn/security/stock/downloadStockListFile.do?csrcCode=&stockCode=&areaName=&stockType=2
#科创
http://query.sse.com.cn/security/stock/downloadStockListFile.do?csrcCode=&stockCode=&areaName=&stockType=8
head设置如下:
c.OnRequest(func(r *colly.Request) {
//r.Headers.Set("User-Agent", RandomString())
r.Headers.Set("Host", "query.sse.com.cn")
r.Headers.Set("Connection", "keep-alive")
r.Headers.Set("Accept", "*/*")
r.Headers.Set("Origin", "http://www.sse.com.cn")
r.Headers.Set("Referer", "http://www.sse.com.cn/assortment/stock/list/share/")
//关键头 如果没有 则返回 错误
r.Headers.Set("Accept-Encoding", "gzip, deflate")
r.Headers.Set("Accept-Language", "zh-CN,zh;q=0.9")
})
编写抓取结果处理代码:
c.OnScraped(func(r *colly.Response) {
//获取数据,BODY就是csv格式的结果数据
vList := strings.Split(string(r.Body), "\n")
//解析CSV格式数据
for _, row := range vList {
rSplits := strings.Split(row, "\t")
if len(rSplits) >= 5 {
Url := "http://www.sse.com.cn/assortment/stock/list/info/announcement/index.shtmlproductId=?"
+rSplits[0]
item:=Stocks{
Guid: rSplits[0],
Name: rSplits[1],
Code: rSplits[2],
Describe: rSplits[3],
Date: rSplits[4],
Info: Url,
}
StockList=append(StockList,&item)
}
}
})
完整代码如下:
package main
import (
"fmt"
"github.com/gocolly/colly"
log "github.com/sirupsen/logrus"
"strings"
)
type Stocks struct {
Guid string `json:"guid"`
Name string `json:"name"`
Code string `json:"code"`
Describe string `json:"describe"`
Date string `json:"date"`
Info string `json:"info"`
}
func (receiver Collector) ScrapeStocks(url string) (error,[]*Stocks) {
receiver.MLog.GetLogHandle().WithFields(log.Fields{"URL":url}).Info("URL...")
c := colly.NewCollector(colly.UserAgent(RandomString()),colly.AllowURLRevisit())
c.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36"
StockList := make([]*Stocks, 0)
c.OnRequest(func(r *colly.Request) {
//r.Headers.Set("User-Agent", RandomString())
r.Headers.Set("Host", "query.sse.com.cn")
r.Headers.Set("Connection", "keep-alive")
r.Headers.Set("Accept", "*/*")
r.Headers.Set("Origin", "http://www.sse.com.cn")
r.Headers.Set("Referer", "http://www.sse.com.cn/assortment/stock/list/share/") //关键头 如果没有 则返回 错误
r.Headers.Set("Accept-Encoding", "gzip, deflate")
r.Headers.Set("Accept-Language", "zh-CN,zh;q=0.9")
receiver.MLog.GetLogHandle().WithFields(log.Fields{"Request":fmt.Sprintf("%+v",*r),"Headers":fmt.Sprintf("%+v",*r.Headers)}).Info("Begin Visiting...")
})
c.OnError(func(_ *colly.Response, err error) {
receiver.MLog.GetLogHandle().WithFields(log.Fields{"error":err}).Info("Something went wrong:")
})
c.OnResponse(func(r *colly.Response) {
receiver.MLog.GetLogHandle().WithFields(log.Fields{"Headers":r.Headers}).Info("Receive Header")
})
//scraped item from body
//finish
c.OnScraped(func(r *colly.Response) {
//获取数据,BODY就是csv格式的结果数据
vList := strings.Split(string(r.Body), "\n")
for _, row := range vList {
rSplits := strings.Split(row, "\t")
if len(rSplits) >= 5 {
Url := "http://www.sse.com.cn/assortment/stock/list/info/announcement/index.shtmlproductId=?"+rSplits[0]
item:=Stocks{
Guid: rSplits[0],
Name: rSplits[1],
Code: rSplits[2],
Describe: rSplits[3],
Date: rSplits[4],
Info: Url,
}
StockList=append(StockList,&item)
receiver.MLog.GetLogHandle().WithFields(log.Fields{"item":item}).Info("Receive message ")
}
}
})
c.Visit(url)
return nil,StockList
}
func (receiver Collector) ScrapeStocksTest() error {
//now := time.Now()
//zjh1:A
//zjh2:B
//zjh3:科创
UrlA := "http://query.sse.com.cn/security/stock/downloadStockListFile.do?csrcCode=&stockCode=&areaName=&stockType=1"
UrlB := "http://query.sse.com.cn/security/stock/downloadStockListFile.do?csrcCode=&stockCode=&areaName=&stockType=2"
UrlC := "http://query.sse.com.cn/security/stock/downloadStockListFile.do?csrcCode=&stockCode=&areaName=&stockType=8"
receiver.ScrapeStocks(UrlA)
receiver.ScrapeStocks(UrlB)
receiver.ScrapeStocks(UrlC)
return nil
}
func main() {
var c Collector
c.MLog=receiver.MLog
c.ScrapeStocksTest()
return
}
结果图如下: