"股票数据Scrapy爬虫"实例
下面是stocks.py文件源代码
import scrapy
import re
class StocksSpider ( scrapy. Spider) :
name = "stocks"
start_urls = [ 'https://quote.eastmoney.com/stocklist.html' ]
def parse ( self, response) :
for href in response. css( 'a::attr(href)' ) . extract( ) :
try :
stock = re. findall( r"[s][hz]\d{6}" , href) [ 0 ]
url = 'https://gupiao.baidu.com/stock/' + stock + '.html'
yield scrapy. Request( url, callback= self. parse_stock)
except :
continue
def parse_stock ( self, response) :
infoDict = { }
stockInfo = response. css( '.stock-bets' )
name = stockInfo. css( '.bets-name' ) . extract( ) [ 0 ]
keyList = stockInfo. css( 'dt' ) . extract( )
valueList = stockInfo. css( 'dd' ) . extract( )
for i in range ( len ( keyList) ) :
key = re. findall( r'>.*</dt>' , keyList[ i] ) [ 0 ] [ 1 : - 5 ]
try :
val = re. findall( r'\d+\.?.*</dd>' , valueList[ i] ) [ 0 ] [ 0 : - 5 ]
except :
val = '--'
infoDict[ key] = val
infoDict. update(
{ '股票名称' : re. findall( '\s.*\(' , name) [ 0 ] . split( ) [ 0 ] + \
re. findall( '\>.*\<' , name) [ 0 ] [ 1 : - 1 ] } )
yield infoDict
下面是pipelines.py文件源代码:
class BaidustocksPipeline ( object ) :
def process_item ( self, item, spider) :
return item
class BaidustocksInfoPipeline ( object ) :
def open_spider ( self, spider) :
self. f = open ( 'BaiduStockInfo.txt' , 'w' )
def close_spider ( self, spider) :
self. f. close( )
def process_item ( self, item, spider) :
try :
line = str ( dict ( item) ) + '\n'
self. f. write( line)
except :
pass
return item
下面是settings.py文件中被修改的区域:
ITEM_PIPELINES = {
'BaiduStocks.pipelines.BaidustocksInfoPipeline' : 300 ,
}