Python 实现股票数据的实时抓取

        **最近捣鼓股票的东西,想看看股票的实时涨跌信息,又不想去看网上炒股软件现有的信息,所以寻思着自己写了一个Python的股票当前价格抓取工具:**


一、得到沪深两市的所有上市股票数据

        考虑主要在东方财富网站上面抓取所有的沪深两市的股票名字和股票代码信息,很多办法可以爬到这样的数据,我用了一个爬虫框架Scrapy(正好在学习),将得到的数据保存进一个名叫TTJJ.json的文件中,Scrapy新建一个TTJJ工程,然后我添加了一个user-agent文件,防止被服务器ban,(在这儿其实没什么用,因为我访问的频率不高,服务器不会拒绝),工程列表如下:


        爬虫的主要程序如TTJJr所示:

from scrapy.spider import Spider
from scrapy.selector import Selector
from TTJJ.items import TTjjItem
import re
from scrapy import log
class TTJJi(Spider):

    name = "TTJJ"
    allowed_domains=['eastmoney.com']
    start_urls = ["http://quote.eastmoney.com/stocklist.html#sh"]

    def parse(self, response):

        sel = Selector(response)
        cont=sel.xpath('//div[@class="qox"]/div[@class="quotebody"]/div/ul')[0].extract()
        item = TTjjItem()

        for ii in re.findall(r'<li>.*?<a.*?target=.*?>(.*?)</a>',cont):
            item["stockName"]=ii.split("(")[0].encode('utf-8')
            item["stockCode"]=("sh"+ii.split("(")[1][:-1]).encode('utf-8')
            log.msg(ii.encode('utf-8'),level="INFO")
            yield item

        #item["stockCode"]="+------------------------------------------------------------------+"
        #yield item
        cont1=sel.xpath('//div[@class="qox"]/div[@class="quotebody"]/div/ul')[1].extract()

        for iii in re.findall(r'<li>.*?<a.*?target=.*?>(.*?)</a>',cont1):
            item["stockName"]=iii.split("(")[0].encode('utf-8')
            item["stockCode"]=("sz"+iii.split("(")[1][:-1]).encode('utf-8')
            #log.msg(iii.encode('utf-8'),level="INFO")
            yield item

网上找了一个UserAgentMiddle的代码,只要在settings.py里面声明就可以不适用默认的登陆方式了,代码如下:

#-*- coding:utf-8 -*-
from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware
import random as rd
from scrapy import log

class UserAgentMiddle(UserAgentMiddleware):

    def __init__(self, user_agent=''):
        self.user_agent = user_agent

    def process_request(self, request, spider):
        ua = rd.choice(self.user_agent_list)
        if ua:
            #显示当前使用的useragent
            print "********Current UserAgent:%s************" %ua

            #记录
            log.msg('Current UserAgent: '+ua, level='INFO')
            request.headers.setdefault('User-Agent', ua)

    #the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape
    #for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php
    user_agent_list = [\
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
        "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
       ]

        另外,items定义了一些爬虫到的数据的保存格式,以及pipeline里面定义了对数据的处理方式,也就是保存进json文件中,还有一些配置的内容在settings.py里详细代码分别如下:

# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class TTjjItem(scrapy.Item):
    stockCode = scrapy.Field()
    stockName = scrapy.Field()
   
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import codecs
import json

class TtjjPipeline(object):

    def __init__(self):

        self.file=codecs.open("TTJJ.json",mode="wb",encoding='utf-8')
        self.file.write('{"hah"'+':[')


    def process_item(self, item, spider):
        line = json.dumps(dict(item))+","
        self.file.write(line.decode("unicode_escape"))

        return item

# -*- coding: utf-8 -*-

# Scrapy settings for TTJJ project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#

BOT_NAME = 'TTJJ'

SPIDER_MODULES = ['TTJJ.spiders']
NEWSPIDER_MODULE = 'TTJJ.spiders'
download_delay=1
ITEM_PIPELINES={'TTJJ.pipelines.TtjjPipeline':300}
COOKIES_ENABLED=False
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'TTJJ (+http://www.yourdomain.com)'
#取消默认的useragent,使用新的useragent
DOWNLOADER_MIDDLEWARES = {
        'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware' : None,
        'TTJJ.spiders.UserAgentMiddle.UserAgentMiddle':400
    }

运行结果,如下图所示,得到了股票的名字和股票代码信息:



二、得到股票的实时交易数据信息

         从新浪提供的股票接口http://hq.sinajs.cn/list= (放股票代码)拿数据,每天的开盘时间到收盘时间才会有数据的更新,其他时间拿到的数据是当天收盘的数据。我没有做太复杂的处理,就是简单计算了一个当前涨幅最高的10只股票,打印出来,主要的代码如下:

# -*- coding:utf-8 -*-
        #bb[0]:股票名  bb[1]:今日开盘价    bb[2]:昨日收盘价    bb[3]:当前价格   bb[4]:今日最高价    bb[5]:今日最低价
        #bb[6]:买一报价 bb[7]:卖一报价     bb[8]:成交股票数/100 bb[9]:成交金额/w bb[10]:买一申请股数 bb[11]:买一报价
        #bb[12]:买二股数 bb[13]:买二报价   bb[14]:买三股数      bb[15]:买三报价  bb[16]:买四申请股数 bb[17]:买四报价
        #bb[18]:买五股数 bb[19]:买五报价   bb[20]:卖一股数      bb[21]:卖一报价  bb[22]:卖二申请股数 bb[23]:卖二报价
        #bb[24]:卖三股数 bb[25]:卖三报价   bb[26]:卖四股数      bb[27]:卖四报价  bb[28]:卖五股数     bb[29]:卖五报价
        #bb[30]:日期     bb[31]:时间     bb[8]:不知道

import urllib2
import time
from stockSort import stocksort
stockDict={}
stockTimeList=[]

class updateData(object):
    def __init__(self):
        self.url = 'http://hq.sinajs.cn/list='

    def getData(self,stockID):
        dataList={}
        try:
            request = urllib2.Request(self.url+str(stockID))
            response = urllib2.urlopen(request)
            contents = response.read()

            for content in str(contents).splitlines():
                temp=(content).split(",")
                if float(temp[1])!=0:
                    hehe=str((float(temp[3])-float(temp[2]))/float(temp[1])*100)
                    dataList[str(temp[0][11:19])]=[hehe,temp[31]]
                else:
                    dataList[str(temp[0][11:19])]=[0,temp[31]]
            return dataList

        except urllib2.URLError, e:
            print "BadxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxBad"

    def getPart(self,stockID):

        bb=self.getData(stockID)
        #print bb

        #print 'a'+str(len(bb))
        if len(bb)>0:
            for key in bb:
                stockDict[key].append(bb[key][0])

            stockTimeList.append(bb[key][1])

if __name__=='__main__':

    ttjj=updateData()
    ff=open("stockCodeList.txt","r")
    #print ff.readline().split(",")

    nameA=""
    count=0
    dataTemp=ff.readline().strip(",").split(",")
    ff.close()
    for  stockI in dataTemp:
        stockDict[stockI]=[]
        count+=1
        if count<800:
            nameA+=stockI+","
        else:
            ttjj.getPart(nameA.strip(","))
            nameA=""
            count=0
    ttjj.getPart(nameA.strip(","))

    print stockDict
    while(True):
        time.sleep(1)
        nameA=""
        count=0
        for  stockI in dataTemp:
            count+=1
            if count<800:
                nameA+=stockI+","
            else:
                ttjj.getPart(nameA.strip(","))
                nameA=""
                count=0
        ttjj.getPart(nameA.strip(","))

        #print stockDict
        xx=stocksort(stockDict)
        print xx.getSort()[0:10]
class stocksort(object):
    def __init__(self,stockDict):
        self.stockDict=stockDict

    def getSort(self):
        sortList={}

        for key in self.stockDict:
            if len(self.stockDict[key])!=0:
                sortList[key]=self.stockDict[key][-1]

        return sorted(sortList.items(), lambda x, y: cmp(x[1], y[1]),reverse=True)
        程序的运行结果如图所示:


由于我现在访问的时间是收盘之后了,所以每行看到的值是一样的(数据没更新了),一行表示的是10只涨的最高的股票,sz002703表示股票代码,后面数值9.99...是涨幅。主要就实现了这些,记录一下学习爬虫的经历。

       您的支持是对我最大的鼓励!



展开阅读全文

没有更多推荐了,返回首页