一个爬取沪深两市融资融券标的融资融券交易数据的小爬虫

1.学习初衷

本着紧跟时代进步步伐,坚决不拖社会主义建设后腿的想法,紧赶大数据、数据分析、机器学习的趋势......当然,前面只是瞎扯了。
最后我综合考虑R跟Python,最终还是选择走上了python的学习道路...

其实本意是:一想结合本专业再学习编程知识想往数据分析、机器学习等方向方向发展;二是的确对编程学习挺感兴趣。


2.小爬虫

      要想数据分析、机器学习之类,肯定得要有大量的数据,没毛病好吧~学完基础知识后当然从最开始的爬虫写起。好了,网上教程不少了吧,我就直接贴代码了。

1.SpiderInGui.py

#encoding:utf-8
#author:buracag_mc

from tkinter import *
from rzrq_spider import *
from guidemo import redirectedGuiFunc

def SpiderGui():
    redirectedGuiFunc(repetrun)

if __name__ == '__main__':
    root = Tk()
    root.title('小爬虫')
    Label(root,text="1.一个爬取沪深两市融资融券标的融资融券交易数据的小爬虫\n"
                    "2.点击爬取后可爬取特定标的的数据,如直接按OK,则爬取所有标的\n\n\n"
                    ""
                    "注:\n"
                    "不知为何原脚本现在的爬取过程有时会爬取不到数据,故效率会有所降低!").pack()
    Button(root, text='点击爬取', command=SpiderGui).pack(fill=X)
    Button(root, text='点击退出', command=root.quit).pack(fill=X)
    root.mainloop()







以上是一个简易的GUI界面。包括了:

1. 特定标的的输入(或默认爬取所有列表标的);
2.以及将函数中的流映射输入到弹出的GUI窗口中;
3.最后所有的数据保存到了运行目录下的All_rzrq_data文件夹中;
4.可以将其打包为可执行exe文件,用2.7版本打包成功过最初的版本。
其中用到了两个封装的模块:rzrq_spider和guidemo,后者参照了Mark Lutz著的Programming Python的相关内容。


2.rzrq_spider.py

以下是爬虫脚本:

# encoding:utf-8
# author:buracag_mc

import os,sys,time,requests,\
    csv,re,random,multiprocessing

#==========以下是为了解决multiprocessing模块打包成exe的问题==========
try:
    if sys.platform.startswith('win'):
        import multiprocessing.popen_spawn_win32 as forking
    else:
        import multiprocessing.popen_fork as forking
except ImportError:
    import multiprocessing.forking as forking
if sys.platform.startswith('win'):

    class _Popen(forking.Popen):
        def __init__(self, *args, **kw):
            if hasattr(sys, 'frozen'):
                os.putenv('_MEIPASS2', sys._MEIPASS)
            try:
                super(_Popen, self).__init__(*args, **kw)
            finally:
                if hasattr(sys, 'frozen'):
                    if hasattr(os, 'unsetenv'):
                        os.unsetenv('_MEIPASS2')
                    else:
                        os.putenv('_MEIPASS2', '')
    forking.Popen = _Popen


user_agent = ['Mozilla/5.0 (Windows NT 10.0; WOW64)',
              'Mozilla/5.0 (Windows NT 6.3; WOW64)',
              'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
              'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
              'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']

if not os.path.exists('ALL_rzrq_data'):
    os.mkdir('ALL_rzrq_data')

def rzrq_spider(code):
    # -----------------------通过抓包找到了东方财富网站存有融资融标的的所有历史数据网站------------------------#
    if int(code[0]) < 5:
        baseurl = 'http://datainterface.eastmoney.com/EM_DataCenter/JS.aspx?type=FD&sty=MTE&mkt=2&code='+code+'&st=0&sr=1&p=1&ps=1888'
    else:
        baseurl = 'http://datainterface.eastmoney.com/EM_DataCenter/JS.aspx?type=FD&sty=MTE&mkt=1&code='+code+ '&st=0&sr=1&p=1&ps=1888'

    try:
        response = requests.get(baseurl, headers={"User-Agent": random.choice(user_agent)}, timeout=45)
        html = response.text
        # ------------------------用正则提取出网页中所有想要的数据(也可以用beautisoup)--------------------------#
        data_list = re.findall(r'"\d{6},.*?_.*?,.*?,{1}(.*?)"', html)
        result1_list = []

        for x in data_list:
            x = x + ','
            result2 = re.findall(r'(.*?),', x)
            # ------------网站君很坑爹,打乱了内容,故进行顺序的重新排版----------------#
            result2 = [result2[1], result2[7], result2[6], result2[3], result2[2]]
            result1_list.append(result2)
        if result1_list != []:
        # -----------------------------------------打开csv并按行写入---------------------------------------------------------#
            with open('ALL_rzrq_data/%s_.csv' % code, 'w',encoding='utf8' ,newline='') as outfile:  # ab+ 模式
                spamwriter = csv.writer(outfile)
                file_title = ['日期', '融资买入额(元)', '融资偿还额(元)', '融券卖出量(股)', '融券偿还量(股)']   #只选择了这几项内容
                spamwriter.writerow(file_title)
                if result1_list:
                    for item in result1_list:
                        spamwriter.writerow(item)
            outfile.close()
            #print (u'成功输出融资融券标的%s的数据到csv文件' % code)

        else:
            #print (u'爬取不到内容:',code)
            return code
    except :
        #print ( u'%s 糟糕啦!' % code,sys.exc_info())
        time.sleep(5)
        return code


def run(code_list):
    fail_code = []
    pool = multiprocessing.Pool(processes=4)
    for stock in code_list:
        fail_code.append(pool.apply_async(rzrq_spider, (str(stock),)))

    pool.close()
    pool.join()
    del code_list[:]
    fail_code = [e.get() for e in fail_code if e.get() != None]
    for code in fail_code:
        code_list.append(code)
    print (u'本次有些不给力,暂时爬取失败的有:')
    print (code_list)
    print (u'请不要关闭程序!稍等一会继续爬取失败的标的...')
    return code_list

def repetrun():
    a = str(input())
    if a != "":
        result_stock = re.findall(r'\d{6}', a)
        print(u'爬取标的列表为:\n', result_stock)
    else:
        result_stock = ['000001','000002','000006','000009','000012','000027','000028','000031','000039','000043',
     '000046','000049','000050','000060','000061','000062','000063','000069','000078','000088',
     '000089','000099','000100','000151','000156','000157','000333','000338','000400','000401',
     '000402','000410','000413','000415','000417','000422','000423','000425','000426','000501',
     '000503','000506','000513','000516','000525','000528','000536','000537','000538','000539',
     '000540','000541','000543','000550','000551','000552','000554','000559','000563','000566',
     '000568','000572','000581','000592','000596','000598','000607','000616','000623','000625',
     '000630','000631','000650','000651','000655','000661','000667','000671','000680','000685',
     '000686','000690','000697','000709','000712','000718','000725','000728','000729','000731',
     '000732','000738','000739','000750','000758','000761','000762','000768','000776','000777',
     '000778','000780','000783','000786','000788','000789','000790','000792','000793','000800',
     '000801','000807','000811','000812','000816','000823','000825','000826','000829','000830',
     '000839','000848','000851','000858','000860','000868','000869','000876','000877','000878',
     '000883','000887','000895','000897','000898','000901','000905','000915','000917','000921',
     '000926','000930','000937','000938','000939','000960','000961','000963','000969','000970',
     '000973','000975','000977','000979','000983','000988','000989','000996','000997','000998',
     '000999','002001','002004','002005','002007','002008','002011','002016','002022','002023',
     '002024','002025','002028','002029','002030','002038','002041','002048','002049','002050',
     '002051','002055','002056','002063','002064','002065','002067','002070','002073','002078',
     '002079','002081','002091','002092','002093','002095','002104','002106','002108','002117',
     '002118','002128','002130','002138','002140','002142','002146','002148','002151','002152',
     '002153','002154','002158','002161','002176','002179','002181','002183','002185','002190',
     '002191','002202','002203','002204','002219','002221','002223','002229','002230','002233',
     '002236','002237','002241','002242','002244','002250','002251','002252','002261','002262',
     '002266','002267','002268','002269','002273','002275','002276','002277','002281','002285',
     '002287','002292','002293','002294','002299','002304','002307','002308','002310','002311',
     '002312','002313','002317','002318','002325','002340','002344','002353','002355','002368',
     '002369','002371','002375','002378','002385','002393','002396','002399','002400','002401',
     '002405','002407','002408','002410','002414','002415','002416','002419','002424','002428',
     '002429','002431','002437','002439','002440','002444','002450','002456','002460','002461',
     '002465','002467','002470','002474','002475','002476','002482','002490','002493','002500',
     '002501','002508','002518','002524','002556','002570','002571','002573','002574','002577',
     '002579','002594','002603','002642','002646','002648','002653','002673','002681','002701',
     '300001','300002','300003','300005','300010','300014','300015','300017','300020','300024',
     '300026','300027','300034','300039','300052','300053','300055','300058','300059','300065',
     '300070','300072','300074','300077','300079','300088','300090','300093','300104','300115',
     '300122','300124','300128','300133','300134','300146','300147','300152','300157','300168',
     '300170','300191','300199','300202','300203','300205','300212','300216','300226','300228',
     '300251','300253','300257','300273','300274','300315','300355','600000','600005','600006',
     '600007','600008','600009','600010','600011','600015','600016','600017','600018','600019',
     '600021','600022','600023','600026','600027','600028','600029','600030','600031','600036',
     '600037','600038','600039','600048','600050','600056','600058','600059','600060','600062',
     '600063','600066','600067','600068','600073','600077','600078','600079','600085','600086',
     '600088','600089','600094','600096','600098','600100','600104','600107','600108','600109',
     '600110','600111','600112','600113','600115','600116','600118','600119','600120','600123',
     '600125','600132','600135','600138','600139','600141','600143','600146','600149','600150',
     '600151','600153','600155','600157','600158','600160','600161','600166','600169','600170',
     '600171','600175','600176','600177','600183','600185','600186','600187','600188','600193',
     '600196','600197','600198','600199','600200','600201','600206','600208','600210','600216',
     '600218','600219','600220','600221','600222','600223','600225','600229','600237','600239',
     '600240','600251','600252','600256','600257','600259','600260','600261','600266','600267',
     '600270','600271','600276','600277','600285','600288','600289','600292','600293','600298',
     '600300','600307','600309','600312','600315','600316','600318','600320','600321','600323',
     '600325','600329','600330','600331','600332','600333','600335','600336','600340','600343',
     '600348','600350','600352','600354','600362','600363','600366','600369','600372','600373',
     '600376','600377','600380','600382','600383','600386','600387','600388','600389','600391',
     '600392','600395','600406','600409','600410','600415','600416','600418','600422','600425',
     '600426','600433','600435','600436','600446','600449','600456','600458','600459','600460',
     '600467','600470','600478','600481','600482','600483','600486','600489','600490','600491',
     '600495','600497','600498','600499','600500','600502','600503','600509','600515','600516',
     '600517','600518','600519','600521','600522','600523','600525','600526','600528','600535',
     '600536','600537','600543','600545','600547','600549','600551','600557','600559','600563',
     '600566','600568','600570','600572','600575','600578','600580','600582','600583','600584',
     '600585','600587','600588','600592','600594','600595','600596','600597','600600','600601',
     '600604','600606','600609','600611','600614','600616','600620','600624','600626','600633',
     '600635','600637','600639','600640','600642','600643','600645','600648','600649','600651',
     '600652','600653','600654','600655','600660','600661','600662','600663','600664','600667',
     '600668','600673','600674','600677','600680','600684','600688','600690','600692','600694',
     '600696','600699','600702','600703','600704','600705','600707','600711','600716','600717',
     '600718','600720','600728','600729','600730','600737','600739','600740','600741','600742',
     '600743','600744','600747','600748','600750','600751','600755','600756','600757','600759',
     '600761','600765','600770','600771','600775','600776','600777','600783','600787','600789',
     '600790','600795','600797','600800','600801','600802','600804','600805','600807','600808',
     '600809','600811','600815','600816','600820','600823','600825','600826','600827','600830',
     '600831','600835','600837','600838','600839','600844','600846','600851','600855','600859',
     '600863','600867','600868','600872','600873','600874','600875','600877','600879','600880',
     '600881','600884','600886','600887','600893','600894','600895','600900','600967','600970',
     '600971','600976','600978','600987','600993','600998','600999','601000','601001','601002',
     '601006','601009','601012','601018','601038','601088','601098','601099','601101','601106',
     '601111','601117','601118','601139','601158','601166','601168','601169','601179','601186',
     '601216','601218','601225','601231','601238','601258','601288','601311','601318','601328',
     '601333','601336','601369','601377','601388','601390','601398','601519','601555','601600',
     '601601','601607','601608','601618','601628','601633','601666','601668','601669','601678',
     '601688','601699','601717','601718','601727','601766','601777','601788','601789','601800',
     '601801','601808','601818','601857','601866','601872','601877','601880','601886','601888',
     '601898','601899','601901','601919','601928','601929','601933','601939','601958','601988',
     '601989','601991','601992','601996','601998','601999','603000','603993']
        print(u'默认爬取所有融资融券标的')
    print('已经开始工作了,请稍等片刻*.*')
    while run(result_stock) != []:
        run(result_stock)


if __name__ == "__main__":
    multiprocessing.freeze_support()
    repetrun()

3.guidemo.py

以下是将输入和输出源映射到GUI 程序的脚本:
#encoding:utf-8
from tkinter import *
from tkinter.simpledialog import askstring
from tkinter.scrolledtext import ScrolledText    # or PP4E.Gui.Tour.scrolledtext

class GuiOutput:
    font = ('courier', 9, 'normal')              # 在类里,适用于整体
    def __init__(self, parent=None):
        self.text = None
        if parent: self.popupnow(parent)         # 先弹出或者第一次写入parent窗口

    def popupnow(self, parent=None):             # 回到顶层窗口
        if self.text: return
        self.text = ScrolledText(parent or Toplevel())
        self.text.config(font=self.font)
        self.text.pack()

    def write(self, text):
        self.popupnow()
        self.text.insert(END, str(text))
        self.text.see(END)
        self.text.update()                       # 更新界面

    def writelines(self, lines):                 # 
        for line in lines: self.write(line)      # 或适用map(self.write,lines)

class GuiInput:
    def __init__(self):
        self.buff = ''

    def inputLine(self):
        line = askstring(u'标的输入', u'如直接按OK,则默认爬取所有标的')
        if line == None:
            return ''                            
        else:                                    
            return line + '\n'                   # 否则添加结束标记

    def read(self, bytes=None):
        if not self.buff:
            self.buff = self.inputLine()
        if bytes:                                # 按字节读入
            text = self.buff[:bytes]             
            self.buff = self.buff[bytes:]
        else:
            text = ''                            
            line = self.buff
            while line:
                text = text + line
                line = self.inputLine()          # 持续读入
        return text

    def readline(self):
        text = self.buff or self.inputLine()     # 直到cancer
        self.buff = ''
        return text

    def readlines(self):
        lines = []                               
        while True:
            next = self.readline()
            if not next: break
            lines.append(next)
        return lines

def redirectedGuiFunc(func, *pargs, **kargs):
    import sys
    saveStreams = sys.stdin, sys.stdout         
    sys.stdin   = GuiInput()                     # 根据需要弹出对话框
    sys.stdout  = GuiOutput()                    # 相应调用,创建新的输出窗口
    sys.stderr  = sys.stdout
    result = func(*pargs, **kargs)               # 非阻塞调用
    sys.stdin, sys.stdout = saveStreams
    return result

def redirectedGuiShellCmd(command):
    import os
    input  = os.popen(command, 'r')
    output = GuiOutput()
    def reader(input, output):                   
        while True:                              # 标准输出
            line = input.readline()              # 在新的弹出式文本框组间中
            if not line: break                   # 调用 readline时可能阻塞 
            output.write(line)
    reader(input, output)

if __name__ == '__main__':               
    def makeUpper():                             # 使用标准流
        while True:
            try:
                line = input('Line? ')
            except:
                break
            print(line.upper())
        print('end of file')

    def makeLower(input, output):           
        while True:
            line = input.readline()
            if not line: break
            output.write(line.lower())
        print('end of file')

    root = Tk()
    """
    Button(root, text='test streams',
           command=lambda: redirectedGuiFunc(makeUpper)).pack(fill=X)
    Button(root, text='test files  ',
           command=lambda: makeLower(GuiInput(), GuiOutput()) ).pack(fill=X)
    Button(root, text='test popen  ',
           command=lambda: redirectedGuiShellCmd('dir *')).pack(fill=X)
    """
    root.mainloop()

ok,继续学习吧!




评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值