1.学习初衷
本着紧跟时代进步步伐,坚决不拖社会主义建设后腿的想法,紧赶大数据、数据分析、机器学习的趋势......当然,前面只是瞎扯了。
最后我综合考虑R跟Python,最终还是选择走上了python的学习道路...
其实本意是:一想结合本专业再学习编程知识想往数据分析、机器学习等方向方向发展;二是的确对编程学习挺感兴趣。
2.小爬虫
要想数据分析、机器学习之类,肯定得要有大量的数据,没毛病好吧~学完基础知识后当然从最开始的爬虫写起。好了,网上教程不少了吧,我就直接贴代码了。
1.SpiderInGui.py
#encoding:utf-8
#author:buracag_mc
from tkinter import *
from rzrq_spider import *
from guidemo import redirectedGuiFunc
def SpiderGui():
redirectedGuiFunc(repetrun)
if __name__ == '__main__':
root = Tk()
root.title('小爬虫')
Label(root,text="1.一个爬取沪深两市融资融券标的融资融券交易数据的小爬虫\n"
"2.点击爬取后可爬取特定标的的数据,如直接按OK,则爬取所有标的\n\n\n"
""
"注:\n"
"不知为何原脚本现在的爬取过程有时会爬取不到数据,故效率会有所降低!").pack()
Button(root, text='点击爬取', command=SpiderGui).pack(fill=X)
Button(root, text='点击退出', command=root.quit).pack(fill=X)
root.mainloop()
以上是一个简易的GUI界面。包括了:
1.
特定标的的输入(或默认爬取所有列表标的);
2.以及将函数中的流映射输入到弹出的GUI窗口中;
3.最后所有的数据保存到了运行目录下的All_rzrq_data文件夹中;
4.可以将其打包为可执行exe文件,用2.7版本打包成功过最初的版本。
其中用到了两个封装的模块:rzrq_spider和guidemo,后者参照了Mark Lutz著的Programming Python的相关内容。
2.rzrq_spider.py
以下是爬虫脚本:
# encoding:utf-8
# author:buracag_mc
import os,sys,time,requests,\
csv,re,random,multiprocessing
#==========以下是为了解决multiprocessing模块打包成exe的问题==========
try:
if sys.platform.startswith('win'):
import multiprocessing.popen_spawn_win32 as forking
else:
import multiprocessing.popen_fork as forking
except ImportError:
import multiprocessing.forking as forking
if sys.platform.startswith('win'):
class _Popen(forking.Popen):
def __init__(self, *args, **kw):
if hasattr(sys, 'frozen'):
os.putenv('_MEIPASS2', sys._MEIPASS)
try:
super(_Popen, self).__init__(*args, **kw)
finally:
if hasattr(sys, 'frozen'):
if hasattr(os, 'unsetenv'):
os.unsetenv('_MEIPASS2')
else:
os.putenv('_MEIPASS2', '')
forking.Popen = _Popen
user_agent = ['Mozilla/5.0 (Windows NT 10.0; WOW64)',
'Mozilla/5.0 (Windows NT 6.3; WOW64)',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
if not os.path.exists('ALL_rzrq_data'):
os.mkdir('ALL_rzrq_data')
def rzrq_spider(code):
# -----------------------通过抓包找到了东方财富网站存有融资融标的的所有历史数据网站------------------------#
if int(code[0]) < 5:
baseurl = 'http://datainterface.eastmoney.com/EM_DataCenter/JS.aspx?type=FD&sty=MTE&mkt=2&code='+code+'&st=0&sr=1&p=1&ps=1888'
else:
baseurl = 'http://datainterface.eastmoney.com/EM_DataCenter/JS.aspx?type=FD&sty=MTE&mkt=1&code='+code+ '&st=0&sr=1&p=1&ps=1888'
try:
response = requests.get(baseurl, headers={"User-Agent": random.choice(user_agent)}, timeout=45)
html = response.text
# ------------------------用正则提取出网页中所有想要的数据(也可以用beautisoup)--------------------------#
data_list = re.findall(r'"\d{6},.*?_.*?,.*?,{1}(.*?)"', html)
result1_list = []
for x in data_list:
x = x + ','
result2 = re.findall(r'(.*?),', x)
# ------------网站君很坑爹,打乱了内容,故进行顺序的重新排版----------------#
result2 = [result2[1], result2[7], result2[6], result2[3], result2[2]]
result1_list.append(result2)
if result1_list != []:
# -----------------------------------------打开csv并按行写入---------------------------------------------------------#
with open('ALL_rzrq_data/%s_.csv' % code, 'w',encoding='utf8' ,newline='') as outfile: # ab+ 模式
spamwriter = csv.writer(outfile)
file_title = ['日期', '融资买入额(元)', '融资偿还额(元)', '融券卖出量(股)', '融券偿还量(股)'] #只选择了这几项内容
spamwriter.writerow(file_title)
if result1_list:
for item in result1_list:
spamwriter.writerow(item)
outfile.close()
#print (u'成功输出融资融券标的%s的数据到csv文件' % code)
else:
#print (u'爬取不到内容:',code)
return code
except :
#print ( u'%s 糟糕啦!' % code,sys.exc_info())
time.sleep(5)
return code
def run(code_list):
fail_code = []
pool = multiprocessing.Pool(processes=4)
for stock in code_list:
fail_code.append(pool.apply_async(rzrq_spider, (str(stock),)))
pool.close()
pool.join()
del code_list[:]
fail_code = [e.get() for e in fail_code if e.get() != None]
for code in fail_code:
code_list.append(code)
print (u'本次有些不给力,暂时爬取失败的有:')
print (code_list)
print (u'请不要关闭程序!稍等一会继续爬取失败的标的...')
return code_list
def repetrun():
a = str(input())
if a != "":
result_stock = re.findall(r'\d{6}', a)
print(u'爬取标的列表为:\n', result_stock)
else:
result_stock = ['000001','000002','000006','000009','000012','000027','000028','000031','000039','000043',
'000046','000049','000050','000060','000061','000062','000063','000069','000078','000088',
'000089','000099','000100','000151','000156','000157','000333','000338','000400','000401',
'000402','000410','000413','000415','000417','000422','000423','000425','000426','000501',
'000503','000506','000513','000516','000525','000528','000536','000537','000538','000539',
'000540','000541','000543','000550','000551','000552','000554','000559','000563','000566',
'000568','000572','000581','000592','000596','000598','000607','000616','000623','000625',
'000630','000631','000650','000651','000655','000661','000667','000671','000680','000685',
'000686','000690','000697','000709','000712','000718','000725','000728','000729','000731',
'000732','000738','000739','000750','000758','000761','000762','000768','000776','000777',
'000778','000780','000783','000786','000788','000789','000790','000792','000793','000800',
'000801','000807','000811','000812','000816','000823','000825','000826','000829','000830',
'000839','000848','000851','000858','000860','000868','000869','000876','000877','000878',
'000883','000887','000895','000897','000898','000901','000905','000915','000917','000921',
'000926','000930','000937','000938','000939','000960','000961','000963','000969','000970',
'000973','000975','000977','000979','000983','000988','000989','000996','000997','000998',
'000999','002001','002004','002005','002007','002008','002011','002016','002022','002023',
'002024','002025','002028','002029','002030','002038','002041','002048','002049','002050',
'002051','002055','002056','002063','002064','002065','002067','002070','002073','002078',
'002079','002081','002091','002092','002093','002095','002104','002106','002108','002117',
'002118','002128','002130','002138','002140','002142','002146','002148','002151','002152',
'002153','002154','002158','002161','002176','002179','002181','002183','002185','002190',
'002191','002202','002203','002204','002219','002221','002223','002229','002230','002233',
'002236','002237','002241','002242','002244','002250','002251','002252','002261','002262',
'002266','002267','002268','002269','002273','002275','002276','002277','002281','002285',
'002287','002292','002293','002294','002299','002304','002307','002308','002310','002311',
'002312','002313','002317','002318','002325','002340','002344','002353','002355','002368',
'002369','002371','002375','002378','002385','002393','002396','002399','002400','002401',
'002405','002407','002408','002410','002414','002415','002416','002419','002424','002428',
'002429','002431','002437','002439','002440','002444','002450','002456','002460','002461',
'002465','002467','002470','002474','002475','002476','002482','002490','002493','002500',
'002501','002508','002518','002524','002556','002570','002571','002573','002574','002577',
'002579','002594','002603','002642','002646','002648','002653','002673','002681','002701',
'300001','300002','300003','300005','300010','300014','300015','300017','300020','300024',
'300026','300027','300034','300039','300052','300053','300055','300058','300059','300065',
'300070','300072','300074','300077','300079','300088','300090','300093','300104','300115',
'300122','300124','300128','300133','300134','300146','300147','300152','300157','300168',
'300170','300191','300199','300202','300203','300205','300212','300216','300226','300228',
'300251','300253','300257','300273','300274','300315','300355','600000','600005','600006',
'600007','600008','600009','600010','600011','600015','600016','600017','600018','600019',
'600021','600022','600023','600026','600027','600028','600029','600030','600031','600036',
'600037','600038','600039','600048','600050','600056','600058','600059','600060','600062',
'600063','600066','600067','600068','600073','600077','600078','600079','600085','600086',
'600088','600089','600094','600096','600098','600100','600104','600107','600108','600109',
'600110','600111','600112','600113','600115','600116','600118','600119','600120','600123',
'600125','600132','600135','600138','600139','600141','600143','600146','600149','600150',
'600151','600153','600155','600157','600158','600160','600161','600166','600169','600170',
'600171','600175','600176','600177','600183','600185','600186','600187','600188','600193',
'600196','600197','600198','600199','600200','600201','600206','600208','600210','600216',
'600218','600219','600220','600221','600222','600223','600225','600229','600237','600239',
'600240','600251','600252','600256','600257','600259','600260','600261','600266','600267',
'600270','600271','600276','600277','600285','600288','600289','600292','600293','600298',
'600300','600307','600309','600312','600315','600316','600318','600320','600321','600323',
'600325','600329','600330','600331','600332','600333','600335','600336','600340','600343',
'600348','600350','600352','600354','600362','600363','600366','600369','600372','600373',
'600376','600377','600380','600382','600383','600386','600387','600388','600389','600391',
'600392','600395','600406','600409','600410','600415','600416','600418','600422','600425',
'600426','600433','600435','600436','600446','600449','600456','600458','600459','600460',
'600467','600470','600478','600481','600482','600483','600486','600489','600490','600491',
'600495','600497','600498','600499','600500','600502','600503','600509','600515','600516',
'600517','600518','600519','600521','600522','600523','600525','600526','600528','600535',
'600536','600537','600543','600545','600547','600549','600551','600557','600559','600563',
'600566','600568','600570','600572','600575','600578','600580','600582','600583','600584',
'600585','600587','600588','600592','600594','600595','600596','600597','600600','600601',
'600604','600606','600609','600611','600614','600616','600620','600624','600626','600633',
'600635','600637','600639','600640','600642','600643','600645','600648','600649','600651',
'600652','600653','600654','600655','600660','600661','600662','600663','600664','600667',
'600668','600673','600674','600677','600680','600684','600688','600690','600692','600694',
'600696','600699','600702','600703','600704','600705','600707','600711','600716','600717',
'600718','600720','600728','600729','600730','600737','600739','600740','600741','600742',
'600743','600744','600747','600748','600750','600751','600755','600756','600757','600759',
'600761','600765','600770','600771','600775','600776','600777','600783','600787','600789',
'600790','600795','600797','600800','600801','600802','600804','600805','600807','600808',
'600809','600811','600815','600816','600820','600823','600825','600826','600827','600830',
'600831','600835','600837','600838','600839','600844','600846','600851','600855','600859',
'600863','600867','600868','600872','600873','600874','600875','600877','600879','600880',
'600881','600884','600886','600887','600893','600894','600895','600900','600967','600970',
'600971','600976','600978','600987','600993','600998','600999','601000','601001','601002',
'601006','601009','601012','601018','601038','601088','601098','601099','601101','601106',
'601111','601117','601118','601139','601158','601166','601168','601169','601179','601186',
'601216','601218','601225','601231','601238','601258','601288','601311','601318','601328',
'601333','601336','601369','601377','601388','601390','601398','601519','601555','601600',
'601601','601607','601608','601618','601628','601633','601666','601668','601669','601678',
'601688','601699','601717','601718','601727','601766','601777','601788','601789','601800',
'601801','601808','601818','601857','601866','601872','601877','601880','601886','601888',
'601898','601899','601901','601919','601928','601929','601933','601939','601958','601988',
'601989','601991','601992','601996','601998','601999','603000','603993']
print(u'默认爬取所有融资融券标的')
print('已经开始工作了,请稍等片刻*.*')
while run(result_stock) != []:
run(result_stock)
if __name__ == "__main__":
multiprocessing.freeze_support()
repetrun()
3.guidemo.py
以下是将输入和输出源映射到GUI 程序的脚本:
#encoding:utf-8
from tkinter import *
from tkinter.simpledialog import askstring
from tkinter.scrolledtext import ScrolledText # or PP4E.Gui.Tour.scrolledtext
class GuiOutput:
font = ('courier', 9, 'normal') # 在类里,适用于整体
def __init__(self, parent=None):
self.text = None
if parent: self.popupnow(parent) # 先弹出或者第一次写入parent窗口
def popupnow(self, parent=None): # 回到顶层窗口
if self.text: return
self.text = ScrolledText(parent or Toplevel())
self.text.config(font=self.font)
self.text.pack()
def write(self, text):
self.popupnow()
self.text.insert(END, str(text))
self.text.see(END)
self.text.update() # 更新界面
def writelines(self, lines): #
for line in lines: self.write(line) # 或适用map(self.write,lines)
class GuiInput:
def __init__(self):
self.buff = ''
def inputLine(self):
line = askstring(u'标的输入', u'如直接按OK,则默认爬取所有标的')
if line == None:
return ''
else:
return line + '\n' # 否则添加结束标记
def read(self, bytes=None):
if not self.buff:
self.buff = self.inputLine()
if bytes: # 按字节读入
text = self.buff[:bytes]
self.buff = self.buff[bytes:]
else:
text = ''
line = self.buff
while line:
text = text + line
line = self.inputLine() # 持续读入
return text
def readline(self):
text = self.buff or self.inputLine() # 直到cancer
self.buff = ''
return text
def readlines(self):
lines = []
while True:
next = self.readline()
if not next: break
lines.append(next)
return lines
def redirectedGuiFunc(func, *pargs, **kargs):
import sys
saveStreams = sys.stdin, sys.stdout
sys.stdin = GuiInput() # 根据需要弹出对话框
sys.stdout = GuiOutput() # 相应调用,创建新的输出窗口
sys.stderr = sys.stdout
result = func(*pargs, **kargs) # 非阻塞调用
sys.stdin, sys.stdout = saveStreams
return result
def redirectedGuiShellCmd(command):
import os
input = os.popen(command, 'r')
output = GuiOutput()
def reader(input, output):
while True: # 标准输出
line = input.readline() # 在新的弹出式文本框组间中
if not line: break # 调用 readline时可能阻塞
output.write(line)
reader(input, output)
if __name__ == '__main__':
def makeUpper(): # 使用标准流
while True:
try:
line = input('Line? ')
except:
break
print(line.upper())
print('end of file')
def makeLower(input, output):
while True:
line = input.readline()
if not line: break
output.write(line.lower())
print('end of file')
root = Tk()
"""
Button(root, text='test streams',
command=lambda: redirectedGuiFunc(makeUpper)).pack(fill=X)
Button(root, text='test files ',
command=lambda: makeLower(GuiInput(), GuiOutput()) ).pack(fill=X)
Button(root, text='test popen ',
command=lambda: redirectedGuiShellCmd('dir *')).pack(fill=X)
"""
root.mainloop()
ok,继续学习吧!