股票数据爬取
目的
拿到历史数据,结合经典的数据算法,来看看能不能找到一些数学规律。
准备
需要连接MySQL数据库,我用的是安卓机上的Termux,这样方便以后把代码放到手机上跑,还能结合itchat与微信交互(暂未开写)。
直接上代码
就是把所有数据下下来
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Author: CK
# Date: 2020-03-13
import datetime
import json
import random
import sys
import threading
import time
from queue import Queue
import os
import pymysql
import requests
from warnings import filterwarnings
from tqdm import tqdm
from stock_fomula import StockFomula
from data_simulator import Simulator
file_path = os.path.abspath(sys.argv[0])
abs_path = file_path[: file_path.rfind('/')]
class GetData:
def __init__(self):
"""
初始化mysql连接
"""
self.connect = pymysql.connect(
host='127.0.0.1',
# host='192.168.3.42',
port=3306,
user='root',
password='1@Qwertyuiop',
database='stock',
charset='utf8'
)
self.raw = ''
# 超时时间设置延长
self.connect._write_timeout = 10000
# 线程锁
self.lock = threading.Lock()
@staticmethod
def get_headers():
"""
随机获取User-Agent
:return:
"""
upper_path = abs_path[: abs_path.rfind('/')]
with open(os.path.join(upper_path, "sources/headers.csv")) as ua:
user_agent_list = ua.readlines()
return user_agent_list
def get_raw(self, url):
"""
获取源数据
:param url:
:return:
"""
headers = {
'Host': 'd.10jqka.com.cn',
'Referer': 'http://stockpage.10jqka.com.cn/HQ_v4.html',
'User-Agent': random.choice(self.get_headers()).strip()
}
flag = 0
while flag < 5:
try:
r = requests.get(url=url, headers=headers, timeout=60)
flag = 5
r.close()
return r.text
except Exception as ex:
print('链接读取超时,开始重试,重试次数:%s' % flag) if flag > 0 else None
flag += 1
time.sleep(10)
def data_to_json(self, url):
"""
对raw数据进行转换处理
:return:
"""
raw_data = self.get_raw(url)
if raw_data:
try:
start_index = raw_data.find('{')
end_index = raw_data.find('}')
stock_id = raw_data[: start_index - 1].split('_')[-3]
json_str = raw_data[start_index: end_index + 1]
return stock_id, json_str
except IndexError:
return None
@staticmethod
def date_transfer(price, dates, year, priceFactor):
"""
用于将输入的价格、时间列表连接起来变成实际有意义的结构
:param price:
:param dates:
:param year:
:param priceFactor:
:return:
"""
result = []
for i in range(len(dates)):
date = str(year) + dates[i]
opening = float(price[i * 4]) + float(price[i * 4 + 1])
high = float(price[i * 4]) + float(price[i * 4 + 2])
low = float(price[i * 4])
closing = float(price[i * 4]) + float(price[i * 4 + 3])
result.append(date + ',' + str(opening / priceFactor) + ',' + str(high / priceFactor) + ','
+ str(low / priceFactor) + ',' + str(closing / priceFactor))
return result
def data_to_mysql(self, url, days=0):
"""
大致分数据处理 + 数据写入两部分
:param url:
:param days: 表示最近几天,用于更新近期数据
:return:
"""
# 数据处理
json_data = self.data_to_json(url)
all_data = []
if json_data:
sf = StockFomula()
stock_id = json_data[0]
json_raw = json.loads(json_data[1])
name = json_raw['name']
sortYear = json_raw['sortYear']
priceFactor = json_raw['priceFactor']
price = json_raw['price'].split(',')
dates = json_raw['dates'].split(',')
for i in sortYear:
year = i[0]
num = i[1]
front_price = price[: num * 4]
price = price[num * 4:]
front_dates = dates[: num]
dates = dates[num:]
lis = GetData.date_transfer(front_price, front_dates, year, priceFactor)
all_data.extend(lis)
# 计算各指标参数
macd = sf.macd(all_data)
kdj = sf.kdj(all_data)
boll = sf.boll(all_data)
ma = sf.ma(all_data)
data = [all_data, macd, kdj, boll, ma]
self.write_to_mysql(data, stock_id, name, days)
return all_data
else:
# print('无数据')
return -1
def write_to_mysql(self, data, stock_id, name, days):
"""
将数据写入到MySQL中
:param data:
:param stock_id:
:param name:
:param days: 见data_to_mysql处注释
:return:
"""
# if not exists无法工作,使用try except代替
cursor = self.connect.cursor()
try:
stock_data_create = '''
CREATE TABLE stock_data (
id int,
stock_id varchar(8),
date varchar(15),
opening DECIMAL(10, 2),
high DECIMAL(10, 2),
low DECIMAL(10, 2),
closing DECIMAL(10, 2),
dif DECIMAL(10, 3),
dea DECIMAL(10, 3),
macd_bar DECIMAL(10, 3),
k DECIMAL(10, 3),
d DECIMAL(10, 3),
j DECIMAL(10, 3),
up DECIMAL(10, 3),
mb DECIMAL(10, 3),
dn DECIMAL(10, 3),
ma_5 DECIMAL(10, 3),
ma_6 DECIMAL(10, 3),
ma_7 DECIMAL(10, 3),
ma_8 DECIMAL(10, 3),
ma_9 DECIMAL(10, 3),
ma_10 DECIMAL(10, 3),
ma_11 DECIMAL(10, 3),
ma_12 DECIMAL(10, 3),
ma_13 DECIMAL(10, 3),
ma_14 DECIMAL(10, 3),
ma_15 DECIMAL(10, 3),
ma_16 DECIMAL(10, 3),
ma_17 DECIMAL(10, 3),
ma_18 DECIMAL(10, 3),
ma_19 DECIMAL(10, 3),
ma_20 DECIMAL(10, 3),
ma_30 DECIMAL(10, 3),
ma_60 DECIMAL(10, 3),
ma_120 DECIMAL(10, 3),
PRIMARY KEY (stock_id, date)
);
'''
cursor.execute(stock_data_create)
except:
pass
try:
stock_name_create = 'CREATE TABLE IF NOT EXISTS stock_name' \
'(stock_id VARCHAR(10) PRIMARY KEY , name VARCHAR(15)) CHARSET "utf8";'
cursor.execute(stock_name_create)
except:
pass
# 假如days = 0意味着遍历所有
if days == 0:
days = len(data[0])
now = datetime.datetime.now()
now_date = now.strftime('%Y%m%d')
# print(data[3])
# for i in tqdm(range(len(data[0]) - days, len(data[0]))):
for i in range(len(data[0]) - days, len(data[0])):
try:
date_price = data[0][i].split(',')
except IndexError:
return
id = i
date = date_price[0]
# 不更新今天日期的数据
if date == now_date:
continue
opening = float(date_price[1])
high = float(date_price[2])
low = float(date_price[3])
closing = float(date_price[4])
macd_data = data[1]
kdj_data = data[2]
boll_data = data[3]
ma_data = data[4]
dif = macd_data[1][i]
dea = macd_data[2][i]
macd_bar = macd_data[3][i]
if not kdj_data:
return
k = kdj_data[1][i]
d = kdj_data[2][i]
j = kdj_data[3][i]
up = boll_data[1][i]
mb = boll_data[2][i]
dn = boll_data[3][i]
ma_5 = ma_data[0][i]
ma_6 = ma_data[1][i]
ma_7 = ma_data[2][i]
ma_8 = ma_data[3][i]
ma_9 = ma_data[4][i]
ma_10 = ma_data[5][i]
ma_11 = ma_data[6][i]
ma_12 = ma_data[7][i]
ma_13 = ma_data[8][i]
ma_14 = ma_data[9][i]
ma_15 = ma_data[10][i]
ma_16 = ma_data[11][i]
ma_17 = ma_data[12][i]
ma_18 = ma_data[13][i]
ma_19 = ma_data[14][i]
ma_20 = ma_data[15][i]
ma_30 = ma_data[16][i]
ma_60 = ma_data[17][i]
ma_120 = ma_data[18][i]
try:
self.connect.ping(reconnect=True)
stock_data_add = 'REPLACE INTO stock_data (id, stock_id, date, opening, high, low, closing, dif, dea, ' \
'macd_bar, k, d, j, up, mb, dn, ma_5, ma_6, ma_7, ma_8, ma_9, ma_10, ma_11, ma_12, ma_13, ' \
'ma_14, ma_15, ma_16, ma_17, ma_18, ma_19, ma_20, ma_30, ma_60, ma_120) VALUES (' \
'%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, ' \
'%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
cursor.execute(stock_data_add,
[id, stock_id, date, opening, high, low, closing, dif, dea, macd_bar, k, d, j, up, mb,
dn, ma_5, ma_6, ma_7, ma_8, ma_9, ma_10, ma_11, ma_12, ma_13, ma_14, ma_15, ma_16,
ma_17,
ma_18, ma_19, ma_20, ma_30, ma_60, ma_120])
self.connect.commit()
except Exception as ex:
print(ex)
# self.connect.ping(reconnect=True)
self.connect.rollback()
# 插入数据,跳过重复
try:
self.connect.ping(reconnect=True)
stock_name_add = 'INSERT IGNORE INTO stock_name(stock_id, name) VALUES (%s, %s);'
cursor.execute(stock_name_add, [stock_id, name])
self.connect.commit()
except Exception as ex:
print(ex)
# self.connect.ping(reconnect=True)
self.connect.rollback()
cursor.close()
def downloader(self, queue, days=0):
"""
统一的下载API,支持多线程
:param queue: 队列,方便线程之间通信
:return:
"""
if not queue.empty():
stock_id = queue.get()
url = 'http://d.10jqka.com.cn/v6/line/hs_' + stock_id + '/01/all.js'
# 申请线程锁
self.lock.acquire()
feedback = self.data_to_mysql(url, days)
# 设置失败重试
count = 1
while feedback == -1 and count <= 50:
# if count == 1:
# print("%s进入重试" % stock_id)
feedback = self.data_to_mysql(url, days)
count += 1
if feedback == -1 and count == 51:
print("\033[01;32m%s重试失败\033[0m" % stock_id)
open('fail.txt', 'a').write(stock_id + '\n')
# print("\033[01;31m%s重试成功\033[0m" % stock_id) if feedback != -1 and count != 1 else None
# 释放线程锁
self.lock.release()
def main(thread_num, days=0):
"""
:param thread_num: 线程数
:param days: 同步天数,为0时表示同步全部
:return:
"""
s = Simulator()
last_2rd_day = s.get_cur_date(timedelta=2)
last_day = s.get_cur_date(timedelta=1)
# 创建股票代码队列
flag = 0 # 用来调节下载进度的显示频度
with open(os.path.join(abs_path, 'all_stock_code.txt')) as asc:
all_stock_code = asc.readlines()
# all_stock_code = ['1A0001']
size = len(all_stock_code)
cur_time = datetime.datetime.now()
print('股票数量:%d' % size, '开始时间:%s' % cur_time.strftime('%Y%m%d %H:%M:%S'), sep='\n')
stock_code_queue = Queue()
for i in all_stock_code:
stock_code_queue.put(i.strip())
gd = GetData()
while not stock_code_queue.empty():
flag += 1
if flag % 14 == 0:
print("下载进度%.2f%%" % ((size - stock_code_queue.qsize()) / size * 100))
threads = []
for i in range(thread_num):
t = threading.Thread(target=gd.downloader, args=(stock_code_queue, days,))
threads.append(t)
t.start()
for thread in threads:
thread.join()
time.sleep(random.random() * 2 + 1)
cur_time = datetime.datetime.now()
print('结束时间:%s' % cur_time.strftime('%Y%m%d %H:%M:%S'))
if __name__ == '__main__':
# 去除Warning信息
filterwarnings('ignore', category=pymysql.Warning)
try:
gd = GetData()
one_queue = Queue()
one_queue.put(sys.argv[1])
gd.downloader(one_queue, 3)
print('股票%s信息更新完成' % sys.argv[1])
except IndexError:
main(10, 5)
# gd = GetData()
# url = 'http://d.10jqka.com.cn/v6/line/hs_' + '600016' + '/01/all.js'
# print(gd.get_raw(url))
# 当天的信息可以通过http://d.10jqka.com.cn/v6/line/hs_600271/00/today.js获得
代码中引用到的headers.csv如下
Mozilla/5.0 (Windows NT 10.0; WOW64)
Mozilla/5.0 (Windows NT 6.3; WOW64)
Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11
Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko
Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36
Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)
Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1
Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3
Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12
Opera/9.27 (Windows NT 5.2; U; zh-cn)
Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0
Opera/8.0 (Macintosh; PPC Mac OS X; U; en)
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6
Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)
Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)
Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1
Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)
Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER
Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11