[Python + MySQL]多线程股票数据爬虫

股票数据爬取

目的

拿到历史数据,结合经典的数据算法,来看看能不能找到一些数学规律。

准备

需要连接MySQL数据库,我用的是安卓机上的Termux,这样方便以后把代码放到手机上跑,还能结合itchat与微信交互(暂未开写)。

直接上代码

就是把所有数据下下来

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Author: CK
# Date: 2020-03-13
import datetime
import json
import random
import sys
import threading
import time
from queue import Queue
import os
import pymysql
import requests
from warnings import filterwarnings
from tqdm import tqdm
from stock_fomula import StockFomula
from data_simulator import Simulator

file_path = os.path.abspath(sys.argv[0])
abs_path = file_path[: file_path.rfind('/')]


class GetData:

    def __init__(self):
        """
        初始化mysql连接
        """
        self.connect = pymysql.connect(
            host='127.0.0.1',
            # host='192.168.3.42',
            port=3306,
            user='root',
            password='1@Qwertyuiop',
            database='stock',
            charset='utf8'
        )
        self.raw = ''
        # 超时时间设置延长
        self.connect._write_timeout = 10000
        # 线程锁
        self.lock = threading.Lock()

    @staticmethod
    def get_headers():
        """
        随机获取User-Agent
        :return:
        """
        upper_path = abs_path[: abs_path.rfind('/')]
        with open(os.path.join(upper_path, "sources/headers.csv")) as ua:
            user_agent_list = ua.readlines()
        return user_agent_list

    def get_raw(self, url):
        """
        获取源数据
        :param url:
        :return:
        """
        headers = {
            'Host': 'd.10jqka.com.cn',
            'Referer': 'http://stockpage.10jqka.com.cn/HQ_v4.html',
            'User-Agent': random.choice(self.get_headers()).strip()
        }
        flag = 0
        while flag < 5:
            try:
                r = requests.get(url=url, headers=headers, timeout=60)
                flag = 5
                r.close()
                return r.text
            except Exception as ex:
                print('链接读取超时,开始重试,重试次数:%s' % flag) if flag > 0 else None
                flag += 1
                time.sleep(10)

    def data_to_json(self, url):
        """
        对raw数据进行转换处理
        :return:
        """
        raw_data = self.get_raw(url)
        if raw_data:
            try:
                start_index = raw_data.find('{')
                end_index = raw_data.find('}')
                stock_id = raw_data[: start_index - 1].split('_')[-3]
                json_str = raw_data[start_index: end_index + 1]
                return stock_id, json_str
            except IndexError:
                return None

    @staticmethod
    def date_transfer(price, dates, year, priceFactor):
        """
        用于将输入的价格、时间列表连接起来变成实际有意义的结构
        :param price:
        :param dates:
        :param year:
        :param priceFactor:
        :return:
        """
        result = []
        for i in range(len(dates)):
            date = str(year) + dates[i]
            opening = float(price[i * 4]) + float(price[i * 4 + 1])
            high = float(price[i * 4]) + float(price[i * 4 + 2])
            low = float(price[i * 4])
            closing = float(price[i * 4]) + float(price[i * 4 + 3])
            result.append(date + ',' + str(opening / priceFactor) + ',' + str(high / priceFactor) + ','
                          + str(low / priceFactor) + ',' + str(closing / priceFactor))
        return result

    def data_to_mysql(self, url, days=0):
        """
        大致分数据处理 + 数据写入两部分
        :param url:
        :param days: 表示最近几天,用于更新近期数据
        :return:
        """
        # 数据处理
        json_data = self.data_to_json(url)
        all_data = []
        if json_data:
            sf = StockFomula()
            stock_id = json_data[0]
            json_raw = json.loads(json_data[1])
            name = json_raw['name']
            sortYear = json_raw['sortYear']
            priceFactor = json_raw['priceFactor']
            price = json_raw['price'].split(',')
            dates = json_raw['dates'].split(',')
            for i in sortYear:
                year = i[0]
                num = i[1]
                front_price = price[: num * 4]
                price = price[num * 4:]
                front_dates = dates[: num]
                dates = dates[num:]
                lis = GetData.date_transfer(front_price, front_dates, year, priceFactor)
                all_data.extend(lis)
            # 计算各指标参数
            macd = sf.macd(all_data)
            kdj = sf.kdj(all_data)
            boll = sf.boll(all_data)
            ma = sf.ma(all_data)
            data = [all_data, macd, kdj, boll, ma]
            self.write_to_mysql(data, stock_id, name, days)
            return all_data
        else:
            # print('无数据')
            return -1

    def write_to_mysql(self, data, stock_id, name, days):
        """
        将数据写入到MySQL中
        :param data:
        :param stock_id:
        :param name:
        :param days: 见data_to_mysql处注释
        :return:
        """
        # if not exists无法工作,使用try except代替
        cursor = self.connect.cursor()
        try:
            stock_data_create = '''
                        CREATE TABLE stock_data (
                            id int,
                            stock_id varchar(8),
                            date varchar(15),
                            opening DECIMAL(10, 2),
                            high DECIMAL(10, 2),
                            low DECIMAL(10, 2),
                            closing DECIMAL(10, 2),
                            dif DECIMAL(10, 3),
                            dea DECIMAL(10, 3),
                            macd_bar DECIMAL(10, 3),
                            k DECIMAL(10, 3),
                            d DECIMAL(10, 3),
                            j DECIMAL(10, 3),
                            up DECIMAL(10, 3),
                            mb DECIMAL(10, 3),
                            dn DECIMAL(10, 3),
                            ma_5 DECIMAL(10, 3),
                            ma_6 DECIMAL(10, 3),
                            ma_7 DECIMAL(10, 3),
                            ma_8 DECIMAL(10, 3),
                            ma_9 DECIMAL(10, 3),
                            ma_10 DECIMAL(10, 3),
                            ma_11 DECIMAL(10, 3),
                            ma_12 DECIMAL(10, 3),
                            ma_13 DECIMAL(10, 3),
                            ma_14 DECIMAL(10, 3),
                            ma_15 DECIMAL(10, 3),
                            ma_16 DECIMAL(10, 3),
                            ma_17 DECIMAL(10, 3),
                            ma_18 DECIMAL(10, 3),
                            ma_19 DECIMAL(10, 3),
                            ma_20 DECIMAL(10, 3),
                            ma_30 DECIMAL(10, 3),
                            ma_60 DECIMAL(10, 3),
                            ma_120 DECIMAL(10, 3),
                            PRIMARY KEY (stock_id, date)
                            );
                    '''
            cursor.execute(stock_data_create)
        except:
            pass
        try:
            stock_name_create = 'CREATE TABLE IF NOT EXISTS stock_name' \
                                '(stock_id VARCHAR(10) PRIMARY KEY , name VARCHAR(15)) CHARSET "utf8";'
            cursor.execute(stock_name_create)
        except:
            pass
        # 假如days = 0意味着遍历所有
        if days == 0:
            days = len(data[0])

        now = datetime.datetime.now()
        now_date = now.strftime('%Y%m%d')
        # print(data[3])
        # for i in tqdm(range(len(data[0]) - days, len(data[0]))):
        for i in range(len(data[0]) - days, len(data[0])):
            try:
                date_price = data[0][i].split(',')
            except IndexError:
                return
            id = i
            date = date_price[0]
            # 不更新今天日期的数据
            if date == now_date:
                continue
            opening = float(date_price[1])
            high = float(date_price[2])
            low = float(date_price[3])
            closing = float(date_price[4])
            macd_data = data[1]
            kdj_data = data[2]
            boll_data = data[3]
            ma_data = data[4]
            dif = macd_data[1][i]
            dea = macd_data[2][i]
            macd_bar = macd_data[3][i]
            if not kdj_data:
                return
            k = kdj_data[1][i]
            d = kdj_data[2][i]
            j = kdj_data[3][i]
            up = boll_data[1][i]
            mb = boll_data[2][i]
            dn = boll_data[3][i]
            ma_5 = ma_data[0][i]
            ma_6 = ma_data[1][i]
            ma_7 = ma_data[2][i]
            ma_8 = ma_data[3][i]
            ma_9 = ma_data[4][i]
            ma_10 = ma_data[5][i]
            ma_11 = ma_data[6][i]
            ma_12 = ma_data[7][i]
            ma_13 = ma_data[8][i]
            ma_14 = ma_data[9][i]
            ma_15 = ma_data[10][i]
            ma_16 = ma_data[11][i]
            ma_17 = ma_data[12][i]
            ma_18 = ma_data[13][i]
            ma_19 = ma_data[14][i]
            ma_20 = ma_data[15][i]
            ma_30 = ma_data[16][i]
            ma_60 = ma_data[17][i]
            ma_120 = ma_data[18][i]
            try:
                self.connect.ping(reconnect=True)
                stock_data_add = 'REPLACE INTO stock_data (id, stock_id, date, opening, high, low, closing, dif, dea, ' \
                                 'macd_bar, k, d, j, up, mb, dn, ma_5, ma_6, ma_7, ma_8, ma_9, ma_10, ma_11, ma_12, ma_13, ' \
                                 'ma_14, ma_15, ma_16, ma_17, ma_18, ma_19, ma_20, ma_30, ma_60, ma_120) VALUES (' \
                                 '%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, ' \
                                 '%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
                cursor.execute(stock_data_add,
                               [id, stock_id, date, opening, high, low, closing, dif, dea, macd_bar, k, d, j, up, mb,
                                dn, ma_5, ma_6, ma_7, ma_8, ma_9, ma_10, ma_11, ma_12, ma_13, ma_14, ma_15, ma_16,
                                ma_17,
                                ma_18, ma_19, ma_20, ma_30, ma_60, ma_120])
                self.connect.commit()
            except Exception as ex:
                print(ex)
                # self.connect.ping(reconnect=True)
                self.connect.rollback()
        # 插入数据,跳过重复
        try:
            self.connect.ping(reconnect=True)
            stock_name_add = 'INSERT IGNORE INTO stock_name(stock_id, name) VALUES (%s, %s);'
            cursor.execute(stock_name_add, [stock_id, name])
            self.connect.commit()
        except Exception as ex:
            print(ex)
            # self.connect.ping(reconnect=True)
            self.connect.rollback()
        cursor.close()

    def downloader(self, queue, days=0):
        """
        统一的下载API,支持多线程
        :param queue: 队列,方便线程之间通信
        :return:
        """
        if not queue.empty():
            stock_id = queue.get()
            url = 'http://d.10jqka.com.cn/v6/line/hs_' + stock_id + '/01/all.js'
            # 申请线程锁
            self.lock.acquire()
            feedback = self.data_to_mysql(url, days)
            # 设置失败重试
            count = 1
            while feedback == -1 and count <= 50:
                # if count == 1:
                #     print("%s进入重试" % stock_id)
                feedback = self.data_to_mysql(url, days)
                count += 1
            if feedback == -1 and count == 51:
                print("\033[01;32m%s重试失败\033[0m" % stock_id)
                open('fail.txt', 'a').write(stock_id + '\n')
            # print("\033[01;31m%s重试成功\033[0m" % stock_id) if feedback != -1 and count != 1 else None
            # 释放线程锁
            self.lock.release()


def main(thread_num, days=0):
    """
    :param thread_num: 线程数
    :param days: 同步天数,为0时表示同步全部
    :return:
    """
    s = Simulator()
    last_2rd_day = s.get_cur_date(timedelta=2)
    last_day = s.get_cur_date(timedelta=1)
    # 创建股票代码队列
    flag = 0  # 用来调节下载进度的显示频度
    with open(os.path.join(abs_path, 'all_stock_code.txt')) as asc:
        all_stock_code = asc.readlines()
    # all_stock_code = ['1A0001']
    size = len(all_stock_code)
    cur_time = datetime.datetime.now()
    print('股票数量:%d' % size, '开始时间:%s' % cur_time.strftime('%Y%m%d %H:%M:%S'), sep='\n')
    stock_code_queue = Queue()
    for i in all_stock_code:
        stock_code_queue.put(i.strip())
    gd = GetData()
    while not stock_code_queue.empty():
        flag += 1
        if flag % 14 == 0:
            print("下载进度%.2f%%" % ((size - stock_code_queue.qsize()) / size * 100))
        threads = []
        for i in range(thread_num):
            t = threading.Thread(target=gd.downloader, args=(stock_code_queue, days,))
            threads.append(t)
            t.start()
        for thread in threads:
            thread.join()
        time.sleep(random.random() * 2 + 1)
    cur_time = datetime.datetime.now()
    print('结束时间:%s' % cur_time.strftime('%Y%m%d %H:%M:%S'))


if __name__ == '__main__':
    # 去除Warning信息
    filterwarnings('ignore', category=pymysql.Warning)
    try:
        gd = GetData()
        one_queue = Queue()
        one_queue.put(sys.argv[1])
        gd.downloader(one_queue, 3)
        print('股票%s信息更新完成' % sys.argv[1])
    except IndexError:
        main(10, 5)
    # gd = GetData()
    # url = 'http://d.10jqka.com.cn/v6/line/hs_' + '600016' + '/01/all.js'
    # print(gd.get_raw(url))
    # 当天的信息可以通过http://d.10jqka.com.cn/v6/line/hs_600271/00/today.js获得

代码中引用到的headers.csv如下

Mozilla/5.0 (Windows NT 10.0; WOW64)
Mozilla/5.0 (Windows NT 6.3; WOW64)
Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11
Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko
Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36
Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)
Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1
Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3
Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12
Opera/9.27 (Windows NT 5.2; U; zh-cn)
Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0
Opera/8.0 (Macintosh; PPC Mac OS X; U; en)
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6
Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)
Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)
Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1
Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)
Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER
Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值