使用python抓取金融数据-详细实例

为了练手熟悉一下python爬虫工具,写了个小例子。主要逻辑如下:

1、在基金分类页获取所有基金基础信息

2、根据第一步获取的信息进行循环,在每个基金页面进行爬取每天的记录值

代码:

  获取不同基金代码,并存入本地数据库:getFoundCode.py

import requests
import demjson
import sqlite3
import uuid

def get_page(url, page_num):
    pageList = []
    for i in range(1, page_num + 1):
        formdata = {'op': 'ph',
                    'dt': 'kf',
                    'ft': 'zs',
                    'rs': '',
                    'gs': 0,
                    'sc': 'zzf',
                    'st': 'desc',
                    'sd': '2018 - 11 - 11',
                    'ed': '2019 - 11 - 11',
                    'qdii': '',
                    'tabSubtype': ', , , , ,',
                    'pi': i,
                    'pn': 50,
                    'dx': 1,
                    'v': 0.6213036119243205}
        try:
            r = requests.get(url, params=formdata)
            r.raise_for_status()
            print('链接成功')
            str = r.text
            str = str[15:-1]
            b = demjson.decode(str)

            sqlCommand = "insert into found_code (foundId, foundName, foundCode) values "
            for item in b['datas']:
                rowSql = "('{}','{}','{}'),".format(uuid.uuid1(), item.split(',')[1], item.split(',')[0])
                sqlCommand += rowSql

            sqlCommand = sqlCommand[0:-1]
            cursor.execute(sqlCommand)

        except:
            print('链接失败')

#初始化数据库连接
conn = sqlite3.connect(r'C:\AWork\python\workspace\supervised-reptile\found.db', check_same_thread=False)
cursor = conn.cursor()
url = 'http://fund.eastmoney.com/data/rankhandler.aspx'
pageList = get_page(url, 17)
conn.commit()
conn.close()

然后主程序中通过mainSpider()方法,循环生成不同基金的地址

在beginSpider(code)方法中,调用initSpider(fundcode)方法获取页面操作对象driver和总页数

调用getData方法,循环拉取数据并插入本地数据库

# coding: utf-8
import time
import uuid

from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from bs4 import BeautifulSoup
from threading import Thread, Lock
import os
import csv
import sqlite3

# 下面是利用 selenium 抓取html页面的代码

# 初始化函数
def initSpider(fundcode):
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    driver = webdriver.Chrome(chrome_options=chrome_options)
    driver.get("http://fund.eastmoney.com/f10/jjjz_{0}.html".format(fundcode))  # 要抓取的网页地址

    # 找到"下一页"按钮,就可以得到它前面的一个label,就是总页数
    getPage_text = driver.find_element_by_id("pagebar").find_element_by_xpath(
        "div[@class='pagebtns']/label[text()='下一页']/preceding-sibling::label[1]").get_attribute("innerHTML")
    # 得到总共有多少页
    total_page = int("".join(filter(str.isdigit, getPage_text)))

    # 返回
    return (driver, total_page)


# 获取html内容
def getData(myrange, driver, lock, code):

    for x in myrange:
        # 锁住
        lock.acquire()
        # 初始化数据库连接
        conn = sqlite3.connect(r'C:\AWork\python\workspace\supervised-reptile\found.db')
        cursor = conn.cursor()
        attempts = 0
        success = False

        tonum = driver.find_element_by_id("pagebar").find_element_by_xpath(
            "div[@class='pagebtns']/input[@class='pnum']")  # 得到 页码文本框
        jumpbtn = driver.find_element_by_id("pagebar").find_element_by_xpath(
            "div[@class='pagebtns']/input[@class='pgo']")  # 跳转到按钮

        tonum.clear()  # 第x页 输入框
        tonum.send_keys(str(x))  # 去第x页
        jumpbtn.click()  # 点击按钮

        # 抓取
        WebDriverWait(driver, 20).until(lambda driver: driver.find_element_by_id("pagebar").find_element_by_xpath(
            "div[@class='pagebtns']/label[@value={0} and @class='cur']".format(x)) != None)

        # 保存到项目中
        datalist = []

        while attempts < 3 and not success:
            try:
                rows = driver.find_element_by_id("jztable").find_elements(By.TAG_NAME, "tr")
                for row in rows:
                    para = {}
                    table_td_list = row.find_elements(By.TAG_NAME, "td")
                    if len(table_td_list) > 0:
                        para['id'] = uuid.uuid1()
                        para['foundCode'] = code
                        para['date'] = table_td_list[0].text
                        para['netWorth'] = table_td_list[1].text
                        para['growthRate'] = table_td_list[3].text[0:-1]
                        datalist.append(para)

                # 存储数据
                sqlCommand = "insert into found_netWorth_detail (id, foundCode, date, netWorth, growthRate) values "
                for item in datalist:
                    rowSql = "('{}','{}','{}','{}','{}'),".format(item['id'], item['foundCode'], item['date'],
                                                                item['netWorth'],
                                                                item['growthRate'])
                    sqlCommand += rowSql

                sqlCommand = sqlCommand[0:-1]
                print(sqlCommand)
                cursor.execute(sqlCommand)
                cursor.execute("UPDATE found_code set isGet='yes' where foundCode=="+code)
                conn.commit()
                conn.close()
                print("保存完成")
                # 解锁
                lock.release()
                success = True
            except:
                attempts += 1
                time.sleep(1)
                if attempts == 3:
                    os.exits(0)

# 开始抓取函数
def beginSpider(code):
    # 初始化爬虫
    (driver, total_page) = initSpider(code)
    # 创建锁
    lock = Lock()

    r = range(1, int(total_page) + 1)
    step = 10
    range_list = [r[x:x + step] for x in range(0, len(r), step)]  # 把页码分段
    thread_list = []
    for r in range_list:
        t = Thread(target=getData, args=(r, driver, lock, code))
        thread_list.append(t)
        t.start()
    for t in thread_list:
        t.join()  # 这一步是需要的,等待线程全部执行完成

def mainSpider():
    conn1 = sqlite3.connect(r'C:\AWork\python\workspace\supervised-reptile\found.db')
    cursor1 = conn1.cursor()
    cursor1.execute("SELECT foundCode from found_code where isGet='no'")
    cd = cursor1.fetchall()
    for (row,) in cd:
        beginSpider(row)
    conn1.commit()
    conn1.close()
    print("抓取完成")

mainSpider()

 

  • 1
    点赞
  • 17
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值