Python新手入门爬虫练习——天天基金

Wnig

已于 2023-04-12 17:30:55 修改

阅读量1.1k

点赞数 1

分类专栏： Python 文章标签： python 爬虫开发语言

于 2023-04-12 16:21:29 首次发布

本文链接：https://blog.csdn.net/qq_29217789/article/details/130102955

版权

Python 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

利用Python爬取天天基金的数据。
先打开天天基金网，然后选择感兴趣的主题基金，这里以煤炭为例。
进入页面之后，复制一个基金名称，在浏览器的控制台Network搜索这个名称。
点击我们搜索到的js，切换到Headers，这里有几个信息是我们需要的：General里的Request URL、Request Headers里的Cookie、Host、Referer、User-Agent。

import pandas as pd
import requests
import re
import json
import time
import math

#定义一个fund_obj用于存放，刚才在浏览器上获取到的几个值
fund_obj = {
    'isbuy': 1,
    'sort': 'TRY',
    'sorttype': 'desc',
    'callback': 'jQuery18306680153539612363_1681216767557',
    'cookie': 'intellpositionL=1152px; em_hq_fls=js; intellpositionT=455px; qgqp_b_id=2c2759821488d533c53576ad44a41e5d; HAList=ty-1-000001-%u4E0A%u8BC1%u6307%u6570%2Cty-90-BK1036-%u534A%u5BFC%u4F53%2Ca-sh-603713-%u5BC6%u5C14%u514B%u536B%2Cf-0-000001-%u4E0A%u8BC1%u6307%u6570%2Ca-sh-600559-%u8001%u767D%u5E72%u9152%2Ca-sh-603195-%u516C%u725B%u96C6%u56E2%2Ca-sz-002236-%u5927%u534E%u80A1%u4EFD%2Ca-sh-600759-%u6D32%u9645%u6CB9%u6C14%2Cf-0-399439-%u56FD%u8BC1%u6CB9%u6C14%2Ca-sz-000049-%u5FB7%u8D5B%u7535%u6C60; EMFUND1=null; EMFUND2=null; EMFUND3=null; EMFUND4=null; EMFUND5=null; EMFUND6=null; EMFUND7=null; EMFUND0=null; EMFUND8=04-11%2015%3A07%3A30@%23%24%u534E%u5546%u4FE1%u7528%u589E%u5F3A%u503A%u5238A@%23%24001751; AUTH_FUND.EASTMONEY.COM_GSJZ=AUTH*TTJJ*TOKEN; EMFUND9=04-11 15:32:10@#$%u62DB%u5546%u4E2D%u8BC1%u767D%u9152%u6307%u6570%28LOF%29A@%23%24161725; st_si=97748096283622; st_pvi=48514956678552; st_sp=2019-09-26%2018%3A30%3A34; st_inirUrl=https%3A%2F%2Fwww.baidu.com%2Flink; st_sn=1; st_psi=2023041120385018-112200312942-2229925924; st_asi=delete',
    'host': 'api.fund.eastmoney.com',
    'referer': 'http://fund.eastmoney.com/',
    'useragent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
    'themecallback': 'jQuery18306592616272463223_1681284141591',
}

#基金信息
class FundInfo:
    #初始化
    def __init__(self, page_index = 1, page_size = 10, total_page = 0, total_count = 0):
        self.pageIndex = page_index
        self.pageSize = page_size
        self.totalPage = total_page
        self.totalCount = total_count
        self.dataList = []
        self.themeName = ''
    #请求接口
    def get_response(self, parame_obj, parame_url):
        # 伪装
        # Request Headers里的Cookie、Host、Referer、User-Agent。
        headers = {
            'Cookie': parame_obj['cookie'],
            'Host': parame_obj['host'],
            'Referer': parame_obj['referer'],
            'User-Agent': parame_obj['useragent'],
        }
        response = requests.get(url=parame_url, headers=headers)

        return response
    #获取基金主题
    def get_fund_theme(self, parame_obj):
        url = f'http://api.fund.eastmoney.com/ztjj//GetBKDetailInfoNew?callback={parame_obj["themecallback"]}&tp=801950&_=1681284141906'
        response = self.get_response(parame_obj, url)
        # 获取数据
        data = response.text
        print('theme===', data)
        # 解析数据
        res = re.findall(f'{parame_obj["themecallback"]}\((.*)\)', data)[0]
        res = json.loads(res)

        self.themeName = res['Data']['INDEXNAME']
    #获取基金列表数据
    def get_fund_data(self, parame_obj):
        # General里的Request URL
        # 将callback、sort、sorttype、pageindex、pagesize、isbuy这几个参数提取出来
        url = f'http://api.fund.eastmoney.com/ztjj/GetBKRelTopicFundNew?callback={parame_obj["callback"]}&sort={parame_obj["sort"]}&sorttype={parame_obj["sorttype"]}&pageindex={self.pageIndex}&pagesize={self.pageSize}&tp=801950&isbuy={parame_obj["isbuy"]}'
        response = self.get_response(parame_obj, url)
        # 获取数据
        data = response.text
        print(data)

        # 解析数据
        res = re.findall(f'{parame_obj["callback"]}\((.*)\)', data)[0]
        res = json.loads(res)
        return res
    #获取总条数、总页数
    def get_total_page(self, parame_res):
        self.totalCount = int(parame_res['TotalCount'])
        self.totalPage = math.ceil(self.totalCount / self.pageSize)
    def get_fund_info(self, parame_res):
        datas = parame_res['Data']
        for list in datas:
            itemList = []
            for key, val in list.items():
                if 'SHORTNAME' in key:
                    itemList.append(str(val))
                elif 'FCODE' in key:
                    itemList.append(str(val))
                elif 'DWJZ' in key:
                    itemList.append(str(val))
                elif 'RZDF' in key:
                    itemList.append(str(val))
                elif 'SYRQ' in key:
                    itemList.append(str(val))
                elif 'FTYPE' in key:
                    itemList.append(str(val))
                elif 'RELATION' in key:
                    itemList.append(str(val))
                elif 'SYL_Z' in key:
                    itemList.append(str(val))
                elif 'SYL_Y' in key:
                    itemList.append(str(val))
                elif 'SYL_3Y' in key:
                    itemList.append(str(val))
                elif 'SYL_6Y' in key:
                    itemList.append(str(val))
                elif 'SYL_JN' in key:
                    itemList.append(str(val))
                elif 'SYL_1N' in key:
                    itemList.append(str(val))
                elif 'SYL_2N' in key:
                    itemList.append(str(val))
                elif 'SYL_3N' in key:
                    itemList.append(str(val))
                elif 'SYL_LN' in key:
                    itemList.append(str(val))
                elif 'RATE' in key:
                    itemList.append(str(val))
                elif 'SOURCERATE' in key:
                    itemList.append(str(val))
                elif 'MINSG' in key:
                    itemList.append(str(val))
                elif 'ISSALES' in key:
                    itemList.append(str(val))
                elif 'ISBUY' in key:
                    itemList.append(str(val))
            self.dataList.append(itemList)
    #分页
    def change_page(self, parame_obj, parame_res):
        self.get_fund_theme(parame_obj)
        timestamp = int(time.time())
        for page in range(self.totalPage):
            print(f'正在爬取第{page+1}页的数据内容')
            time.sleep(1)
            self.get_fund_data(parame_obj)
            self.get_fund_info(parame_res)
            self.pageIndex += 1
            if self.pageIndex > self.totalPage:
                df = pd.DataFrame(self.dataList, columns=['基金代码', '', '日期', '原手续费', '净值', '日增长率', '手续费', 'ISBUY', '购买起点', '基金名称', '近1周','近1月','近3月','近6月','近1年','近2年','近3年','今年来','成立来', 'ISSALES','基金类型'])
                df.to_excel(f'{self.themeName}基金{timestamp}.xlsx')
                break

fund_info = FundInfo()
res = fund_info.get_fund_data(fund_obj)
fund_info.get_total_page(res)
fund_info.change_page(fund_obj, res)

运行上面的代码，然后就可以得到一份数据。

Wnig

关注

1
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
Python新手入门爬虫练习——天天基金

点击我们搜素到的js，切换到Headers，这里有几个信息是我们需要的：General里的Request URL、Request Headers里的Cookie、Host、Referer、User-Agent。
复制链接

扫一扫