用Python来筛选基金（1）

我文非相

已于 2022-02-16 16:39:55 修改

阅读量481

点赞数

文章标签： python 开发语言后端

于 2022-02-16 14:53:01 首次发布

本文链接：https://blog.csdn.net/awen1986531/article/details/122963260

版权

先通过天天基金网抓取所有基金数据
代码如下

#-*-coding:GBK -*- 
#********************
#微信&电话：13248260503
# 证券开户 研报收集
# 代码交流 数据分析
# 脚本开发 投资推荐
#********************
import urllib.request
import requests
import re
import random
import time
from urllib.parse import urlencode
import pandas as pd  #制表模块
from urllib.parse import urlparse

my_headers = [
    "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
    "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
    'Opera/9.25 (Windows NT 5.1; U; en)',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
    'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
    'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
    "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
    "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "
]
headers = {'User-Agent':random.choice(my_headers)}
def get_page(url):#封装下载页面方法
    response = requests.get(url,headers=headers)
    if response.status_code == 200:
        return response.content.decode("utf-8")#应对乱码
    else:
        return '爬取失败！'
def parse_html(html_content):
    pattern = re.compile('.*?fname fl.*?>(.*?)\D(\d+)\D</a>.*?单位净值.*?>(.*?)</span>.*?<span.*?>(.*?)</span>.*?基金类型：(.*?)</li>.*?管&nbsp;理&nbsp;人：.*?>(.*?)</a>.*?规&nbsp;&nbsp;&nbsp;&nbsp;模</a>：((---)|(.*?)亿元).*?基金经理：.*?>(.*?)</a>', re.S)#(.*?fname fl.*?>(.*?)\D(\d+)\D</a>).*?ping.*?>(.*?)</span>.*?基金类型：(.*?)</li>.*?管&nbsp;理&nbsp;人：.*?>(.*?)</a>.*?规&nbsp;&nbsp;&nbsp;&nbsp;模</a>：(.*?)亿元.*?基金经理：.*?>(.*?)</a>.*?手&nbsp;续&nbsp;费</a>：(.*?)<', re.S)
    result1 = re.findall(pattern, html_content)
    return result1
def parse_html_list(html_content_list):
    pattern = re.compile('.*?{.*?\[(.*?)].*?',re.S)
    result_list = re.findall(pattern, html_content_list)
    return result_list
def parse_html_list_up(html_content_list_up):
    pattern = re.compile('\"(.*?)\"',re.S)
    result_list_up = re.findall(pattern, html_content_list_up)
    return result_list_up
def parse_html1(html):
    pattern = re.compile('.*?allPages.*?(\d+)', re.S)
    result2 = re.findall(pattern, html)
    return result2
ex_name = input('表格名称:') +'.csv'
url = input('输入网址:')
result = urlparse(url)
url_parse = list(result) #元组转为列表
url_parse_fragment = url_parse[-1]
url_parse_fragment_l= url_parse_fragment.split(";")#将字符串转为列表
new_data_list =[]
for data in url_parse_fragment_l:
    data_1=re.split('(\w{2})',data,1)
    data_2=data_1
    new_data_list.append(data_2)
new_data_list_len = len(new_data_list)
new_dict = {}
for i in new_data_list:
    new_dict[i[1]] = i[2] 
base_url = 'http://fund.eastmoney.com/data/FundGuideapi.aspx?'
new_url = base_url + urlencode(new_dict)
html = get_page(new_url)
rusult2 = parse_html1(html)
all_page = int(rusult2[0])
shuju = pd.DataFrame([], columns=['名称', '基金类型','管理人','规模（亿元）','基金经理','净值','日增长率','成立来','今年来','近一周','近一月','近三月','近六月','近一年','近两年','近三年','日期','手续费','购买起点','原手续费'])#0代码1名称2单位净值3涨跌4基金类型5管理人7-8规模9基金经理12字母代码13基金类型14今年来15近一周16近一月17近三月18近六月19近一年
#20近两年21近三年25日期26净值27日增长率29手续费30购买起点（元）32原手续费34成立来
for i in range(all_page):
   new_dict['pi']=str(i+1)
   new_dict['sh']= 'table'
   new_url = base_url + urlencode(new_dict)
   html_content = get_page(new_url)
   result1 = parse_html(html_content)
   new_dict['sh']= 'list'
   new_url = base_url + urlencode(new_dict)
   html_content = get_page(new_url)
   result_list = parse_html_list(html_content)
   result_list_up = parse_html_list_up(result_list[0])
   for list_1 in result1:
           list_1_1 = list(list_1)
           list_num = result1.index(list_1)
           print(list_num)
           list_1_2 = result_list_up[list_num].split(",")#通过表一索引提取表二对应字符串并生成列表
           list_3 = list_1_1 + list_1_2
           print(list_3)
           daima = '代码：'+list_3[1]
           biao_1= list_3[0]
           biao_2= list_3[4]
           biao_3= list_3[5]
           biao_4= list_3[7] + list_3[8]
           biao_5= list_3[9]
           biao_jin= list_3[2]
           biao_6= list_3[14] +'%'
           biao_7= list_3[15] +'%'
           biao_8= list_3[16] +'%'
           biao_9= list_3[17] +'%'
           biao_10= list_3[18] +'%'
           biao_11= list_3[19] +'%'
           biao_12= list_3[20] +'%'
           biao_13= list_3[21] +'%'
           biao_14= list_3[25]
           biao_15= list_3[27] +'%'
           biao_16= list_3[29]
           biao_17= list_3[30] +'元'
           biao_18= list_3[32]
           biao_19= list_3[34] +'%'
           shuju.loc[daima,'名称'] = biao_1
           shuju.loc[daima,'基金类型'] = biao_2
           shuju.loc[daima,'管理人'] = biao_3
           shuju.loc[daima,'规模（亿元）'] = biao_4
           shuju.loc[daima,'基金经理'] = biao_5
           shuju.loc[daima,'净值'] = biao_jin
           shuju.loc[daima,'日增长率'] = biao_15
           shuju.loc[daima,'成立来'] = biao_19
           shuju.loc[daima,'今年来'] = biao_6
           shuju.loc[daima,'近一周'] = biao_7
           shuju.loc[daima,'近一月'] = biao_8
           shuju.loc[daima,'近三月'] = biao_9
           shuju.loc[daima,'近六月'] = biao_10
           shuju.loc[daima,'近一年'] = biao_11
           shuju.loc[daima,'近两年'] = biao_12
           shuju.loc[daima,'近三年'] = biao_13
           shuju.loc[daima,'日期'] = biao_14
           shuju.loc[daima,'手续费'] = biao_16
           shuju.loc[daima,'购买起点'] = biao_17
           shuju.loc[daima,'原手续费'] = biao_18
shuju.to_csv(ex_name, encoding='utf-8')