1. 创建索引
PUT fear_greed_stock
{
"mappings" : {
"dynamic" : "strict" ,
"properties" : {
"c_date" : {
"type" : "date" ,
"format" : "yyyy-MM-dd"
} ,
"val" : {
"type" : "float"
} ,
"tag" : {
"type" : "keyword"
} ,
"type" : {
"type" : "keyword"
} ,
"previous_close" : {
"type" : "float"
} ,
"previous_1_week" : {
"type" : "float"
} ,
"previous_1_month" : {
"type" : "float"
} ,
"previous_1_year" : {
"type" : "float"
} ,
"metric_value" : {
"type" : "float"
}
}
}
}
2. Python核心代码
import requests
import json
from datetime import datetime
from elasticsearch import helpers
import time
import logging
from requests. adapters import HTTPAdapter
from urllib3. util. retry import Retry
from utils. datautils import get_unique_id
from utils. esutils import es_client
logging. basicConfig( level= logging. INFO, format = '%(asctime)s - %(levelname)s - %(message)s' )
logger = logging. getLogger( __name__)
INDEX_NAME = "fear_greed_stock"
def parse_timestamp ( timestamp) :
"""解析时间戳,支持字符串和毫秒时间戳格式"""
try :
if isinstance ( timestamp, str ) :
return datetime. strptime( timestamp, "%Y-%m-%dT%H:%M:%S+00:00" ) . strftime( "%Y-%m-%d" )
else :
return datetime. fromtimestamp( int ( timestamp) / 1000 ) . strftime( "%Y-%m-%d" )
except Exception as e:
logger. error( f"时间戳解析失败: { timestamp} , 错误: { e} " )
return None
def process_fear_greed_data ( data, es_client, index_name= INDEX_NAME) :
"""处理Fear and Greed Index数据并存储到Elasticsearch"""
actions = [ ]
processed_count = 0
try :
fear_greed = data. get( 'fear_and_greed' , { } )
if not fear_greed:
logger. warning( "未找到fear_and_greed数据" )
return processed_count
score = fear_greed. get( 'score' )
rating = fear_greed. get( 'rating' )
timestamp = fear_greed. get( 'timestamp' )
previous_close = fear_greed. get( 'previous_close' )
previous_1_week = fear_greed. get( 'previous_1_week' )
previous_1_month = fear_greed. get( 'previous_1_month' )
previous_1_year = fear_greed. get( 'previous_1_year' )
date = parse_timestamp( timestamp)
if not date:
logger. warning( "无效的时间戳,跳过记录" )
return processed_count
info = {
"c_date" : date,
"val" : score,
"tag" : rating,
"previous_close" : previous_close,
"previous_1_week" : previous_1_week,
"previous_1_month" : previous_1_month,
"previous_1_year" : previous_1_year,
"type" : "overall"
}
action = {
"_op_type" : "index" ,
"_index" : index_name,
"_id" : get_unique_id( f" { date} _overall" ) ,
"_source" : info
}
actions. append( action)
processed_count += 1
sub_metrics = [
"market_momentum_sp500" , "market_momentum_sp125" , "stock_price_strength" ,
"stock_price_breadth" , "put_call_options" , "market_volatility_vix" ,
"market_volatility_vix_50" , "junk_bond_demand" , "safe_haven_demand"
]
for metric in sub_metrics:
metric_data = data. get( metric, { } )
metric_timestamp = metric_data. get( 'timestamp' )
metric_score = metric_data. get( 'score' )
metric_rating = metric_data. get( 'rating' )
metric_value = metric_data. get( 'data' , [ { } ] ) [ 0 ] . get( 'y' ) if metric_data. get( 'data' ) else None
metric_date = parse_timestamp( metric_timestamp)
if not metric_date:
logger. warning( f" { metric} 时间戳无效,跳过" )
continue
metric_info = {
"c_date" : metric_date,
"val" : metric_score,
"tag" : metric_rating,
"metric_value" : metric_value,
"type" : metric
}
action = {
"_op_type" : "index" ,
"_index" : index_name,
"_id" : get_unique_id( f" { metric_date} _ { metric} " ) ,
"_source" : metric_info
}
actions. append( action)
processed_count += 1
if len ( actions) >= 5000 :
try :
helpers. bulk( es_client, actions)
logger. info( f"已处理 { processed_count} 条数据" )
actions. clear( )
time. sleep( 0.5 )
except Exception as e:
logger. error( f"批量写入失败: { e} " )
time. sleep( 2 )
if actions:
try :
helpers. bulk( es_client, actions)
logger. info( f"已处理 { processed_count} 条数据" )
except Exception as e:
logger. error( f"剩余数据写入失败: { e} " )
except Exception as e:
logger. error( f"数据处理失败: { e} " )
return processed_count
def do ( ) :
try :
date = datetime. now( ) . strftime( "%Y-%m-%d" )
url = f"https://production.dataviz.cnn.io/index/fearandgreed/graphdata/ { date} "
headers = {
"Accept" : "application/json" ,
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
proxies = {
"http" : "http://127.0.0.1:7890" ,
"https" : "http://127.0.0.1:7890"
}
session = requests. Session( )
retries = Retry( total= 3 , backoff_factor= 1 , status_forcelist= [ 418 , 429 , 500 , 502 , 503 , 504 ] )
session. mount( "https://" , HTTPAdapter( max_retries= retries) )
response = session. get( url, headers= headers, proxies= proxies, timeout= 10 )
response. raise_for_status( )
data = response. json( )
processed_count = process_fear_greed_data( data, es_client, INDEX_NAME)
logger. info( f"总共处理 { processed_count} 条数据" )
except requests. exceptions. HTTPError as e:
if response. status_code == 418 :
logger. error( "遇到418错误:服务器检测到爬虫行为,建议更换代理或增加延迟后重试" )
else :
logger. error( f"请求失败: { e} " )
except requests. exceptions. RequestException as e:
logger. error( f"请求过程中发生错误: { e} " )
finally :
session. close( )