需要编写一段Data Pipeline在AWS云上运行,它需要访问企业内部的API获取JSON格式的数据,企业有网关和防火墙,API有公司的okta身份认证,通过公司的域账号来授权访问,现在需要创建一个专用的域账号,让Data Pipeline访问Secret Manager,来获取账号密码,然后通过配置访问公司内部API的数据,请写出所有的开发配置步骤,以及完成这一功能的所有的Python源代码,需要确保安全性和可靠性。
该方案通过多层安全防护(网络隔离、最小权限、凭证加密)、完善的错误处理机制(自动令牌刷新、智能重试)和全面的监控告警体系,确保数据管道在企业安全策略下可靠运行。建议根据实际API规格调整参数验证和数据处理逻辑。
一、基础设施配置步骤
-
创建专用域账号
- 在企业Okta管理控制台创建Service Account
- 分配最小必要API访问权限
- 生成Client Credentials (client_id/client_secret)
-
网络连接配置
# 企业防火墙需开放以下访问:
Source: AWS VPC CIDR (或NAT Gateway IP)
Destination: 内部API端点
Port: 443
Protocol: HTTPS
# 推荐使用AWS Direct Connect或VPN建立私有连接
- AWS Secrets Manager配置
aws secretsmanager create-secret --name prod/okta_api_creds \
--description "Okta Service Account for Data Pipeline" \
--secret-string '{
"okta_domain": "yourcompany.okta.com",
"client_id": "xxxxxxxxxx",
"client_secret": "yyyyyyyyyy",
"api_endpoint": "https://internal-api.yourcompany.com/v1/data"
}'
- IAM角色配置(CDK示例)
from aws_cdk import (
aws_iam as iam,
aws_secretsmanager as sm
)
pipeline_role = iam.Role(
scope=self,
id="DataPipelineRole",
assumed_by=iam.ServicePrincipal("glue.amazonaws.com"),
managed_policies=[
iam.ManagedPolicy.from_aws_managed_policy_name(
"service-role/AWSGlueServiceRole"
)
]
)
secrets_policy = iam.PolicyStatement(
effect=iam.Effect.ALLOW,
actions=[
"secretsmanager:GetSecretValue",
"secretsmanager:DescribeSecret"
],
resources=[sm.Secret.from_secret_name_v2(
self, "OktaCredsSecret", "prod/okta_api_creds"
).secret_arn]
)
pipeline_role.add_to_policy(secrets_policy)
二、Python数据管道实现
import os
import json
import logging
from datetime import datetime, timedelta
import boto3
from botocore.exceptions import ClientError
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
# 日志配置
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
class SecureAPIClient:
def __init__(self):
self.secret_name = "prod/okta_api_creds"
self.region_name = os.getenv("AWS_REGION", "us-west-2")
self.session = requests.Session()
# 配置自动重试
retry = Retry(
total=3,
backoff_factor=1,
status_forcelist=[500, 502, 503, 504],
allowed_methods=frozenset(['POST', 'GET'])
)
self.session.mount('https://', HTTPAdapter(max_retries=retry))
# 初始化凭证缓存
self._access_token = None
self._token_expiry = None
def _get_secrets(self):
"""从Secrets Manager安全获取凭证"""
try:
client = boto3.client(
service_name='secretsmanager',
region_name=self.region_name
)
response = client.get_secret_value(SecretId=self.secret_name)
return json.loads(response['SecretString'])
except ClientError as e:
logger.error(f"Secret retrieval failed: {e}")
raise
def _get_access_token(self):
"""获取Okta访问令牌(带缓存验证)"""
if self._access_token and datetime.now() < self._token_expiry:
return self._access_token
secrets = self._get_secrets()
token_url = f"https://{secrets['okta_domain']}/oauth2/v1/token"
data = {
"grant_type": "client_credentials",
"client_id": secrets["client_id"],
"client_secret": secrets["client_secret"],
"scope": "api.access"
}
try:
response = self.session.post(
token_url,
data=data,
timeout=10,
headers={"Content-Type": "application/x-www-form-urlencoded"}
)
response.raise_for_status()
token_data = response.json()
self._access_token = token_data['access_token']
# 设置令牌过期时间为实际过期前5分钟
self._token_expiry = datetime.now() + timedelta(
seconds=token_data['expires_in'] - 300)
return self._access_token
except requests.exceptions.RequestException as e:
logger.error(f"Token request failed: {e}")
raise
def fetch_api_data(self, params=None):
"""获取API数据"""
try:
secrets = self._get_secrets()
headers = {
"Authorization": f"Bearer {self._get_access_token()}",
"Accept": "application/json"
}
response = self.session.get(
secrets["api_endpoint"],
headers=headers,
params=params,
timeout=15
)
response.raise_for_status()
return response.json()
except requests.exceptions.HTTPError as e:
if e.response.status_code == 401:
logger.warning("Token expired, attempting refresh...")
self._access_token = None # 强制刷新令牌
return self.fetch_api_data(params)
logger.error(f"API request failed: {e}")
raise
# 使用示例
def handler(event, context):
client = SecureAPIClient()
try:
data = client.fetch_api_data(params={
"startDate": "2023-01-01",
"endDate": "2023-01-31"
})
# 此处添加数据处理和存储逻辑
return {
"statusCode": 200,
"body": json.dumps({"message": "Data processed successfully"})
}
except Exception as e:
logger.error(f"Pipeline execution failed: {e}")
raise
if __name__ == "__main__":
# 本地测试
print(handler(None, None))
三、安全增强措施
- Secrets Manager加密
aws secretsmanager update-secret --secret-id prod/okta_api_creds \
--kms-key-id alias/aws/secretsmanager
- 网络传输安全
# 在代码中强制启用SSL验证
self.session.verify = True # 默认启用
os.environ['REQUESTS_CA_BUNDLE'] = '/etc/ssl/certs/ca-certificates.crt'
- IAM策略细化示例
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"secretsmanager:GetSecretValue",
"secretsmanager:DescribeSecret"
],
"Resource": "arn:aws:secretsmanager:us-west-2:1234567890:secret:prod/okta_api_creds-xxxxx",
"Condition": {
"IpAddress": {
"aws:SourceIp": [
"192.168.1.0/24", # VPC CIDR
"52.32.0.0/11" # AWS区域IP范围
]
}
}
}
]
}
四、监控与可靠性保障
- CloudWatch监控配置
# 创建API成功率指标
aws cloudwatch put-metric-alarm \
--alarm-name APISuccessRate \
--metric-name "APISuccessRate" \
--namespace "Custom" \
--statistic Average \
--period 300 \
--threshold 95 \
--comparison-operator LessThanThreshold \
--evaluation-periods 3 \
--alarm-actions arn:aws:sns:us-west-2:1234567890:DataPipelineAlerts
- 重试机制增强
# 使用指数退避算法
retry = Retry(
total=5,
backoff_factor=0.5,
status_forcelist=[429, 500, 502, 503, 504],
allowed_methods=frozenset(['GET', 'POST']),
respect_retry_after_header=True
)
五、部署与维护
- CI/CD管道示例(AWS CodePipeline)
Stages:
- Name: Source
Actions:
- Name: SourceAction
ActionTypeId:
category: Source
owner: AWS
version: '1'
provider: CodeCommit
Configuration:
RepositoryName: data-pipeline
BranchName: main
- Name: Deployment
Actions:
- Name: DeployLambda
ActionTypeId:
category: Deploy
owner: AWS
version: '1'
provider: Lambda
Configuration:
FunctionName: DataPipelineFunction
UserParameters: "prod"