按照官方办法有少许问题,下面的脚本解决了
# 创建存放chaosd的文件夹
mkdir /usr/local/chaosd && \
# 指定版本,此处使用最新版
export CHAOSD_VERSION=latest && \
# 下载,注意需要添加User-Agent
curl -fsSL -o chaosd.tar.gz https://mirrors.chaos-mesh.org/chaosd-$CHAOSD_VERSION-linux-amd64.tar.gz -H 'User-Agent: Mozilla/5.0' && \
tar zxvf chaosd.tar.gz && \
cd chaosd-* && \
mv * /usr/local/chaosd && \
export PATH=/usr/local/chaosd:$PATH && \
# 打印下版本,但是输出貌似有问题
chaosd version && \
# 启动chaosd服务
chaosd server --port 31767
chaosd服务启动后输出内容如下
Chaosd Server Version: version.Info{GitVersion:"v0.0.0-master+$Format:%h$", GitCommit:"$Format:%H$", BuildDate:"2022-05-05T10:22:09Z", GoVersion:"go1.16.2", Compiler:"gc", Platform:"linux/amd64"}
[2022/05/10 19:35:51.379 +08:00] [INFO] [cron.go:183] ["Starting Scheduler"]
[2022/05/10 19:35:51.379 +08:00] [INFO] [server.go:71] ["starting HTTP server"] [address=0.0.0.0:31767]
[GIN-debug] [WARNING] Creating an Engine instance with the Logger and Recovery middleware already attached.
[GIN-debug] [WARNING] Running in "debug" mode. Switch to "release" mode in production.
- using env: export GIN_MODE=release
- using code: gin.SetMode(gin.ReleaseMode)
[GIN-debug] GET /api/system/health --> github.com/chaos-mesh/chaosd/pkg/server/httpserver.(*httpServer).healthcheck-fm (4 handlers)
[GIN-debug] GET /api/system/version --> github.com/chaos-mesh/chaosd/pkg/server/httpserver.(*httpServer).version-fm (4 handlers)
[GIN-debug] GET /api/swagger/*any --> github.com/chaos-mesh/chaosd/pkg/swaggerserver.Handler.func1 (4 handlers)
[GIN-debug] POST /api/attack/process --> github.com/chaos-mesh/chaosd/pkg/server/httpserver.(*httpServer).createProcessAttack-fm (4 handlers)
[GIN-debug] POST /api/attack/stress --> github.com/chaos-mesh/chaosd/pkg/server/httpserver.(*httpServer).createStressAttack-fm (4 handlers)
[GIN-debug] POST /api/attack/network --> github.com/chaos-mesh/chaosd/pkg/server/httpserver.(*httpServer).createNetworkAttack-fm (4 handlers)
[GIN-debug] POST /api/attack/disk --> github.com/chaos-mesh/chaosd/pkg/server/httpserver.(*httpServer).createDiskAttack-fm (4 handlers)
[GIN-debug] POST /api/attack/clock --> github.com/chaos-mesh/chaosd/pkg/server/httpserver.(*httpServer).createClockAttack-fm (4 handlers)
[GIN-debug] POST /api/attack/jvm --> github.com/chaos-mesh/chaosd/pkg/server/httpserver.(*httpServer).createJVMAttack-fm (4 handlers)
[GIN-debug] POST /api/attack/redis --> github.com/chaos-mesh/chaosd/pkg/server/httpserver.(*httpServer).createRedisAttack-fm (4 handlers)
[GIN-debug] DELETE /api/attack/:uid --> github.com/chaos-mesh/chaosd/pkg/server/httpserver.(*httpServer).recoverAttack-fm (4 handlers)
[GIN-debug] GET /api/experiments/ --> github.com/chaos-mesh/chaosd/pkg/server/httpserver.(*httpServer).listExperiments-fm (4 handlers)
[GIN-debug] GET /api/experiments/:uid/runs --> github.com/chaos-mesh/chaosd/pkg/server/httpserver.(*httpServer).listExperimentRuns-fm (4 handlers)
[GIN-debug] Listening and serving HTTP on 0.0.0.0:31767
[2022/05/10 19:35:51.380 +08:00] [INFO] [zapr.go:69] [start]
使用指令chaosd attack host shutdown来关闭该电脑,毕竟这个功能无法使用HTTP来执行
Broadcast message from root@centos (Tue 2022-05-10 19:50:37 CST):
The system is going down for power-off at Tue 2022-05-10 19:51:37 CST!
Attack host successfully, uid: d95bcfe4-4726-4bdf-96fb-cf2ced872faf
使用HTTP来控制
# 设置chaosd服务地址
export CHAOSD_SERVER=master.kamputer.online
# 执行cpu压力测试并记录服务端返回的结果
CHAOSD_RESPONSE=$(curl -X POST $CHAOSD_SERVER:31767/api/attack/stress -H "Content-Type:application/json" -d '{"load":60, "action":"cpu","workers":4}')
# 对于一个两核心的CPU而言,最终没个stress-ng进程都未能超过50%
# 执行内存压力测试
CHAOSD_RESPONSE=$(curl -X POST $CHAOSD_SERVER:31767/api/attack/stress -H "Content-Type:application/json" -d '{ "action":"mem","size":"3GiB"}')
# 或有stress-ng进程占用对应内存,并且占用大量CPU.如果无法占用到足够内存(例如如果有多个压力测试进程),会产生新的进程来重写执行
# 如果内存超过了物理内存,也不会报错,在服务端使用chaosd查询也是success状态
# 添加网络延时,延时1秒,在网卡eth1上,地址是192.168.1.101,可以没有ip但是必须有设备
CHAOSD_RESPONSE=$(curl -X POST $CHAOSD_SERVER:31767/api/attack/network -H "Content-Type:application/json" -d '{ "action":"delay","latency":"1s","device":"eth1","ip":"192.168.1.101"}')
# 添加网络丢包
CHAOSD_RESPONSE=$(curl -X POST $CHAOSD_SERVER:31767/api/attack/network -H "Content-Type:application/json" -d '{ "action":"loss","percent":"30","device":"eth1"}')
# 停止故障
echo $CHAOSD_RESPONSE
CHAOSD_UID=$(echo $CHAOSD_RESPONSE | python3 -c "import json,sys;print(json.load(sys.stdin).get('uid'));")
curl -X DELETE $CHAOSD_SERVER:31767/api/attack/$CHAOSD_UID
# 或者合并成一个alias
alias stop_chaosd_attack='curl -X DELETE $CHAOSD_SERVER:31767/api/attack/$(echo $CHAOSD_RESPONSE | python3 -c "import json,sys;print(json.load(sys.stdin).get(\"uid\"));")'
stop_chaosd_attack
如果不小心没有记录uid,那么到chaosd服务上查询,加上s查询正在执行的attack
chaosd search -s success
或者运行下面命令直接关掉
chaosd search -s success|grep success|awk '{print $1}'|xargs chaosd recover
使用yaml格式
# 设置被测试物理机地址,需启动Chaosd服务
export CHAOSD_SERVER=127.0.0.1
# 设置被测试物理机网卡名称,模拟网络故障时使用
export CHAOSD_NODE_DEVICE=eth1
# 设置所影响的网络包地址
export NETWORKCHAOS_TARGET=127.0.0.1
# 配置chaos-mesh部署在k8s的哪个命名空间下
export CHAOS_NAMESPACE=chaos-testing
name_postfix=$(date "+%m%d%H%M")
# CPU压力测试,启动4个进程,各占用60%的CPU
echo "apiVersion: chaos-mesh.org/v1alpha1
kind: PhysicalMachineChaos
metadata:
name: physical-stress-cpu-$name_postfix
namespace: $CHAOS_NAMESPACE
spec:
action: stress-cpu
address:
- $CHAOSD_SERVER:31767
stress-cpu:
load: 60
workers: 4
duration: '30s'">physical-stress-cpu.yaml
kubectl apply -f physical-stress-cpu.yaml
# 内存压力测试,占用2GiB内存
echo "apiVersion: chaos-mesh.org/v1alpha1
kind: PhysicalMachineChaos
metadata:
name: physical-stress-mem-$name_postfix
namespace: $CHAOS_NAMESPACE
spec:
action: stress-mem
address:
- $CHAOSD_SERVER:31767
stress-mem:
size: 2GiB
duration: '30s'">physical-stress-mem.yaml
kubectl apply -f physical-stress-mem.yaml
# 测试网络丢包,此处设置为30%
echo "apiVersion: chaos-mesh.org/v1alpha1
kind: PhysicalMachineChaos
metadata:
name: physical-network-loss-$name_postfix
namespace: $CHAOS_NAMESPACE
spec:
action: network-loss
address:
- $CHAOSD_SERVER:31767
network-loss:
device: $CHAOSD_NODE_DEVICE
ip-address: $NETWORKCHAOS_TARGET
percent: '30'
duration: '30s'">physical-network-loss.yaml
kubectl apply -f physical-network-loss.yaml
# 测试网络延时,此处设置为1秒
echo "apiVersion: chaos-mesh.org/v1alpha1
kind: PhysicalMachineChaos
metadata:
name: physical-network-delay-$name_postfix
namespace: $CHAOS_NAMESPACE
spec:
action: network-delay
address:
- $CHAOSD_SERVER:31767
network-delay:
device: $CHAOSD_NODE_DEVICE
ip-address: $NETWORKCHAOS_TARGET
latency: '1000ms'
duration: '30s'">physical-network-delay.yaml
kubectl apply -f physical-network-delay.yaml
# 测试节点挂了
#ssh $CHAOSD_SERVER "chaosd attack host shutdown"
在dashboard上查看结果
kubectl port-forward -n chaos-testing svc/chaos-dashboard 2333:2333 --address='0.0.0.0'
准备token
echo 'kind: ServiceAccount
apiVersion: v1
metadata:
namespace: default
name: account-cluster-manager-qbit
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: role-cluster-manager-qbit
rules:
- apiGroups: [""]
resources: ["pods", "namespaces"]
verbs: ["get", "watch", "list"]
- apiGroups:
- chaos-mesh.org
resources: [ "*" ]
verbs: ["get", "list", "watch", "create", "delete", "patch", "update"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: bind-cluster-manager-qbit
subjects:
- kind: ServiceAccount
name: account-cluster-manager-qbit
namespace: default
roleRef:
kind: ClusterRole
name: role-cluster-manager-qbit
apiGroup: rbac.authorization.k8s.io'>rbac.yaml
kubectl apply -f rbac.yaml
kubectl describe secret account-cluster-manager-qbit