一、OSD横向扩容(基于rack和bluestore)
1、系统初始化
#!/bin/bash
sys_init(){
systemctl stop firewalld
systemctl disable firewalld
sed -i 's/enforcing/disabled/' /etc/selinux/config
setenforce 0
systemctl disable NetworkManager
systemctl stop NetworkManager
timedatectl set-timezone Asia/Shanghai
yum -y install ntpdate
systemctl restart ntpd
systemctl enable ntpd
ntpdate ntp1.aliyun.com #生产环境建议使用多个内网yum源,时间不同步会导致集群故障
swapoff -a
echo '* soft nofile 65535' >>/etc/security/limits.conf
echo '* hard nofile 65535' >>/etc/security/limits.conf
echo 'kernel.pid_max = 4194303' >>/etc/sysctl.conf
echo 'vm.swappiness = 0' >>/etc/sysctl.conf
sysctl -p
}
ceph_optimization(){
#关闭numa: https://blog.csdn.net/qq_34065508/article/details/103358812
#优化参考: https://blog.csdn.net/cybertan/article/details/51131444
#read_ahead,通过数据预读并且记载到随机访问内存方式提高磁盘读操作
echo "8192" > /sys/block/sda/queue/read_ahead_kb
#I/O Scheduler,SSD要用noop,SATA/SAS使用deadline
#echo "deadline" >/sys/block/sd[x]/queue/scheduler
#echo "noop" >/sys/block/sd[x]/queue/scheduler
echo "noop" >/sys/block/sda/queue/scheduler
echo "noop" >/sys/block/sdb/queue/scheduler
echo "deadline" >/sys/block/sdc/queue/scheduler
echo "deadline" >/sys/block/sdd/queue/scheduler
}
yum_config(){
mkdir -p /etc/yum.repos.d/backup-repo
mv /etc/yum.repos.d/*.repo /etc/yum.repos.d/backup-repo
curl -o /etc/yum.repos.d/CentOS-Base.repo http://mirrors.aliyun.com/repo/Centos-7.repo
curl -o /etc/yum.repos.d/epel-7.repo http://mirrors.aliyun.com/repo/epel-7.repo
cat >>/etc/yum.repos.d/ceph.repo<<EOF
[Ceph]
name=Ceph packages for x86_64
baseurl=http://mirrors.aliyun.com/ceph/rpm-nautilus/el7/x86_64/
gpgcheck=0
priority=1
[Ceph-noarch]
name=Ceph noarch packages
baseurl=http://mirrors.aliyun.com/ceph/rpm-nautilus/el7/noarch
gpgcheck=0
priority=1
[ceph-source]
name=Ceph source packages
baseurl=http://mirrors.aliyun.com/ceph/rpm-nautilus/el7/SRPMS
gpgcheck=0
priority=1
EOF
yum makecache
yum -y install epel-release
}
main(){
sys_init
ceph_optimization
yum_config
}
main
2、配置ntp服务
cat /etc/ntp.conf
server ntp.aliyun.com iburst
systemctl restart ntpd
3、配置解析修改所有节点hosts解析或者公共dns解析
4、配置bluestore
osd节点
pvcreate /dev/sdb
vgcreate bluestore_ssd /dev/sdb
for i in {c..d};do lvcreate -n blockdb_sd$i -L 2G bluestore_ssd; lvcreate -n blockwal_sd$i -L 2G bluestore_ssd; done
ceph管理节点
5、ceph数据recovery配置策略
1>完全保证client带宽场景
完全关闭数据重建及迁移
ceph osd set norebalance
ceph osd set norecover
ceph osd set nobackfill
在业务空闲时,打开数据重建及迁移
ceph osd unset norebalance
ceph osd unset norecover
ceph osd unset nobackfill
2、优先保证client带宽
降低recovery的I/O带宽及backfill带宽
ceph tell osd.* injectargs '--osd-max-backfills 1'
ceph tell osd.* injectargs "--osd_recovery_max_active 1"
ceph tell osd.* injectargs '--osd-recovery-sleep 1'
待recovery完成,需要还原配置
ceph tell osd.* injectargs '--osd-max-backfills 1'
ceph tell osd.* injectargs "--osd_recovery_max_active 3"
ceph tell osd.* injectargs "--osd_recovery_sleep 0"
默认参数:
ceph --admin-daemon /var/run/ceph/ceph-osd.0.asok config show | grep -E "osd_max_backfills|osd_recovery_max_active|osd_recovery_max_single_start|osd_recovery_sleep|osd_recovery_op_priority"
"osd_max_backfills": "1",
"osd_recovery_max_active": "3",
"osd_recovery_max_single_start": "1",
"osd_recovery_op_priority": "3",
"osd_recovery_sleep": "0.000000",
6、配置和master互信
ssh-copy-id root@ceph04
7、ceph管理节点推送配置文件
ceph-deploy --overwrite-conf admin ceph04
8、清理数据盘
for i in {c..d};do ceph-deploy disk zap ceph04 /dev/sd$i; done
9、创建OSD
for i in {c..d};do ceph-deploy osd create ceph04 --data /dev/sd$i --block-db bluestore_ssd/blockdb_sd$i --block-wal bluestore_ssd/blockwal_sd$i; done
10、移动到rack02
ceph osd crush move ceph04 rack=rack02
二、更换故障盘
1、查看down的osd,osd延迟(延迟较大可能是磁盘坏道),通过dmesg查看是否有IO报错
ceph osd tree|grep down
ceph osd perf
dmesg|grep I/O
2、完全关闭数据重建及迁移
ceph osd set norebalance
ceph osd set norecover
ceph osd set nobackfill
4、更换故障盘
5、剔除故障盘
ceph osd out osd.7
ceph osd crush rm osd.7
ceph osd rm osd.7
ceph auth rm osd.7
卸载分区
umount -f /var/lib/ceph/osd/ceph-7
rm -rf /var/lib/ceph/osd/ceph-7
配置bluestore的需要清理lvm分区和硬盘内的分区
# 确定对应osd的db和wal所在的wal device和db device
ceph-volume lvm list
lvremove /dev/bluestore_ssd/blockwal_sdd -y
lvremove /dev/bluestore_ssd/blockdb_sdd -y
lvremove /dev/ceph-07924eaf-0caf-4f1b-b4de-743ce5772211/osd-block-90ef2320-46ea-4826-9c9f-434e785a5db2 -y
6、添加新盘
使用ssd磁盘为新更换的硬盘创建block-db、block-wal的lvm卷
lvcreate -n blockdb_sdd -L 2G bluestore_ssd; lvcreate -n blockwal_sdd -L 2G bluestore_ssd
清空sdd盘
ceph-deploy disk zap ceph04 /dev/sdd
创建osd
ceph-deploy osd create ceph04 --data /dev/sdd --block-db bluestore_ssd/blockdb_sdd --block-wal bluestore_ssd/blockwal_sdd
7、完全开启数据重建及迁移
ceph osd unset norebalance
ceph osd unset norecover
ceph osd unset nobackfill
8、ceph一致性检查(比较耗费资源和时间,生产环境不建议操作)
for i in `ceph pg dump|grep active+clean|awk '{print $1}'`; do ceph pg deep-scrub ${i};done
三、守护服务管理
1、服务管理
1>节点整体服务管理
systemctl status ceph.target
2>服务类型管理
systemctl status ceph-osd.target
systemctl status ceph-mds.target
systemctl status ceph-radosgw.target
systemctl status ceph-mon.target
systemctl status ceph-mgr.target
systemctl status ceph-fuse.target
systemctl status ceph-crash.service
3>细粒度服务管理
systemctl status ceph-osd@0.service
systemctl status ceph-mon@ceph01.service
其他服务类似
2、ceph服务日志分析参考ceph日志和debug
通常Ceph配置文件开启debugging是启动集群时执行的。如果在启动集群时遇到问题,还可以将Ceph调试日志记录添加到Ceph配置文件中。可以在/var/log/ceph(默认位置)下查看Ceph日志文件
3、ceph集群状态监控
ceph -s
ceph -w
ceph health detail
ceph quorum_status
ceph osd stat
ceph osd df
4、pool资源池管理
5、集群参数调整:
1>调整ceph.conf 推送到节点
2>通过socket config set方式临时设置
四、crush map
1、查看crush map规则
ceph osd crush rule ls
2、查看rbd pool使用的crush rule
ceph osd pool get rbd crush_rule
3、查看指定rule 详细信息
ceph osd crush rule dump replicated_rule
4、编辑crush map
导出crush map二进制文件
ceph osd getcrushmap -o crushmap.bin
解码二进制文件
crushtool -d crushmap.bin -o crushmap.txt
按照需求改好crush map后重新编译成二进制文件
crushtool -c crushmap.txt -o crushmap-new.bin
应用crushmap
ceph osd setcrushmap -i crushmap-new.bin
5、修改crushmap注意事项
1>做操作前备份一份crushmap.bin便于后期对比和误操作后的恢复
2>调整大的架构调整会导致大量数据迁移,最好前期规划好
3>做了架构层级调整,修改crushmap后重启会被覆盖掉,可以通过添加osd crush update on start =false参数禁用,推送到所有节点重启ceph-osd.target