当mongo instance 突然当机可能会导致mongo的automatic balancer出现问题,就是当机时正处于balance状态,重启机器后balance的lock state可能会一直为2,新的balance请求就为因为不能创建lock而被拒绝,解决方案如下
connect to one mongos
use config;
db.settings.update({_id:"balancer"}, {$set :{stopped:true}}, true);
while( db.locks.findOne({_id: "balancer"}).state ) { print("waiting..."); sleep(1000); }
db.locks.update({_id:"unbalanced_collection_name"}, {$set:{state:0}}) ;
db.settings.update({_id:"balancer"}, {$set :{stopped:true}}, true);
mongos错误日志如下:
> > > Tue Nov 29 14:43:39 [Balancer] moving chunk ns: client.client_browsing moving ( ns:client.client_browsing at: shard1:repl1/10.168.5.73:27017,10.168.5.74:27017 lastmod: 581|1 min: { _id: BinData } max: { _id: BinData }) shard1:repl1/10.168.5.73:27017,10.168.5.74:27017 -> shard3:repl3/10.168.5.77:27017,10.168.5.78:27017
> > > Tue Nov 29 14:43:39 [Balancer] moveChunk result: { who: { _id: "client.client_browsing", process: "mongo-5-73:27017:1320397784:40341271", state: 2, ts: ObjectId('4eb3ad856fc0726d03471479'), when: new Date(1320398213973), who: "mongo-5-73:27017:1320397784:40341271:conn18:293570853", why: "migrate-{ _id: BinData }" }, errmsg: "the collection metadata could not be locked with lock migrate-{ _id: BinData }", ok: 0.0 }
> > > Tue Nov 29 14:43:39 [Balancer] balancer move failed: { who: { _id: "client.client_browsing", process: "mongo-5-73:27017:1320397784:40341271", state: 2, ts: ObjectId('4eb3ad856fc0726d03471479'), when: new Date(1320398213973), who: "mongo-5-73:27017:1320397784:40341271:conn18:293570853", why: "migrate-{ _id: BinData }" }, errmsg: "the collection metadata could not be locked with lock migrate-{ _id: BinData }", ok: 0.0 } from: shard1 to: shard3 chunk: { _id: "client.client_browsing-_id_BinData(0, 3133313432363336373838643931616261646132373236343134336236363531)", lastmod: Timestamp 581000|1, ns: "client.client_browsing", min: { _id: BinData }, max: { _id: BinData }, shard: "shard1" }
> > > Tue Nov 29 14:43:39 [Balancer] distributed lock 'balancer/mongo-5-70:30000:1320402611:1804289383' unlocked.
> > > Tue Nov 29 14:43:50 [Balancer] distributed lock 'balancer/mongo-5-70:30000:1320402611:1804289383' acquired, ts : 4ed47f2582d8ce34035ba553
> > > Tue Nov 29 14:43:50 [Balancer] chose [shard1] to [shard3] { _id: "client.client_browsing-_id_BinData(0, 3133313432363336373838643931616261646132373236343134336236363531)", lastmod: Timestamp 581000|1, ns: "client.client_browsing", min: { _id: BinData }, max: { _id: BinData }, shard: "shard1" }
> > > Tue Nov 29 14:43:50 [Balancer] moving chunk ns: client.client_browsing moving ( ns:client.client_browsing at: shard1:repl1/10.168.5.73:27017,10.168.5.74:27017 lastmod: 581|1 min: { _id: BinData } max: { _id: BinData }) shard1:repl1/10.168.5.73:27017,10.168.5.74:27017 -> shard3:repl3/10.168.5.77:27017,10.168.5.78:27017
> > > Tue Nov 29 14:43:50 [Balancer] moveChunk result: { who: { _id: "client.client_browsing", process: "mongo-5-73:27017:1320397784:40341271", state: 2, ts: ObjectId('4eb3ad856fc0726d03471479'), when: new Date(1320398213973), who: "mongo-5-73:27017:1320397784:40341271:conn18:293570853", why: "migrate-{ _id: BinData }" }, errmsg: "the collection metadata could not be locked with lock migrate-{ _id: BinData }", ok: 0.0 }
> > > Tue Nov 29 14:43:50 [Balancer] balancer move failed: { who: { _id: "client.client_browsing", process: "mongo-5-73:27017:1320397784:40341271", state: 2, ts: ObjectId('4eb3ad856fc0726d03471479'), when: new Date(1320398213973), who: "mongo-5-73:27017:1320397784:40341271:conn18:293570853", why: "migrate-{ _id: BinData }" }, errmsg: "the collection metadata could not be locked with lock migrate-{ _id: BinData }", ok: 0.0 } from: shard1 to: shard3 chunk: { _id: "client.client_browsing-_id_BinData(0, 3133313432363336373838643931616261646132373236343134336236363531)", lastmod: Timestamp 581000|1, ns: "client.client_browsing", min: { _id: BinData }, max: { _id: BinData }, shard: "shard1" }
> > > Tue Nov 29 14:43:50 [Balancer] distributed lock 'balancer/mongo-5-70:30000:1320402611:1804289383' unlocked.
mongod错误日志如下:
1323316497112), what: "moveChunk.from", ns: "client.client_browsing", details: { min: { _id: BinData }, max: { _id: BinData }, step1: 0, note: "aborted" } }
Thu Dec 8 11:55:01 [conn107] received moveChunk request: { moveChunk: "client.client_browsing", from: "repl1/10.168.5.73:27017,10.168.5.74:27017", to: "repl3/10.168.5.77:27017,10.168.5.78:27017", min: { _id: BinData }, max: { _id: BinData }, maxChunkSizeBytes: 209715200, shardId: "client.client_browsing-_id_BinData(0, 3133313432363336373838643931616261646132373236343134336236363531)", configdb: "10.168.5.70:20000,10.168.5.71:20000,10.168.5.72:20000" }
Thu Dec 8 11:55:01 [conn107] created new distributed lock for client.client_browsing on 10.168.5.70:20000,10.168.5.71:20000,10.168.5.72:20000 ( lock timeout : 900000, ping interval : 30000, process : 0 )
Thu Dec 8 11:55:01 [conn107] about to log metadata event: { _id: "mongo-5-73-2011-12-08T03:55:01-209523", server: "mongo-5-73", clientAddr: "10.168.5.71:52944", time: new Date(1323316501307), what: "moveChunk.from", ns: "client.client_browsing", details: { min: { _id: BinData }, max: { _id: BinData }, step1: 0, note: "aborted" } }
Thu Dec 8 11:55:04 [LockPinger] cluster 10.168.5.70:20000,10.168.5.71:20000,10.168.5.72:20000 pinged successfully at Thu Dec 8 11:55:04 2011 by distributed lock pinger '10.168.5.70:20000,10.168.5.71:20000,10.168.5.72:20000/mongo-5-73:27017:1320403511:980896278', sleeping for 30000ms
Thu Dec 8 11:55:01 [conn107] received moveChunk request: { moveChunk: "client.client_browsing", from: "repl1/10.168.5.73:27017,10.168.5.74:27017", to: "repl3/10.168.5.77:27017,10.168.5.78:27017", min: { _id: BinData }, max: { _id: BinData }, maxChunkSizeBytes: 209715200, shardId: "client.client_browsing-_id_BinData(0, 3133313432363336373838643931616261646132373236343134336236363531)", configdb: "10.168.5.70:20000,10.168.5.71:20000,10.168.5.72:20000" }
Thu Dec 8 11:55:01 [conn107] created new distributed lock for client.client_browsing on 10.168.5.70:20000,10.168.5.71:20000,10.168.5.72:20000 ( lock timeout : 900000, ping interval : 30000, process : 0 )
Thu Dec 8 11:55:01 [conn107] about to log metadata event: { _id: "mongo-5-73-2011-12-08T03:55:01-209523", server: "mongo-5-73", clientAddr: "10.168.5.71:52944", time: new Date(1323316501307), what: "moveChunk.from", ns: "client.client_browsing", details: { min: { _id: BinData }, max: { _id: BinData }, step1: 0, note: "aborted" } }
Thu Dec 8 11:55:04 [LockPinger] cluster 10.168.5.70:20000,10.168.5.71:20000,10.168.5.72:20000 pinged successfully at Thu Dec 8 11:55:04 2011 by distributed lock pinger '10.168.5.70:20000,10.168.5.71:20000,10.168.5.72:20000/mongo-5-73:27017:1320403511:980896278', sleeping for 30000ms