| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 
 | 方法一:# 1. 找到状态为 red 的索引
 curl -X GET "http://192.168.xxx.xx:30092/_cat/indices?v="
 
 # 2.
 状态为 red 是无法对外提供服务的,说明有主节点没有分配到对应的机子上。
 
 找到 UNASSIGNED 节点,_cat/shards 能够看到节点的分配情况
 curl -X GET "http://192.168.xxx.xx:30092/_cat/shards"
 index                            shard prirep state        docs   store   ip             node
 index                      1    p     STARTED     764505 338.6mb 172.xxx.xxx.174 Calypso
 index                      1    r     STARTED     764505 338.6mb 172.xxx.xxx.89  Savage Steel
 index                      2    p     STARTED     763750 336.6mb 172.xxx.xxx.174 Calypso
 index                      2    r     STARTED     763750 336.6mb 172.xxx.xxx.88  Temugin
 index                      3    p     STARTED     764537 340.2mb 172.xxx.xxx.89  Savage Steel
 index                      3    r     STARTED     764537 340.2mb 172.xxx.xxx.88  Temugin
 index                      4    p     STARTED     765476 339.3mb 172.xxx.xxx.89  Savage Steel
 index                      4    r     STARTED     765476 339.3mb 172.xxx.xxx.88  Temugin
 index                      0    p     UNASSIGNED
 index                      0    r     UNASSIGNED
 index 有一个主节点 0 和一个副本 0 处于 UNASSIGNED 状态,也就是没有分配到机子上,因为主节点没有分配到机子上,所以状态为 red。
 从 ip 列可以看出一共有三台机子,尾数分别为 174,89 以及 88。一共有 10 个 index 所以对应的 elasticsearch 的 index.number_of_shards: 5,index.number_of_replicas: 1。一共有 10 个分片,可以按照 3,3,4 这样分配到三台不同的机子上。88 和 89 机子都分配多个节点,所以可以将另外一个主节点分配到 174 机子上。
 
 # 3. 找出机子的 id,找到 174 机子对应的 id,后续重新分配主节点得要用到,174 机子对应的 id 为 Leivp0laTYSqvMVm49SulQ
 curl -X GET "http://172.xxx.xxx.174:9288/_nodes/process?v="
 {
 "cluster_name": "es2.3.2-titan-cl",
 "nodes": {
 "Leivp0laTYSqvMVm49SulQ": {
 "name": "Calypso",
 "transport_address": "172.xxx.xxx.174:9388",
 "host": "172.xxx.xxx.174",
 "ip": "172.xxx.xxx.174",
 "version": "2.3.2",
 "build": "b9e4a6a",
 "http_address": "172.xxx.xxx.174:9288",
 "process": {
 "refresh_interval_in_millis": 1000,
 "id": 32130,
 "mlockall": false
 }
 },
 "EafIS3ByRrm4g-14KmY_wg": {
 "name": "Savage Steel",
 "transport_address": "172.xxx.xxx.89:9388",
 "host": "172.xxx.xxx.89",
 "ip": "172.xxx.xxx.89",
 "version": "2.3.2",
 "build": "b9e4a6a",
 "http_address": "172.xxx.xxx.89:9288",
 "process": {
 "refresh_interval_in_millis": 1000,
 "id": 7560,
 "mlockall": false
 }
 },
 "tojQ9EiXS0m6ZP16N7Ug3A": {
 "name": "Temugin",
 "transport_address": "172.xxx.xxx.88:9388",
 "host": "172.xxx.xxx.88",
 "ip": "172.xxx.xxx.88",
 "version": "2.3.2",
 "build": "b9e4a6a",
 "http_address": "172.xxx.xxx.88:9288",
 "process": {
 "refresh_interval_in_millis": 1000,
 "id": 47701,
 "mlockall": false
 }
 }
 }
 }
 
 # 4. 或者 为了简单也可以直接将该主分片放到 master 机子上,但是如果节点过于集中肯定会影响性能,同时会影响宕机后数据丢失的可能性,所以建议根据机子目前节点的分布情况重新分配。
 curl -X GET "http://172.xxx.xxx.174:9288/_cat/master?v="
 id                     host          ip            node
 EafIS3ByRrm4g-14KmY_wg 172.xxx.xxx.89 172.xxx.xxx.89 Savage Steel
 
 # 5. 分配 UNASSIGNED 节点到机子
 得要找到 UNASSIGNED 状态的主分片才能够重新分配,如果重新分配不是 UNASSIGNED 状态的主分片,例如我视图重新分配 shard 1 会出现
 curl -X POST -d '{
 "commands" : [ {
 "allocate" : {
 "index" : "index",
 "shard" : 1,
 "node" : "EafIS3ByRrm4g-14KmY_wg",
 "allow_primary" : true
 }
 }]
 }' "http://172.xxx.xxx.174:9288/_cluster/reroute"
 
 {
 "error": {
 "root_cause": [
 {
 "type": "remote_transport_exception",
 "reason": "[Savage Steel][172.xxx.xxx.89:9388][cluster:admin/reroute]"
 }
 ],
 "type": "illegal_argument_exception",
 "reason": "[allocate] failed to find [index][1] on the list of unassigned shards"
 },
 "status": 400
 }
 
 # 6. 重新分配 index shard 0 到某一台机子。_cluster/reroute 的参数 allow_primary 得要小心,有概率会导致数据丢失。
 curl -X POST -d '{
 "commands" : [ {
 "allocate" : {
 "index" : "index",
 "shard" : 0,
 "node" : "Leivp0laTYSqvMVm49SulQ",
 "allow_primary" : true
 }
 }]
 }' "http://172.xxx.xxx.174:9288/_cluster/reroute"
 
 {
 "acknowledged": true,
 .........
 "index": {
 "shards": {
 "0": [
 {
 "state": "INITIALIZING",
 "primary": true,
 "node": "Leivp0laTYSqvMVm49SulQ",
 "relocating_node": null,
 "shard": 0,
 "index": "index",
 "version": 1,
 "allocation_id": {
 "id": "wk5q0CryQpmworGFalfWQQ"
 },
 "unassigned_info": {
 "reason": "INDEX_CREATED",
 "at": "2017-03-23T12:27:33.405Z",
 "details": "force allocation from previous reason INDEX_REOPENED, null"
 }
 },
 {
 "state": "UNASSIGNED",
 "primary": false,
 "node": null,
 "relocating_node": null,
 "shard": 0,
 "index": "index",
 "version": 1,
 "unassigned_info": {
 "reason": "INDEX_REOPENED",
 "at": "2017-03-23T11:56:25.568Z"
 }
 }
 ]
 }
 }
 .............
 }
 
 # 7. 输出结果只罗列出了关键部分,主节点处于 INITIALIZING 状态,在看看索引的状态
 curl -X GET "http://172.xxx.xxx.174:9288/_cat/indices?v="
 
 green  open   index                          5   1    3058268        97588      2.6gb          1.3gb
 索引状态已经为 green,恢复正常使用。
 
 方法二:
 找一台空的机子,与现有的机子组成集群,由于新机子的加入机子的节点将会被分配,状态也就会恢复。等集群中所有的节点的状态变为 green 就可以关闭新加入的机子。
 
 |