Wednesday, 2 September 2015

DB2 HADR - A Clash of Primaries

I had fun with DB2 this afternoon, following an unplanned restart of BOTH of my DB2 servers.

In essence, neither the primary nor the standby wanted to be ... the primary.

This is what I saw for one of the two databases: -

On original primary

db2pd -hadr -db cognos

Database Member 0 -- Database COGNOS -- Standby -- Up 0 days 00:11:28 -- Date 2015-09-02-18.53.28.860276

                            HADR_ROLE = STANDBY
                          REPLAY_TYPE = PHYSICAL
                        HADR_SYNCMODE = 
                           STANDBY_ID = 0
                        LOG_STREAM_ID = 0
                           HADR_STATE = REMOTE_CATCHUP_PENDING
                           HADR_FLAGS = 
                  PRIMARY_MEMBER_HOST = NULL
                     PRIMARY_INSTANCE = NULL
                       PRIMARY_MEMBER = NULL
                  STANDBY_MEMBER_HOST = bpm856
                     STANDBY_INSTANCE = db2inst1
                       STANDBY_MEMBER = 0
                  HADR_CONNECT_STATUS = DISCONNECTED
             HADR_CONNECT_STATUS_TIME = 02/09/2015 18:53:27.015115 (1441216407)
          HEARTBEAT_INTERVAL(seconds) = 15
                     HEARTBEAT_MISSED = 0
                   HEARTBEAT_EXPECTED = 0
                HADR_TIMEOUT(seconds) = 60
        TIME_SINCE_LAST_RECV(seconds) = 0
             PEER_WAIT_LIMIT(seconds) = 0
           LOG_HADR_WAIT_CUR(seconds) = 0.000
    LOG_HADR_WAIT_RECENT_AVG(seconds) = 0.000000
   LOG_HADR_WAIT_ACCUMULATED(seconds) = 0.000
                  LOG_HADR_WAIT_COUNT = 0
SOCK_SEND_BUF_REQUESTED,ACTUAL(bytes) = 0, 16384
SOCK_RECV_BUF_REQUESTED,ACTUAL(bytes) = 0, 87380
            PRIMARY_LOG_FILE,PAGE,POS = S0000000.LOG, 0, 0
            STANDBY_LOG_FILE,PAGE,POS = S0000004.LOG, 446, 80142794
                  HADR_LOG_GAP(bytes) = 0
     STANDBY_REPLAY_LOG_FILE,PAGE,POS = S0000004.LOG, 446, 80142794
       STANDBY_RECV_REPLAY_GAP(bytes) = 0
                     PRIMARY_LOG_TIME = NULL
                     STANDBY_LOG_TIME = 02/09/2015 09:00:32.000000 (1441180832)
              STANDBY_REPLAY_LOG_TIME = 02/09/2015 09:00:32.000000 (1441180832)
         STANDBY_RECV_BUF_SIZE(pages) = 16
             STANDBY_RECV_BUF_PERCENT = 0
           STANDBY_SPOOL_LIMIT(pages) = 25600
                STANDBY_SPOOL_PERCENT = 0
                   STANDBY_ERROR_TIME = NULL
                 PEER_WINDOW(seconds) = 0
             READS_ON_STANDBY_ENABLED = N


On original standby

db2pd -hadr -db cognos

Database Member 0 -- Database COGNOS -- Standby -- Up 0 days 00:12:52 -- Date 2015-09-02-18.55.07.690822

                            HADR_ROLE = STANDBY
                          REPLAY_TYPE = PHYSICAL
                        HADR_SYNCMODE = 
                           STANDBY_ID = 0
                        LOG_STREAM_ID = 0
                           HADR_STATE = LOCAL_CATCHUP
                           HADR_FLAGS = 
                  PRIMARY_MEMBER_HOST = NULL
                     PRIMARY_INSTANCE = NULL
                       PRIMARY_MEMBER = NULL
                  STANDBY_MEMBER_HOST = db2hadr
                     STANDBY_INSTANCE = db2inst1
                       STANDBY_MEMBER = 0
                  HADR_CONNECT_STATUS = DISCONNECTED
             HADR_CONNECT_STATUS_TIME = 02/09/2015 18:55:06.448385 (1441216506)
          HEARTBEAT_INTERVAL(seconds) = 15
                     HEARTBEAT_MISSED = 0
                   HEARTBEAT_EXPECTED = 0
                HADR_TIMEOUT(seconds) = 60
        TIME_SINCE_LAST_RECV(seconds) = 0
             PEER_WAIT_LIMIT(seconds) = 0
           LOG_HADR_WAIT_CUR(seconds) = 0.000
    LOG_HADR_WAIT_RECENT_AVG(seconds) = 0.000000
   LOG_HADR_WAIT_ACCUMULATED(seconds) = 0.000
                  LOG_HADR_WAIT_COUNT = 0
SOCK_SEND_BUF_REQUESTED,ACTUAL(bytes) = 0, 16384
SOCK_RECV_BUF_REQUESTED,ACTUAL(bytes) = 0, 87380
            PRIMARY_LOG_FILE,PAGE,POS = S0000000.LOG, 0, 0
            STANDBY_LOG_FILE,PAGE,POS = S0000006.LOG, 18, 86748034
                  HADR_LOG_GAP(bytes) = 0
     STANDBY_REPLAY_LOG_FILE,PAGE,POS = S0000006.LOG, 18, 86748034
       STANDBY_RECV_REPLAY_GAP(bytes) = 0
                     PRIMARY_LOG_TIME = NULL
                     STANDBY_LOG_TIME = 02/09/2015 16:22:27.000000 (1441207347)
              STANDBY_REPLAY_LOG_TIME = 02/09/2015 16:22:27.000000 (1441207347)
         STANDBY_RECV_BUF_SIZE(pages) = 16
             STANDBY_RECV_BUF_PERCENT = 0
           STANDBY_SPOOL_LIMIT(pages) = 25600
                STANDBY_SPOOL_PERCENT = 0
                   STANDBY_ERROR_TIME = NULL
                 PEER_WINDOW(seconds) = 0
             READS_ON_STANDBY_ENABLED = N

with messages such as: -

2015-09-02-18.53.02.766411+060 I15074291E494         LEVEL: Info
PID     : 2497                 TID : 139896769472256 PROC : db2sysc 0
INSTANCE: db2inst1             NODE : 000            DB   : COGNOS  
HOSTNAME: bpm856.uk.ibm.com
EDUID   : 45                   EDUNAME: db2hadrs.0.0 (COGNOS) 0
FUNCTION: DB2 UDB, High Availability Disaster Recovery, hdrSendHsMsgNoDefer, probe:30539
DATA #1 : <preformatted>
A HDR_MSG_NOTPRIMARY message was sent to db2hadr:cognos_hadr (192.168.33.100:60009)

2015-09-02-18.53.02.769628+060 I15074786E502         LEVEL: Info
PID     : 2497                 TID : 139896769472256 PROC : db2sysc 0
INSTANCE: db2inst1             NODE : 000            DB   : COGNOS  
HOSTNAME: bpm856.uk.ibm.com
EDUID   : 45                   EDUNAME: db2hadrs.0.0 (COGNOS) 0
FUNCTION: DB2 UDB, High Availability Disaster Recovery, hdrHandleHsAck, probe:43900
DATA #1 : <preformatted>
Handshake HDR_MSG_NOTPRIMARY message is received from db2hadr:cognos_hadr (192.168.33.100:60009)

2015-09-02-18.53.02.769830+060 I15075289E603         LEVEL: Error
PID     : 2497                 TID : 139896769472256 PROC : db2sysc 0
INSTANCE: db2inst1             NODE : 000            DB   : COGNOS  
HOSTNAME: bpm856.uk.ibm.com
EDUID   : 45                   EDUNAME: db2hadrs.0.0 (COGNOS) 0
FUNCTION: DB2 UDB, High Availability Disaster Recovery, hdrHandleHsAck, probe:43970
MESSAGE : ZRC=0x878001BA=-2021654086=HDR_ZRC_NOT_PRIMARY
          "The database being contacted as primary is not a primary"
DATA #1 : <preformatted>
HADR handshake with db2hadr:cognos_hadr (192.168.33.100:60009) failed.


in db2diag.log.

On "new" primary

db2 deactivate db cognos

DB20000I  The DEACTIVATE DATABASE command completed successfully.

db2 stop hadr on db cognos

DB20000I  The STOP HADR ON DATABASE command completed successfully.

db2 start hadr on db cognos as primary

SQL1117N  A connection to or activation of database "COGNOS" cannot be made because of ROLL-FORWARD PENDING.  SQLSTATE=57019

db2 rollforward database cognos to end of logs and complete

                                 Rollforward Status

 Input database alias                   = cognos
 Number of members have returned status = 1

 Member ID                              = 0
 Rollforward status                     = DB  working
 Next log file to be read               = S0000007.LOG
 Log files processed                    = S0000005.LOG - S0000006.LOG
 Last committed transaction             = 2015-09-02-16.39.40.000000 UTC

DB20000I  The ROLLFORWARD command completed successfully.


db2 start hadr on db cognos as primary

DB20000I  The START HADR ON DATABASE command completed successfully.

On "new" standby

db2 deactivate db cognos

DB20000I  The DEACTIVATE DATABASE command completed successfully.

db2 stop hadr on db cognos

DB20000I  The STOP HADR ON DATABASE command completed successfully.

db2 start hadr on db cognos as standby

DB20000I  The START HADR ON DATABASE command completed successfully.

Now things look clean and green .....

New "primary"

db2pd -hadr -db cognos

Database Member 0 -- Database COGNOS -- Active -- Up 0 days 00:00:54 -- Date 2015-09-02-19.25.29.081627

                            HADR_ROLE = PRIMARY
                          REPLAY_TYPE = PHYSICAL
                        HADR_SYNCMODE = SYNC
                           STANDBY_ID = 1
                        LOG_STREAM_ID = 0
                           HADR_STATE = PEER
                           HADR_FLAGS = 
                  PRIMARY_MEMBER_HOST = db2hadr
                     PRIMARY_INSTANCE = db2inst1
                       PRIMARY_MEMBER = 0
                  STANDBY_MEMBER_HOST = bpm856
                     STANDBY_INSTANCE = db2inst1
                       STANDBY_MEMBER = 0
                  HADR_CONNECT_STATUS = CONNECTED
             HADR_CONNECT_STATUS_TIME = 02/09/2015 19:24:36.897314 (1441218276)
          HEARTBEAT_INTERVAL(seconds) = 15
                     HEARTBEAT_MISSED = 0
                   HEARTBEAT_EXPECTED = 3
                HADR_TIMEOUT(seconds) = 60
        TIME_SINCE_LAST_RECV(seconds) = 8
             PEER_WAIT_LIMIT(seconds) = 0
           LOG_HADR_WAIT_CUR(seconds) = 0.000
    LOG_HADR_WAIT_RECENT_AVG(seconds) = 0.000000
   LOG_HADR_WAIT_ACCUMULATED(seconds) = 0.000
                  LOG_HADR_WAIT_COUNT = 0
SOCK_SEND_BUF_REQUESTED,ACTUAL(bytes) = 0, 19800
SOCK_RECV_BUF_REQUESTED,ACTUAL(bytes) = 0, 87380
            PRIMARY_LOG_FILE,PAGE,POS = S0000007.LOG, 0, 90845934
            STANDBY_LOG_FILE,PAGE,POS = S0000007.LOG, 0, 90845934
                  HADR_LOG_GAP(bytes) = 0
     STANDBY_REPLAY_LOG_FILE,PAGE,POS = S0000007.LOG, 0, 90845934
       STANDBY_RECV_REPLAY_GAP(bytes) = 0
                     PRIMARY_LOG_TIME = 02/09/2015 17:39:40.000000 (1441211980)
                     STANDBY_LOG_TIME = 02/09/2015 17:39:40.000000 (1441211980)
              STANDBY_REPLAY_LOG_TIME = 02/09/2015 17:39:40.000000 (1441211980)
         STANDBY_RECV_BUF_SIZE(pages) = 4298
             STANDBY_RECV_BUF_PERCENT = 0
           STANDBY_SPOOL_LIMIT(pages) = 25600
                STANDBY_SPOOL_PERCENT = 0
                   STANDBY_ERROR_TIME = NULL
                 PEER_WINDOW(seconds) = 120
                      PEER_WINDOW_END = 02/09/2015 19:27:22.000000 (1441218442)
             READS_ON_STANDBY_ENABLED = N


New "standby"

db2pd -hadr -db cognos

Database Member 0 -- Database COGNOS -- Standby -- Up 0 days 00:13:07 -- Date 2015-09-02-19.26.59.577531

                            HADR_ROLE = STANDBY
                          REPLAY_TYPE = PHYSICAL
                        HADR_SYNCMODE = SYNC
                           STANDBY_ID = 0
                        LOG_STREAM_ID = 0
                           HADR_STATE = PEER
                           HADR_FLAGS = 
                  PRIMARY_MEMBER_HOST = db2hadr
                     PRIMARY_INSTANCE = db2inst1
                       PRIMARY_MEMBER = 0
                  STANDBY_MEMBER_HOST = bpm856
                     STANDBY_INSTANCE = db2inst1
                       STANDBY_MEMBER = 0
                  HADR_CONNECT_STATUS = CONNECTED
             HADR_CONNECT_STATUS_TIME = 02/09/2015 19:24:36.898265 (1441218276)
          HEARTBEAT_INTERVAL(seconds) = 15
                     HEARTBEAT_MISSED = 0
                   HEARTBEAT_EXPECTED = 9
                HADR_TIMEOUT(seconds) = 60
        TIME_SINCE_LAST_RECV(seconds) = 8
             PEER_WAIT_LIMIT(seconds) = 0
           LOG_HADR_WAIT_CUR(seconds) = 0.000
    LOG_HADR_WAIT_RECENT_AVG(seconds) = 0.000000
   LOG_HADR_WAIT_ACCUMULATED(seconds) = 0.000
                  LOG_HADR_WAIT_COUNT = 0
SOCK_SEND_BUF_REQUESTED,ACTUAL(bytes) = 0, 19800
SOCK_RECV_BUF_REQUESTED,ACTUAL(bytes) = 0, 87380
            PRIMARY_LOG_FILE,PAGE,POS = S0000007.LOG, 0, 90845934
            STANDBY_LOG_FILE,PAGE,POS = S0000007.LOG, 0, 90845934
                  HADR_LOG_GAP(bytes) = 0
     STANDBY_REPLAY_LOG_FILE,PAGE,POS = S0000007.LOG, 0, 90845934
       STANDBY_RECV_REPLAY_GAP(bytes) = 0
                     PRIMARY_LOG_TIME = 02/09/2015 17:39:40.000000 (1441211980)
                     STANDBY_LOG_TIME = 02/09/2015 17:39:40.000000 (1441211980)
              STANDBY_REPLAY_LOG_TIME = 02/09/2015 17:39:40.000000 (1441211980)
         STANDBY_RECV_BUF_SIZE(pages) = 4298
             STANDBY_RECV_BUF_PERCENT = 0
           STANDBY_SPOOL_LIMIT(pages) = 25600
                STANDBY_SPOOL_PERCENT = 0
                   STANDBY_ERROR_TIME = NULL
                 PEER_WINDOW(seconds) = 120
                      PEER_WINDOW_END = 02/09/2015 19:28:52.000000 (1441218532)
             READS_ON_STANDBY_ENABLED = N


No comments:

Note to self - use kubectl to query images in a pod or deployment

In both cases, we use JSON ... For a deployment, we can do this: - kubectl get deployment foobar --namespace snafu --output jsonpath="{...