[Linux-HA] Takeover not happening when service fails
Andrew Beekhof
beekhof at gmail.com
Fri Jul 11 00:59:02 MDT 2008
Here is your problem:
<nvpair id="cib-bootstrap-options-default-resource-failure-stickiness"
name="default-resource-failure-stickiness" value="0"/>
Grab
http://clusterlabs.org/mw/Image:Configuration_Explained.pdf
And look for the section on "Migration due to failure"
On Jul 11, 2008, at 7:11 AM, Max Deputter wrote:
> Hi Linux-HA guru, I'm running heartbeat 2.1.3 and I have a strange
> problem...
> My setup is as follow. I have two nodes and I want an active-passive
> setup with 2 Nginx load balancers. I tested my setup and it seems to
> work great. When lb1 fails, lb2 takes over and when lb1 comes back,
> lb2 goes back on hot-standby.
> Now when I voluntarily corrupt the nginx load balancer on
> lb1.awakers.biz, it seems that lb1 tries to restart it forver and
> lb2 does not take over, no matter what. I have written my own OCF
> and it seems to work ok:
> [root at lb2 ~]# ocf-tester -n nginx -o ip=127.0.0.1 /usr/lib/ocf/
> resource.d/maxdeputter/nginx
> Beginning tests for /usr/lib/ocf/resource.d/maxdeputter/nginx...
> /usr/lib/ocf/resource.d/maxdeputter/nginx passed all tests
>
> Here's my cib.xml file:
> <cib admin_epoch="0" epoch="1" num_updates="1" generated="false"
> have_quorum="false" ignore_dtd="false" num_peers="0"
> cib_feature_revision="2.0" cib-last-written="Fri Jul 11 13:55:55
> 2008">
> <configuration>
> <crm_config>
> <cluster_property_set id="cib-bootstrap-options">
> <attributes>
> <nvpair id="cib-bootstrap-options-symmetric-cluster"
> name="symmetric-cluster" value="true"/>
> <nvpair id="cib-bootstrap-options-no-quorum-policy"
> name="no-quorum-policy" value="stop"/>
> <nvpair id="cib-bootstrap-options-default-resource-
> stickiness" name="default-resource-stickiness" value="0"/>
> <nvpair id="cib-bootstrap-options-default-resource-failure-
> stickiness" name="default-resource-failure-stickiness" value="0"/>
> <nvpair id="cib-bootstrap-options-stonith-enabled"
> name="stonith-enabled" value="false"/>
> <nvpair id="cib-bootstrap-options-stonith-action"
> name="stonith-action" value="reboot"/>
> <nvpair id="cib-bootstrap-options-startup-fencing"
> name="startup-fencing" value="true"/>
> <nvpair id="cib-bootstrap-options-stop-orphan-resources"
> name="stop-orphan-resources" value="true"/>
> <nvpair id="cib-bootstrap-options-stop-orphan-actions"
> name="stop-orphan-actions" value="true"/>
> <nvpair id="cib-bootstrap-options-remove-after-stop"
> name="remove-after-stop" value="false"/>
> <nvpair id="cib-bootstrap-options-short-resource-names"
> name="short-resource-names" value="true"/>
> <nvpair id="cib-bootstrap-options-transition-idle-timeout"
> name="transition-idle-timeout" value="5min"/>
> <nvpair id="cib-bootstrap-options-default-action-timeout"
> name="default-action-timeout" value="20s"/>
> <nvpair id="cib-bootstrap-options-is-managed-default"
> name="is-managed-default" value="true"/>
> <nvpair id="cib-bootstrap-options-cluster-delay"
> name="cluster-delay" value="60s"/>
> <nvpair id="cib-bootstrap-options-pe-error-series-max"
> name="pe-error-series-max" value="-1"/>
> <nvpair id="cib-bootstrap-options-pe-warn-series-max"
> name="pe-warn-series-max" value="-1"/>
> <nvpair id="cib-bootstrap-options-pe-input-series-max"
> name="pe-input-series-max" value="-1"/>
> <nvpair id="cib-bootstrap-options-dc-version" name="dc-
> version" value="2.1.3-node:
> 552305612591183b1628baa5bc6e903e0f1e26a3"/>
> </attributes>
> </cluster_property_set>
> </crm_config>
> <nodes>
> <node id="197f587d-a76b-456c-9f69-c99275c411db"
> uname="lb1.awakers.biz" type="normal"/>
> <node id="5566227d-e446-4c54-a7f4-cd312e6e8b36"
> uname="lb2.awakers.biz" type="normal"/>
> </nodes>
> <resources>
> <group id="group_1">
> <primitive class="ocf" id="IPaddr_10_0_0_200"
> provider="heartbeat" type="IPaddr">
> <operations>
> <op id="IPaddr_10_0_0_200_mon" interval="5s"
> name="monitor" timeout="5s"/>
> </operations>
> <instance_attributes id="IPaddr_10_0_0_200_inst_attr">
> <attributes>
> <nvpair id="IPaddr_10_0_0_200_attr_0" name="ip"
> value="10.0.0.200"/>
> </attributes>
> </instance_attributes>
> </primitive>
> <primitive class="ocf" id="nginx_2" provider="maxdeputter"
> type="nginx">
> <operations>
> <op id="nginx_2_start" name="start" timeout="5s"/>
> <op id="nginx_2_stop" name="stop" timeout="3s"/>
> <op id="nginx_2_mon" interval="10s" name="monitor"
> timeout="3s">
> <instance_attributes id="monitor_10s">
> <attributes>
> <nvpair id="nginx_2-monitor-ip" name="ip"
> value="127.0.0.1"/>
> </attributes>
> </instance_attributes>
> </op>
> </operations>
> </primitive>
> </group>
> </resources>
> <constraints>
> <rsc_location id="rsc_location_group_1" rsc="group_1">
> <rule id="prefered_location_group_1" score="100">
> <expression attribute="#uname"
> id="prefered_location_group_1_expr" operation="eq"
> value="lb1.awakers.biz"/>
> </rule>
> </rsc_location>
> </constraints>
> </configuration>
> </cib>
>
> Here's the ha.cf
> logfile /var/log/ha-log
> logfacility local0
> keepalive 2
> deadtime 30
> initdead 30
> bcast eth0
> udpport 694
> auto_failback on
> node lb1.awakers.biz
> node lb2.awakers.biz
> crm respawn
>
> The log on lb1
> crmd[16206]: 2008/07/11_13:57:13 info: process_lrm_event: LRM
> operation nginx_2_monitor_10000 (call=8, rc=7) complete
> crmd[16206]: 2008/07/11_13:57:13 info: do_lrm_rsc_op: Performing
> op=nginx_2_stop_0 key=3:4:7a57ef5a-9658-4683-b3cf-6c3cd58c5521)
> lrmd[16203]: 2008/07/11_13:57:13 info: rsc:nginx_2: stop
> crmd[16206]: 2008/07/11_13:57:13 info: process_lrm_event: LRM
> operation nginx_2_monitor_10000 (call=8, rc=-2) Cancelled
> crmd[16206]: 2008/07/11_13:57:13 info: process_lrm_event: LRM
> operation nginx_2_stop_0 (call=9, rc=0) complete
> crmd[16206]: 2008/07/11_13:57:15 info: do_lrm_rsc_op: Performing
> op=IPaddr_10_0_0_200_start_0 key=9:4:7a57ef5a-9658-4683-
> b3cf-6c3cd58c5521)
> lrmd[16203]: 2008/07/11_13:57:15 info: rsc:IPaddr_10_0_0_200: start
> IPaddr[16765]: 2008/07/11_13:57:15 INFO: Using calculated nic for
> 10.0.0.200: eth0
> IPaddr[16765]: 2008/07/11_13:57:15 INFO: Using calculated netmask
> for 10.0.0.200: 255.255.255.0
> crmd[16206]: 2008/07/11_13:57:15 info: process_lrm_event: LRM
> operation IPaddr_10_0_0_200_start_0 (call=10, rc=0) complete
> crmd[16206]: 2008/07/11_13:57:17 info: do_lrm_rsc_op: Performing
> op=IPaddr_10_0_0_200_monitor_5000 key=1:4:7a57ef5a-9658-4683-
> b3cf-6c3cd58c5521)
> crmd[16206]: 2008/07/11_13:57:17 info: do_lrm_rsc_op: Performing
> op=nginx_2_start_0 key=10:4:7a57ef5a-9658-4683-b3cf-6c3cd58c5521)
> lrmd[16203]: 2008/07/11_13:57:17 info: rsc:nginx_2: start
> crmd[16206]: 2008/07/11_13:57:17 info: process_lrm_event: LRM
> operation IPaddr_10_0_0_200_monitor_5000 (call=6, rc=-2) Cancelled
> crmd[16206]: 2008/07/11_13:57:17 info: process_lrm_event: LRM
> operation nginx_2_start_0 (call=12, rc=0) complete
> crmd[16206]: 2008/07/11_13:57:17 info: process_lrm_event: LRM
> operation IPaddr_10_0_0_200_monitor_5000 (call=11, rc=0) complete
> crmd[16206]: 2008/07/11_13:57:18 info: do_lrm_rsc_op: Performing
> op=nginx_2_monitor_10000 key=2:4:7a57ef5a-9658-4683-b3cf-6c3cd58c5521)
> crmd[16206]: 2008/07/11_13:57:18 info: process_lrm_event: LRM
> operation nginx_2_monitor_10000 (call=13, rc=7) complete
> crmd[16206]: 2008/07/11_13:57:19 info: do_lrm_rsc_op: Performing
> op=nginx_2_stop_0 key=3:5:7a57ef5a-9658-4683-b3cf-6c3cd58c5521)
> lrmd[16203]: 2008/07/11_13:57:19 info: rsc:nginx_2: stop
> crmd[16206]: 2008/07/11_13:57:19 info: process_lrm_event: LRM
> operation nginx_2_monitor_10000 (call=13, rc=-2) Cancelled
> crmd[16206]: 2008/07/11_13:57:19 info: process_lrm_event: LRM
> operation nginx_2_stop_0 (call=14, rc=0) complete
> crmd[16206]: 2008/07/11_13:57:21 info: do_lrm_rsc_op: Performing
> op=IPaddr_10_0_0_200_start_0 key=9:5:7a57ef5a-9658-4683-
> b3cf-6c3cd58c5521)
> lrmd[16203]: 2008/07/11_13:57:21 info: rsc:IPaddr_10_0_0_200: start
> IPaddr[16882]: 2008/07/11_13:57:22 INFO: Using calculated nic for
> 10.0.0.200: eth0
> IPaddr[16882]: 2008/07/11_13:57:22 INFO: Using calculated netmask
> for 10.0.0.200: 255.255.255.0
> crmd[16206]: 2008/07/11_13:57:22 info: process_lrm_event: LRM
> operation IPaddr_10_0_0_200_start_0 (call=15, rc=0) complete
> crmd[16206]: 2008/07/11_13:57:23 info: do_lrm_rsc_op: Performing
> op=IPaddr_10_0_0_200_monitor_5000 key=1:5:7a57ef5a-9658-4683-
> b3cf-6c3cd58c5521)
> crmd[16206]: 2008/07/11_13:57:23 info: do_lrm_rsc_op: Performing
> op=nginx_2_start_0 key=10:5:7a57ef5a-9658-4683-b3cf-6c3cd58c5521)
> lrmd[16203]: 2008/07/11_13:57:23 info: rsc:nginx_2: start
> crmd[16206]: 2008/07/11_13:57:23 info: process_lrm_event: LRM
> operation IPaddr_10_0_0_200_monitor_5000 (call=11, rc=-2) Cancelled
> crmd[16206]: 2008/07/11_13:57:23 info: process_lrm_event: LRM
> operation nginx_2_start_0 (call=17, rc=0) complete
> crmd[16206]: 2008/07/11_13:57:23 info: process_lrm_event: LRM
> operation IPaddr_10_0_0_200_monitor_5000 (call=16, rc=0) complete
> crmd[16206]: 2008/07/11_13:57:24 info: do_lrm_rsc_op: Performing
> op=nginx_2_monitor_10000 key=2:5:7a57ef5a-9658-4683-b3cf-6c3cd58c5521)
> crmd[16206]: 2008/07/11_13:57:24 info: process_lrm_event: LRM
> operation nginx_2_monitor_10000 (call=18, rc=7) complete
> crmd[16206]: 2008/07/11_13:57:25 info: do_lrm_rsc_op: Performing
> op=nginx_2_stop_0 key=3:6:7a57ef5a-9658-4683-b3cf-6c3cd58c5521)
> lrmd[16203]: 2008/07/11_13:57:25 info: rsc:nginx_2: stop
> crmd[16206]: 2008/07/11_13:57:25 info: process_lrm_event: LRM
> operation nginx_2_monitor_10000 (call=18, rc=-2) Cancelled
> crmd[16206]: 2008/07/11_13:57:25 info: process_lrm_event: LRM
> operation nginx_2_stop_0 (call=19, rc=0) complete
> crmd[16206]: 2008/07/11_13:57:26 info: do_lrm_rsc_op: Performing
> op=IPaddr_10_0_0_200_start_0 key=9:6:7a57ef5a-9658-4683-
> b3cf-6c3cd58c5521)
> lrmd[16203]: 2008/07/11_13:57:26 info: rsc:IPaddr_10_0_0_200: start
> IPaddr[16999]: 2008/07/11_13:57:27 INFO: Using calculated nic for
> 10.0.0.200: eth0
> IPaddr[16999]: 2008/07/11_13:57:27 INFO: Using calculated netmask
> for 10.0.0.200: 255.255.255.0
> crmd[16206]: 2008/07/11_13:57:27 info: process_lrm_event: LRM
> operation IPaddr_10_0_0_200_start_0 (call=20, rc=0) complete
> crmd[16206]: 2008/07/11_13:57:27 info: do_lrm_rsc_op: Performing
> op=IPaddr_10_0_0_200_monitor_5000 key=1:6:7a57ef5a-9658-4683-
> b3cf-6c3cd58c5521)
> crmd[16206]: 2008/07/11_13:57:27 info: do_lrm_rsc_op: Performing
> op=nginx_2_start_0 key=10:6:7a57ef5a-9658-4683-b3cf-6c3cd58c5521)
> lrmd[16203]: 2008/07/11_13:57:27 info: rsc:nginx_2: start
> crmd[16206]: 2008/07/11_13:57:27 info: process_lrm_event: LRM
> operation IPaddr_10_0_0_200_monitor_5000 (call=16, rc=-2) Cancelled
> crmd[16206]: 2008/07/11_13:57:27 info: process_lrm_event: LRM
> operation IPaddr_10_0_0_200_monitor_5000 (call=21, rc=0) complete
> crmd[16206]: 2008/07/11_13:57:28 info: process_lrm_event: LRM
> operation nginx_2_start_0 (call=22, rc=0) complete
> crmd[16206]: 2008/07/11_13:57:29 info: do_lrm_rsc_op: Performing
> op=nginx_2_monitor_10000 key=2:6:7a57ef5a-9658-4683-b3cf-6c3cd58c5521)
> crmd[16206]: 2008/07/11_13:57:29 info: process_lrm_event: LRM
> operation nginx_2_monitor_10000 (call=23, rc=7) complete
> crmd[16206]: 2008/07/11_13:57:31 info: do_lrm_rsc_op: Performing
> op=nginx_2_stop_0 key=3:7:7a57ef5a-9658-4683-b3cf-6c3cd58c5521)
> lrmd[16203]: 2008/07/11_13:57:31 info: rsc:nginx_2: stop
> crmd[16206]: 2008/07/11_13:57:31 info: process_lrm_event: LRM
> operation nginx_2_monitor_10000 (call=23, rc=-2) Cancelled
> crmd[16206]: 2008/07/11_13:57:31 info: process_lrm_event: LRM
> operation nginx_2_stop_0 (call=24, rc=0) complete
> crmd[16206]: 2008/07/11_13:57:32 info: do_lrm_rsc_op: Performing
> op=IPaddr_10_0_0_200_start_0 key=9:7:7a57ef5a-9658-4683-
> b3cf-6c3cd58c5521)
> lrmd[16203]: 2008/07/11_13:57:32 info: rsc:IPaddr_10_0_0_200: start
> IPaddr[17102]: 2008/07/11_13:57:33 INFO: Using calculated nic for
> 10.0.0.200: eth0
> IPaddr[17102]: 2008/07/11_13:57:33 INFO: Using calculated netmask
> for 10.0.0.200: 255.255.255.0
> crmd[16206]: 2008/07/11_13:57:33 info: process_lrm_event: LRM
> operation IPaddr_10_0_0_200_start_0 (call=25, rc=0) complete
> crmd[16206]: 2008/07/11_13:57:33 info: do_lrm_rsc_op: Performing
> op=IPaddr_10_0_0_200_monitor_5000 key=1:7:7a57ef5a-9658-4683-
> b3cf-6c3cd58c5521)
> crmd[16206]: 2008/07/11_13:57:33 info: do_lrm_rsc_op: Performing
> op=nginx_2_start_0 key=10:7:7a57ef5a-9658-4683-b3cf-6c3cd58c5521)
> lrmd[16203]: 2008/07/11_13:57:33 info: rsc:nginx_2: start
> crmd[16206]: 2008/07/11_13:57:33 info: process_lrm_event: LRM
> operation IPaddr_10_0_0_200_monitor_5000 (call=21, rc=-2) Cancelled
> crmd[16206]: 2008/07/11_13:57:34 info: process_lrm_event: LRM
> operation nginx_2_start_0 (call=27, rc=0) complete
> crmd[16206]: 2008/07/11_13:57:34 info: process_lrm_event: LRM
> operation IPaddr_10_0_0_200_monitor_5000 (call=26, rc=0) complete
> crmd[16206]: 2008/07/11_13:57:35 info: do_lrm_rsc_op: Performing
> op=nginx_2_monitor_10000 key=2:7:7a57ef5a-9658-4683-b3cf-6c3cd58c5521)
> crmd[16206]: 2008/07/11_13:57:35 info: process_lrm_event: LRM
> operation nginx_2_monitor_10000 (call=28, rc=7) complete
> crmd[16206]: 2008/07/11_13:57:36 info: do_lrm_rsc_op: Performing
> op=nginx_2_stop_0 key=3:8:7a57ef5a-9658-4683-b3cf-6c3cd58c5521)
> lrmd[16203]: 2008/07/11_13:57:36 info: rsc:nginx_2: stop
> crmd[16206]: 2008/07/11_13:57:36 info: process_lrm_event: LRM
> operation nginx_2_monitor_10000 (call=28, rc=-2) Cancelled
> crmd[16206]: 2008/07/11_13:57:36 info: process_lrm_event: LRM
> operation nginx_2_stop_0 (call=29, rc=0) complete
> crmd[16206]: 2008/07/11_13:57:37 info: do_lrm_rsc_op: Performing
> op=IPaddr_10_0_0_200_start_0 key=9:8:7a57ef5a-9658-4683-
> b3cf-6c3cd58c5521)
> lrmd[16203]: 2008/07/11_13:57:37 info: rsc:IPaddr_10_0_0_200: start
> IPaddr[17221]: 2008/07/11_13:57:38 INFO: Using calculated nic for
> 10.0.0.200: eth0
> IPaddr[17221]: 2008/07/11_13:57:38 INFO: Using calculated netmask
> for 10.0.0.200: 255.255.255.0
> crmd[16206]: 2008/07/11_13:57:38 info: process_lrm_event: LRM
> operation IPaddr_10_0_0_200_start_0 (call=30, rc=0) complete
>
> And the on lb2
> tengine[8768]: 2008/07/11_21:29:55 info: te_pseudo_action: Pseudo
> action 12 fired and confirmed
> tengine[8768]: 2008/07/11_21:29:55 info: send_rsc_command:
> Initiating action 2: nginx_2_monitor_10000 on lb1.awakers.biz
> tengine[8768]: 2008/07/11_21:29:55 info: match_graph_event: Action
> IPaddr_10_0_0_200_monitor_5000 (1) confirmed on lb1.awakers.biz (rc=0)
> tengine[8768]: 2008/07/11_21:29:56 info: status_from_rc: Re-mapping
> op status to LRM_OP_ERROR for rc=7
> tengine[8768]: 2008/07/11_21:29:56 WARN: status_from_rc: Action
> monitor on lb1.awakers.biz failed (target: <null> vs. rc: 7): Error
> tengine[8768]: 2008/07/11_21:29:56 WARN: update_failcount: Updating
> failcount for nginx_2 on 197f587d-a76b-456c-9f69-c99275c411db after
> failed monitor: rc=7
> tengine[8768]: 2008/07/11_21:29:56 info: update_abort_priority:
> Abort priority upgraded to 1
> tengine[8768]: 2008/07/11_21:29:56 info: update_abort_priority:
> Abort action 0 superceeded by 2
> tengine[8768]: 2008/07/11_21:29:56 info: match_graph_event: Action
> nginx_2_monitor_10000 (2) confirmed on lb1.awakers.biz (rc=4)
> tengine[8768]: 2008/07/11_21:29:56 info: run_graph: Transition 8:
> (Complete=10, Pending=0, Fired=0, Skipped=0, Incomplete=0)
> crmd[8636]: 2008/07/11_21:29:56 info: do_state_transition: State
> transition S_TRANSITION_ENGINE -> S_POLICY_ENGINE [ input=I_PE_CALC
> cause=C_IPC_MESSAGE origin=route_message ]
> crmd[8636]: 2008/07/11_21:29:56 info: do_state_transition: All 2
> cluster nodes are eligible to run resources.
> pengine[8769]: 2008/07/11_21:29:56 info: determine_online_status:
> Node lb1.awakers.biz is online
> pengine[8769]: 2008/07/11_21:29:56 WARN: unpack_rsc_op: Processing
> failed op nginx_2_monitor_10000 on lb1.awakers.biz: Error
> pengine[8769]: 2008/07/11_21:29:56 info: determine_online_status:
> Node lb2.awakers.biz is online
> pengine[8769]: 2008/07/11_21:29:56 notice: group_print: Resource
> Group: group_1
> pengine[8769]: 2008/07/11_21:29:56 notice: native_print:
> IPaddr_10_0_0_200 (heartbeat::ocf:IPaddr): Started
> lb1.awakers.biz
> pengine[8769]: 2008/07/11_21:29:56 notice: native_print:
> nginx_2 (maxdeputter::ocf:nginx): Started lb1.awakers.biz
> FAILED
> pengine[8769]: 2008/07/11_21:29:56 notice: NoRoleChange: Leave
> resource IPaddr_10_0_0_200 (lb1.awakers.biz)
> pengine[8769]: 2008/07/11_21:29:56 notice: NoRoleChange: Recover
> resource nginx_2 (lb1.awakers.biz)
> pengine[8769]: 2008/07/11_21:29:56 notice: StopRsc:
> lb1.awakers.biz Stop nginx_2
> pengine[8769]: 2008/07/11_21:29:56 notice: StartRsc:
> lb1.awakers.biz Start nginx_2
> pengine[8769]: 2008/07/11_21:29:56 notice: RecurringOp:
> lb1.awakers.biz nginx_2_monitor_10000
> crmd[8636]: 2008/07/11_21:29:56 info: do_state_transition: State
> transition S_POLICY_ENGINE -> S_TRANSITION_ENGINE
> [ input=I_PE_SUCCESS cause=C_IPC_MESSAGE origin=route_message ]
> pengine[8769]: 2008/07/11_21:29:56 info: process_pe_message:
> Transition 9: PEngine Input stored in: /var/lib/heartbeat/pengine/pe-
> input-64.bz2
> tengine[8768]: 2008/07/11_21:29:56 info: unpack_graph: Unpacked
> transition 9: 10 actions in 10 synapses
> tengine[8768]: 2008/07/11_21:29:57 info: te_pseudo_action: Pseudo
> action 13 fired and confirmed
> tengine[8768]: 2008/07/11_21:29:57 info: send_rsc_command:
> Initiating action 3: nginx_2_stop_0 on lb1.awakers.biz
> tengine[8768]: 2008/07/11_21:29:58 info: match_graph_event: Action
> nginx_2_stop_0 (3) confirmed on lb1.awakers.biz (rc=0)
> tengine[8768]: 2008/07/11_21:29:58 info: te_pseudo_action: Pseudo
> action 14 fired and confirmed
> tengine[8768]: 2008/07/11_21:29:58 info: te_pseudo_action: Pseudo
> action 7 fired and confirmed
> tengine[8768]: 2008/07/11_21:29:58 info: te_pseudo_action: Pseudo
> action 11 fired and confirmed
> tengine[8768]: 2008/07/11_21:29:58 info: send_rsc_command:
> Initiating action 9: IPaddr_10_0_0_200_start_0 on lb1.awakers.biz
> tengine[8768]: 2008/07/11_21:29:59 info: match_graph_event: Action
> IPaddr_10_0_0_200_start_0 (9) confirmed on lb1.awakers.biz (rc=0)
> tengine[8768]: 2008/07/11_21:29:59 info: send_rsc_command:
> Initiating action 1: IPaddr_10_0_0_200_monitor_5000 on lb1.awakers.biz
> tengine[8768]: 2008/07/11_21:29:59 info: send_rsc_command:
> Initiating action 10: nginx_2_start_0 on lb1.awakers.biz
> tengine[8768]: 2008/07/11_21:30:00 info: match_graph_event: Action
> IPaddr_10_0_0_200_monitor_5000 (1) confirmed on lb1.awakers.biz (rc=0)
> tengine[8768]: 2008/07/11_21:30:00 info: match_graph_event: Action
> nginx_2_start_0 (10) confirmed on lb1.awakers.biz (rc=0)
> tengine[8768]: 2008/07/11_21:30:00 info: te_pseudo_action: Pseudo
> action 12 fired and confirmed
> tengine[8768]: 2008/07/11_21:30:00 info: send_rsc_command:
> Initiating action 2: nginx_2_monitor_10000 on lb1.awakers.biz
> tengine[8768]: 2008/07/11_21:30:01 info: status_from_rc: Re-mapping
> op status to LRM_OP_ERROR for rc=7
> tengine[8768]: 2008/07/11_21:30:01 WARN: status_from_rc: Action
> monitor on lb1.awakers.biz failed (target: <null> vs. rc: 7): Error
> tengine[8768]: 2008/07/11_21:30:01 WARN: update_failcount: Updating
> failcount for nginx_2 on 197f587d-a76b-456c-9f69-c99275c411db after
> failed monitor: rc=7
> tengine[8768]: 2008/07/11_21:30:01 info: update_abort_priority:
> Abort priority upgraded to 1
> tengine[8768]: 2008/07/11_21:30:01 info: update_abort_priority:
> Abort action 0 superceeded by 2
> tengine[8768]: 2008/07/11_21:30:01 info: match_graph_event: Action
> nginx_2_monitor_10000 (2) confirmed on lb1.awakers.biz (rc=4)
> tengine[8768]: 2008/07/11_21:30:01 info: run_graph: Transition 9:
> (Complete=10, Pending=0, Fired=0, Skipped=0, Incomplete=0)
> crmd[8636]: 2008/07/11_21:30:01 info: do_state_transition: State
> transition S_TRANSITION_ENGINE -> S_POLICY_ENGINE [ input=I_PE_CALC
> cause=C_IPC_MESSAGE origin=route_message ]
> crmd[8636]: 2008/07/11_21:30:01 info: do_state_transition: All 2
> cluster nodes are eligible to run resources.
> pengine[8769]: 2008/07/11_21:30:01 info: determine_online_status:
> Node lb1.awakers.biz is online
> pengine[8769]: 2008/07/11_21:30:01 WARN: unpack_rsc_op: Processing
> failed op nginx_2_monitor_10000 on lb1.awakers.biz: Error
> pengine[8769]: 2008/07/11_21:30:01 info: determine_online_status:
> Node lb2.awakers.biz is online
> pengine[8769]: 2008/07/11_21:30:01 notice: group_print: Resource
> Group: group_1
> pengine[8769]: 2008/07/11_21:30:01 notice: native_print:
> IPaddr_10_0_0_200 (heartbeat::ocf:IPaddr): Started
> lb1.awakers.biz
> pengine[8769]: 2008/07/11_21:30:01 notice: native_print:
> nginx_2 (maxdeputter::ocf:nginx): Started lb1.awakers.biz
> FAILED
> pengine[8769]: 2008/07/11_21:30:01 notice: NoRoleChange: Leave
> resource IPaddr_10_0_0_200 (lb1.awakers.biz)
> pengine[8769]: 2008/07/11_21:30:01 notice: NoRoleChange: Recover
> resource nginx_2 (lb1.awakers.biz)
> pengine[8769]: 2008/07/11_21:30:01 notice: StopRsc:
> lb1.awakers.biz Stop nginx_2
> pengine[8769]: 2008/07/11_21:30:01 notice: StartRsc:
> lb1.awakers.biz Start nginx_2
> pengine[8769]: 2008/07/11_21:30:01 notice: RecurringOp:
> lb1.awakers.biz nginx_2_monitor_10000
> crmd[8636]: 2008/07/11_21:30:01 info: do_state_transition: State
> transition S_POLICY_ENGINE -> S_TRANSITION_ENGINE
> [ input=I_PE_SUCCESS cause=C_IPC_MESSAGE origin=route_message ]
> pengine[8769]: 2008/07/11_21:30:01 info: process_pe_message:
> Transition 10: PEngine Input stored in: /var/lib/heartbeat/pengine/
> pe-input-65.bz2
> tengine[8768]: 2008/07/11_21:30:01 info: unpack_graph: Unpacked
> transition 10: 10 actions in 10 synapses
> tengine[8768]: 2008/07/11_21:30:01 info: te_pseudo_action: Pseudo
> action 13 fired and confirmed
> tengine[8768]: 2008/07/11_21:30:01 info: send_rsc_command:
> Initiating action 3: nginx_2_stop_0 on lb1.awakers.biz
> tengine[8768]: 2008/07/11_21:30:03 info: match_graph_event: Action
> nginx_2_stop_0 (3) confirmed on lb1.awakers.biz (rc=0)
> tengine[8768]: 2008/07/11_21:30:03 info: te_pseudo_action: Pseudo
> action 14 fired and confirmed
> tengine[8768]: 2008/07/11_21:30:03 info: te_pseudo_action: Pseudo
> action 7 fired and confirmed
> tengine[8768]: 2008/07/11_21:30:03 info: te_pseudo_action: Pseudo
> action 11 fired and confirmed
> tengine[8768]: 2008/07/11_21:30:03 info: send_rsc_command:
> Initiating action 9: IPaddr_10_0_0_200_start_0 on lb1.awakers.biz
> tengine[8768]: 2008/07/11_21:30:05 info: match_graph_event: Action
> IPaddr_10_0_0_200_start_0 (9) confirmed on lb1.awakers.biz (rc=0)
> tengine[8768]: 2008/07/11_21:30:05 info: send_rsc_command:
> Initiating action 1: IPaddr_10_0_0_200_monitor_5000 on lb1.awakers.biz
> tengine[8768]: 2008/07/11_21:30:05 info: send_rsc_command:
> Initiating action 10: nginx_2_start_0 on lb1.awakers.biz
> crmd[8636]: 2008/07/11_21:30:05 info: handle_shutdown_request:
> Creating shutdown request for lb1.awakers.biz
> tengine[8768]: 2008/07/11_21:30:05 info: extract_event: Aborting on
> shutdown attribute for 197f587d-a76b-456c-9f69-c99275c411db
> tengine[8768]: 2008/07/11_21:30:05 info: update_abort_priority:
> Abort priority upgraded to 1000000
> tengine[8768]: 2008/07/11_21:30:05 info: update_abort_priority:
> Abort action 0 superceeded by 2
> tengine[8768]: 2008/07/11_21:30:06 info: match_graph_event: Action
> nginx_2_start_0 (10) confirmed on lb1.awakers.biz (rc=0)
> tengine[8768]: 2008/07/11_21:30:06 info: match_graph_event: Action
> IPaddr_10_0_0_200_monitor_5000 (1) confirmed on lb1.awakers.biz (rc=0)
> tengine[8768]: 2008/07/11_21:30:06 info: run_graph:
> ====================================================
> tengine[8768]: 2008/07/11_21:30:06 notice: run_graph: Transition 10:
> (Complete=8, Pending=0, Fired=0, Skipped=2, Incomplete=0)
> crmd[8636]: 2008/07/11_21:30:06 info: do_state_transition: State
> transition S_TRANSITION_ENGINE -> S_POLICY_ENGINE [ input=I_PE_CALC
> cause=C_IPC_MESSAGE origin=route_message ]
> crmd[8636]: 2008/07/11_21:30:06 info: do_state_transition: All 2
> cluster nodes are eligible to run resources.
> pengine[8769]: 2008/07/11_21:30:06 info: determine_online_status:
> Node lb1.awakers.biz is shutting down
> pengine[8769]: 2008/07/11_21:30:06 info: determine_online_status:
> Node lb2.awakers.biz is online
> pengine[8769]: 2008/07/11_21:30:06 notice: group_print: Resource
> Group: group_1
> pengine[8769]: 2008/07/11_21:30:06 notice: native_print:
> IPaddr_10_0_0_200 (heartbeat::ocf:IPaddr): Started
> lb1.awakers.biz
> pengine[8769]: 2008/07/11_21:30:06 notice: native_print:
> nginx_2 (maxdeputter::ocf:nginx): Started lb1.awakers.biz
> pengine[8769]: 2008/07/11_21:30:06 notice: NoRoleChange: Move
> resource IPaddr_10_0_0_200 (lb1.awakers.biz -> lb2.awakers.biz)
> pengine[8769]: 2008/07/11_21:30:06 notice: StopRsc:
> lb1.awakers.biz Stop IPaddr_10_0_0_200
> pengine[8769]: 2008/07/11_21:30:06 notice: StartRsc:
> lb2.awakers.biz Start IPaddr_10_0_0_200
> pengine[8769]: 2008/07/11_21:30:06 notice: RecurringOp:
> lb2.awakers.biz IPaddr_10_0_0_200_monitor_5000
> pengine[8769]: 2008/07/11_21:30:06 notice: NoRoleChange: Move
> resource nginx_2 (lb1.awakers.biz -> lb2.awakers.biz)
> pengine[8769]: 2008/07/11_21:30:06 notice: StopRsc:
> lb1.awakers.biz Stop nginx_2
> pengine[8769]: 2008/07/11_21:30:06 notice: StartRsc:
> lb2.awakers.biz Start nginx_2
> pengine[8769]: 2008/07/11_21:30:06 notice: RecurringOp:
> lb2.awakers.biz nginx_2_monitor_10000
> pengine[8769]: 2008/07/11_21:30:06 info: stage6: Scheduling Node
> lb1.awakers.biz for shutdown
> crmd[8636]: 2008/07/11_21:30:06 info: do_state_transition: State
> transition S_POLICY_ENGINE -> S_TRANSITION_ENGINE
> [ input=I_PE_SUCCESS cause=C_IPC_MESSAGE origin=route_message ]
> pengine[8769]: 2008/07/11_21:30:06 info: process_pe_message:
> Transition 11: PEngine Input stored in: /var/lib/heartbeat/pengine/
> pe-input-66.bz2
> tengine[8768]: 2008/07/11_21:30:06 info: unpack_graph: Unpacked
> transition 11: 12 actions in 12 synapses
> tengine[8768]: 2008/07/11_21:30:06 info: te_pseudo_action: Pseudo
> action 14 fired and confirmed
> tengine[8768]: 2008/07/11_21:30:06 info: send_rsc_command:
> Initiating action 9: nginx_2_stop_0 on lb1.awakers.biz
> tengine[8768]: 2008/07/11_21:30:07 info: match_graph_event: Action
> nginx_2_stop_0 (9) confirmed on lb1.awakers.biz (rc=0)
> tengine[8768]: 2008/07/11_21:30:07 info: send_rsc_command:
> Initiating action 6: IPaddr_10_0_0_200_stop_0 on lb1.awakers.biz
> tengine[8768]: 2008/07/11_21:30:08 info: match_graph_event: Action
> IPaddr_10_0_0_200_stop_0 (6) confirmed on lb1.awakers.biz (rc=0)
> tengine[8768]: 2008/07/11_21:30:08 info: te_pseudo_action: Pseudo
> action 15 fired and confirmed
> tengine[8768]: 2008/07/11_21:30:08 info: te_pseudo_action: Pseudo
> action 5 fired and confirmed
> tengine[8768]: 2008/07/11_21:30:08 info: te_crm_command: Executing
> crm-event (17): do_shutdown on lb1.awakers.biz
> tengine[8768]: 2008/07/11_21:30:08 info: te_pseudo_action: Pseudo
> action 12 fired and confirmed
> tengine[8768]: 2008/07/11_21:30:08 info: send_rsc_command:
> Initiating action 7: IPaddr_10_0_0_200_start_0 on lb2.awakers.biz
> crmd[8636]: 2008/07/11_21:30:08 info: do_lrm_rsc_op: Performing
> op=IPaddr_10_0_0_200_start_0 key=7:11:7a57ef5a-9658-4683-
> b3cf-6c3cd58c5521)
> lrmd[8633]: 2008/07/11_21:30:08 info: rsc:IPaddr_10_0_0_200: start
> IPaddr[9030]: 2008/07/11_21:30:09 INFO: Using calculated nic for
> 10.0.0.200: eth0
> IPaddr[9030]: 2008/07/11_21:30:09 INFO: Using calculated netmask
> for 10.0.0.200: 255.255.255.0
> IPaddr[9030]: 2008/07/11_21:30:09 INFO: eval ifconfig eth0:0
> 10.0.0.200 netmask 255.255.255.0 broadcast 10.0.0.255
> crmd[8636]: 2008/07/11_21:30:09 notice: crmd_client_status_callback:
> Status update: Client lb1.awakers.biz/crmd now has status [offline]
> crmd[8636]: 2008/07/11_21:30:09 info: process_lrm_event: LRM
> operation IPaddr_10_0_0_200_start_0 (call=11, rc=0) complete
> tengine[8768]: 2008/07/11_21:30:09 info: match_graph_event: Action
> IPaddr_10_0_0_200_start_0 (7) confirmed on lb2.awakers.biz (rc=0)
> tengine[8768]: 2008/07/11_21:30:09 info: send_rsc_command:
> Initiating action 8: IPaddr_10_0_0_200_monitor_5000 on lb2.awakers.biz
> tengine[8768]: 2008/07/11_21:30:09 info: send_rsc_command:
> Initiating action 10: nginx_2_start_0 on lb2.awakers.biz
> crmd[8636]: 2008/07/11_21:30:09 info: do_lrm_rsc_op: Performing
> op=IPaddr_10_0_0_200_monitor_5000 key=8:11:7a57ef5a-9658-4683-
> b3cf-6c3cd58c5521)
> crmd[8636]: 2008/07/11_21:30:09 info: do_lrm_rsc_op: Performing
> op=nginx_2_start_0 key=10:11:7a57ef5a-9658-4683-b3cf-6c3cd58c5521)
> lrmd[8633]: 2008/07/11_21:30:09 info: rsc:nginx_2: start
> crmd[8636]: 2008/07/11_21:30:10 info: process_lrm_event: LRM
> operation IPaddr_10_0_0_200_monitor_5000 (call=12, rc=0) complete
> tengine[8768]: 2008/07/11_21:30:10 info: match_graph_event: Action
> IPaddr_10_0_0_200_monitor_5000 (8) confirmed on lb2.awakers.biz (rc=0)
> crmd[8636]: 2008/07/11_21:30:10 info: process_lrm_event: LRM
> operation nginx_2_start_0 (call=13, rc=0) complete
> tengine[8768]: 2008/07/11_21:30:10 info: match_graph_event: Action
> nginx_2_start_0 (10) confirmed on lb2.awakers.biz (rc=0)
> tengine[8768]: 2008/07/11_21:30:10 info: te_pseudo_action: Pseudo
> action 13 fired and confirmed
> tengine[8768]: 2008/07/11_21:30:10 info: send_rsc_command:
> Initiating action 11: nginx_2_monitor_10000 on lb2.awakers.biz
> crmd[8636]: 2008/07/11_21:30:10 info: do_lrm_rsc_op: Performing
> op=nginx_2_monitor_10000 key=11:11:7a57ef5a-9658-4683-
> b3cf-6c3cd58c5521)
> crmd[8636]: 2008/07/11_21:30:10 info: process_lrm_event: LRM
> operation nginx_2_monitor_10000 (call=14, rc=0) complete
> tengine[8768]: 2008/07/11_21:30:10 info: match_graph_event: Action
> nginx_2_monitor_10000 (11) confirmed on lb2.awakers.biz (rc=0)
> cib[8632]: 2008/07/11_21:30:10 info: cib_process_shutdown_req:
> Shutdown REQ from lb1.awakers.biz
> cib[8632]: 2008/07/11_21:30:12 info: sync_our_cib: Syncing CIB to
> lb1.awakers.biz
> cib[8632]: 2008/07/11_21:30:12 info: cib_client_status_callback:
> Status update: Client lb1.awakers.biz/cib now has status [leave]
> ccm[8631]: 2008/07/11_21:30:12 info: Break tie for 2 nodes cluster
> crmd[8636]: 2008/07/11_21:30:12 info: mem_handle_event: Got an event
> OC_EV_MS_INVALID from ccm
> crmd[8636]: 2008/07/11_21:30:12 info: mem_handle_event: no mbr_track
> info
> crmd[8636]: 2008/07/11_21:30:12 info: mem_handle_event: Got an event
> OC_EV_MS_NEW_MEMBERSHIP from ccm
> crmd[8636]: 2008/07/11_21:30:12 info: mem_handle_event: instance=5,
> nodes=1, new=0, lost=1, n_idx=0, new_idx=1, old_idx=3
> crmd[8636]: 2008/07/11_21:30:12 info: crmd_ccm_msg_callback: Quorum
> (re)attained after event=NEW MEMBERSHIP (id=5)
> crmd[8636]: 2008/07/11_21:30:12 info: ccm_event_detail: NEW
> MEMBERSHIP: trans=5, nodes=1, new=0, lost=1 n_idx=0, new_idx=1,
> old_idx=3
> cib[8632]: 2008/07/11_21:30:12 info: mem_handle_event: Got an event
> OC_EV_MS_INVALID from ccm
> cib[8632]: 2008/07/11_21:30:12 info: mem_handle_event: no mbr_track
> info
> cib[8632]: 2008/07/11_21:30:12 info: mem_handle_event: Got an event
> OC_EV_MS_NEW_MEMBERSHIP from ccm
> cib[8632]: 2008/07/11_21:30:12 info: mem_handle_event: instance=5,
> nodes=1, new=0, lost=1, n_idx=0, new_idx=1, old_idx=3
> cib[8632]: 2008/07/11_21:30:12 info: cib_ccm_msg_callback: LOST:
> lb1.awakers.biz
> cib[8632]: 2008/07/11_21:30:12 info: cib_ccm_msg_callback: PEER:
> lb2.awakers.biz
> crmd[8636]: 2008/07/11_21:30:12 info: ccm_event_detail:
> CURRENT: lb2.awakers.biz [nodeid=1, born=5]
> tengine[8768]: 2008/07/11_21:30:12 info: run_graph: Transition 11:
> (Complete=12, Pending=0, Fired=0, Skipped=0, Incomplete=0)
> crmd[8636]: 2008/07/11_21:30:12 info: ccm_event_detail:
> LOST: lb1.awakers.biz [nodeid=0, born=4]
> tengine[8768]: 2008/07/11_21:30:12 info: notify_crmd: Transition 11
> status: te_complete - <null>
> crmd[8636]: 2008/07/11_21:30:12 info: do_state_transition: State
> transition S_TRANSITION_ENGINE -> S_IDLE [ input=I_TE_SUCCESS
> cause=C_IPC_MESSAGE origin=route_message ]
> heartbeat[8623]: 2008/07/11_21:30:43 WARN: node lb1.awakers.biz: is
> dead
> heartbeat[8623]: 2008/07/11_21:30:43 info: Link lb1.awakers.biz:eth0
> dead.
> crmd[8636]: 2008/07/11_21:30:43 notice: crmd_ha_status_callback:
> Status update: Node lb1.awakers.biz now has status [dead]
> cib[8632]: 2008/07/11_21:36:41 info: cib_stats: Processed 165
> operations (4848.00us average, 0% utilization) in the last 10min
>
>
>
>
> I think this line might be the problem:
> tengine[8768]: 2008/07/11_21:30:01 WARN: status_from_rc: Action
> monitor on lb1.awakers.biz failed (target: <null> vs. rc: 7): Error
> I googled it and the only answer I found was that the service was
> already running on the local machine. I can assure you that the
> service is not running.
>
> any ideas?
>
> Max
> _______________________________________________
> Linux-HA mailing list
> Linux-HA at lists.linux-ha.org
> http://lists.linux-ha.org/mailman/listinfo/linux-ha
> See also: http://linux-ha.org/ReportingProblems
More information about the Linux-HA
mailing list