[Linux-HA] Test of HA

Philip Juels pjuels at rics.bwh.harvard.edu
Mon Oct 17 09:54:00 MDT 2005


Hi all,

I've successfully set up a simple two-node v2 active/passive Apache 
cluster.  In order to test failover of a crashed httpd service, I killed 
the httpd daemon and watched the ha.log to see if HA would recover the 
service.  Well, my "test" did not work...killing the daemon (or 
executing a httpd stop) only resulted in a series of errors in the ha.log:

crmd[28210]: 2005/10/17_11:23:02 info: mask(lrm.c:do_lrm_rsc_op): 
Performing op start on group_1:httpd_2
crmd[28210]: 2005/10/17_11:23:03 info: mask(lrm.c:do_lrm_rsc_op): 
Performing op monitor on group_1:httpd_2
crmd[28210]: 2005/10/17_11:27:03 ERROR: mask(lrm.c:do_lrm_event): LRM 
operation (5) monitor on group_1:httpd_2 ERROR: invalid parameter
crmd[28210]: 2005/10/17_11:27:03 info: mask(lrm.c:do_lrm_rsc_op): 
Performing op stop on group_1:httpd_2
crmd[28210]: 2005/10/17_11:27:03 WARN: mask(lrm.c:do_lrm_event): LRM 
operation (5) monitor on group_1:httpd_2 cancelled
crmd[28210]: 2005/10/17_11:27:03 ERROR: mask(lrm.c:do_lrm_event): LRM 
operation (7) stop on group_1:httpd_2 ERROR: unknown error
crmd[28210]: 2005/10/17_11:27:05 info: mask(lrm.c:do_lrm_rsc_op): 
Performing op stop on group_1:httpd_2
crmd[28210]: 2005/10/17_11:27:05 ERROR: mask(lrm.c:do_lrm_event): LRM 
operation (8) stop on group_1:httpd_2 ERROR: unknown error
crmd[28210]: 2005/10/17_11:27:07 info: mask(lrm.c:do_lrm_rsc_op): 
Performing op stop on group_1:httpd_2

These errrors continue until I restarted httpd on the primary node, 
after which heartbeat switched the httpd service over to the secondary node:

crmd[28210]: 2005/10/17_11:28:17 info: mask(lrm.c:do_lrm_rsc_op): 
Performing op stop on group_1:httpd_2
crmd[28210]: 2005/10/17_11:28:17 info: mask(lrm.c:do_lrm_rsc_op): 
Performing op stop on group_1:IPaddr_1
crmd[28210]: 2005/10/17_11:28:17 WARN: mask(lrm.c:do_lrm_event): LRM 
operation (3) monitor on group_1:IPaddr_1 cancelled
IPaddr[30012]:  2005/10/17_11:28:17 INFO: /sbin/route -n del -host 
10.252.1.50
IPaddr[30012]:  2005/10/17_11:28:17 INFO: /sbin/ifconfig eth1:0 down
IPaddr[30012]:  2005/10/17_11:28:17 INFO: IP Address 10.252.1.50 released

I can only assume that I did not properly set up the cib.xml file 
(attached below).  What am I doing wrong?  Also,  is it possible to set 
heartbeat to restart a crashed service on the primary node first before 
failing-over to the secondary?

Thanks,

PJ

--- cib.xml ---
 <cib admin_epoch="0" have_quorum="true" num_peers="1" 
origin="hpcgg-grd1" last_written="Mon Oct 17 11:29:23 2005" 
dc_uuid="306d4c0a-4d7a-43b0-b2e6-fa5ab74ae435" 
debug_source="finalize_join" ccm_transition="3" generated="true" 
epoch="3" num_updates="223" cib_feature_revision="1">
   <configuration>
     <crm_config>
       <nvpair id="transition_idle_timeout" 
name="transition_idle_timeout" value="120s"/>
       <nvpair id="symmetric_cluster" name="symmetric_cluster" 
value="true"/>
       <nvpair id="no_quorum_policy" name="no_quorum_policy" value="stop"/>
       <nvpair id="suppress_cib_writes" name="suppress_cib_writes" 
value="false"/>
     </crm_config>
     <nodes>
       <node id="306d4c0a-4d7a-43b0-b2e6-fa5ab74ae435" 
uname="hpcgg-grd1" type="member"/>
       <node id="8a9fa544-185d-44c5-ae5a-63dab9df49a3" 
uname="hpcgg-grd2" type="member"/>
     </nodes>
     <resources>
       <group id="group_1">
         <primitive class="ocf" id="IPaddr_1" provider="heartbeat" 
type="IPaddr">
           <operations>
             <op id="1" interval="5s" name="monitor" timeout="5s"/>
           </operations>
           <instance_attributes>
             <attributes>
               <nvpair name="ip" value="10.252.1.50" 
id="b66e8aaa-8d9f-4c0b-897b-0fa3d350fea7"/>
             </attributes>
           </instance_attributes>
         </primitive>
         <primitive class="heartbeat" id="httpd_2" provider="heartbeat" 
type="httpd">
           <operations>
             <op id="facb09b3-8cfe-4f8e-829a-e49f10a9b004" 
interval="120s" name="monitor" timeout="60s"/>
           </operations>
         </primitive>
       </group>
     </resources>
     <constraints>
       <rsc_location id="rsc_location_group_1" rsc="group_1">
         <rule id="prefered_location_group_1" score="100">
           <expression attribute="#uname" operation="eq" 
value="hpcgg-grd1" id="ba557d8d-594f-4d06-b78c-eb20c26384da"/>
         </rule>
       </rsc_location>
     </constraints>
   </configuration>
   <status>
     <node_state uname="hpcgg-grd2" in_ccm="false" join="down" 
origin="ghash_update_cib_node" ha="active" crmd="offline" 
expected="down" id="8a9fa544-185d-44c5-ae5a-63dab9df49a3">
       <lrm>
         <lrm_resources>
           <lrm_resource id="group_1:IPaddr_1" type="IPaddr" class="ocf" 
provider="heartbeat" last_op="stop" rsc_state="stopped" rc_code="0" 
op_status="0">
             <lrm_rsc_op id="group_1:IPaddr_1_start_0" operation="start" 
origin="build_active_RAs" 
transition_key="29:19b183db-63a9-4a1c-a155-cb299ffc74ef" 
transition_magic="0:29:19b183db-63a9-4a1c-a155-cb299ffc74ef" 
rsc_state="running" call_id="2" rc_code="0" op_status="0"/>
             <lrm_rsc_op id="group_1:IPaddr_1_monitor_5000" 
operation="monitor" origin="build_active_RAs" 
transition_key="29:19b183db-63a9-4a1c-a155-cb299ffc74ef" 
transition_magic="0:29:19b183db-63a9-4a1c-a155-cb299ffc74ef" 
rsc_state="running" call_id="3" rc_code="0" op_status="0"/>
             <lrm_rsc_op id="group_1:IPaddr_1_stop_0" operation="stop" 
origin="do_update_resource" 
transition_key="0:06510941-1c87-4449-b0c6-b0bc13c39ab8" 
transition_magic="0:0:06510941-1c87-4449-b0c6-b0bc13c39ab8" 
rsc_state="stopped" call_id="9" rc_code="0" op_status="0"/>
           </lrm_resource>
           <lrm_resource id="group_1:httpd_2" type="httpd" 
class="heartbeat" provider="heartbeat" last_op="stop" 
rsc_state="stopped" rc_code="0" op_status="0">
             <lrm_rsc_op id="group_1:httpd_2_start_0" operation="start" 
origin="build_active_RAs" 
transition_key="29:19b183db-63a9-4a1c-a155-cb299ffc74ef" 
transition_magic="0:29:19b183db-63a9-4a1c-a155-cb299ffc74ef" 
rsc_state="running" call_id="4" rc_code="0" op_status="0"/>
             <lrm_rsc_op id="group_1:httpd_2_monitor_120000" 
operation="monitor" origin="build_active_RAs" 
transition_key="29:19b183db-63a9-4a1c-a155-cb299ffc74ef" 
transition_magic="0:29:19b183db-63a9-4a1c-a155-cb299ffc74ef" 
rsc_state="running" call_id="5" rc_code="0" op_status="0"/>
             <lrm_rsc_op id="group_1:httpd_2_stop_0" operation="stop" 
origin="do_update_resource" 
transition_key="0:06510941-1c87-4449-b0c6-b0bc13c39ab8" 
transition_magic="0:0:06510941-1c87-4449-b0c6-b0bc13c39ab8" 
rsc_state="stopped" call_id="7" rc_code="0" op_status="0"/>
           </lrm_resource>
         </lrm_resources>
       </lrm>
     </node_state>
     <node_state join="member" uname="hpcgg-grd1" ha="active" 
in_ccm="true" crmd="online" expected="member" origin="do_lrm_query" 
id="306d4c0a-4d7a-43b0-b2e6-fa5ab74ae435">
       <lrm>
         <lrm_resources>
           <lrm_resource id="group_1:IPaddr_1" type="IPaddr" class="ocf" 
provider="heartbeat" last_op="monitor" rsc_state="running" rc_code="0" 
op_status="0">
             <lrm_rsc_op id="group_1:IPaddr_1_start_0" operation="start" 
origin="do_update_resource" 
transition_key="0:06510941-1c87-4449-b0c6-b0bc13c39ab8" 
transition_magic="0:0:06510941-1c87-4449-b0c6-b0bc13c39ab8" 
rsc_state="running" call_id="37" rc_code="0" op_status="0"/>
             <lrm_rsc_op id="group_1:IPaddr_1_stop_0" operation="stop" 
origin="build_active_RAs" 
transition_key="29:19b183db-63a9-4a1c-a155-cb299ffc74ef" 
transition_magic="0:29:19b183db-63a9-4a1c-a155-cb299ffc74ef" 
rsc_state="stopped" call_id="36" rc_code="0" op_status="0"/>
             <lrm_rsc_op id="group_1:IPaddr_1_monitor_5000" 
operation="monitor" op_status="0" call_id="38" rc_code="0" 
origin="do_update_resource" 
transition_key="0:06510941-1c87-4449-b0c6-b0bc13c39ab8" 
transition_magic="0:0:06510941-1c87-4449-b0c6-b0bc13c39ab8" 
rsc_state="running"/>
           </lrm_resource>
           <lrm_resource id="group_1:httpd_2" type="httpd" 
class="heartbeat" provider="heartbeat" last_op="monitor" 
rsc_state="running" rc_code="0" op_status="0">
             <lrm_rsc_op id="group_1:httpd_2_start_0" operation="start" 
origin="do_update_resource" 
transition_key="0:06510941-1c87-4449-b0c6-b0bc13c39ab8" 
transition_magic="0:0:06510941-1c87-4449-b0c6-b0bc13c39ab8" 
rsc_state="running" call_id="39" rc_code="0" op_status="0"/>
             <lrm_rsc_op id="group_1:httpd_2_stop_0" operation="stop" 
origin="build_active_RAs" 
transition_key="29:19b183db-63a9-4a1c-a155-cb299ffc74ef" 
transition_magic="0:29:19b183db-63a9-4a1c-a155-cb299ffc74ef" 
rsc_state="stopped" call_id="34" rc_code="0" op_status="0"/>
             <lrm_rsc_op id="group_1:httpd_2_monitor_120000" 
operation="monitor" op_status="0" call_id="40" rc_code="0" 
origin="do_update_resource" 
transition_key="0:06510941-1c87-4449-b0c6-b0bc13c39ab8" 
transition_magic="0:0:06510941-1c87-4449-b0c6-b0bc13c39ab8" 
rsc_state="running"/>
           </lrm_resource>
         </lrm_resources>
       </lrm>
     </node_state>
   </status>
 </cib>





More information about the Linux-HA mailing list