[Linux-HA] R2 Two-node apache cluster with STONITH

Andrew Beekhof beekhof at gmail.com
Fri Mar 9 03:11:43 MST 2007


On 3/8/07, Bjorn Oglefjorn <sys.mailing at gmail.com> wrote:
> Thanks Andrew,
>
> As I said, I'm new to heartbeat2 and the logs are very difficult to
> understand.  I will double check that my ResourceAgent follows the
> guidelines in the documentation.  Do you see anything obvious in my config?

can both nodes connect to both stonith devices?

>
> --BO
>
> keepalive       1
> deadtime        20
> warntime        10
> initdead        120
> ping            XX.XX.XX.1
> deadping        10
> auto_failback   off
> node            ldap-1.domain
> node            ldap-2.domain
> ucast           eth0 XX.XX.XX.101
> ucast           eth0 XX.XX.XX.102
> use_logd        yes
> crm             yes
>
>
>  <cib admin_epoch="0" epoch="0" num_updates="0">
>    <configuration>
>      <crm_config>
>        <cluster_property_set id="cib-bootstrap-options">
>          <attributes>
>            <nvpair id="cib-bootstrap-options-transition_idle_timeout"
> name="transition_idle_timeout" value="5min"/>
>            <nvpair id="cib-bootstrap-options-stonith_enabled"
> name="stonith_enabled" value="true"/>
>            <nvpair id="cib-bootstrap-options-default_resource_stickiness"
> name="default_resource_stickiness" value="INFINITY"/>
>            <nvpair id="cib-bootstrap-options-short_resource_names"
> name="short_resource_names" value="true"/>
>            <nvpair
> id="cib-bootstrap-options-default_resource_failure_stickiness"
> name="default_resource_failure_stickiness" value="-INFINITY"/>
>            <nvpair id="cib-bootstrap-options-stonith_action"
> name="stonith_action" value="reboot"/>
>            <nvpair id="cib-bootstrap-options-remove_after_stop"
> name="remove_after_stop" value="false"/>
>            <nvpair id="cib-bootstrap-options-default_action_timeout"
> name="default_action_timeout" value="5s"/>
>        <nvpair id="cib-bootstrap-options-symmetric_cluster"
> name="symmetric_cluster" value="true"/>
>            <nvpair id="cib-bootstrap-options-no_quorum_policy"
> name="no_quorum_policy" value="stop"/>
>            <nvpair id="cib-bootstrap-options-stop_orphan_resources"
> name="stop_orphan_resources" value="true"/>
>            <nvpair id="cib-bootstrap-options-stop_orphan_actions"
> name="stop_orphan_actions" value="true"/>
>            <nvpair id="cib-bootstrap-options-is_managed_default"
> name="is_managed_default" value="true"/>
>          </attributes>
>        </cluster_property_set>
>      </crm_config>
>      <nodes/>
>      <resources>
>        <group id="test_group">
>          <primitive class="ocf" id="test_IP" provider="heartbeat"
> type="IPaddr">
>            <operations>
>              <op id="test_IP_mon" interval="5s" name="monitor" timeout="20s"
> on_fail="fence"/>
>            </operations>
>            <instance_attributes id="test_IP_inst_attr">
>              <attributes>
>                <nvpair id="test_IP_attr_0" name="ip" value="192.168.168.168
> "/>
>                <nvpair id="test_IP_attr_1" name="netmask" value="25"/>
>              </attributes>
>            </instance_attributes>
>          </primitive>
>          <primitive class="lsb" id="httpd" provider="advance" type="httpd">
>            <operations>
>              <op id="httpd_status" interval="5s" name="status" timeout="20s"
> on_fail="fence"/>
>            </operations>
>          </primitive>
>        </group>
>        <clone id="test-1_drac">
>          <primitive id="test-1_drac_DoFencing" class="stonith"
> type="external/drac4" provider="heartbeat">
>            <operations>
>              <op id="test-1_drac_DoFencing_reset" name="reset" timeout="20s"
> prereq="nothing"/>
>            </operations>
>            <instance_attributes id="test-1_drac_DoFencing_inst_attr">
>              <attributes>
>                <nvpair id="test-1_drac_DoFencing_attr_0" name="DRAC_ADDR"
> value="test-1.drac.domain"/>
>                <nvpair id="test-1_drac_DoFencing_attr_1" name="DRAC_LOGIN"
> value="root"/>
>                <nvpair id="test-1_drac_DoFencing_attr_2" name="DRAC_PASSWD"
> value="*****"/>
>              </attributes>
>            </instance_attributes>
>          </primitive>
>        </clone>
>        <clone id="test-2_drac">
>          <primitive id="test-2_drac_DoFencing" class="stonith"
> type="external/drac4" provider="heartbeat">
>            <operations>
>              <op id="test-2_drac_DoFencing_reset" name="reset" timeout="20s"
> prereq="nothing"/>
>            </operations>
>            <instance_attributes id="test-2_drac_DoFencing_inst_attr">
>              <attributes>
>                <nvpair id="test-2_drac_DoFencing_attr_0" name="DRAC_ADDR"
> value="test-2.drac.domain"/>
>                <nvpair id="test-2_drac_DoFencing_attr_1" name="DRAC_LOGIN"
> value="root"/>
>                <nvpair id="test-2_drac_DoFencing_attr_2" name="DRAC_PASSWD"
> value="*****"/>
>              </attributes>
>            </instance_attributes>
>          </primitive>
>        </clone>
>      </resources>
>      <constraints>
>        <rsc_location id="test_group_location" rsc="test_group">
>          <rule id="prefered_location_test_group" score="100">
>            <expression attribute="#uname"
> id="prefered_location_test_group_expr_1" operation="eq" value="test-1.domain
> "/>
>          </rule>
>        </rsc_location>
>        <rsc_location id="test-1_drac_location" rsc="test-1_drac">
>          <rule id="prefered_location_test-1_drac" score="100">
>            <expression attribute="#uname"
> id="prefered_location_test-1_drac_expr_1" operation="eq" value="
> test-1.domain"/>
>          </rule>
>        </rsc_location>
>        <rsc_location id="test-2_drac_location" rsc="test-2_drac">
>          <rule id="prefered_location_test-2_drac" score="100">
>            <expression attribute="#uname"
> id="prefered_location_test-2_drac_expr_1" operation="eq" value="
> test-2.domain"/>
>          </rule>
>        </rsc_location>
>      </constraints>
>    </configuration>
>    <status/>
>  </cib>
>
> On 3/8/07, Andrew Beekhof <beekhof at gmail.com> wrote:
> >
> > its very unlikely to shoot anything if the stonith agents cant start
> >
> > Mar  7 09:30:22 ldap-2 pengine: [5712]: WARN:
> > unpack_rsc_op:unpack.cProcessing failed op
> > (test-1_drac_DoFencing:0_start_0) for test-1_drac_DoFencing:0 on
> > ldap-1.domain
> >
> > either the RA is broken or your configuration is
> >
> > On 3/7/07, Bjorn Oglefjorn <sys.mailing at gmail.com> wrote:
> > > On 3/6/07, Alan Robertson <alanr at unix.sh> wrote:
> > > >
> > > > Bjorn Oglefjorn wrote:
> > > > > Hello,
> > > > >
> > > > > I have tried at length to follow the documentation and peruse this
> > > > mailing
> > > > > list, but as of yet I am unable to have this work properly.  Is
> > there
> > > > any
> > > > > one who can provide me with some direction here?
> > > > >
> > > > > The STONITH plugin (external/drac4) is a custom one that I have
> > created.
> > > > > I've tested it with the /usr/sbin/stonith command and it works as
> > > > outlined
> > > > > in the documentation.
> > > > >
> > > > > Below are my test config files.  Thanks in advance for any help you
> > all
> > > > can
> > > > > provide.
> > > >
> > > >
> > > > "unable to have this work properly"...
> > > >
> > > > Could you be a little more specific on exactly what your
> > > > problems/symptoms are?
> > > >
> > > >
> > > > --
> > > >     Alan Robertson <alanr at unix.sh>
> > > >
> > > > "Openness is the foundation and preservative of friendship...  Let me
> > > > claim from you at all times your undisguised opinions." - William
> > > > Wilberforce
> > > > _______________________________________________
> > > > Linux-HA mailing list
> > > > Linux-HA at lists.linux-ha.org
> > > > http://lists.linux-ha.org/mailman/listinfo/linux-ha
> > > > See also: http://linux-ha.org/ReportingProblems
> > > >
> > >
> > > Hello Alan,
> > >
> > > Thanks for the quick reply.  Specifically, I can tell you that my
> > cluster
> > > does seem to start up properly, however when I simulate node failure,
> > > STONITH does not succeed.  The STONITH process is attempted over and
> > again,
> > > but is never actually performed (ie: the failed node is never shot in
> > the
> > > head).
> > >
> > > I'm sorry to say that I'm not really sure what to make of the log
> > entries as
> > > they seem very obtuse.  What I can tell you is that these lines are from
> > the
> > > host which should have been fenced, which I assume is wrong.  Here are
> > some
> > > excerpts:
> > >
> > > Mar  7 09:30:19 ldap-2 stonithd: [5698]: info: client tengine [pid:
> > 5711]
> > > want a STONITH operation RESET to node ldap-2.domain.
> > > Mar  7 09:30:19 ldap-2 tengine: [5711]: info:
> > > te_fence_node:actions.cExecuting reboot fencing operation (21) on
> > > ldap-2.domain (timeout=2500)
> > > Mar  7 09:30:19 ldap-2 stonithd: [5698]: info: Broadcasting the message
> > > succeeded: require others to stonith node ldap-2.domain.
> > > Mar  7 09:30:19 ldap-2 tengine: [5711]: info:
> > > te_pseudo_action:actions.cPseudo action 12 confirmed
> > > Mar  7 09:30:19 ldap-2 tengine: [5711]: info:
> > > te_pseudo_action:actions.cPseudo action 9 confirmed
> > > Mar  7 09:30:22 ldap-2 stonithd: [5698]: info: Failed to STONITH the
> > node
> > > ldap-2.domain: optype=1, op_result=2
> > > Mar  7 09:30:22 ldap-2 tengine: [5711]: info: tengine_stonith_callback:
> > > callbacks.c call=-173, optype=1, node_name=ldap-2.domain, result=2, nod
> > > e_list=, action=21;175:d1784142-1161-4f9b-8865-731e40b59e13
> > > Mar  7 09:30:22 ldap-2 tengine: [5711]: ERROR: tengine_stonith_callback:
> > > callbacks.c Stonith of ldap-2.domain failed (2)... aborting transition.
> > >
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: WARN:
> > > unpack_rsc_op:unpack.cProcessing failed op
> > > (test-1_drac_DoFencing:0_start_0) for
> > > test-1_drac_DoFencing:0 on l
> > > dap-1.domain
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: WARN:
> > > unpack_rsc_op:unpack.cHandling failed start for
> > > test-1_drac_DoFencing:0 on
> > > ldap-1.domain
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: WARN:
> > > unpack_rsc_op:unpack.cProcessing failed op
> > > (test-1_drac_DoFencing:1_start_0) for
> > > test-1_drac_DoFencing:1 on l
> > > dap-1.domain
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: WARN:
> > > unpack_rsc_op:unpack.cHandling failed start for
> > > test-1_drac_DoFencing:1 on
> > > ldap-1.domain
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: WARN:
> > > unpack_rsc_op:unpack.cProcessing failed op
> > > (test-2_drac_DoFencing:0_start_0) for
> > > test-2_drac_DoFencing:0 on l
> > > dap-1.domain
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: WARN:
> > > unpack_rsc_op:unpack.cHandling failed start for
> > > test-2_drac_DoFencing:0 on
> > > ldap-1.domain
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: WARN:
> > > unpack_rsc_op:unpack.cProcessing failed op
> > > (test-2_drac_DoFencing:1_start_0) for
> > > test-2_drac_DoFencing:1 on l
> > > dap-1.domain
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: WARN:
> > > unpack_rsc_op:unpack.cHandling failed start for
> > > test-2_drac_DoFencing:1 on
> > > ldap-1.domain
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: info: determine_online_status:
> > > unpack.c Node ldap-2.domain is online
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: WARN:
> > > unpack_rsc_op:unpack.cProcessing failed op
> > > (test-1_drac_DoFencing:0_start_0) for
> > > test-1_drac_DoFencing:0 on l
> > > dap-2.domain
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: WARN:
> > > unpack_rsc_op:unpack.cHandling failed start for
> > > test-1_drac_DoFencing:0 on
> > > ldap-2.domain
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: WARN:
> > > unpack_rsc_op:unpack.cProcessing failed op
> > > (test-2_drac_DoFencing:0_start_0) for
> > > test-2_drac_DoFencing:0 on l
> > > dap-2.domain
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: WARN:
> > > unpack_rsc_op:unpack.cHandling failed start for
> > > test-2_drac_DoFencing:0 on
> > > ldap-2.domain
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: WARN:
> > > unpack_rsc_op:unpack.cProcessing failed op (test_IP_monitor_5000) for
> > > test_IP on
> > > ldap-2.domain
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: WARN:
> > > unpack_rsc_op:unpack.cHandling failed start for
> > > test-2_drac_DoFencing:0 on
> > > ldap-2.domain
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: WARN:
> > > unpack_rsc_op:unpack.cProcessing failed op (test_IP_monitor_5000) for
> > > test_IP on
> > > ldap-2.domain
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: info: Resource Group: test_group
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: info:     test_IP
> > > (heartbeat::ocf:IPaddr):        Started ldap-2.domain FAILED
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: info:     httpd (lsb:httpd):
> > > Stopped
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: info: Clone Set: test-1_drac
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: info:
> > > test-1_drac_DoFencing:0       (stonith:external/drac4):       Stopped
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: info:
> > > test-1_drac_DoFencing:1       (stonith:external/drac4):       Stopped
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: info: Clone Set: test-2_drac
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: info:
> > > test-2_drac_DoFencing:0       (stonith:external/drac4):       Stopped
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: info:
> > > test-2_drac_DoFencing:1       (stonith:external/drac4):       Stopped
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: ERROR:
> > > text2task:common.cUnsupported action: status
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: notice: NoRoleChange:native.cMove
> > > resource test_IP    (ldap-2.domain -> ldap-1.domain)
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: notice: Recurring:native.c
> > > ldap-1.domain     test_IP_monitor_5000
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: notice: StartRsc:native.c
> > > ldap-1.domain  Start httpd
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: ERROR:
> > > text2task:common.cUnsupported action: status
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: notice: Recurring:native.c
> > > ldap-1.domain     httpd_status_5000
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: WARN: stage6:allocate.cScheduling
> > > Node ldap-2.domain for STONITH
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: WARN: native_stop_constraints:
> > > native.c Stop of failed resource test_IP is implict after ldap-2.domain
> > > is fenced
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: info: native_stop_constraints:
> > > native.c Re-creating actions for test_group
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: notice: NoRoleChange:native.cMove
> > > resource test_IP    (ldap-2.domain -> ldap-1.domain)
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: notice: Recurring:native.c
> > > ldap-1.domain     test_IP_monitor_5000
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: notice: StartRsc:native.c
> > > ldap-1.domain  Start httpd
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: ERROR:
> > > text2task:common.cUnsupported action: status
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: notice: Recurring:native.c
> > > ldap-1.domain     httpd_status_5000
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: ERROR:
> > > text2task:common.cUnsupported action: status
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: ERROR:
> > > text2task:common.cUnsupported action: status
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: notice: stage8:allocate.cCreated
> > > transition graph 176.
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: WARN:
> > > process_pe_message:pengine.cNo value specified for cluster preference:
> > > pe-error-series-max
> > > Mar  7 09:30:22 ldap-2 pengine: [5712]: ERROR:
> > > process_pe_message:pengine.cTransition 176: ERRORs found during PE
> > > processing. PEngine Input stored in:
> > > /var
> > > /lib/heartbeat/pengine/pe-error-1408.bz2
> > >
> > > --BO
> > > _______________________________________________
> > > Linux-HA mailing list
> > > Linux-HA at lists.linux-ha.org
> > > http://lists.linux-ha.org/mailman/listinfo/linux-ha
> > > See also: http://linux-ha.org/ReportingProblems
> > >
> > _______________________________________________
> > Linux-HA mailing list
> > Linux-HA at lists.linux-ha.org
> > http://lists.linux-ha.org/mailman/listinfo/linux-ha
> > See also: http://linux-ha.org/ReportingProblems
> >
> _______________________________________________
> Linux-HA mailing list
> Linux-HA at lists.linux-ha.org
> http://lists.linux-ha.org/mailman/listinfo/linux-ha
> See also: http://linux-ha.org/ReportingProblems
>


More information about the Linux-HA mailing list