Do not undertake a clustered storage solution lightly. There are good reasons why vendors, such as Netapp and EMC, charge top dollar for their solutions. These vendors have tightly integrated there products to efficiently do what they do well. But, if done right, NFS clustering on Linux works and can be reliable too.
I know myself. This document will contain spelling and grammar mistakes.
Connect a crossover cable between both servers and configure your network interfaces.
Host1
node1:~ # ifconfig -a
eth0 Link encap:Ethernet HWaddr 00:0C:29:E2:23:BC
inet addr:192.168.0.14 Bcast:192.168.221.255 Mask:255.255.255.0
inet6 addr: fe80::20c:29ff:fee2:23bc/64 Scope:Link
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
RX packets:2045217 errors:0 dropped:0 overruns:0 frame:0
TX packets:1860970 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:1000
RX bytes:815526851 (777.7 Mb) TX bytes:333011616 (317.5 Mb)
Base address:0x1070 Memory:f4820000-f4840000
eth1 Link encap:Ethernet HWaddr 00:0C:29:E2:23:C6
inet addr:192.168.254.1 Bcast:192.168.254.255 Mask:255.255.255.0
inet6 addr: fe80::20c:29ff:fee2:23c6/64 Scope:Link
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
RX packets:1927056 errors:0 dropped:0 overruns:0 frame:0
TX packets:1349586 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:1000
RX bytes:1052573296 (1003.8 Mb) TX bytes:1887852059 (1800.3 Mb)
Base address:0x1078 Memory:f4840000-f4860000
Host2
node2:~ # ifconfig -a
eth0 Link encap:Ethernet HWaddr 00:0C:29:C9:A8:0A
inet addr:192.168.0.15 Bcast:192.168.221.255 Mask:255.255.255.0
inet6 addr: fe80::20c:29ff:fec9:a80a/64 Scope:Link
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
RX packets:24052857 errors:0 dropped:0 overruns:0 frame:0
TX packets:24081678 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:1000
RX bytes:2816432080 (2685.9 Mb) TX bytes:2771853252 (2643.4 Mb)
Base address:0x1400 Memory:f4820000-f4840000
eth1 Link encap:Ethernet HWaddr 00:0C:29:C9:A8:14
inet addr:192.168.254.2 Bcast:192.168.254.255 Mask:255.255.255.0
inet6 addr: fe80::20c:29ff:fec9:a814/64 Scope:Link
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
RX packets:6027413 errors:0 dropped:0 overruns:0 frame:0
TX packets:4580113 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:1000
RX bytes:3466109957 (3305.5 Mb) TX bytes:2470685710 (2356.2 Mb)
Base address:0x1440 Memory:f4840000-f4860000
Assign an extra IP address to be clustered IP and give it a DNS entry.
# host file1 file1.mydomain.tld has address 192.168.0.25
Install these RPM's
yast -i drbd heartbeat sles-heartbeat_en drbd-kmp-smp
Partition the disks:
fdisk /dev/cciss/c0d4
Create LVM volumes on top of our clustered volumes:
pvcreate /dev/cciss/c0d4p1 vgcreate data /dev/cciss/c0d4p1 lvcreate -L 10M -n nfs data lvcreate -L 1G -n dbs1 data lvcreate -L 10G -n vol1 data
Next we will put DRBD on top of LVM volumes.
Put in place your /etc/drbd.conf configuration:
<cdoe>
resource nfs {
protocol C;
startup {
wfc-timeout 0; # unlimited
degr-wfc-timeout 120; # 2 minutes.
}
disk { on-io-error detach; }
syncer {
rate 100M;
group 0;
}
on ocdcdbs072 {
device /dev/drbd0;
disk /dev/data/nfs;
address 192.168.254.1:7000;
meta-disk /dev/data/drbd[0];
}
on ocdcdbs073 {
device /dev/drbd0;
disk /dev/data/nfs;
address 192.168.254.2:7000;
meta-disk /dev/data/drbd[0];
}
}
resource dbs1 {
protocol C;
startup {
wfc-timeout 0; # unlimited
degr-wfc-timeout 120; # 2 minutes.
}
disk { on-io-error detach; }
syncer {
rate 100M;
group 1;
}
on ocdcdbs072 {
device /dev/drbd1;
disk /dev/data/dbs1;
address 192.168.254.1:7001;
meta-disk /dev/data/drbd[1];
}
on ocdcdbs073 {
device /dev/drbd1;
disk /dev/data/dbs1;
address 192.168.254.2:7001;
meta-disk /dev/data/drbd[1];
}
}
resource vol1 {
protocol C;
startup {
wfc-timeout 0; # unlimited
degr-wfc-timeout 120; # 2 minutes.
}
disk { on-io-error detach; }
syncer {
rate 100M;
group 1;
}
on ocdcdbs072 {
device /dev/drbd2;
disk /dev/data/vol1;
address 192.168.254.1:7002;
meta-disk /dev/data/drbd[2];
}
on ocdcdbs073 {
device /dev/drbd2;
disk /dev/data/vol1;
address 192.168.254.2:7002;
meta-disk /dev/data/drbd[2];
}
} </code>
Start drbd and configure the clustered volume(s):
/etc/init.d/drbd start drbdsetup /dev/drbd0 primary --do-what-I-say drbdsetup /dev/drbd1 primary --do-what-I-say drbdsetup /dev/drbd2 primary --do-what-I-say drbdadm primary all
Format the new volumes:
mkfs -t ext3 /dev/drbd0 mkfs -t xfs /dev/drbd1 mkfs -t xfs /dev/drbd2
Make sure the DRBD is configured to start at system boot:
chkconfig drbd on
On the secondary node, copy the /etc/drbd.conf, configure the drive partitions, the start DRBD and let it sync.
# cat /proc/drbd
version: 0.7.22 (api:79/proto:74)
SVN Revision: 2572 build by lmb@dale, 2006-10-25 18:17:21
0: cs:Connected st:Primary/Secondary ld:Consistent
ns:193 nr:12354 dw:12547 dr:63 al:0 bm:16 lo:0 pe:0 ua:0 ap:0
1: cs:Connected st:Primary/Secondary ld:Consistent
ns:1651645 nr:521980 dw:2173625 dr:1226241 al:302 bm:679 lo:0 pe:0 ua:0 ap:0
2: cs:Connected st:Primary/Secondary ld:Consistent
ns:1551 nr:262923 dw:264475 dr:1888 al:0 bm:342 lo:0 pe:0 ua:0 ap:0
Create the following configuration files:
/etc/ha.d/ha.cf
crm off # turn off 2.x style nodes logfacility local0 # syslog facility use_logd no # another logging service debug 0 ucast eth0 node2 # node(s) to send heartbeats on ucast eth1 node2 # node(s) to send heartbeats on keepalive 1 # time between heartbeats warntime 5 # time before a late warning shows in the logs deadtime 10 # time before the node is pronounced dead initdead 60 # deadtime after a reboot, gives time for the network to come up udpport 694 # udp port for heartbeat broadcast autojoin none # enables nodes to join automatically by communicating with the cluster node node1 node2 # cluster nodes auto_failback on # enables favorite member node watchdog /dev/watchdog # watchdog device to fence the system by shuting it down ping_group ldap1.mydomain.tld
/etc/ha.d/haresources
node1 \
drbddisk::vol1 \
Filesystem::/dev/drbd2::/data/vol1::xfs \
drbddisk::dbs1 \
Filesystem::/dev/drbd1::/var/lib/mysql::xfs \
Filesystem::/dev/drbd1::/data/dbs1::xfs \ # mounted this in two places
drbddisk::nfs \
Filesystem::/dev/drbd0::/var/lib/nfs::ext3 \
nfsboot \
nfsserver \
idmapd \
mysql_repair \
mysql \
Delay::3::0 \
IPaddr3::file1.mydomain.tld/24/eth0/192.168.0.255
/etc/ha.d/authkeys
auth 1 1 sha1 mysecrectpassword
NFS's state data needs moved onto a clustered file system so that it follows the cluster.
mv /var/lib/nfs /var/lib/nfs.bak cp -R /var/lib/nfs.bak /var/lib/nfs
mount /dev/drbd0 /var/lib/nfs cp -Rv /var/lib/nfs.bak/* /var/lib/nfs
lockd:
sm-notify which seems to load the lockd kernel module. I'm still not sure on how this should be integrated with the cluster, as it uses /var/lib/nfs and starts before the cluster. It seems like you would want lockd to follow the nfs server. On sles10 I did run into an issue where lock stopped working. Eventually the issue was resolved, but I was unable to figure out what resolved it. Some unknown action in combination with a server reboot fixed the issue.chkconfig nfsboot on
For firewalled environments:
MOUNTD_PORT="736"
echo "options lockd nlm_udpport=4002 nlm_tcpport=4002" > /etc/modprobe.d/lockd
There are two ways to do LVM snapshots with clustering, either inside the cluster or outside the cluster.
These configuration detailed above configures LVM snapshot outside the DRBD/Heartbeat configuration. Since the DRBD volumes are mountable, you can simply take a snapshot of them. Both cluster nodes will need to take their snapshot independently.
If you dynamically create snapshots, as I did because they are time based, and you wish to make them accessible to others, you cannot use NFS to mount them as the /etc/export file would need updated every time. This is because NFS exports must align with the mounted volume path. Meaning, you can't NFS mount a higher directory and cd into the sub directory to access the mounted volume. NFS does not allow that. A way around this is to use samba, which doesn't care about mounts.
Placing LVM snapshots outside the cluster configuration is less complex and the method I ended up staying with.
This method is more complex. Keep in mind you will be moving LVM volumes between two systems. Thus, you will need to un-allocate and allocate the LVM volumes during cluster failover. Heartbeat does not come pre-configured to do this.
Partition if you want. I left the volume unpartitioned.
fdisk /dev/cciss/c0d4
Apply this configuration to /etc/drbd.conf:
resource drbd-c0d4 {
protocol C;
disk { on-io-error detach; }
syncer {
rate 100M;
group 0;
}
on ocdcfil074 {
device /dev/drbd0;
disk /dev/cciss/c0d4;
address 10.0.0.1:7000;
meta-disk internal;
}
on ocdcfil075 {
device /dev/drbd0;
disk /dev/cciss/c0d4;
address 10.0.0.2:7000;
meta-disk internal;
}
}
Start drbd and configure the clustered volume(s):
/etc/init.d/drbd start drbdsetup /dev/drbd0 primary --do-what-I-say drbdadm primary all
Replace the LVM filter in /etc/lvm/lvm.conf with the following filter:
filter = ["a|sd.*|", "a|drbd.*|", "r|.*|"]
Create LVM volumes on top of our clustered volumes:
pvcreate /dev/drbd0 vgcreate vol1 /dev/drbd0 lvcreate -L 10M -n nfs vol1 lvcreate -L 10G -n home vol1 lvcreate -L 5G -n dbs1 vol1
Format the new volumes:
mkfs -t ext3 /dev/vol1/nfs mkfs -t xfs /dev/vol2/dbs1 mkfs -t xfs /dev/vol2/vol1
Make sure the DRBD is configured to start:
chkconfig drbd on
On the secondary node, copy the /etc/drbd.conf, configure the drive partitions, the start DRBD and let it sync.
# cat /proc/drbd
version: 0.7.22 (api:79/proto:74)
SVN Revision: 2572 build by lmb@dale, 2006-10-25 18:17:21
0: cs:SyncTarget st:Secondary/Primary ld:Inconsistent
ns:0 nr:108596 dw:108592 dr:0 al:0 bm:13052 lo:139 pe:2464 ua:139 ap:0
[>...................] sync'ed: 0.2% (104248/104354)M
finish: 2:27:43 speed: 12,004 (12,004) K/sec
1: cs:PausedSyncT st:Secondary/Primary ld:Inconsistent
ns:0 nr:0 dw:0 dr:0 al:0 bm:52478 lo:0 pe:0 ua:0 ap:0
Create the following config files:
/etc/ha.d/haresources:
node1 \
drbddisk::drbd-c0d4 \
drbdlvm::vol1 \
Filesystem::/dev/vol1/os::/data/dbs1::xfs \
Filesystem::/dev/vol1/cd::/data/vol1::xfs \
Filesystem::/dev/vol1/nfs::/data/nfs::ext3 \
drbdsnap \
nfsserver \
Delay::3::0 \
IPaddr3::file1/24/eth0/192.168.0.255 \
apache2 \
vsftpd \
smb
These two scripts are needed to allow heartbeat to safely remove and import the volumes on cluster failover:
Here is a script that can be used to manually start and stop services under the cluster's control. This tools has a lot of value in testing and trouble shooting your haresources file.
ssh <primary-node-name> /etc/init.d/heartbeat restart
By restarting heartbeat on the primary node, services will stop and start on the other node.
cat /proc/drbd
This will place each node in standalone mode:
drbdadm disconnect <volume-name>
This will make node2's replicated block disk accessible on the secondary node:
drbdadm primary <volume-name>
This will make node2's replicated block disk ready to be the secondary node:
drbdadm secondary <volume-name>
This will place each node in standalone mode:
drbdadm connect <volume-name>
ssh file1 cd /data/snap/<volume>/<hour>/my-dir/my-files cp my-files /vol/<volume>/<hour>/my-dir/my-files exit
ssh file1 sudo acp /etc/exports sudo vi /etc/exports sudo /etc/init.d/nfsserver reload
showmount -e file1
This can happen when the exported share, or the NFS server is down longer then the client's timeout.
ssh <username>@<servername
pkill -9 -f /vol
umount -f /vol/<share-name>
/etc/init.d/autofs stop
umount -f /vol
/etc/init.d/autofs start
Note: If umount -f will note work, then umount -l will be needed.
/etc/init.d/nfsserver restart
Here are examples of starting and stopping the clustered apps manaually:
# haadm -d start /etc/ha.d/resource.d/drbddisk vol1 start /etc/ha.d/resource.d/Filesystem /dev/drbd2 /data/vol1 xfs start /etc/ha.d/resource.d/drbddisk dbs1 start /etc/ha.d/resource.d/Filesystem /dev/drbd1 /var/lib/mysql xfs start /etc/ha.d/resource.d/Filesystem /dev/drbd1 /data/dbs1 xfs start /etc/ha.d/resource.d/drbddisk nfs start /etc/ha.d/resource.d/Filesystem /dev/drbd0 /data/nfs ext3 start /etc/init.d/nfsserver start /etc/ha.d/resource.d/mysql_repair start /etc/init.d/mysql start /etc/ha.d/resource.d/Delay 3 0 start /etc/ha.d/resource.d/IPaddr3 file1.mydomain.tld/24/eth0/192.168.0.255 start
# haadm -d stop /etc/ha.d/resource.d/IPaddr3 file1.mydomain.tld/24/eth0/192.168.0.255 stop /etc/ha.d/resource.d/Delay 3 0 stop /etc/init.d/mysql stop /etc/ha.d/resource.d/mysql_repair stop /etc/init.d/nfsserver stop /etc/ha.d/resource.d/Filesystem /dev/drbd0 /data/nfs ext3 stop /etc/ha.d/resource.d/drbddisk nfs stop /etc/ha.d/resource.d/Filesystem /dev/drbd1 /data/dbs1 xfs stop /etc/ha.d/resource.d/Filesystem /dev/drbd1 /var/lib/mysql xfs stop /etc/ha.d/resource.d/drbddisk dbs1 stop /etc/ha.d/resource.d/Filesystem /dev/drbd2 /data/vol1 xfs stop /etc/ha.d/resource.d/drbddisk vol1 stop
cd /vol/nfs/volume touch test.lock flock test.lock uname
chkconfig heartbeat off
Document Author: Travis Sidelinger