Explicit heartbeat update method, only check healthy nodes for HB timeout, default node state on creation is 'dead'

This commit is contained in:
Roberto Rosario
2012-08-03 05:40:01 -04:00
parent 5019b1a01f
commit 60b2998b65
4 changed files with 30 additions and 17 deletions

View File

@@ -13,5 +13,5 @@ NODE_STATE_CHOICES = (
) )
DEFAULT_JOB_QUEUE_POLL_INTERVAL = 2 DEFAULT_JOB_QUEUE_POLL_INTERVAL = 2
DEFAULT_NODE_CPU_LOAD = 100 DEFAULT_NODE_CPU_LOAD = 0
DEFAULT_NODE_MEMORY_USAGE = 100 DEFAULT_NODE_MEMORY_USAGE = 0

View File

@@ -13,7 +13,9 @@ from django.utils.translation import ugettext, ugettext_lazy as _
from common.models import Singleton from common.models import Singleton
from .literals import (DEFAULT_NODE_HEARTBEAT_INTERVAL, DEFAULT_NODE_HEARTBEAT_TIMEOUT, from .literals import (DEFAULT_NODE_HEARTBEAT_INTERVAL, DEFAULT_NODE_HEARTBEAT_TIMEOUT,
DEFAULT_DEAD_NODE_REMOVAL_INTERVAL, NODE_STATE_HEALTHY, NODE_STATE_CHOICES, NODE_STATE_DEAD) DEFAULT_DEAD_NODE_REMOVAL_INTERVAL, NODE_STATE_HEALTHY, NODE_STATE_CHOICES, NODE_STATE_DEAD,
DEFAULT_NODE_CPU_LOAD, DEFAULT_NODE_MEMORY_USAGE)
from .signals import node_died
class NodeManager(models.Manager): class NodeManager(models.Manager):
@@ -28,12 +30,12 @@ class NodeManager(models.Manager):
class Node(models.Model): class Node(models.Model):
hostname = models.CharField(max_length=255, verbose_name=_(u'hostname')) hostname = models.CharField(max_length=255, verbose_name=_(u'hostname'))
cpuload = models.FloatField(blank=True, default=0.0, verbose_name=_(u'cpu load')) cpuload = models.FloatField(blank=True, default=DEFAULT_NODE_CPU_LOAD, verbose_name=_(u'cpu load'))
heartbeat = models.DateTimeField(blank=True, default=datetime.datetime.now(), verbose_name=_(u'last heartbeat check')) heartbeat = models.DateTimeField(blank=True, default=datetime.datetime.now(), verbose_name=_(u'last heartbeat check'))
memory_usage = models.FloatField(blank=True, default=0.0, verbose_name=_(u'memory usage')) memory_usage = models.FloatField(blank=True, default=DEFAULT_NODE_MEMORY_USAGE, verbose_name=_(u'memory usage'))
state = models.CharField(max_length=4, state = models.CharField(max_length=4,
choices=NODE_STATE_CHOICES, choices=NODE_STATE_CHOICES,
default=NODE_STATE_HEALTHY, default=NODE_STATE_DEAD,
verbose_name=_(u'state')) verbose_name=_(u'state'))
objects = NodeManager() objects = NodeManager()
@@ -50,17 +52,24 @@ class Node(models.Model):
def refresh(self): def refresh(self):
if self.hostname == platform.node(): if self.hostname == platform.node():
# Make we can only update ourselves # Make sure we can only update ourselves
info = Node.platform_info() info = Node.platform_info()
self.cpuload = info['cpuload'] self.cpuload = info['cpuload']
self.memory_usage = info['memory_usage'] self.memory_usage = info['memory_usage']
def if_healthy(self): def is_healthy(self):
return self.health == NODE_STATE_HEALTHY return self.state == NODE_STATE_HEALTHY
def save(self, *args, **kwargs): def mark_as_dead(self):
self.state=NODE_STATE_DEAD
node_died.send(sender=self)
self.save()
def send_heartbeat(self):
self.refresh()
self.state=NODE_STATE_HEALTHY
self.heartbeat = datetime.datetime.now() self.heartbeat = datetime.datetime.now()
return super(Node, self).save(*args, **kwargs) self.save()
class Meta: class Meta:
verbose_name = _(u'node') verbose_name = _(u'node')
@@ -69,10 +78,11 @@ class Node(models.Model):
class ClusteringConfigManager(models.Manager): class ClusteringConfigManager(models.Manager):
def dead_nodes(self): def dead_nodes(self):
return self.model.objects.filter(heartbeat__lt=datetime.datetime.now() - datetime.timedelta(seconds=self.model.get().node_heartbeat_timeout)) return Node.objects.filter(state=NODE_STATE_HEALTHY).filter(heartbeat__lt=datetime.datetime.now() - datetime.timedelta(seconds=self.model.get().node_heartbeat_timeout))
def check_dead_nodes(self): def check_dead_nodes(self):
self.dead_nodes().update(healty=NODE_STATE_DEAD) for node in self.dead_nodes():
node.mark_as_dead()
def zombiest_node(self): def zombiest_node(self):
try: try:
@@ -86,7 +96,7 @@ class ClusteringConfig(Singleton):
node_heartbeat_timeout = models.PositiveIntegerField(verbose_name=(u'node heartbeat timeout (in seconds)'), help_text=_(u'After this amount of time a node without heartbeat updates is considered dead and removed from the cluster node list.'), default=DEFAULT_NODE_HEARTBEAT_TIMEOUT) node_heartbeat_timeout = models.PositiveIntegerField(verbose_name=(u'node heartbeat timeout (in seconds)'), help_text=_(u'After this amount of time a node without heartbeat updates is considered dead and removed from the cluster node list.'), default=DEFAULT_NODE_HEARTBEAT_TIMEOUT)
dead_node_removal_interval = models.PositiveIntegerField(verbose_name=(u'dead node check and removal interval (in seconds)'), help_text=_(u'Interval of time to check the cluster for unresponsive nodes and remove them from the cluster.'), default=DEFAULT_DEAD_NODE_REMOVAL_INTERVAL) dead_node_removal_interval = models.PositiveIntegerField(verbose_name=(u'dead node check and removal interval (in seconds)'), help_text=_(u'Interval of time to check the cluster for unresponsive nodes and remove them from the cluster.'), default=DEFAULT_DEAD_NODE_REMOVAL_INTERVAL)
objects = ClusteringConfigManager() cluster = ClusteringConfigManager()
def __unicode__(self): def __unicode__(self):
return ugettext('clustering config') return ugettext('clustering config')

View File

@@ -0,0 +1,3 @@
from django.dispatch import Signal
node_died = Signal(providing_args=['node'])

View File

@@ -15,11 +15,11 @@ logger = logging.getLogger(__name__)
def node_heartbeat(): def node_heartbeat():
logger.debug('starting') logger.debug('starting')
node = Node.objects.myself() node = Node.objects.myself()
node.save() node.send_heartbeat()
@simple_locking('house_keeping', 10) @simple_locking('house_keeping', 10)
def house_keeping(): def house_keeping():
logger.debug('starting') logger.debug('starting')
ClusteringConfig.objects.check_dead_nodes() ClusteringConfig.cluster.check_dead_nodes()