#!/usr/bin/python3
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#

import sys
import os
import subprocess
import re
import threading
import getopt
import syslog
import time
import datetime
import calendar
import signal
import argparse
import logging
from logging.handlers import SysLogHandler

from gi.repository import GLib as glib
from gi.repository import GObject as gobject
import dbus
import dbus.service
import dbus.mainloop
import dbus.mainloop.glib

from . import dbussvc
from . import zfs
from . import smf
import time_slider.linux.timeslidersmf as timeslidersmf
import time_slider.linux.autosnapsmf as autosnapsmf
# import plugin
from time_slider.linux.rbac import RBACprofile
from . import util

import time_slider.linux.timesliderconfig as timesliderconfig

_MINUTE = 60
_HOUR = _MINUTE * 60
_DAY = _HOUR * 24
_WEEK = _DAY * 7


# Status codes for actual zpool capacity levels.
# These are relative to the SMF property defined
# levels for: user, warning and emergenecy levels
STATUS_OK = 0 # Below user specified threshhold. Everything was OK
STATUS_WARNING = 1 # Above specified user threshold level
STATUS_CRITICAL = 2 # Above specified critical threshhold level
STATUS_EMERGENCY = 3 # Above specified emergency threshhold level

intervals = {"weeks" : _WEEK, "days" : _DAY, "hours" : _HOUR, "minutes" : _MINUTE}


class SnapshotManager(threading.Thread):

    def __init__(self, bus):
        # Used to wake up the run() method prematurely in the event
        # of a SIGHUP/SMF refresh
        self._conditionLock = threading.Condition(threading.RLock())
        # Used when schedules are being rebuilt or examined.
        self._refreshLock = threading.Lock()
        # Indicates that cleanup is in progress when locked
        self._cleanupLock = threading.Lock()
        self._datasets = zfs.Datasets()
        # Indicates that schedules need to be rebuilt from scratch
        self._stale = True
        self._lastCleanupCheck = 0;
        self._zpools = []
        self._poolstatus = {}
        self._destroyedsnaps = []
        self.logger = logging.getLogger('time-slider')

        # This is also checked during the refresh() method but we need
        # to know it sooner for instantiation of the PluginManager
        self._smf = timeslidersmf.TimeSliderSMF()
        try:
            self.verbose = self._smf.get_verbose()
        except RuntimeError as message:
            self.logger.error("Error determing whether debugging is enabled")
            self.verbose = False

        self._dbus = dbussvc.AutoSnap(bus,
                                      '/org/opensolaris/TimeSlider/autosnap',
                                      self)

        # self._plugin = plugin.PluginManager(self.verbose)
        self.exitCode = smf.SMF_EXIT_OK
        self.refresh()

        # Seems we're up and running OK.
        # Signal our parent so we can daemonise
        os.kill(os.getppid(), signal.SIGUSR1)

        # SMF/svc.startd sends SIGHUP to force a
        # a refresh of the daemon
        signal.signal(signal.SIGHUP, self._signalled)

        # Init done. Now initiaslise threading.
        threading.Thread.__init__ (self)
        self.setDaemon(True)

    def run(self):
        # Deselect swap and dump volumes so they don't get snapshotted.
        for vol in self._datasets.list_volumes():
            name = vol.rsplit("/")
            try:
                if (name[1] == "swap" or name[1] == "dump"):
                    util.debug("Auto excluding %s volume" % vol, self.verbose)
                    volume = zfs.Volume(vol)
                    volume.set_auto_snap(False)
            except IndexError:
                pass

        nexttime = None
        waittime = None
        while True:
            try:
                self.refresh()
                # First check and, if necessary, perform any remedial cleanup.
                # This is best done before creating any new snapshots which may
                # otherwise get immediately gobbled up by the remedial cleanup.
                if self._needs_cleanup() == True:
                    self._perform_cleanup()
                    # Check to see if cleanup actually deleted anything before
                    # notifying the user. Avoids the popup appearing continuously
                    if len(self._destroyedsnaps) > 0:
                        self._send_notification()
                    self._send_to_syslog()

                nexttime = self._check_snapshots()
                # Overdue snapshots are already taken automatically
                # inside _check_snapshots() so nexttime should never be
                # < 0. It can be None however, which is fine since it
                # will cause the scheduler thread to sleep indefinitely
                # or until a SIGHUP is caught.
                if nexttime:
                    util.debug("Waiting until " + str (nexttime), self.verbose)
                waittime = None
                if nexttime != None:
                    waittime = nexttime - int(time.time())
                    if (waittime <= 0):
                        # We took too long and missed a snapshot, so break out
                        # and catch up on it the next time through the loop
                        continue
                # waittime could be None if no auto-snap schedules are online
                self._conditionLock.acquire()
                if waittime:
                    util.debug("Waiting %d seconds" % (waittime), self.verbose)
                    self._conditionLock.wait(waittime)
                else: #None. Just wait a while to check for cleanups.
                    util.debug("No auto-snapshot schedules online.", \
                               self.verbose)
                    self._conditionLock.wait(_MINUTE * 15)

            except OSError as message:
                self.logger.error("Caught OSError exception in snapshot" +
                                 " manager thread")
                self.logger.error("Error details:\n" + \
                                 "--------BEGIN ERROR MESSAGE--------\n" + \
                                 str(message) + \
                                 "\n--------END ERROR MESSAGE--------")
                self.exitCode = smf.SMF_EXIT_ERR_FATAL
                # Exit this thread
                break
            except RuntimeError as message:
                self.logger.error("Caught RuntimeError exception in snapshot" +
                                 " manager thread")
                self.logger.error("Error details:\n" + \
                                 "--------BEGIN ERROR MESSAGE--------\n" + \
                                 str(message) + \
                                 "\n--------END ERROR MESSAGE--------")
                # Exit this thread
                break

    def _signalled(self, signum, frame):
        if signum == signal.SIGHUP:
            if self._refreshLock.acquire(False) == False:
                return
            self._stale = True
            self._refreshLock.release()
            self._conditionLock.acquire()
            self._conditionLock.notify()
            self._conditionLock.release()

    def refresh(self):
        """
        Checks if defined snapshot schedules are out
        of date and rebuilds and updates if necessary
        """
        self._refreshLock.acquire()
        if self._stale == True:
            self._configure_svc_props()
            self._rebuild_schedules()
            self._update_schedules()
            # self._plugin.refresh()
            self._stale = False
        self._refreshLock.release()

    def _configure_svc_props(self):
        try:
            self.verbose = self._smf.get_verbose()
        except RuntimeError as message:
            self.logger.error("Error determing whether debugging is enabled")
            self.verbose = False

        try:
            cleanup = self._smf.get_remedial_cleanup()
            warn = self._smf.get_cleanup_level("warning")
            util.debug("Warning level value is:   %d%%" % warn, self.verbose)
            crit = self._smf.get_cleanup_level("critical")
            util.debug("Critical level value is:  %d%%" % crit, self.verbose)
            emer = self._smf.get_cleanup_level("emergency")
            util.debug("Emergency level value is: %d%%" % emer, self.verbose)
        except RuntimeError as message:
            self.logger.error("Failed to determine cleanup threshhold levels")
            self.logger.error("Details:\n" + \
                             "--------BEGIN ERROR MESSAGE--------\n" + \
                             str(message) + \
                             "\n---------END ERROR MESSAGE---------\n")
            self.logger.error("Using factory defaults of 80%, 90% and 95%")
            #Go with defaults
            #FIXME - this would be an appropriate case to mark svc as degraded
            self._remedialCleanup = True
            self._warningLevel = 80
            self._criticalLevel = 90
            self._emergencyLevel = 95
        else:
            self._remedialCleanup = cleanup
            self._warningLevel = warn
            self._criticalLevel = crit
            self._emergencyLevel = emer

        try:
            self._keepEmpties = self._smf.get_keep_empties()
        except RuntimeError as message:
            # Not fatal, just assume we delete them (default configuration)
            self.logger.error("Can't determine whether to keep empty snapshots")
            self.logger.error("Details:\n" + \
                             "--------BEGIN ERROR MESSAGE--------\n" + \
                             str(message) + \
                             "\n---------END ERROR MESSAGE---------")
            self.logger.error("Assuming default value: False")
            self._keepEmpties = False

        # Previously, snapshot labels used the ":" character was used as a
        # separator character for datestamps. Windows filesystems such as
        # CIFS and FAT choke on this character so now we use a user definable
        # separator value, with a default value of "_"
        # We need to check for both the old and new format when looking for
        # snapshots.
        self._separator = self._smf.get_separator()
        self._prefix = "%s[:%s]" \
            % (autosnapsmf.SNAPLABELPREFIX, self._separator)

        # Rebuild pool list
        self._zpools = []
        try:
            for poolname in zfs.list_zpools():
                # Do not try to examine FAULTED pools
                zpool = zfs.ZPool(poolname)
                if zpool.health == "FAULTED":
                    util.debug("Ignoring faulted Zpool: %s\n" \
                               % (zpool.name), \
                               self.verbose)
                else:
                    self._zpools.append(zpool)
                util.debug(str(zpool), self.verbose)
        except RuntimeError as message:
            self.logger.error("Could not list Zpools")
            self.exitCode = smf.SMF_EXIT_ERR_FATAL
            # Propogate exception up to thread's run() method
            raise RuntimeError(message)


    def _rebuild_schedules(self):
        """
        Builds 2 lists of default and custom auto-snapshot SMF instances
        """

        self._last = {}
        self._next = {}
        self._keep = {}

        try:
            _defaultSchedules = autosnapsmf.get_default_schedules()
            _customSchedules = autosnapsmf.get_custom_schedules()
        except RuntimeError as message:
            self.exitCode = smf.SMF_EXIT_ERR_FATAL
            raise RuntimeError("Error reading SMF schedule instances\n" + \
                                "Details:\n" + str(message))
        else:
            # Now set it in stone.
            self._defaultSchedules = tuple(_defaultSchedules)
            self._customSchedules = tuple(_customSchedules)

            # Build the combined schedule tuple from default + custom schedules
            _defaultSchedules.extend(_customSchedules)
            self._allSchedules = tuple(_defaultSchedules)
            for schedule,i,p,keep in self._allSchedules:
                self._last[schedule] = 0
                self._next[schedule] = 0
                self._keep[schedule] = keep

    def _update_schedules(self):
        interval = 0
        idx = 1 # Used to index subsets for schedule overlap calculation
        last = None

        for schedule,interval,period,keep in self._allSchedules:
            # Shortcut if we've already processed this schedule and it's
            # still up to date. Don't skip the default schedules though
            # because overlap affects their scheduling
            if [schedule,interval,period,keep] not in \
                self._defaultSchedules and \
                (self._next[schedule] > self._last[schedule]):
                util.debug("Short circuiting %s recalculation" \
                           % (schedule), \
                           self.verbose)
                continue

            # If we don't have an internal timestamp for the given schedule
            # ask zfs for the last snapshot and get it's creation timestamp.
            if self._last[schedule] == 0:
                try:
                    snaps = self._datasets.list_snapshots("%s%s" % \
                                                         (self._prefix,
                                                          schedule))
                except RuntimeError as message:
                    self.exitCode = smf.SMF_EXIT_ERR_FATAL
                    self.logger.error("Failed to list snapshots during schedule update")
                    #Propogate up to the thread's run() method
                    raise RuntimeError(message)

                if len(snaps) > 0:
                    util.debug("Last %s snapshot was: %s" % \
                               (schedule, snaps[-1][0]), \
                               self.verbose)
                    self._last[schedule] = snaps[-1][1]

            last = self._last[schedule]
            if interval != "months": # months is non-constant. See below.
                util.debug("Recalculating %s schedule" % (schedule), \
                           self.verbose)
                try:
                    totalinterval = intervals[interval] * period
                except KeyError:
                    self.exitCode = smf.SMF_EXIT_ERR_CONFIG
                    self.logger.error(schedule + \
                                      " schedule has invalid interval: " + \
                                      "'%s\'" % interval)
                    #Propogate up to thread's run() method
                    raise RuntimeError
                if [schedule,interval,period,keep] in self._defaultSchedules:
                    # This is one of the default schedules so check for an
                    # overlap with one of the dominant shchedules.
                    for s,i,p,k in self._defaultSchedules[:idx]:
                        last = max(last, self._last[s])
                    idx += 1

            else: # interval == "months"
                if self._next[schedule] > last:
                    util.debug("Short circuiting " + \
                               schedule + \
                               " recalculation", \
                               self.verbose)
                    continue
                util.debug("Recalculating %s schedule" % (schedule), \
                           self.verbose)
                snap_tm = time.gmtime(self._last[schedule])
                # Increment year if period >= than 1 calender year.
                year = snap_tm.tm_year
                year += period // 12
                period = period % 12

                mon = (snap_tm.tm_mon + period) % 12
                # Result of 0 actually means december.
                if mon == 0:
                    mon = 12
                # Account for period that spans calendar year boundary.
                elif snap_tm.tm_mon + period > 12:
                    year += 1

                d,dlastmon = calendar.monthrange(snap_tm.tm_year, snap_tm.tm_mon)
                d,dnewmon = calendar.monthrange(year, mon)
                mday = snap_tm.tm_mday
                if dlastmon > dnewmon and snap_tm.tm_mday > dnewmon:
                   mday = dnewmon

                tm =(year, mon, mday, \
                    snap_tm.tm_hour, snap_tm.tm_min, snap_tm.tm_sec, \
                    0, 0, -1)
                newt = calendar.timegm(tm)
                new_tm = time.gmtime(newt)
                totalinterval = newt - self._last[schedule]

            self._next[schedule] = last + totalinterval

    def _next_due(self):
        schedule = None
        earliest = None
        now = int(time.time())

        for s,i,p,k in self._defaultSchedules:
            due = self._next[s]
            if due <= now:
                #Default Schedule - so break out at the first
                #schedule that is overdue. The subordinate schedules
                #will re-adjust afterwards.
                earliest,schedule = due,s
                break
            elif earliest != None:
                if due < earliest:
                    earliest,schedule = due,s
            else: #FIXME better optimisation with above condition
                earliest,schedule = due,s
        for s,i,p,k in self._customSchedules:
            due = self._next[s]
            if earliest != None:
                if due < earliest:
                    earliest,schedule = due,s
            else: #FIXME better optimisation with above condition
                earliest,schedule = due,s
        return earliest,schedule

    def _check_snapshots(self):
        """
        Check the schedules and see what the required snapshot is.
        Take one immediately on the first overdue snapshot required
        """
        # Make sure a refresh() doesn't mess with the schedule while
        # we're reading through it.
        self._refreshLock.acquire()
        next,schedule = self._next_due()
        self._refreshLock.release()
        now = int(time.time())
        while next != None and next <= now:
            label = self._take_snapshots(schedule)
            # self._plugin.execute_plugins(schedule, label)
            self._refreshLock.acquire()
            self._update_schedules()
            next,schedule = self._next_due();
            self._refreshLock.release()
            dt = datetime.datetime.fromtimestamp(next)
            util.debug("Next snapshot is %s due at: %s" % \
                       (schedule, dt.isoformat()), \
                       self.verbose)
        return next

    def _take_snapshots(self, schedule):
        # Set the time before taking snapshot to avoid clock skew due
        # to time taken to complete snapshot.
        tm = int(time.time())
        label = "%s%s%s-%s" % \
                (autosnapsmf.SNAPLABELPREFIX, self._separator, schedule,
                 datetime.datetime.now().strftime("%Y-%m-%d-%Hh%M"))
        try:
            self._datasets.create_auto_snapshot_set(label, tag=schedule)
        except RuntimeError as message:
            # Write an error message, set the exit code and pass it up the
            # stack so the thread can terminate
            self.logger.error("Failed to create snapshots for schedule: %s" \
                             % (schedule))
            self.exitCode = smf.SMF_EXIT_MON_DEGRADE
            raise RuntimeError(message)
        self._last[schedule] = tm;
        self._perform_purge(schedule)
        return label

    def _prune_snapshots(self, dataset, schedule):
        """Cleans out zero sized snapshots, kind of cautiously"""
            # Per schedule: We want to delete 0 sized
            # snapshots but we need to keep at least one around (the most
            # recent one) for each schedule so that that overlap is
            # maintained from frequent -> hourly -> daily etc.
            # Start off with the smallest interval schedule first and
            # move up. This increases the amount of data retained where
            # several snapshots are taken together like a frequent hourly
            # and daily snapshot taken at 12:00am. If 3 snapshots are all
            # identical and reference the same identical data they will all
            # be initially reported as zero for used size. Deleting the
            # daily first then the hourly would shift make the data referenced
            # by all 3 snapshots unique to the frequent scheduled snapshot.
            # This snapshot would probably be purged within an how ever and the
            # data referenced by it would be gone for good.
            # Doing it the other way however ensures that the data should
            # remain accessible to the user for at least a week as long as
            # the pool doesn't run low on available space before that.

        try:
            snaps = dataset.list_snapshots("%s%s" % (self._prefix,schedule))
            # Clone the list because we want to remove items from it
            # while iterating through it.
            remainingsnaps = snaps[:]
        except RuntimeError as message:
            self.logger.error("Failed to list snapshots during snapshot cleanup")
            self.exitCode = smf.SMF_EXIT_ERR_FATAL
            raise RuntimeError(message)

        if (self._keepEmpties == False):
            try: # remove the newest one from the list.
                snaps.pop()
            except IndexError:
                pass
            for snapname in snaps:
                try:
                    snapshot = zfs.Snapshot(snapname)
                except Exception as message:
                    self.logger.error(str(message))
                    # Not fatal, just skip to the next snapshot
                    continue

                try:
                    if snapshot.get_used_size() == 0:
                        util.debug("Destroying zero sized: " + snapname, \
                                   self.verbose)
                        try:
                            snapshot.destroy()
                        except RuntimeError as message:
                            self.logger.error("Failed to destroy snapshot: " +
                                             snapname)
                            self.exitCode = smf.SMF_EXIT_MON_DEGRADE
                            # Propogate exception so thread can exit
                            raise RuntimeError(message)
                        remainingsnaps.remove(snapname)
                except RuntimeError as message:
                    self.logger.error("Can not determine used size of: " + \
                                     snapname)
                    self.exitCode = smf.SMF_EXIT_MON_DEGRADE
                    #Propogate the exception to the thead run() method
                    raise RuntimeError(message)

        # Deleting individual snapshots instead of recursive sets
        # breaks the recursion chain and leaves child snapshots
        # dangling so we need to take care of cleaning up the
        # snapshots.
        target = len(remainingsnaps) - self._keep[schedule]
        counter = 0
        while counter < target:
            util.debug("Destroy expired snapshot: " + \
                       remainingsnaps[counter],
                       self.verbose)
            try:
                snapshot = zfs.Snapshot(remainingsnaps[counter])
            except Exception as message:
                    self.logger.error(str(message))
                    # Not fatal, just skip to the next snapshot
                    counter += 1
                    continue
            try:
                snapshot.destroy()
            except RuntimeError as message:
                self.logger.error("Failed to destroy snapshot: " +
                                 snapshot.name)
                self.exitCode = smf.SMF_EXIT_ERR_FATAL
                # Propogate exception so thread can exit
                raise RuntimeError(message)
            else:
                counter += 1

    def _perform_purge(self, schedule):
        """Cautiously cleans out zero sized snapshots"""
        # We need to avoid accidentally pruning auto snapshots received
        # from one zpool to another. We ensure this by examining only
        # snapshots whose parent fileystems and volumes are explicitly
        # tagged to be snapshotted.
        try:
            for name in self._datasets.list_auto_snapshot_sets(schedule):
                dataset = zfs.ReadWritableDataset(name)
                self._prune_snapshots(dataset, schedule)
        except RuntimeError as message:
            self.logger.error("Error listing datasets during " + \
                             "removal of expired snapshots")
            self.exitCode = smf.SMF_EXIT_ERR_FATAL
            # Propogate up to thread's run() method
            raise RuntimeError(message)

    def _needs_cleanup(self):
        if self._remedialCleanup == False:
            # Sys admin has explicitly instructed for remedial cleanups
            # not to be performed.
            return False
        now = int(time.time())
        # Don't run checks any less than 15 minutes apart.
        if self._cleanupLock.acquire(False) == False:
            #Indicates that a cleanup is already running.
            return False
        # FIXME - Make the cleanup interval equal to the minimum snapshot interval
        # if custom snapshot schedules are defined and enabled.
        elif ((now - self._lastCleanupCheck) < (_MINUTE * 15)):
            pass
        else:
            for zpool in self._zpools:
                try:
                    if zpool.get_capacity() > self._warningLevel:
                        # Before getting into a panic, determine if the pool
                        # is one we actually take snapshots on, by checking
                        # for one of the "auto-snapshot:<schedule> tags. Not
                        # super fast, but it only happens under exceptional
                        # circumstances of a zpool nearing it's capacity.

                        for sched in self._allSchedules:
                            sets = zpool.list_auto_snapshot_sets(sched[0])
                            if len(sets) > 0:
                                util.debug("%s needs a cleanup" \
                                           % zpool.name, \
                                           self.verbose)
                                self._cleanupLock.release()
                                return True
                except RuntimeError as message:
                    self.logger.error("Error checking zpool capacity of: " + \
                                     zpool.name)
                    self._cleanupLock.release()
                    self.exitCode = smf.SMF_EXIT_ERR_FATAL
                    # Propogate up to thread's run() mehod.
                    raise RuntimeError(message)
            self._lastCleanupCheck = int(time.time())
        self._cleanupLock.release()
        return False

    def _perform_cleanup(self):
        if self._cleanupLock.acquire(False) == False:
            # Cleanup already running. Skip
            return
        self._destroyedsnaps = []
        for zpool in self._zpools:
            try:
                self._poolstatus[zpool.name] = 0
                capacity = zpool.get_capacity()
                if capacity > self._warningLevel:
                    self._run_warning_cleanup(zpool)
                    self._poolstatus[zpool.name] = 1
                    capacity = zpool.get_capacity()
                if capacity > self._criticalLevel:
                    self._run_critical_cleanup(zpool)
                    self._poolstatus[zpool.name] = 2
                    capacity = zpool.get_capacity()
                if capacity > self._emergencyLevel:
                    self._run_emergency_cleanup(zpool)
                    self._poolstatus[zpool.name] = 3
                    capacity = zpool.get_capacity()
                if capacity > self._emergencyLevel:
                    self._run_emergency_cleanup(zpool)
                    self._poolstatus[zpool.name] = 4
            # This also catches exceptions thrown from _run_<level>_cleanup()
            # and _run_cleanup() in methods called by _perform_cleanup()
            except RuntimeError as message:
                self.logger.error("Remedial space cleanup failed because " + \
                                 "of failure to determinecapacity of: " + \
                                 zpool.name)
                self.exitCode = smf.SMF_EXIT_ERR_FATAL
                self._cleanupLock.release()
                # Propogate up to thread's run() method.
                raise RuntimeError(message)

            # Bad - there's no more snapshots left and nothing
            # left to delete. We don't disable the service since
            # it will permit self recovery and snapshot
            # retention when space becomes available on
            # the pool (hopefully).
            util.debug("%s pool status after cleanup:" \
                       % zpool.name, \
                       self.verbose)
            util.debug(zpool, self.verbose)
        util.debug("Cleanup completed. %d snapshots were destroyed" \
                   % len(self._destroyedsnaps), \
                   self.verbose)
        # Avoid needless list iteration for non-debug mode
        if self.verbose == True and len(self._destroyedsnaps) > 0:
            for snap in self._destroyedsnaps:
                self.logger.error("\t%s" % snap)
        self._cleanupLock.release()

    def _run_warning_cleanup(self, zpool):
        util.debug("Performing warning level cleanup on %s" % \
                   zpool.name, \
                   self.verbose)
        self._run_cleanup(zpool, "daily", self._warningLevel)
        if zpool.get_capacity() > self._warningLevel:
            self._run_cleanup(zpool, "hourly", self._warningLevel)

    def _run_critical_cleanup(self, zpool):
        util.debug("Performing critical level cleanup on %s" % \
                   zpool.name, \
                   self.verbose)
        self._run_cleanup(zpool, "weekly", self._criticalLevel)
        if zpool.get_capacity() > self._criticalLevel:
            self._run_cleanup(zpool, "daily", self._criticalLevel)
        if zpool.get_capacity() > self._criticalLevel:
            self._run_cleanup(zpool, "hourly", self._criticalLevel)

    def _run_emergency_cleanup(self, zpool):
        util.debug("Performing emergency level cleanup on %s" % \
                   zpool.name, \
                   self.verbose)
        self._run_cleanup(zpool, "monthly", self._emergencyLevel)
        if zpool.get_capacity() > self._emergencyLevel:
            self._run_cleanup(zpool, "weekly", self._emergencyLevel)
        if zpool.get_capacity() > self._emergencyLevel:
            self._run_cleanup(zpool, "daily", self._emergencyLevel)
        if zpool.get_capacity() > self._emergencyLevel:
            self._run_cleanup(zpool, "hourly", self._emergencyLevel)
        if zpool.get_capacity() > self._emergencyLevel:
            self._run_cleanup(zpool, "frequent", self._emergencyLevel)
        #Finally, as a last resort, delete custom scheduled snaphots
        for schedule,i,p,k in self._customSchedules:
            if zpool.get_capacity() < self._emergencyLevel:
                break
            else:
                self._run_cleanup(zpool, schedule, self._emergencyLevel)

    def _run_cleanup(self, zpool, schedule, threshold):
        clonedsnaps = []
        snapshots = []
        try:
            clonedsnaps = self._datasets.list_cloned_snapshots()
        except RuntimeError as message:
                self.logger.error("Error (non-fatal) listing cloned snapshots" +
                                 " while recovering pool capacity")
                self.logger.error("Error details:\n" + \
                                 "--------BEGIN ERROR MESSAGE--------\n" + \
                                 str(message) + \
                                 "\n--------END ERROR MESSAGE--------")

        # Build a list of snapshots in the given schedule, that are not
        # cloned, and sort the result in reverse chronological order.
        try:
            snapshots = [s for s,t in \
                            zpool.list_snapshots("%s%s" \
                            % (self._prefix,schedule)) \
                            if not s in clonedsnaps]
            snapshots.reverse()
        except RuntimeError as message:
            self.logger.error("Error listing snapshots" +
                             " while recovering pool capacity")
            self.exitCode = smf.SMF_EXIT_ERR_FATAL
            # Propogate the error up to the thread's run() method.
            raise RuntimeError(message)

        while zpool.get_capacity() > threshold:
            if len(snapshots) == 0:
                self.logger.info( \
                              "No more %s snapshots left" \
                               % schedule)
                return

            """This is not an exact science. Deleteing a zero sized
            snapshot can have unpredictable results. For example a
            pair of snapshots may share exclusive reference to a large
            amount of data (eg. a large core file). The usage of both
            snapshots will initially be seen to be 0 by zfs(1). Deleting
            one of the snapshots will make the data become unique to the
            single remaining snapshot that references it uniquely. The
            remaining snapshot's size will then show up as non zero. So
            deleting 0 sized snapshot is not as pointless as it might seem.
            It also means we have to loop through this, each snapshot set
            at a time and observe the before and after results. Perhaps
            better way exists...."""

            # Start with the oldest first
            snapname = snapshots.pop()
            snapshot = zfs.Snapshot(snapname)
            # It would be nicer, for performance purposes, to delete sets
            # of snapshots recursively but this might destroy more data than
            # absolutely necessary, plus the previous purging of zero sized
            # snapshots can easily break the recursion chain between
            # filesystems.
            # On the positive side there should be fewer snapshots and they
            # will mostly non-zero so we should get more effectiveness as a
            # result of deleting snapshots since they should be nearly always
            # non zero sized.
            util.debug("Destroying %s" % snapname, self.verbose)
            try:
                snapshot.destroy()
            except RuntimeError as message:
                # Would be nice to be able to mark service as degraded here
                # but it's better to try to continue on rather than to give
                # up alltogether (SMF maintenance state)
                self.logger.error("Warning: Cleanup failed to destroy: %s" % \
                                 (snapshot.name))
                self.logger.error("Details:\n%s" % (str(message)))
            else:
                self._destroyedsnaps.append(snapname)
            # Give zfs some time to recalculate.
            time.sleep(3)

    def _send_to_syslog(self):
        for zpool in self._zpools:
            status = self._poolstatus[zpool.name]
            if status == 4:
                self.logger.critical( \
                              "All automatic snapshots were destroyed" \
                               % (zpool.name, self._emergencyLevel))
            elif status == 3:
                self.logger.error( \
                              "%s exceeded %d%% capacity. " \
                              "Automatic snapshots over 1 hour old were destroyed" \
                               % (zpool.name, self._emergencyLevel))
            elif status == 2:
                self.logger.critical( \
                              "%s exceeded %d%% capacity. " \
                              "Weekly, hourly and daily automatic snapshots were destroyed" \
                               % (zpool.name, self._criticalLevel))
            elif status == 1:
                self.logger.warning( \
                              "%s exceeded %d%% capacity. " \
                              "Hourly and daily automatic snapshots were destroyed" \
                               % (zpool.name, self._warningLevel))

        if len(self._destroyedsnaps) > 0:
            self.logger.warning( \
                          "%d automatic snapshots were destroyed" \
                           % len(self._destroyedsnaps))

    def _send_notification(self):
        worstpool = None
        worststatus = 0

        for zpool in self._zpools:
            status = self._poolstatus[zpool.name]
            # >= to ensure that something should always be set.
            if status >= worststatus:
                worstpool = zpool.name
                worststatus = status

        #FIXME make the various levels indexible
        if worststatus == 4:
            self._dbus.capacity_exceeded(worstpool, 4, self._emergencyLevel)
        elif worststatus == 3:
            self._dbus.capacity_exceeded(worstpool, 3, self._emergencyLevel)
        elif worststatus == 2:
            self._dbus.capacity_exceeded(worstpool, 2, self._criticalLevel)
        elif worststatus == 1:
            self._dbus.capacity_exceeded(worstpool, 1, self._warningLevel)
        #elif: 0 everything is fine. Do nothing.


def monitor_threads(snapthread):
    logger = logging.getLogger('time-slider')
    if snapthread.is_alive():
        return True
    else:
        logger.error("Snapshot monitor thread exited.")
        if snapthread.exitCode == smf.SMF_EXIT_MON_DEGRADE:
            # FIXME - it would be nicer to mark the service as degraded than
            # go into maintenance state for some situations such as a
            # particular snapshot schedule failing.
            # But for now SMF does not implement this feature. But if/when it
            # does it's better to use svcadm to put the # service into the
            # correct state since the daemon shouldn't exit whentransitioning
            # to a degraded state.
            #sys.stderr.write("Placing service into maintenance state\n")
            #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
            #                 os.getenv("SMF_FMRI")])
            # SMF will take care of kill the daemon
            sys.exit(smf.SMF_EXIT_ERR_FATAL)
            return False
        elif snapthread.exitCode == smf.SMF_EXIT_ERR_FATAL:
            #sys.stderr.write("Placing service into maintenance state\n")
            #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
            #                 os.getenv("SMF_FMRI")])
            # SMF will take care of killing the daemon
            sys.exit(smf.SMF_EXIT_ERR_FATAL)
            return False
        else:
            logger.error("Snapshot monitor thread exited abnormally")
            logger.error("Exit code: %d" % (snapthread.exitCode))
            #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
            #                 os.getenv("SMF_FMRI")])
            sys.exit(smf.SMF_EXIT_ERR_FATAL)
            return False


def child_sig_handler(signum, frame):
    if signum == signal.SIGUSR1:
        sys.exit(smf.SMF_EXIT_OK)
    elif signum == signal.SIGCHLD:
        sys.exit(smf.SMF_EXIT_ERR_FATAL)
    elif signum == signal.SIGALRM:
        sys.exit(smf.SMF_EXIT_ERR_FATAL)

# Default daemon parameters.
# File mode creation mask of the daemon.
UMASK = 0
# Default working directory for the daemon.
WORKDIR = "/"
# Default maximum for the number of available file descriptors.
MAXFD = 1024

def create_daemon():
    """
    Detach a process from the controlling terminal and run it in the
    background as a daemon.
    """
    #Catch signals that we might receive from child
    signal.signal(signal.SIGCHLD, child_sig_handler)
    signal.signal(signal.SIGUSR1, child_sig_handler)
    signal.signal(signal.SIGALRM, child_sig_handler)
    try:
        pid = os.fork()
    except OSError as e:
        raise Exception("%s [%d]" % (e.strerror, e.errno))

    if (pid == 0):
        #Reset signals that we set to trap in parent
        signal.signal(signal.SIGCHLD, signal.SIG_DFL)
        signal.signal(signal.SIGUSR1, signal.SIG_DFL)
        signal.signal(signal.SIGALRM, signal.SIG_DFL)
        os.setsid()
        os.chdir(WORKDIR)
        os.umask(UMASK)
    else:
        #Wait for the child to give the OK or otherwise.
        signal.pause()


def main(argv):

    parser = argparse.ArgumentParser()
    parser.add_argument('--foreground', action='store_true', help='Do not daemonize', default=False)
    parser.add_argument('--config', '-c', type=str, help='Configuration file', default='/etc/time-slider/timesliderd.conf')
    parser.add_argument('--configdump', action='store_true', help='Dump default values in config file format', default=False)
    args, _ = parser.parse_known_args()

    logger = logging.getLogger('time-slider')
    logger.setLevel(logging.DEBUG)
    if args.foreground:
        handler = logging.StreamHandler()
        handler.setFormatter(logging.Formatter('%(message)s'))
    else:
        handler = SysLogHandler(address='/dev/log')
        handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s: %(message)s', '%b %d %H:%M:%S time-sliderd:'))
    handler.setLevel(logging.DEBUG)
    logger.addHandler(handler)

    if args.configdump:
        timesliderconfig.configdump()
        sys.exit(smf.SMF_EXIT_OK)

    timesliderconfig.configfile = args.config

    # Daemonise the service.
    if not args.foreground:
        create_daemon()

    # The user security attributes checked are the following:
    # Note that UID == 0 will match any profile search so
    # no need to check it explicitly.
    rbacp = RBACprofile()
    if rbacp.has_profile("ZFS File System Management"):

        gobject.threads_init()

        # Tell dbus to use the gobject mainloop for async ops
        dbus.mainloop.glib.DBusGMainLoop(set_as_default=True)
        dbus.mainloop.glib.threads_init()
        # Register a bus name with the system dbus daemon
        systemBus = dbus.SystemBus()
        name = dbus.service.BusName("org.opensolaris.TimeSlider", systemBus)

        # Create and start the snapshot manger. Takes care of
        # auto snapshotting service and auto cleanup.
        snapshot = SnapshotManager(systemBus)
        snapshot.start()
        gobject.timeout_add(2000, monitor_threads, snapshot)

        mainloop = gobject.MainLoop()
        try:
            mainloop.run()
        except KeyboardInterrupt:
            mainloop.quit()
            sys.exit(smf.SMF_EXIT_OK)
    else:
        logger.error( \
               "%s has insufficient privileges to run time-sliderd!" \
               % rbacp.name)
        sys.exit(smf.SMF_EXIT_ERR_PERM)
    sys.exit(smf.SMF_EXIT_OK)