usr/share/time-slider/lib/time_slider/timesliderd.py

   1 #!/usr/bin/python2
   2 #
   3 # CDDL HEADER START
   4 #
   5 # The contents of this file are subject to the terms of the
   6 # Common Development and Distribution License (the "License").
   7 # You may not use this file except in compliance with the License.
   8 #
   9 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10 # or http://www.opensolaris.org/os/licensing.
  11 # See the License for the specific language governing permissions
  12 # and limitations under the License.
  13 #
  14 # When distributing Covered Code, include this CDDL HEADER in each
  15 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16 # If applicable, add the following below this CDDL HEADER, with the
  17 # fields enclosed by brackets "[]" replaced with your own identifying
  18 # information: Portions Copyright [yyyy] [name of copyright owner]
  19 #
  20 # CDDL HEADER END
  21 #
  22
  23 import sys
  24 import os
  25 import subprocess
  26 import re
  27 import threading
  28 import getopt
  29 import syslog
  30 import time
  31 import datetime
  32 import calendar
  33 import signal
  34 import argparse
  35 import logging
  36 from logging.handlers import SysLogHandler
  37
  38 import glib
  39 import gobject
  40 import dbus
  41 import dbus.service
  42 import dbus.mainloop
  43 import dbus.mainloop.glib
  44
  45 import dbussvc
  46 import zfs
  47 import smf
  48 import time_slider.linux.timeslidersmf as timeslidersmf
  49 import time_slider.linux.autosnapsmf as autosnapsmf
  50 # import plugin
  51 from time_slider.linux.rbac import RBACprofile
  52 import util
  53
  54 import time_slider.linux.timesliderconfig as timesliderconfig
  55
  56 _MINUTE = 60
  57 _HOUR = _MINUTE * 60
  58 _DAY = _HOUR * 24
  59 _WEEK = _DAY * 7
  60
  61
  62 # Status codes for actual zpool capacity levels.
  63 # These are relative to the SMF property defined
  64 # levels for: user, warning and emergenecy levels
  65 STATUS_OK = 0 # Below user specified threshhold. Everything was OK
  66 STATUS_WARNING = 1 # Above specified user threshold level
  67 STATUS_CRITICAL = 2 # Above specified critical threshhold level
  68 STATUS_EMERGENCY = 3 # Above specified emergency threshhold level
  69
  70 intervals = {"weeks" : _WEEK, "days" : _DAY, "hours" : _HOUR, "minutes" : _MINUTE}
  71
  72
  73 class SnapshotManager(threading.Thread):
  74
  75     def __init__(self, bus):
  76         # Used to wake up the run() method prematurely in the event
  77         # of a SIGHUP/SMF refresh
  78         self._conditionLock = threading.Condition(threading.RLock())
  79         # Used when schedules are being rebuilt or examined.
  80         self._refreshLock = threading.Lock()
  81         # Indicates that cleanup is in progress when locked
  82         self._cleanupLock = threading.Lock()
  83         self._datasets = zfs.Datasets()
  84         # Indicates that schedules need to be rebuilt from scratch
  85         self._stale = True
  86         self._lastCleanupCheck = 0;
  87         self._zpools = []
  88         self._poolstatus = {}
  89         self._destroyedsnaps = []
  90         self.logger = logging.getLogger('time-slider')
  91
  92         # This is also checked during the refresh() method but we need
  93         # to know it sooner for instantiation of the PluginManager
  94         self._smf = timeslidersmf.TimeSliderSMF()
  95         try:
  96             self.verbose = self._smf.get_verbose()
  97         except RuntimeError,message:
  98             self.logger.error("Error determing whether debugging is enabled")
  99             self.verbose = False
 100
 101         self._dbus = dbussvc.AutoSnap(bus,
 102                                       '/org/opensolaris/TimeSlider/autosnap',
 103                                       self)
 104
 105         # self._plugin = plugin.PluginManager(self.verbose)
 106         self.exitCode = smf.SMF_EXIT_OK
 107         self.refresh()
 108
 109         # Seems we're up and running OK.
 110         # Signal our parent so we can daemonise
 111         os.kill(os.getppid(), signal.SIGUSR1)
 112
 113         # SMF/svc.startd sends SIGHUP to force a
 114         # a refresh of the daemon
 115         signal.signal(signal.SIGHUP, self._signalled)
 116
 117         # Init done. Now initiaslise threading.
 118         threading.Thread.__init__ (self)
 119         self.setDaemon(True)
 120
 121     def run(self):
 122         # Deselect swap and dump volumes so they don't get snapshotted.
 123         for vol in self._datasets.list_volumes():
 124             name = vol.rsplit("/")
 125             try:
 126                 if (name[1] == "swap" or name[1] == "dump"):
 127                     util.debug("Auto excluding %s volume" % vol, self.verbose)
 128                     volume = zfs.Volume(vol)
 129                     volume.set_auto_snap(False)
 130             except IndexError:
 131                 pass
 132
 133         nexttime = None
 134         waittime = None
 135         while True:
 136             try:
 137                 self.refresh()
 138                 # First check and, if necessary, perform any remedial cleanup.
 139                 # This is best done before creating any new snapshots which may
 140                 # otherwise get immediately gobbled up by the remedial cleanup.
 141                 if self._needs_cleanup() == True:
 142                     self._perform_cleanup()
 143                     # Check to see if cleanup actually deleted anything before
 144                     # notifying the user. Avoids the popup appearing continuously
 145                     if len(self._destroyedsnaps) > 0:
 146                         self._send_notification()
 147                     self._send_to_syslog()
 148
 149                 nexttime = self._check_snapshots()
 150                 # Overdue snapshots are already taken automatically
 151                 # inside _check_snapshots() so nexttime should never be
 152                 # < 0. It can be None however, which is fine since it
 153                 # will cause the scheduler thread to sleep indefinitely
 154                 # or until a SIGHUP is caught.
 155                 if nexttime:
 156                     util.debug("Waiting until " + str (nexttime), self.verbose)
 157                 waittime = None
 158                 if nexttime != None:
 159                     waittime = nexttime - long(time.time())
 160                     if (waittime <= 0):
 161                         # We took too long and missed a snapshot, so break out
 162                         # and catch up on it the next time through the loop
 163                         continue
 164                 # waittime could be None if no auto-snap schedules are online
 165                 self._conditionLock.acquire()
 166                 if waittime:
 167                     util.debug("Waiting %d seconds" % (waittime), self.verbose)
 168                     self._conditionLock.wait(waittime)
 169                 else: #None. Just wait a while to check for cleanups.
 170                     util.debug("No auto-snapshot schedules online.", \
 171                                self.verbose)
 172                     self._conditionLock.wait(_MINUTE * 15)
 173
 174             except OSError, message:
 175                 self.logger.error("Caught OSError exception in snapshot" +
 176                                  " manager thread")
 177                 self.logger.error("Error details:\n" + \
 178                                  "--------BEGIN ERROR MESSAGE--------\n" + \
 179                                  str(message) + \
 180                                  "\n--------END ERROR MESSAGE--------")
 181                 self.exitCode = smf.SMF_EXIT_ERR_FATAL
 182                 # Exit this thread
 183                 break
 184             except RuntimeError,message:
 185                 self.logger.error("Caught RuntimeError exception in snapshot" +
 186                                  " manager thread")
 187                 self.logger.error("Error details:\n" + \
 188                                  "--------BEGIN ERROR MESSAGE--------\n" + \
 189                                  str(message) + \
 190                                  "\n--------END ERROR MESSAGE--------")
 191                 # Exit this thread
 192                 break
 193
 194     def _signalled(self, signum, frame):
 195         if signum == signal.SIGHUP:
 196             if self._refreshLock.acquire(False) == False:
 197                 return
 198             self._stale = True
 199             self._refreshLock.release()
 200             self._conditionLock.acquire()
 201             self._conditionLock.notify()
 202             self._conditionLock.release()
 203
 204     def refresh(self):
 205         """
 206         Checks if defined snapshot schedules are out
 207         of date and rebuilds and updates if necessary
 208         """
 209         self._refreshLock.acquire()
 210         if self._stale == True:
 211             self._configure_svc_props()
 212             self._rebuild_schedules()
 213             self._update_schedules()
 214             # self._plugin.refresh()
 215             self._stale = False
 216         self._refreshLock.release()
 217
 218     def _configure_svc_props(self):
 219         try:
 220             self.verbose = self._smf.get_verbose()
 221         except RuntimeError,message:
 222             self.logger.error("Error determing whether debugging is enabled")
 223             self.verbose = False
 224
 225         try:
 226             cleanup = self._smf.get_remedial_cleanup()
 227             warn = self._smf.get_cleanup_level("warning")
 228             util.debug("Warning level value is:   %d%%" % warn, self.verbose)
 229             crit = self._smf.get_cleanup_level("critical")
 230             util.debug("Critical level value is:  %d%%" % crit, self.verbose)
 231             emer = self._smf.get_cleanup_level("emergency")
 232             util.debug("Emergency level value is: %d%%" % emer, self.verbose)
 233         except RuntimeError,message:
 234             self.logger.error("Failed to determine cleanup threshhold levels")
 235             self.logger.error("Details:\n" + \
 236                              "--------BEGIN ERROR MESSAGE--------\n" + \
 237                              str(message) + \
 238                              "\n---------END ERROR MESSAGE---------\n")
 239             self.logger.error("Using factory defaults of 80%, 90% and 95%")
 240             #Go with defaults
 241             #FIXME - this would be an appropriate case to mark svc as degraded
 242             self._remedialCleanup = True
 243             self._warningLevel = 80
 244             self._criticalLevel = 90
 245             self._emergencyLevel = 95
 246         else:
 247             self._remedialCleanup = cleanup
 248             self._warningLevel = warn
 249             self._criticalLevel = crit
 250             self._emergencyLevel = emer
 251
 252         try:
 253             self._keepEmpties = self._smf.get_keep_empties()
 254         except RuntimeError,message:
 255             # Not fatal, just assume we delete them (default configuration)
 256             self.logger.error("Can't determine whether to keep empty snapshots")
 257             self.logger.error("Details:\n" + \
 258                              "--------BEGIN ERROR MESSAGE--------\n" + \
 259                              str(message) + \
 260                              "\n---------END ERROR MESSAGE---------")
 261             self.logger.error("Assuming default value: False")
 262             self._keepEmpties = False
 263
 264         # Previously, snapshot labels used the ":" character was used as a
 265         # separator character for datestamps. Windows filesystems such as
 266         # CIFS and FAT choke on this character so now we use a user definable
 267         # separator value, with a default value of "_"
 268         # We need to check for both the old and new format when looking for
 269         # snapshots.
 270         self._separator = self._smf.get_separator()
 271         self._prefix = "%s[:%s]" \
 272             % (autosnapsmf.SNAPLABELPREFIX, self._separator)
 273
 274         # Rebuild pool list
 275         self._zpools = []
 276         try:
 277             for poolname in zfs.list_zpools():
 278                 # Do not try to examine FAULTED pools
 279                 zpool = zfs.ZPool(poolname)
 280                 if zpool.health == "FAULTED":
 281                     util.debug("Ignoring faulted Zpool: %s\n" \
 282                                % (zpool.name), \
 283                                self.verbose)
 284                 else:
 285                     self._zpools.append(zpool)
 286                 util.debug(str(zpool), self.verbose)
 287         except RuntimeError,message:
 288             self.logger.error("Could not list Zpools")
 289             self.exitCode = smf.SMF_EXIT_ERR_FATAL
 290             # Propogate exception up to thread's run() method
 291             raise RuntimeError,message
 292
 293
 294     def _rebuild_schedules(self):
 295         """
 296         Builds 2 lists of default and custom auto-snapshot SMF instances
 297         """
 298
 299         self._last = {}
 300         self._next = {}
 301         self._keep = {}
 302
 303         try:
 304             _defaultSchedules = autosnapsmf.get_default_schedules()
 305             _customSchedules = autosnapsmf.get_custom_schedules()
 306         except RuntimeError,message:
 307             self.exitCode = smf.SMF_EXIT_ERR_FATAL
 308             raise RuntimeError, "Error reading SMF schedule instances\n" + \
 309                                 "Details:\n" + str(message)
 310         else:
 311             # Now set it in stone.
 312             self._defaultSchedules = tuple(_defaultSchedules)
 313             self._customSchedules = tuple(_customSchedules)
 314
 315             # Build the combined schedule tuple from default + custom schedules
 316             _defaultSchedules.extend(_customSchedules)
 317             self._allSchedules = tuple(_defaultSchedules)
 318             for schedule,i,p,keep in self._allSchedules:
 319                 self._last[schedule] = 0
 320                 self._next[schedule] = 0
 321                 self._keep[schedule] = keep
 322
 323     def _update_schedules(self):
 324         interval = 0
 325         idx = 1 # Used to index subsets for schedule overlap calculation
 326         last = None
 327
 328         for schedule,interval,period,keep in self._allSchedules:
 329             # Shortcut if we've already processed this schedule and it's
 330             # still up to date. Don't skip the default schedules though
 331             # because overlap affects their scheduling
 332             if [schedule,interval,period,keep] not in \
 333                 self._defaultSchedules and \
 334                 (self._next[schedule] > self._last[schedule]):
 335                 util.debug("Short circuiting %s recalculation" \
 336                            % (schedule), \
 337                            self.verbose)
 338                 continue
 339
 340             # If we don't have an internal timestamp for the given schedule
 341             # ask zfs for the last snapshot and get it's creation timestamp.
 342             if self._last[schedule] == 0:
 343                 try:
 344                     snaps = self._datasets.list_snapshots("%s%s" % \
 345                                                          (self._prefix,
 346                                                           schedule))
 347                 except RuntimeError,message:
 348                     self.exitCode = smf.SMF_EXIT_ERR_FATAL
 349                     self.logger.error("Failed to list snapshots during schedule update")
 350                     #Propogate up to the thread's run() method
 351                     raise RuntimeError,message
 352
 353                 if len(snaps) > 0:
 354                     util.debug("Last %s snapshot was: %s" % \
 355                                (schedule, snaps[-1][0]), \
 356                                self.verbose)
 357                     self._last[schedule] = snaps[-1][1]
 358
 359             last = self._last[schedule]
 360             if interval != "months": # months is non-constant. See below.
 361                 util.debug("Recalculating %s schedule" % (schedule), \
 362                            self.verbose)
 363                 try:
 364                     totalinterval = intervals[interval] * period
 365                 except KeyError:
 366                     self.exitCode = smf.SMF_EXIT_ERR_CONFIG
 367                     self.logger.error(schedule + \
 368                                       " schedule has invalid interval: " + \
 369                                       "'%s\'" % interval)
 370                     #Propogate up to thread's run() method
 371                     raise RuntimeError
 372                 if [schedule,interval,period,keep] in self._defaultSchedules:
 373                     # This is one of the default schedules so check for an
 374                     # overlap with one of the dominant shchedules.
 375                     for s,i,p,k in self._defaultSchedules[:idx]:
 376                         last = max(last, self._last[s])
 377                     idx += 1
 378
 379             else: # interval == "months"
 380                 if self._next[schedule] > last:
 381                     util.debug("Short circuiting " + \
 382                                schedule + \
 383                                " recalculation", \
 384                                self.verbose)
 385                     continue
 386                 util.debug("Recalculating %s schedule" % (schedule), \
 387                            self.verbose)
 388                 snap_tm = time.gmtime(self._last[schedule])
 389                 # Increment year if period >= than 1 calender year.
 390                 year = snap_tm.tm_year
 391                 year += period / 12
 392                 period = period % 12
 393
 394                 mon = (snap_tm.tm_mon + period) % 12
 395                 # Result of 0 actually means december.
 396                 if mon == 0:
 397                     mon = 12
 398                 # Account for period that spans calendar year boundary.
 399                 elif snap_tm.tm_mon + period > 12:
 400                     year += 1
 401
 402                 d,dlastmon = calendar.monthrange(snap_tm.tm_year, snap_tm.tm_mon)
 403                 d,dnewmon = calendar.monthrange(year, mon)
 404                 mday = snap_tm.tm_mday
 405                 if dlastmon > dnewmon and snap_tm.tm_mday > dnewmon:
 406                    mday = dnewmon
 407
 408                 tm =(year, mon, mday, \
 409                     snap_tm.tm_hour, snap_tm.tm_min, snap_tm.tm_sec, \
 410                     0, 0, -1)
 411                 newt = calendar.timegm(tm)
 412                 new_tm = time.gmtime(newt)
 413                 totalinterval = newt - self._last[schedule]
 414
 415             self._next[schedule] = last + totalinterval
 416
 417     def _next_due(self):
 418         schedule = None
 419         earliest = None
 420         now = long(time.time())
 421
 422         for s,i,p,k in self._defaultSchedules:
 423             due = self._next[s]
 424             if due <= now:
 425                 #Default Schedule - so break out at the first
 426                 #schedule that is overdue. The subordinate schedules
 427                 #will re-adjust afterwards.
 428                 earliest,schedule = due,s
 429                 break
 430             elif earliest != None:
 431                 if due < earliest:
 432                     earliest,schedule = due,s
 433             else: #FIXME better optimisation with above condition
 434                 earliest,schedule = due,s
 435         for s,i,p,k in self._customSchedules:
 436             due = self._next[s]
 437             if earliest != None:
 438                 if due < earliest:
 439                     earliest,schedule = due,s
 440             else: #FIXME better optimisation with above condition
 441                 earliest,schedule = due,s
 442         return earliest,schedule
 443
 444     def _check_snapshots(self):
 445         """
 446         Check the schedules and see what the required snapshot is.
 447         Take one immediately on the first overdue snapshot required
 448         """
 449         # Make sure a refresh() doesn't mess with the schedule while
 450         # we're reading through it.
 451         self._refreshLock.acquire()
 452         next,schedule = self._next_due()
 453         self._refreshLock.release()
 454         now = long(time.time())
 455         while next != None and next <= now:
 456             label = self._take_snapshots(schedule)
 457             # self._plugin.execute_plugins(schedule, label)
 458             self._refreshLock.acquire()
 459             self._update_schedules()
 460             next,schedule = self._next_due();
 461             self._refreshLock.release()
 462             dt = datetime.datetime.fromtimestamp(next)
 463             util.debug("Next snapshot is %s due at: %s" % \
 464                        (schedule, dt.isoformat()), \
 465                        self.verbose)
 466         return next
 467
 468     def _take_snapshots(self, schedule):
 469         # Set the time before taking snapshot to avoid clock skew due
 470         # to time taken to complete snapshot.
 471         tm = long(time.time())
 472         label = "%s%s%s-%s" % \
 473                 (autosnapsmf.SNAPLABELPREFIX, self._separator, schedule,
 474                  datetime.datetime.now().strftime("%Y-%m-%d-%Hh%M"))
 475         try:
 476             self._datasets.create_auto_snapshot_set(label, tag=schedule)
 477         except RuntimeError, message:
 478             # Write an error message, set the exit code and pass it up the
 479             # stack so the thread can terminate
 480             self.logger.error("Failed to create snapshots for schedule: %s" \
 481                              % (schedule))
 482             self.exitCode = smf.SMF_EXIT_MON_DEGRADE
 483             raise RuntimeError,message
 484         self._last[schedule] = tm;
 485         self._perform_purge(schedule)
 486         return label
 487
 488     def _prune_snapshots(self, dataset, schedule):
 489         """Cleans out zero sized snapshots, kind of cautiously"""
 490             # Per schedule: We want to delete 0 sized
 491             # snapshots but we need to keep at least one around (the most
 492             # recent one) for each schedule so that that overlap is
 493             # maintained from frequent -> hourly -> daily etc.
 494             # Start off with the smallest interval schedule first and
 495             # move up. This increases the amount of data retained where
 496             # several snapshots are taken together like a frequent hourly
 497             # and daily snapshot taken at 12:00am. If 3 snapshots are all
 498             # identical and reference the same identical data they will all
 499             # be initially reported as zero for used size. Deleting the
 500             # daily first then the hourly would shift make the data referenced
 501             # by all 3 snapshots unique to the frequent scheduled snapshot.
 502             # This snapshot would probably be purged within an how ever and the
 503             # data referenced by it would be gone for good.
 504             # Doing it the other way however ensures that the data should
 505             # remain accessible to the user for at least a week as long as
 506             # the pool doesn't run low on available space before that.
 507
 508         try:
 509             snaps = dataset.list_snapshots("%s%s" % (self._prefix,schedule))
 510             # Clone the list because we want to remove items from it
 511             # while iterating through it.
 512             remainingsnaps = snaps[:]
 513         except RuntimeError,message:
 514             self.logger.error("Failed to list snapshots during snapshot cleanup")
 515             self.exitCode = smf.SMF_EXIT_ERR_FATAL
 516             raise RuntimeError,message
 517
 518         if (self._keepEmpties == False):
 519             try: # remove the newest one from the list.
 520                 snaps.pop()
 521             except IndexError:
 522                 pass
 523             for snapname in snaps:
 524                 try:
 525                     snapshot = zfs.Snapshot(snapname)
 526                 except Exception,message:
 527                     self.logger.error(str(message))
 528                     # Not fatal, just skip to the next snapshot
 529                     continue
 530
 531                 try:
 532                     if snapshot.get_used_size() == 0:
 533                         util.debug("Destroying zero sized: " + snapname, \
 534                                    self.verbose)
 535                         try:
 536                             snapshot.destroy()
 537                         except RuntimeError,message:
 538                             self.logger.error("Failed to destroy snapshot: " +
 539                                              snapname)
 540                             self.exitCode = smf.SMF_EXIT_MON_DEGRADE
 541                             # Propogate exception so thread can exit
 542                             raise RuntimeError,message
 543                         remainingsnaps.remove(snapname)
 544                 except RuntimeError,message:
 545                     self.logger.error("Can not determine used size of: " + \
 546                                      snapname)
 547                     self.exitCode = smf.SMF_EXIT_MON_DEGRADE
 548                     #Propogate the exception to the thead run() method
 549                     raise RuntimeError,message
 550
 551         # Deleting individual snapshots instead of recursive sets
 552         # breaks the recursion chain and leaves child snapshots
 553         # dangling so we need to take care of cleaning up the
 554         # snapshots.
 555         target = len(remainingsnaps) - self._keep[schedule]
 556         counter = 0
 557         while counter < target:
 558             util.debug("Destroy expired snapshot: " + \
 559                        remainingsnaps[counter],
 560                        self.verbose)
 561             try:
 562                 snapshot = zfs.Snapshot(remainingsnaps[counter])
 563             except Exception,message:
 564                     self.logger.error(str(message))
 565                     # Not fatal, just skip to the next snapshot
 566                     counter += 1
 567                     continue
 568             try:
 569                 snapshot.destroy()
 570             except RuntimeError,message:
 571                 self.logger.error("Failed to destroy snapshot: " +
 572                                  snapshot.name)
 573                 self.exitCode = smf.SMF_EXIT_ERR_FATAL
 574                 # Propogate exception so thread can exit
 575                 raise RuntimeError,message
 576             else:
 577                 counter += 1
 578
 579     def _perform_purge(self, schedule):
 580         """Cautiously cleans out zero sized snapshots"""
 581         # We need to avoid accidentally pruning auto snapshots received
 582         # from one zpool to another. We ensure this by examining only
 583         # snapshots whose parent fileystems and volumes are explicitly
 584         # tagged to be snapshotted.
 585         try:
 586             for name in self._datasets.list_auto_snapshot_sets(schedule):
 587                 dataset = zfs.ReadWritableDataset(name)
 588                 self._prune_snapshots(dataset, schedule)
 589         except RuntimeError,message:
 590             self.logger.error("Error listing datasets during " + \
 591                              "removal of expired snapshots")
 592             self.exitCode = smf.SMF_EXIT_ERR_FATAL
 593             # Propogate up to thread's run() method
 594             raise RuntimeError,message
 595
 596     def _needs_cleanup(self):
 597         if self._remedialCleanup == False:
 598             # Sys admin has explicitly instructed for remedial cleanups
 599             # not to be performed.
 600             return False
 601         now = long(time.time())
 602         # Don't run checks any less than 15 minutes apart.
 603         if self._cleanupLock.acquire(False) == False:
 604             #Indicates that a cleanup is already running.
 605             return False
 606         # FIXME - Make the cleanup interval equal to the minimum snapshot interval
 607         # if custom snapshot schedules are defined and enabled.
 608         elif ((now - self._lastCleanupCheck) < (_MINUTE * 15)):
 609             pass
 610         else:
 611             for zpool in self._zpools:
 612                 try:
 613                     if zpool.get_capacity() > self._warningLevel:
 614                         # Before getting into a panic, determine if the pool
 615                         # is one we actually take snapshots on, by checking
 616                         # for one of the "auto-snapshot:<schedule> tags. Not
 617                         # super fast, but it only happens under exceptional
 618                         # circumstances of a zpool nearing it's capacity.
 619
 620                         for sched in self._allSchedules:
 621                             sets = zpool.list_auto_snapshot_sets(sched[0])
 622                             if len(sets) > 0:
 623                                 util.debug("%s needs a cleanup" \
 624                                            % zpool.name, \
 625                                            self.verbose)
 626                                 self._cleanupLock.release()
 627                                 return True
 628                 except RuntimeError, message:
 629                     self.logger.error("Error checking zpool capacity of: " + \
 630                                      zpool.name)
 631                     self._cleanupLock.release()
 632                     self.exitCode = smf.SMF_EXIT_ERR_FATAL
 633                     # Propogate up to thread's run() mehod.
 634                     raise RuntimeError,message
 635             self._lastCleanupCheck = long(time.time())
 636         self._cleanupLock.release()
 637         return False
 638
 639     def _perform_cleanup(self):
 640         if self._cleanupLock.acquire(False) == False:
 641             # Cleanup already running. Skip
 642             return
 643         self._destroyedsnaps = []
 644         for zpool in self._zpools:
 645             try:
 646                 self._poolstatus[zpool.name] = 0
 647                 capacity = zpool.get_capacity()
 648                 if capacity > self._warningLevel:
 649                     self._run_warning_cleanup(zpool)
 650                     self._poolstatus[zpool.name] = 1
 651                     capacity = zpool.get_capacity()
 652                 if capacity > self._criticalLevel:
 653                     self._run_critical_cleanup(zpool)
 654                     self._poolstatus[zpool.name] = 2
 655                     capacity = zpool.get_capacity()
 656                 if capacity > self._emergencyLevel:
 657                     self._run_emergency_cleanup(zpool)
 658                     self._poolstatus[zpool.name] = 3
 659                     capacity = zpool.get_capacity()
 660                 if capacity > self._emergencyLevel:
 661                     self._run_emergency_cleanup(zpool)
 662                     self._poolstatus[zpool.name] = 4
 663             # This also catches exceptions thrown from _run_<level>_cleanup()
 664             # and _run_cleanup() in methods called by _perform_cleanup()
 665             except RuntimeError,message:
 666                 self.logger.error("Remedial space cleanup failed because " + \
 667                                  "of failure to determinecapacity of: " + \
 668                                  zpool.name)
 669                 self.exitCode = smf.SMF_EXIT_ERR_FATAL
 670                 self._cleanupLock.release()
 671                 # Propogate up to thread's run() method.
 672                 raise RuntimeError,message
 673
 674             # Bad - there's no more snapshots left and nothing
 675             # left to delete. We don't disable the service since
 676             # it will permit self recovery and snapshot
 677             # retention when space becomes available on
 678             # the pool (hopefully).
 679             util.debug("%s pool status after cleanup:" \
 680                        % zpool.name, \
 681                        self.verbose)
 682             util.debug(zpool, self.verbose)
 683         util.debug("Cleanup completed. %d snapshots were destroyed" \
 684                    % len(self._destroyedsnaps), \
 685                    self.verbose)
 686         # Avoid needless list iteration for non-debug mode
 687         if self.verbose == True and len(self._destroyedsnaps) > 0:
 688             for snap in self._destroyedsnaps:
 689                 self.logger.error("\t%s" % snap)
 690         self._cleanupLock.release()
 691
 692     def _run_warning_cleanup(self, zpool):
 693         util.debug("Performing warning level cleanup on %s" % \
 694                    zpool.name, \
 695                    self.verbose)
 696         self._run_cleanup(zpool, "daily", self._warningLevel)
 697         if zpool.get_capacity() > self._warningLevel:
 698             self._run_cleanup(zpool, "hourly", self._warningLevel)
 699
 700     def _run_critical_cleanup(self, zpool):
 701         util.debug("Performing critical level cleanup on %s" % \
 702                    zpool.name, \
 703                    self.verbose)
 704         self._run_cleanup(zpool, "weekly", self._criticalLevel)
 705         if zpool.get_capacity() > self._criticalLevel:
 706             self._run_cleanup(zpool, "daily", self._criticalLevel)
 707         if zpool.get_capacity() > self._criticalLevel:
 708             self._run_cleanup(zpool, "hourly", self._criticalLevel)
 709
 710     def _run_emergency_cleanup(self, zpool):
 711         util.debug("Performing emergency level cleanup on %s" % \
 712                    zpool.name, \
 713                    self.verbose)
 714         self._run_cleanup(zpool, "monthly", self._emergencyLevel)
 715         if zpool.get_capacity() > self._emergencyLevel:
 716             self._run_cleanup(zpool, "weekly", self._emergencyLevel)
 717         if zpool.get_capacity() > self._emergencyLevel:
 718             self._run_cleanup(zpool, "daily", self._emergencyLevel)
 719         if zpool.get_capacity() > self._emergencyLevel:
 720             self._run_cleanup(zpool, "hourly", self._emergencyLevel)
 721         if zpool.get_capacity() > self._emergencyLevel:
 722             self._run_cleanup(zpool, "frequent", self._emergencyLevel)
 723         #Finally, as a last resort, delete custom scheduled snaphots
 724         for schedule,i,p,k in self._customSchedules:
 725             if zpool.get_capacity() < self._emergencyLevel:
 726                 break
 727             else:
 728                 self._run_cleanup(zpool, schedule, self._emergencyLevel)
 729
 730     def _run_cleanup(self, zpool, schedule, threshold):
 731         clonedsnaps = []
 732         snapshots = []
 733         try:
 734             clonedsnaps = self._datasets.list_cloned_snapshots()
 735         except RuntimeError,message:
 736                 self.logger.error("Error (non-fatal) listing cloned snapshots" +
 737                                  " while recovering pool capacity")
 738                 self.logger.error("Error details:\n" + \
 739                                  "--------BEGIN ERROR MESSAGE--------\n" + \
 740                                  str(message) + \
 741                                  "\n--------END ERROR MESSAGE--------")
 742
 743         # Build a list of snapshots in the given schedule, that are not
 744         # cloned, and sort the result in reverse chronological order.
 745         try:
 746             snapshots = [s for s,t in \
 747                             zpool.list_snapshots("%s%s" \
 748                             % (self._prefix,schedule)) \
 749                             if not s in clonedsnaps]
 750             snapshots.reverse()
 751         except RuntimeError,message:
 752             self.logger.error("Error listing snapshots" +
 753                              " while recovering pool capacity")
 754             self.exitCode = smf.SMF_EXIT_ERR_FATAL
 755             # Propogate the error up to the thread's run() method.
 756             raise RuntimeError,message
 757
 758         while zpool.get_capacity() > threshold:
 759             if len(snapshots) == 0:
 760                 self.logger.info( \
 761                               "No more %s snapshots left" \
 762                                % schedule)
 763                 return
 764
 765             """This is not an exact science. Deleteing a zero sized
 766             snapshot can have unpredictable results. For example a
 767             pair of snapshots may share exclusive reference to a large
 768             amount of data (eg. a large core file). The usage of both
 769             snapshots will initially be seen to be 0 by zfs(1). Deleting
 770             one of the snapshots will make the data become unique to the
 771             single remaining snapshot that references it uniquely. The
 772             remaining snapshot's size will then show up as non zero. So
 773             deleting 0 sized snapshot is not as pointless as it might seem.
 774             It also means we have to loop through this, each snapshot set
 775             at a time and observe the before and after results. Perhaps
 776             better way exists...."""
 777
 778             # Start with the oldest first
 779             snapname = snapshots.pop()
 780             snapshot = zfs.Snapshot(snapname)
 781             # It would be nicer, for performance purposes, to delete sets
 782             # of snapshots recursively but this might destroy more data than
 783             # absolutely necessary, plus the previous purging of zero sized
 784             # snapshots can easily break the recursion chain between
 785             # filesystems.
 786             # On the positive side there should be fewer snapshots and they
 787             # will mostly non-zero so we should get more effectiveness as a
 788             # result of deleting snapshots since they should be nearly always
 789             # non zero sized.
 790             util.debug("Destroying %s" % snapname, self.verbose)
 791             try:
 792                 snapshot.destroy()
 793             except RuntimeError,message:
 794                 # Would be nice to be able to mark service as degraded here
 795                 # but it's better to try to continue on rather than to give
 796                 # up alltogether (SMF maintenance state)
 797                 self.logger.error("Warning: Cleanup failed to destroy: %s" % \
 798                                  (snapshot.name))
 799                 self.logger.error("Details:\n%s" % (str(message)))
 800             else:
 801                 self._destroyedsnaps.append(snapname)
 802             # Give zfs some time to recalculate.
 803             time.sleep(3)
 804
 805     def _send_to_syslog(self):
 806         for zpool in self._zpools:
 807             status = self._poolstatus[zpool.name]
 808             if status == 4:
 809                 self.logger.critical( \
 810                               "All automatic snapshots were destroyed" \
 811                                % (zpool.name, self._emergencyLevel))
 812             elif status == 3:
 813                 self.logger.error( \
 814                               "%s exceeded %d%% capacity. " \
 815                               "Automatic snapshots over 1 hour old were destroyed" \
 816                                % (zpool.name, self._emergencyLevel))
 817             elif status == 2:
 818                 self.logger.critical( \
 819                               "%s exceeded %d%% capacity. " \
 820                               "Weekly, hourly and daily automatic snapshots were destroyed" \
 821                                % (zpool.name, self._criticalLevel))
 822             elif status == 1:
 823                 self.logger.warning( \
 824                               "%s exceeded %d%% capacity. " \
 825                               "Hourly and daily automatic snapshots were destroyed" \
 826                                % (zpool.name, self._warningLevel))
 827
 828         if len(self._destroyedsnaps) > 0:
 829             self.logger.warning( \
 830                           "%d automatic snapshots were destroyed" \
 831                            % len(self._destroyedsnaps))
 832
 833     def _send_notification(self):
 834         worstpool = None
 835         worststatus = 0
 836
 837         for zpool in self._zpools:
 838             status = self._poolstatus[zpool.name]
 839             # >= to ensure that something should always be set.
 840             if status >= worststatus:
 841                 worstpool = zpool.name
 842                 worststatus = status
 843
 844         #FIXME make the various levels indexible
 845         if worststatus == 4:
 846             self._dbus.capacity_exceeded(worstpool, 4, self._emergencyLevel)
 847         elif worststatus == 3:
 848             self._dbus.capacity_exceeded(worstpool, 3, self._emergencyLevel)
 849         elif worststatus == 2:
 850             self._dbus.capacity_exceeded(worstpool, 2, self._criticalLevel)
 851         elif worststatus == 1:
 852             self._dbus.capacity_exceeded(worstpool, 1, self._warningLevel)
 853         #elif: 0 everything is fine. Do nothing.
 854
 855
 856 def monitor_threads(snapthread):
 857     logger = logging.getLogger('time-slider')
 858     if snapthread.is_alive():
 859         return True
 860     else:
 861         logger.error("Snapshot monitor thread exited.")
 862         if snapthread.exitCode == smf.SMF_EXIT_MON_DEGRADE:
 863             # FIXME - it would be nicer to mark the service as degraded than
 864             # go into maintenance state for some situations such as a
 865             # particular snapshot schedule failing.
 866             # But for now SMF does not implement this feature. But if/when it
 867             # does it's better to use svcadm to put the # service into the
 868             # correct state since the daemon shouldn't exit whentransitioning
 869             # to a degraded state.
 870             #sys.stderr.write("Placing service into maintenance state\n")
 871             #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
 872             #                 os.getenv("SMF_FMRI")])
 873             # SMF will take care of kill the daemon
 874             sys.exit(smf.SMF_EXIT_ERR_FATAL)
 875             return False
 876         elif snapthread.exitCode == smf.SMF_EXIT_ERR_FATAL:
 877             #sys.stderr.write("Placing service into maintenance state\n")
 878             #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
 879             #                 os.getenv("SMF_FMRI")])
 880             # SMF will take care of killing the daemon
 881             sys.exit(smf.SMF_EXIT_ERR_FATAL)
 882             return False
 883         else:
 884             logger.error("Snapshot monitor thread exited abnormally")
 885             logger.error("Exit code: %d" % (snapthread.exitCode))
 886             #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
 887             #                 os.getenv("SMF_FMRI")])
 888             sys.exit(smf.SMF_EXIT_ERR_FATAL)
 889             return False
 890
 891
 892 def child_sig_handler(signum, frame):
 893     if signum == signal.SIGUSR1:
 894         sys.exit(smf.SMF_EXIT_OK)
 895     elif signum == signal.SIGCHLD:
 896         sys.exit(smf.SMF_EXIT_ERR_FATAL)
 897     elif signum == signal.SIGALRM:
 898         sys.exit(smf.SMF_EXIT_ERR_FATAL)
 899
 900 # Default daemon parameters.
 901 # File mode creation mask of the daemon.
 902 UMASK = 0
 903 # Default working directory for the daemon.
 904 WORKDIR = "/"
 905 # Default maximum for the number of available file descriptors.
 906 MAXFD = 1024
 907
 908 def create_daemon():
 909     """
 910     Detach a process from the controlling terminal and run it in the
 911     background as a daemon.
 912     """
 913     #Catch signals that we might receive from child
 914     signal.signal(signal.SIGCHLD, child_sig_handler)
 915     signal.signal(signal.SIGUSR1, child_sig_handler)
 916     signal.signal(signal.SIGALRM, child_sig_handler)
 917     try:
 918         pid = os.fork()
 919     except OSError, e:
 920         raise Exception, "%s [%d]" % (e.strerror, e.errno)
 921
 922     if (pid == 0):
 923         #Reset signals that we set to trap in parent
 924         signal.signal(signal.SIGCHLD, signal.SIG_DFL)
 925         signal.signal(signal.SIGUSR1, signal.SIG_DFL)
 926         signal.signal(signal.SIGALRM, signal.SIG_DFL)
 927         os.setsid()
 928         os.chdir(WORKDIR)
 929         os.umask(UMASK)
 930     else:
 931         #Wait for the child to give the OK or otherwise.
 932         signal.pause()
 933
 934
 935 def main(argv):
 936
 937     parser = argparse.ArgumentParser()
 938     parser.add_argument('--foreground', action='store_true', help='Do not daemonize', default=False)
 939     parser.add_argument('--config', '-c', type=str, help='Configuration file', default='/etc/time-slider/timesliderd.conf')
 940     parser.add_argument('--configdump', action='store_true', help='Dump default values in config file format', default=False)
 941     args, _ = parser.parse_known_args()
 942
 943     logger = logging.getLogger('time-slider')
 944     logger.setLevel(logging.DEBUG)
 945     if args.foreground:
 946         handler = logging.StreamHandler()
 947         handler.setFormatter(logging.Formatter('%(message)s'))
 948     else:
 949         handler = SysLogHandler(address='/dev/log')
 950         handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s: %(message)s', '%b %d %H:%M:%S time-sliderd:'))
 951     handler.setLevel(logging.DEBUG)
 952     logger.addHandler(handler)
 953
 954     if args.configdump:
 955         timesliderconfig.configdump()
 956         sys.exit(smf.SMF_EXIT_OK)
 957
 958     timesliderconfig.configfile = args.config
 959
 960     # Daemonise the service.
 961     if not args.foreground:
 962         create_daemon()
 963
 964     # The user security attributes checked are the following:
 965     # Note that UID == 0 will match any profile search so
 966     # no need to check it explicitly.
 967     rbacp = RBACprofile()
 968     if rbacp.has_profile("ZFS File System Management"):
 969
 970         gobject.threads_init()
 971
 972         # Tell dbus to use the gobject mainloop for async ops
 973         dbus.mainloop.glib.DBusGMainLoop(set_as_default=True)
 974         dbus.mainloop.glib.threads_init()
 975         # Register a bus name with the system dbus daemon
 976         systemBus = dbus.SystemBus()
 977         name = dbus.service.BusName("org.opensolaris.TimeSlider", systemBus)
 978
 979         # Create and start the snapshot manger. Takes care of
 980         # auto snapshotting service and auto cleanup.
 981         snapshot = SnapshotManager(systemBus)
 982         snapshot.start()
 983         gobject.timeout_add(2000, monitor_threads, snapshot)
 984
 985         mainloop = gobject.MainLoop()
 986         try:
 987             mainloop.run()
 988         except KeyboardInterrupt:
 989             mainloop.quit()
 990             sys.exit(smf.SMF_EXIT_OK)
 991     else:
 992         logger.error( \
 993                "%s has insufficient privileges to run time-sliderd!" \
 994                % rbacp.name)
 995         sys.exit(smf.SMF_EXIT_ERR_PERM)
 996     sys.exit(smf.SMF_EXIT_OK)
 997