usr/share/time-slider/lib/time_slider/timesliderd.py

   1 #!/usr/bin/python2
   2 #
   3 # CDDL HEADER START
   4 #
   5 # The contents of this file are subject to the terms of the
   6 # Common Development and Distribution License (the "License").
   7 # You may not use this file except in compliance with the License.
   8 #
   9 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10 # or http://www.opensolaris.org/os/licensing.
  11 # See the License for the specific language governing permissions
  12 # and limitations under the License.
  13 #
  14 # When distributing Covered Code, include this CDDL HEADER in each
  15 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16 # If applicable, add the following below this CDDL HEADER, with the
  17 # fields enclosed by brackets "[]" replaced with your own identifying
  18 # information: Portions Copyright [yyyy] [name of copyright owner]
  19 #
  20 # CDDL HEADER END
  21 #
  22
  23 import sys
  24 import os
  25 import subprocess
  26 import re
  27 import threading
  28 import getopt
  29 import syslog
  30 import time
  31 import datetime
  32 import calendar
  33 import signal
  34 import argparse
  35
  36 import glib
  37 import gobject
  38 import dbus
  39 import dbus.service
  40 import dbus.mainloop
  41 import dbus.mainloop.glib
  42
  43 import dbussvc
  44 import zfs
  45 import smf
  46 import time_slider.linux.timeslidersmf as timeslidersmf
  47 import time_slider.linux.autosnapsmf as autosnapsmf
  48 # import plugin
  49 from time_slider.linux.rbac import RBACprofile
  50 import util
  51
  52 import time_slider.linux.timesliderconfig as timesliderconfig
  53
  54 _MINUTE = 60
  55 _HOUR = _MINUTE * 60
  56 _DAY = _HOUR * 24
  57 _WEEK = _DAY * 7
  58
  59
  60 # Status codes for actual zpool capacity levels.
  61 # These are relative to the SMF property defined
  62 # levels for: user, warning and emergenecy levels
  63 STATUS_OK = 0 # Below user specified threshhold. Everything was OK
  64 STATUS_WARNING = 1 # Above specified user threshold level
  65 STATUS_CRITICAL = 2 # Above specified critical threshhold level
  66 STATUS_EMERGENCY = 3 # Above specified emergency threshhold level
  67
  68 intervals = {"weeks" : _WEEK, "days" : _DAY, "hours" : _HOUR, "minutes" : _MINUTE}
  69
  70
  71 class SnapshotManager(threading.Thread):
  72
  73     def __init__(self, bus):
  74         # Used to wake up the run() method prematurely in the event
  75         # of a SIGHUP/SMF refresh
  76         self._conditionLock = threading.Condition(threading.RLock())
  77         # Used when schedules are being rebuilt or examined.
  78         self._refreshLock = threading.Lock()
  79         # Indicates that cleanup is in progress when locked
  80         self._cleanupLock = threading.Lock()
  81         self._datasets = zfs.Datasets()
  82         # Indicates that schedules need to be rebuilt from scratch
  83         self._stale = True
  84         self._lastCleanupCheck = 0;
  85         self._zpools = []
  86         self._poolstatus = {}
  87         self._destroyedsnaps = []
  88
  89         # This is also checked during the refresh() method but we need
  90         # to know it sooner for instantiation of the PluginManager
  91         self._smf = timeslidersmf.TimeSliderSMF()
  92         try:
  93             self.verbose = self._smf.get_verbose()
  94         except RuntimeError,message:
  95             sys.stderr.write("Error determing whether debugging is enabled\n")
  96             self.verbose = False
  97
  98         self._dbus = dbussvc.AutoSnap(bus,
  99                                       '/org/opensolaris/TimeSlider/autosnap',
 100                                       self)
 101
 102         # self._plugin = plugin.PluginManager(self.verbose)
 103         self.exitCode = smf.SMF_EXIT_OK
 104         self.refresh()
 105
 106         # Seems we're up and running OK.
 107         # Signal our parent so we can daemonise
 108         os.kill(os.getppid(), signal.SIGUSR1)
 109
 110         # SMF/svc.startd sends SIGHUP to force a
 111         # a refresh of the daemon
 112         signal.signal(signal.SIGHUP, self._signalled)
 113
 114         # Init done. Now initiaslise threading.
 115         threading.Thread.__init__ (self)
 116         self.setDaemon(True)
 117
 118     def run(self):
 119         # Deselect swap and dump volumes so they don't get snapshotted.
 120         for vol in self._datasets.list_volumes():
 121             name = vol.rsplit("/")
 122             try:
 123                 if (name[1] == "swap" or name[1] == "dump"):
 124                     util.debug("Auto excluding %s volume" % vol, self.verbose)
 125                     volume = zfs.Volume(vol)
 126                     volume.set_auto_snap(False)
 127             except IndexError:
 128                 pass
 129
 130         nexttime = None
 131         waittime = None
 132         while True:
 133             try:
 134                 self.refresh()
 135                 # First check and, if necessary, perform any remedial cleanup.
 136                 # This is best done before creating any new snapshots which may
 137                 # otherwise get immediately gobbled up by the remedial cleanup.
 138                 if self._needs_cleanup() == True:
 139                     self._perform_cleanup()
 140                     # Check to see if cleanup actually deleted anything before
 141                     # notifying the user. Avoids the popup appearing continuously
 142                     if len(self._destroyedsnaps) > 0:
 143                         self._send_notification()
 144                     self._send_to_syslog()
 145
 146                 nexttime = self._check_snapshots()
 147                 # Overdue snapshots are already taken automatically
 148                 # inside _check_snapshots() so nexttime should never be
 149                 # < 0. It can be None however, which is fine since it
 150                 # will cause the scheduler thread to sleep indefinitely
 151                 # or until a SIGHUP is caught.
 152                 if nexttime:
 153                     util.debug("Waiting until " + str (nexttime), self.verbose)
 154                 waittime = None
 155                 if nexttime != None:
 156                     waittime = nexttime - long(time.time())
 157                     if (waittime <= 0):
 158                         # We took too long and missed a snapshot, so break out
 159                         # and catch up on it the next time through the loop
 160                         continue
 161                 # waittime could be None if no auto-snap schedules are online
 162                 self._conditionLock.acquire()
 163                 if waittime:
 164                     util.debug("Waiting %d seconds" % (waittime), self.verbose)
 165                     self._conditionLock.wait(waittime)
 166                 else: #None. Just wait a while to check for cleanups.
 167                     util.debug("No auto-snapshot schedules online.", \
 168                                self.verbose)
 169                     self._conditionLock.wait(_MINUTE * 15)
 170
 171             except OSError, message:
 172                 sys.stderr.write("Caught OSError exception in snapshot" +
 173                                  " manager thread\n")
 174                 sys.stderr.write("Error details:\n" + \
 175                                  "--------BEGIN ERROR MESSAGE--------\n" + \
 176                                  str(message) + \
 177                                  "\n--------END ERROR MESSAGE--------\n")
 178                 self.exitCode = smf.SMF_EXIT_ERR_FATAL
 179                 # Exit this thread
 180                 break
 181             except RuntimeError,message:
 182                 sys.stderr.write("Caught RuntimeError exception in snapshot" +
 183                                  " manager thread\n")
 184                 sys.stderr.write("Error details:\n" + \
 185                                  "--------BEGIN ERROR MESSAGE--------\n" + \
 186                                  str(message) + \
 187                                  "\n--------END ERROR MESSAGE--------\n")
 188                 # Exit this thread
 189                 break
 190
 191     def _signalled(self, signum, frame):
 192         if signum == signal.SIGHUP:
 193             if self._refreshLock.acquire(False) == False:
 194                 return
 195             self._stale = True
 196             self._refreshLock.release()
 197             self._conditionLock.acquire()
 198             self._conditionLock.notify()
 199             self._conditionLock.release()
 200
 201     def refresh(self):
 202         """
 203         Checks if defined snapshot schedules are out
 204         of date and rebuilds and updates if necessary
 205         """
 206         self._refreshLock.acquire()
 207         if self._stale == True:
 208             self._configure_svc_props()
 209             self._rebuild_schedules()
 210             self._update_schedules()
 211             # self._plugin.refresh()
 212             self._stale = False
 213         self._refreshLock.release()
 214
 215     def _configure_svc_props(self):
 216         try:
 217             self.verbose = self._smf.get_verbose()
 218         except RuntimeError,message:
 219             sys.stderr.write("Error determing whether debugging is enabled\n")
 220             self.verbose = False
 221
 222         try:
 223             cleanup = self._smf.get_remedial_cleanup()
 224             warn = self._smf.get_cleanup_level("warning")
 225             util.debug("Warning level value is:   %d%%" % warn, self.verbose)
 226             crit = self._smf.get_cleanup_level("critical")
 227             util.debug("Critical level value is:  %d%%" % crit, self.verbose)
 228             emer = self._smf.get_cleanup_level("emergency")
 229             util.debug("Emergency level value is: %d%%" % emer, self.verbose)
 230         except RuntimeError,message:
 231             sys.stderr.write("Failed to determine cleanup threshhold levels\n")
 232             sys.stderr.write("Details:\n" + \
 233                              "--------BEGIN ERROR MESSAGE--------\n" + \
 234                              str(message) + \
 235                              "\n---------END ERROR MESSAGE---------\n")
 236             sys.stderr.write("Using factory defaults of 80%, 90% and 95%\n")
 237             #Go with defaults
 238             #FIXME - this would be an appropriate case to mark svc as degraded
 239             self._remedialCleanup = True
 240             self._warningLevel = 80
 241             self._criticalLevel = 90
 242             self._emergencyLevel = 95
 243         else:
 244             self._remedialCleanup = cleanup
 245             self._warningLevel = warn
 246             self._criticalLevel = crit
 247             self._emergencyLevel = emer
 248
 249         try:
 250             self._keepEmpties = self._smf.get_keep_empties()
 251         except RuntimeError,message:
 252             # Not fatal, just assume we delete them (default configuration)
 253             sys.stderr.write("Can't determine whether to keep empty snapshots\n")
 254             sys.stderr.write("Details:\n" + \
 255                              "--------BEGIN ERROR MESSAGE--------\n" + \
 256                              str(message) + \
 257                              "\n---------END ERROR MESSAGE---------\n")
 258             sys.stderr.write("Assuming default value: False\n")
 259             self._keepEmpties = False
 260
 261         # Previously, snapshot labels used the ":" character was used as a
 262         # separator character for datestamps. Windows filesystems such as
 263         # CIFS and FAT choke on this character so now we use a user definable
 264         # separator value, with a default value of "_"
 265         # We need to check for both the old and new format when looking for
 266         # snapshots.
 267         self._separator = self._smf.get_separator()
 268         self._prefix = "%s[:%s]" \
 269             % (autosnapsmf.SNAPLABELPREFIX, self._separator)
 270
 271         # Rebuild pool list
 272         self._zpools = []
 273         try:
 274             for poolname in zfs.list_zpools():
 275                 # Do not try to examine FAULTED pools
 276                 zpool = zfs.ZPool(poolname)
 277                 if zpool.health == "FAULTED":
 278                     util.debug("Ignoring faulted Zpool: %s\n" \
 279                                % (zpool.name), \
 280                                self.verbose)
 281                 else:
 282                     self._zpools.append(zpool)
 283                 util.debug(str(zpool), self.verbose)
 284         except RuntimeError,message:
 285             sys.stderr.write("Could not list Zpools\n")
 286             self.exitCode = smf.SMF_EXIT_ERR_FATAL
 287             # Propogate exception up to thread's run() method
 288             raise RuntimeError,message
 289
 290
 291     def _rebuild_schedules(self):
 292         """
 293         Builds 2 lists of default and custom auto-snapshot SMF instances
 294         """
 295
 296         self._last = {}
 297         self._next = {}
 298         self._keep = {}
 299
 300         try:
 301             _defaultSchedules = autosnapsmf.get_default_schedules()
 302             _customSchedules = autosnapsmf.get_custom_schedules()
 303         except RuntimeError,message:
 304             self.exitCode = smf.SMF_EXIT_ERR_FATAL
 305             raise RuntimeError, "Error reading SMF schedule instances\n" + \
 306                                 "Details:\n" + str(message)
 307         else:
 308             # Now set it in stone.
 309             self._defaultSchedules = tuple(_defaultSchedules)
 310             self._customSchedules = tuple(_customSchedules)
 311
 312             # Build the combined schedule tuple from default + custom schedules
 313             _defaultSchedules.extend(_customSchedules)
 314             self._allSchedules = tuple(_defaultSchedules)
 315             for schedule,i,p,keep in self._allSchedules:
 316                 self._last[schedule] = 0
 317                 self._next[schedule] = 0
 318                 self._keep[schedule] = keep
 319
 320     def _update_schedules(self):
 321         interval = 0
 322         idx = 1 # Used to index subsets for schedule overlap calculation
 323         last = None
 324
 325         for schedule,interval,period,keep in self._allSchedules:
 326             # Shortcut if we've already processed this schedule and it's
 327             # still up to date. Don't skip the default schedules though
 328             # because overlap affects their scheduling
 329             if [schedule,interval,period,keep] not in \
 330                 self._defaultSchedules and \
 331                 (self._next[schedule] > self._last[schedule]):
 332                 util.debug("Short circuiting %s recalculation" \
 333                            % (schedule), \
 334                            self.verbose)
 335                 continue
 336
 337             # If we don't have an internal timestamp for the given schedule
 338             # ask zfs for the last snapshot and get it's creation timestamp.
 339             if self._last[schedule] == 0:
 340                 try:
 341                     snaps = self._datasets.list_snapshots("%s%s" % \
 342                                                          (self._prefix,
 343                                                           schedule))
 344                 except RuntimeError,message:
 345                     self.exitCode = smf.SMF_EXIT_ERR_FATAL
 346                     sys.stderr.write("Failed to list snapshots during schedule update\n")
 347                     #Propogate up to the thread's run() method
 348                     raise RuntimeError,message
 349
 350                 if len(snaps) > 0:
 351                     util.debug("Last %s snapshot was: %s" % \
 352                                (schedule, snaps[-1][0]), \
 353                                self.verbose)
 354                     self._last[schedule] = snaps[-1][1]
 355
 356             last = self._last[schedule]
 357             if interval != "months": # months is non-constant. See below.
 358                 util.debug("Recalculating %s schedule" % (schedule), \
 359                            self.verbose)
 360                 try:
 361                     totalinterval = intervals[interval] * period
 362                 except KeyError:
 363                     self.exitCode = smf.SMF_EXIT_ERR_CONFIG
 364                     sys.stderr.write(schedule + \
 365                                       " schedule has invalid interval: " + \
 366                                       "'%s\'\n" % interval)
 367                     #Propogate up to thread's run() method
 368                     raise RuntimeError
 369                 if [schedule,interval,period,keep] in self._defaultSchedules:
 370                     # This is one of the default schedules so check for an
 371                     # overlap with one of the dominant shchedules.
 372                     for s,i,p,k in self._defaultSchedules[:idx]:
 373                         last = max(last, self._last[s])
 374                     idx += 1
 375
 376             else: # interval == "months"
 377                 if self._next[schedule] > last:
 378                     util.debug("Short circuiting " + \
 379                                schedule + \
 380                                " recalculation", \
 381                                self.verbose)
 382                     continue
 383                 util.debug("Recalculating %s schedule" % (schedule), \
 384                            self.verbose)
 385                 snap_tm = time.gmtime(self._last[schedule])
 386                 # Increment year if period >= than 1 calender year.
 387                 year = snap_tm.tm_year
 388                 year += period / 12
 389                 period = period % 12
 390
 391                 mon = (snap_tm.tm_mon + period) % 12
 392                 # Result of 0 actually means december.
 393                 if mon == 0:
 394                     mon = 12
 395                 # Account for period that spans calendar year boundary.
 396                 elif snap_tm.tm_mon + period > 12:
 397                     year += 1
 398
 399                 d,dlastmon = calendar.monthrange(snap_tm.tm_year, snap_tm.tm_mon)
 400                 d,dnewmon = calendar.monthrange(year, mon)
 401                 mday = snap_tm.tm_mday
 402                 if dlastmon > dnewmon and snap_tm.tm_mday > dnewmon:
 403                    mday = dnewmon
 404
 405                 tm =(year, mon, mday, \
 406                     snap_tm.tm_hour, snap_tm.tm_min, snap_tm.tm_sec, \
 407                     0, 0, -1)
 408                 newt = calendar.timegm(tm)
 409                 new_tm = time.gmtime(newt)
 410                 totalinterval = newt - self._last[schedule]
 411
 412             self._next[schedule] = last + totalinterval
 413
 414     def _next_due(self):
 415         schedule = None
 416         earliest = None
 417         now = long(time.time())
 418
 419         for s,i,p,k in self._defaultSchedules:
 420             due = self._next[s]
 421             if due <= now:
 422                 #Default Schedule - so break out at the first
 423                 #schedule that is overdue. The subordinate schedules
 424                 #will re-adjust afterwards.
 425                 earliest,schedule = due,s
 426                 break
 427             elif earliest != None:
 428                 if due < earliest:
 429                     earliest,schedule = due,s
 430             else: #FIXME better optimisation with above condition
 431                 earliest,schedule = due,s
 432         for s,i,p,k in self._customSchedules:
 433             due = self._next[s]
 434             if earliest != None:
 435                 if due < earliest:
 436                     earliest,schedule = due,s
 437             else: #FIXME better optimisation with above condition
 438                 earliest,schedule = due,s
 439         return earliest,schedule
 440
 441     def _check_snapshots(self):
 442         """
 443         Check the schedules and see what the required snapshot is.
 444         Take one immediately on the first overdue snapshot required
 445         """
 446         # Make sure a refresh() doesn't mess with the schedule while
 447         # we're reading through it.
 448         self._refreshLock.acquire()
 449         next,schedule = self._next_due()
 450         self._refreshLock.release()
 451         now = long(time.time())
 452         while next != None and next <= now:
 453             label = self._take_snapshots(schedule)
 454             # self._plugin.execute_plugins(schedule, label)
 455             self._refreshLock.acquire()
 456             self._update_schedules()
 457             next,schedule = self._next_due();
 458             self._refreshLock.release()
 459             dt = datetime.datetime.fromtimestamp(next)
 460             util.debug("Next snapshot is %s due at: %s" % \
 461                        (schedule, dt.isoformat()), \
 462                        self.verbose)
 463         return next
 464
 465     def _take_snapshots(self, schedule):
 466         # Set the time before taking snapshot to avoid clock skew due
 467         # to time taken to complete snapshot.
 468         tm = long(time.time())
 469         label = "%s%s%s-%s" % \
 470                 (autosnapsmf.SNAPLABELPREFIX, self._separator, schedule,
 471                  datetime.datetime.now().strftime("%Y-%m-%d-%Hh%M"))
 472         try:
 473             self._datasets.create_auto_snapshot_set(label, tag=schedule)
 474         except RuntimeError, message:
 475             # Write an error message, set the exit code and pass it up the
 476             # stack so the thread can terminate
 477             sys.stderr.write("Failed to create snapshots for schedule: %s\n" \
 478                              % (schedule))
 479             self.exitCode = smf.SMF_EXIT_MON_DEGRADE
 480             raise RuntimeError,message
 481         self._last[schedule] = tm;
 482         self._perform_purge(schedule)
 483         return label
 484
 485     def _prune_snapshots(self, dataset, schedule):
 486         """Cleans out zero sized snapshots, kind of cautiously"""
 487             # Per schedule: We want to delete 0 sized
 488             # snapshots but we need to keep at least one around (the most
 489             # recent one) for each schedule so that that overlap is
 490             # maintained from frequent -> hourly -> daily etc.
 491             # Start off with the smallest interval schedule first and
 492             # move up. This increases the amount of data retained where
 493             # several snapshots are taken together like a frequent hourly
 494             # and daily snapshot taken at 12:00am. If 3 snapshots are all
 495             # identical and reference the same identical data they will all
 496             # be initially reported as zero for used size. Deleting the
 497             # daily first then the hourly would shift make the data referenced
 498             # by all 3 snapshots unique to the frequent scheduled snapshot.
 499             # This snapshot would probably be purged within an how ever and the
 500             # data referenced by it would be gone for good.
 501             # Doing it the other way however ensures that the data should
 502             # remain accessible to the user for at least a week as long as
 503             # the pool doesn't run low on available space before that.
 504
 505         try:
 506             snaps = dataset.list_snapshots("%s%s" % (self._prefix,schedule))
 507             # Clone the list because we want to remove items from it
 508             # while iterating through it.
 509             remainingsnaps = snaps[:]
 510         except RuntimeError,message:
 511             sys.stderr.write("Failed to list snapshots during snapshot cleanup\n")
 512             self.exitCode = smf.SMF_EXIT_ERR_FATAL
 513             raise RuntimeError,message
 514
 515         if (self._keepEmpties == False):
 516             try: # remove the newest one from the list.
 517                 snaps.pop()
 518             except IndexError:
 519                 pass
 520             for snapname in snaps:
 521                 try:
 522                     snapshot = zfs.Snapshot(snapname)
 523                 except Exception,message:
 524                     sys.stderr.write(str(message))
 525                     # Not fatal, just skip to the next snapshot
 526                     continue
 527
 528                 try:
 529                     if snapshot.get_used_size() == 0:
 530                         util.debug("Destroying zero sized: " + snapname, \
 531                                    self.verbose)
 532                         try:
 533                             snapshot.destroy()
 534                         except RuntimeError,message:
 535                             sys.stderr.write("Failed to destroy snapshot: " +
 536                                              snapname + "\n")
 537                             self.exitCode = smf.SMF_EXIT_MON_DEGRADE
 538                             # Propogate exception so thread can exit
 539                             raise RuntimeError,message
 540                         remainingsnaps.remove(snapname)
 541                 except RuntimeError,message:
 542                     sys.stderr.write("Can not determine used size of: " + \
 543                                      snapname + "\n")
 544                     self.exitCode = smf.SMF_EXIT_MON_DEGRADE
 545                     #Propogate the exception to the thead run() method
 546                     raise RuntimeError,message
 547
 548         # Deleting individual snapshots instead of recursive sets
 549         # breaks the recursion chain and leaves child snapshots
 550         # dangling so we need to take care of cleaning up the
 551         # snapshots.
 552         target = len(remainingsnaps) - self._keep[schedule]
 553         counter = 0
 554         while counter < target:
 555             util.debug("Destroy expired snapshot: " + \
 556                        remainingsnaps[counter],
 557                        self.verbose)
 558             try:
 559                 snapshot = zfs.Snapshot(remainingsnaps[counter])
 560             except Exception,message:
 561                     sys.stderr.write(str(message))
 562                     # Not fatal, just skip to the next snapshot
 563                     counter += 1
 564                     continue
 565             try:
 566                 snapshot.destroy()
 567             except RuntimeError,message:
 568                 sys.stderr.write("Failed to destroy snapshot: " +
 569                                  snapshot.name + "\n")
 570                 self.exitCode = smf.SMF_EXIT_ERR_FATAL
 571                 # Propogate exception so thread can exit
 572                 raise RuntimeError,message
 573             else:
 574                 counter += 1
 575
 576     def _perform_purge(self, schedule):
 577         """Cautiously cleans out zero sized snapshots"""
 578         # We need to avoid accidentally pruning auto snapshots received
 579         # from one zpool to another. We ensure this by examining only
 580         # snapshots whose parent fileystems and volumes are explicitly
 581         # tagged to be snapshotted.
 582         try:
 583             for name in self._datasets.list_auto_snapshot_sets(schedule):
 584                 dataset = zfs.ReadWritableDataset(name)
 585                 self._prune_snapshots(dataset, schedule)
 586         except RuntimeError,message:
 587             sys.stderr.write("Error listing datasets during " + \
 588                              "removal of expired snapshots\n")
 589             self.exitCode = smf.SMF_EXIT_ERR_FATAL
 590             # Propogate up to thread's run() method
 591             raise RuntimeError,message
 592
 593     def _needs_cleanup(self):
 594         if self._remedialCleanup == False:
 595             # Sys admin has explicitly instructed for remedial cleanups
 596             # not to be performed.
 597             return False
 598         now = long(time.time())
 599         # Don't run checks any less than 15 minutes apart.
 600         if self._cleanupLock.acquire(False) == False:
 601             #Indicates that a cleanup is already running.
 602             return False
 603         # FIXME - Make the cleanup interval equal to the minimum snapshot interval
 604         # if custom snapshot schedules are defined and enabled.
 605         elif ((now - self._lastCleanupCheck) < (_MINUTE * 15)):
 606             pass
 607         else:
 608             for zpool in self._zpools:
 609                 try:
 610                     if zpool.get_capacity() > self._warningLevel:
 611                         # Before getting into a panic, determine if the pool
 612                         # is one we actually take snapshots on, by checking
 613                         # for one of the "auto-snapshot:<schedule> tags. Not
 614                         # super fast, but it only happens under exceptional
 615                         # circumstances of a zpool nearing it's capacity.
 616
 617                         for sched in self._allSchedules:
 618                             sets = zpool.list_auto_snapshot_sets(sched[0])
 619                             if len(sets) > 0:
 620                                 util.debug("%s needs a cleanup" \
 621                                            % zpool.name, \
 622                                            self.verbose)
 623                                 self._cleanupLock.release()
 624                                 return True
 625                 except RuntimeError, message:
 626                     sys.stderr.write("Error checking zpool capacity of: " + \
 627                                      zpool.name + "\n")
 628                     self._cleanupLock.release()
 629                     self.exitCode = smf.SMF_EXIT_ERR_FATAL
 630                     # Propogate up to thread's run() mehod.
 631                     raise RuntimeError,message
 632             self._lastCleanupCheck = long(time.time())
 633         self._cleanupLock.release()
 634         return False
 635
 636     def _perform_cleanup(self):
 637         if self._cleanupLock.acquire(False) == False:
 638             # Cleanup already running. Skip
 639             return
 640         self._destroyedsnaps = []
 641         for zpool in self._zpools:
 642             try:
 643                 self._poolstatus[zpool.name] = 0
 644                 capacity = zpool.get_capacity()
 645                 if capacity > self._warningLevel:
 646                     self._run_warning_cleanup(zpool)
 647                     self._poolstatus[zpool.name] = 1
 648                     capacity = zpool.get_capacity()
 649                 if capacity > self._criticalLevel:
 650                     self._run_critical_cleanup(zpool)
 651                     self._poolstatus[zpool.name] = 2
 652                     capacity = zpool.get_capacity()
 653                 if capacity > self._emergencyLevel:
 654                     self._run_emergency_cleanup(zpool)
 655                     self._poolstatus[zpool.name] = 3
 656                     capacity = zpool.get_capacity()
 657                 if capacity > self._emergencyLevel:
 658                     self._run_emergency_cleanup(zpool)
 659                     self._poolstatus[zpool.name] = 4
 660             # This also catches exceptions thrown from _run_<level>_cleanup()
 661             # and _run_cleanup() in methods called by _perform_cleanup()
 662             except RuntimeError,message:
 663                 sys.stderr.write("Remedial space cleanup failed because " + \
 664                                  "of failure to determinecapacity of: " + \
 665                                  zpool.name + "\n")
 666                 self.exitCode = smf.SMF_EXIT_ERR_FATAL
 667                 self._cleanupLock.release()
 668                 # Propogate up to thread's run() method.
 669                 raise RuntimeError,message
 670
 671             # Bad - there's no more snapshots left and nothing
 672             # left to delete. We don't disable the service since
 673             # it will permit self recovery and snapshot
 674             # retention when space becomes available on
 675             # the pool (hopefully).
 676             util.debug("%s pool status after cleanup:" \
 677                        % zpool.name, \
 678                        self.verbose)
 679             util.debug(zpool, self.verbose)
 680         util.debug("Cleanup completed. %d snapshots were destroyed" \
 681                    % len(self._destroyedsnaps), \
 682                    self.verbose)
 683         # Avoid needless list iteration for non-debug mode
 684         if self.verbose == True and len(self._destroyedsnaps) > 0:
 685             for snap in self._destroyedsnaps:
 686                 sys.stderr.write("\t%s\n" % snap)
 687         self._cleanupLock.release()
 688
 689     def _run_warning_cleanup(self, zpool):
 690         util.debug("Performing warning level cleanup on %s" % \
 691                    zpool.name, \
 692                    self.verbose)
 693         self._run_cleanup(zpool, "daily", self._warningLevel)
 694         if zpool.get_capacity() > self._warningLevel:
 695             self._run_cleanup(zpool, "hourly", self._warningLevel)
 696
 697     def _run_critical_cleanup(self, zpool):
 698         util.debug("Performing critical level cleanup on %s" % \
 699                    zpool.name, \
 700                    self.verbose)
 701         self._run_cleanup(zpool, "weekly", self._criticalLevel)
 702         if zpool.get_capacity() > self._criticalLevel:
 703             self._run_cleanup(zpool, "daily", self._criticalLevel)
 704         if zpool.get_capacity() > self._criticalLevel:
 705             self._run_cleanup(zpool, "hourly", self._criticalLevel)
 706
 707     def _run_emergency_cleanup(self, zpool):
 708         util.debug("Performing emergency level cleanup on %s" % \
 709                    zpool.name, \
 710                    self.verbose)
 711         self._run_cleanup(zpool, "monthly", self._emergencyLevel)
 712         if zpool.get_capacity() > self._emergencyLevel:
 713             self._run_cleanup(zpool, "weekly", self._emergencyLevel)
 714         if zpool.get_capacity() > self._emergencyLevel:
 715             self._run_cleanup(zpool, "daily", self._emergencyLevel)
 716         if zpool.get_capacity() > self._emergencyLevel:
 717             self._run_cleanup(zpool, "hourly", self._emergencyLevel)
 718         if zpool.get_capacity() > self._emergencyLevel:
 719             self._run_cleanup(zpool, "frequent", self._emergencyLevel)
 720         #Finally, as a last resort, delete custom scheduled snaphots
 721         for schedule,i,p,k in self._customSchedules:
 722             if zpool.get_capacity() < self._emergencyLevel:
 723                 break
 724             else:
 725                 self._run_cleanup(zpool, schedule, self._emergencyLevel)
 726
 727     def _run_cleanup(self, zpool, schedule, threshold):
 728         clonedsnaps = []
 729         snapshots = []
 730         try:
 731             clonedsnaps = self._datasets.list_cloned_snapshots()
 732         except RuntimeError,message:
 733                 sys.stderr.write("Error (non-fatal) listing cloned snapshots" +
 734                                  " while recovering pool capacity\n")
 735                 sys.stderr.write("Error details:\n" + \
 736                                  "--------BEGIN ERROR MESSAGE--------\n" + \
 737                                  str(message) + \
 738                                  "\n--------END ERROR MESSAGE--------\n")
 739
 740         # Build a list of snapshots in the given schedule, that are not
 741         # cloned, and sort the result in reverse chronological order.
 742         try:
 743             snapshots = [s for s,t in \
 744                             zpool.list_snapshots("%s%s" \
 745                             % (self._prefix,schedule)) \
 746                             if not s in clonedsnaps]
 747             snapshots.reverse()
 748         except RuntimeError,message:
 749             sys.stderr.write("Error listing snapshots" +
 750                              " while recovering pool capacity\n")
 751             self.exitCode = smf.SMF_EXIT_ERR_FATAL
 752             # Propogate the error up to the thread's run() method.
 753             raise RuntimeError,message
 754
 755         while zpool.get_capacity() > threshold:
 756             if len(snapshots) == 0:
 757                 syslog.syslog(syslog.LOG_NOTICE,
 758                               "No more %s snapshots left" \
 759                                % schedule)
 760                 return
 761
 762             """This is not an exact science. Deleteing a zero sized
 763             snapshot can have unpredictable results. For example a
 764             pair of snapshots may share exclusive reference to a large
 765             amount of data (eg. a large core file). The usage of both
 766             snapshots will initially be seen to be 0 by zfs(1). Deleting
 767             one of the snapshots will make the data become unique to the
 768             single remaining snapshot that references it uniquely. The
 769             remaining snapshot's size will then show up as non zero. So
 770             deleting 0 sized snapshot is not as pointless as it might seem.
 771             It also means we have to loop through this, each snapshot set
 772             at a time and observe the before and after results. Perhaps
 773             better way exists...."""
 774
 775             # Start with the oldest first
 776             snapname = snapshots.pop()
 777             snapshot = zfs.Snapshot(snapname)
 778             # It would be nicer, for performance purposes, to delete sets
 779             # of snapshots recursively but this might destroy more data than
 780             # absolutely necessary, plus the previous purging of zero sized
 781             # snapshots can easily break the recursion chain between
 782             # filesystems.
 783             # On the positive side there should be fewer snapshots and they
 784             # will mostly non-zero so we should get more effectiveness as a
 785             # result of deleting snapshots since they should be nearly always
 786             # non zero sized.
 787             util.debug("Destroying %s" % snapname, self.verbose)
 788             try:
 789                 snapshot.destroy()
 790             except RuntimeError,message:
 791                 # Would be nice to be able to mark service as degraded here
 792                 # but it's better to try to continue on rather than to give
 793                 # up alltogether (SMF maintenance state)
 794                 sys.stderr.write("Warning: Cleanup failed to destroy: %s\n" % \
 795                                  (snapshot.name))
 796                 sys.stderr.write("Details:\n%s\n" % (str(message)))
 797             else:
 798                 self._destroyedsnaps.append(snapname)
 799             # Give zfs some time to recalculate.
 800             time.sleep(3)
 801
 802     def _send_to_syslog(self):
 803         for zpool in self._zpools:
 804             status = self._poolstatus[zpool.name]
 805             if status == 4:
 806                 syslog.syslog(syslog.LOG_EMERG,
 807                               "%s is over %d%% capacity. " \
 808                               "All automatic snapshots were destroyed" \
 809                                % (zpool.name, self._emergencyLevel))
 810             elif status == 3:
 811                 syslog.syslog(syslog.LOG_ALERT,
 812                               "%s exceeded %d%% capacity. " \
 813                               "Automatic snapshots over 1 hour old were destroyed" \
 814                                % (zpool.name, self._emergencyLevel))
 815             elif status == 2:
 816                 syslog.syslog(syslog.LOG_CRIT,
 817                               "%s exceeded %d%% capacity. " \
 818                               "Weekly, hourly and daily automatic snapshots were destroyed" \
 819                                % (zpool.name, self._criticalLevel))
 820             elif status == 1:
 821                 syslog.syslog(syslog.LOG_WARNING,
 822                               "%s exceeded %d%% capacity. " \
 823                               "Hourly and daily automatic snapshots were destroyed" \
 824                                % (zpool.name, self._warningLevel))
 825
 826         if len(self._destroyedsnaps) > 0:
 827             syslog.syslog(syslog.LOG_NOTICE,
 828                           "%d automatic snapshots were destroyed" \
 829                            % len(self._destroyedsnaps))
 830
 831     def _send_notification(self):
 832         worstpool = None
 833         worststatus = 0
 834
 835         for zpool in self._zpools:
 836             status = self._poolstatus[zpool.name]
 837             # >= to ensure that something should always be set.
 838             if status >= worststatus:
 839                 worstpool = zpool.name
 840                 worststatus = status
 841
 842         #FIXME make the various levels indexible
 843         if worststatus == 4:
 844             self._dbus.capacity_exceeded(worstpool, 4, self._emergencyLevel)
 845         elif worststatus == 3:
 846             self._dbus.capacity_exceeded(worstpool, 3, self._emergencyLevel)
 847         elif worststatus == 2:
 848             self._dbus.capacity_exceeded(worstpool, 2, self._criticalLevel)
 849         elif worststatus == 1:
 850             self._dbus.capacity_exceeded(worstpool, 1, self._warningLevel)
 851         #elif: 0 everything is fine. Do nothing.
 852
 853
 854 def monitor_threads(snapthread):
 855     if snapthread.is_alive():
 856         return True
 857     else:
 858         sys.stderr.write("Snapshot monitor thread exited.\n")
 859         if snapthread.exitCode == smf.SMF_EXIT_MON_DEGRADE:
 860             # FIXME - it would be nicer to mark the service as degraded than
 861             # go into maintenance state for some situations such as a
 862             # particular snapshot schedule failing.
 863             # But for now SMF does not implement this feature. But if/when it
 864             # does it's better to use svcadm to put the # service into the
 865             # correct state since the daemon shouldn't exit whentransitioning
 866             # to a degraded state.
 867             #sys.stderr.write("Placing service into maintenance state\n")
 868             #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
 869             #                 os.getenv("SMF_FMRI")])
 870             # SMF will take care of kill the daemon
 871             sys.exit(smf.SMF_EXIT_ERR_FATAL)
 872             return False
 873         elif snapthread.exitCode == smf.SMF_EXIT_ERR_FATAL:
 874             #sys.stderr.write("Placing service into maintenance state\n")
 875             #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
 876             #                 os.getenv("SMF_FMRI")])
 877             # SMF will take care of killing the daemon
 878             sys.exit(smf.SMF_EXIT_ERR_FATAL)
 879             return False
 880         else:
 881             sys.stderr.write("Snapshot monitor thread exited abnormally\n")
 882             sys.stderr.write("Exit code: %d\n" % (snapthread.exitCode))
 883             #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
 884             #                 os.getenv("SMF_FMRI")])
 885             sys.exit(smf.SMF_EXIT_ERR_FATAL)
 886             return False
 887
 888
 889 def child_sig_handler(signum, frame):
 890     if signum == signal.SIGUSR1:
 891         sys.exit(smf.SMF_EXIT_OK)
 892     elif signum == signal.SIGCHLD:
 893         sys.exit(smf.SMF_EXIT_ERR_FATAL)
 894     elif signum == signal.SIGALRM:
 895         sys.exit(smf.SMF_EXIT_ERR_FATAL)
 896
 897 # Default daemon parameters.
 898 # File mode creation mask of the daemon.
 899 UMASK = 0
 900 # Default working directory for the daemon.
 901 WORKDIR = "/"
 902 # Default maximum for the number of available file descriptors.
 903 MAXFD = 1024
 904
 905 def create_daemon():
 906     """
 907     Detach a process from the controlling terminal and run it in the
 908     background as a daemon.
 909     """
 910     #Catch signals that we might receive from child
 911     signal.signal(signal.SIGCHLD, child_sig_handler)
 912     signal.signal(signal.SIGUSR1, child_sig_handler)
 913     signal.signal(signal.SIGALRM, child_sig_handler)
 914     try:
 915         pid = os.fork()
 916     except OSError, e:
 917         raise Exception, "%s [%d]" % (e.strerror, e.errno)
 918
 919     if (pid == 0):
 920         #Reset signals that we set to trap in parent
 921         signal.signal(signal.SIGCHLD, signal.SIG_DFL)
 922         signal.signal(signal.SIGUSR1, signal.SIG_DFL)
 923         signal.signal(signal.SIGALRM, signal.SIG_DFL)
 924         os.setsid()
 925         os.chdir(WORKDIR)
 926         os.umask(UMASK)
 927     else:
 928         #Wait for the child to give the OK or otherwise.
 929         signal.pause()
 930
 931
 932 def main(argv):
 933
 934     parser = argparse.ArgumentParser()
 935     parser.add_argument('--foreground', action='store_true', help='Do not daemonize', default=False)
 936     parser.add_argument('--config', '-c', type=str, help='Configuration file', default='/etc/time-slider/timesliderd.conf')
 937     parser.add_argument('--configdump', action='store_true', help='Dump default values in config file format', default=False)
 938     args, _ = parser.parse_known_args()
 939
 940     if args.configdump:
 941         timesliderconfig.configdump()
 942         sys.exit(smf.SMF_EXIT_OK)
 943
 944     timesliderconfig.configfile = args.config
 945
 946     # Daemonise the service.
 947     if not args.foreground:
 948         create_daemon()
 949
 950     # The user security attributes checked are the following:
 951     # Note that UID == 0 will match any profile search so
 952     # no need to check it explicitly.
 953     syslog.openlog("time-sliderd", 0, syslog.LOG_DAEMON)
 954     rbacp = RBACprofile()
 955     if rbacp.has_profile("ZFS File System Management"):
 956
 957         gobject.threads_init()
 958
 959         # Tell dbus to use the gobject mainloop for async ops
 960         dbus.mainloop.glib.DBusGMainLoop(set_as_default=True)
 961         dbus.mainloop.glib.threads_init()
 962         # Register a bus name with the system dbus daemon
 963         systemBus = dbus.SystemBus()
 964         name = dbus.service.BusName("org.opensolaris.TimeSlider", systemBus)
 965
 966         # Create and start the snapshot manger. Takes care of
 967         # auto snapshotting service and auto cleanup.
 968         snapshot = SnapshotManager(systemBus)
 969         snapshot.start()
 970         gobject.timeout_add(2000, monitor_threads, snapshot)
 971
 972         mainloop = gobject.MainLoop()
 973         try:
 974             mainloop.run()
 975         except KeyboardInterrupt:
 976             mainloop.quit()
 977             sys.exit(smf.SMF_EXIT_OK)
 978     else:
 979         syslog.syslog(syslog.LOG_ERR,
 980                "%s has insufficient privileges to run time-sliderd!" \
 981                % rbacp.name)
 982         syslog.closelog()
 983         sys.exit(smf.SMF_EXIT_ERR_PERM)
 984     syslog.closelog()
 985     sys.exit(smf.SMF_EXIT_OK)
 986