usr/share/time-slider/lib/time_slider/timesliderd.py

   1 #!/usr/bin/python2
   2 #
   3 # CDDL HEADER START
   4 #
   5 # The contents of this file are subject to the terms of the
   6 # Common Development and Distribution License (the "License").
   7 # You may not use this file except in compliance with the License.
   8 #
   9 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10 # or http://www.opensolaris.org/os/licensing.
  11 # See the License for the specific language governing permissions
  12 # and limitations under the License.
  13 #
  14 # When distributing Covered Code, include this CDDL HEADER in each
  15 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16 # If applicable, add the following below this CDDL HEADER, with the
  17 # fields enclosed by brackets "[]" replaced with your own identifying
  18 # information: Portions Copyright [yyyy] [name of copyright owner]
  19 #
  20 # CDDL HEADER END
  21 #
  22
  23 import sys
  24 import os
  25 import subprocess
  26 import re
  27 import threading
  28 import getopt
  29 import syslog
  30 import time
  31 import datetime
  32 import calendar
  33 import signal
  34 import argparse
  35
  36 import glib
  37 import gobject
  38 import dbus
  39 import dbus.service
  40 import dbus.mainloop
  41 import dbus.mainloop.glib
  42
  43 import dbussvc
  44 import zfs
  45 import smf
  46 import time_slider.linux.timeslidersmf as timeslidersmf
  47 import autosnapsmf
  48 import plugin
  49 from time_slider.linux.rbac import RBACprofile
  50 import util
  51
  52 _MINUTE = 60
  53 _HOUR = _MINUTE * 60
  54 _DAY = _HOUR * 24
  55 _WEEK = _DAY * 7
  56
  57
  58 # Status codes for actual zpool capacity levels.
  59 # These are relative to the SMF property defined
  60 # levels for: user, warning and emergenecy levels
  61 STATUS_OK = 0 # Below user specified threshhold. Everything was OK
  62 STATUS_WARNING = 1 # Above specified user threshold level
  63 STATUS_CRITICAL = 2 # Above specified critical threshhold level
  64 STATUS_EMERGENCY = 3 # Above specified emergency threshhold level
  65
  66 intervals = {"weeks" : _WEEK, "days" : _DAY, "hours" : _HOUR, "minutes" : _MINUTE}
  67
  68
  69 class SnapshotManager(threading.Thread):
  70
  71     def __init__(self, bus):
  72         # Used to wake up the run() method prematurely in the event
  73         # of a SIGHUP/SMF refresh
  74         self._conditionLock = threading.Condition(threading.RLock())
  75         # Used when schedules are being rebuilt or examined.
  76         self._refreshLock = threading.Lock()
  77         # Indicates that cleanup is in progress when locked
  78         self._cleanupLock = threading.Lock()
  79         self._datasets = zfs.Datasets()
  80         # Indicates that schedules need to be rebuilt from scratch
  81         self._stale = True
  82         self._lastCleanupCheck = 0;
  83         self._zpools = []
  84         self._poolstatus = {}
  85         self._destroyedsnaps = []
  86
  87         # This is also checked during the refresh() method but we need
  88         # to know it sooner for instantiation of the PluginManager
  89         self._smf = timeslidersmf.TimeSliderSMF()
  90         try:
  91             self.verbose = self._smf.get_verbose()
  92         except RuntimeError,message:
  93             sys.stderr.write("Error determing whether debugging is enabled\n")
  94             self.verbose = False
  95
  96         self._dbus = dbussvc.AutoSnap(bus,
  97                                       '/org/opensolaris/TimeSlider/autosnap',
  98                                       self)
  99
 100         self._plugin = plugin.PluginManager(self.verbose)
 101         self.exitCode = smf.SMF_EXIT_OK
 102         self.refresh()
 103
 104         # Seems we're up and running OK.
 105         # Signal our parent so we can daemonise
 106         os.kill(os.getppid(), signal.SIGUSR1)
 107
 108         # SMF/svc.startd sends SIGHUP to force a
 109         # a refresh of the daemon
 110         signal.signal(signal.SIGHUP, self._signalled)
 111
 112         # Init done. Now initiaslise threading.
 113         threading.Thread.__init__ (self)
 114         self.setDaemon(True)
 115
 116     def run(self):
 117         # Deselect swap and dump volumes so they don't get snapshotted.
 118         for vol in self._datasets.list_volumes():
 119             name = vol.rsplit("/")
 120             try:
 121                 if (name[1] == "swap" or name[1] == "dump"):
 122                     util.debug("Auto excluding %s volume" % vol, self.verbose)
 123                     volume = zfs.Volume(vol)
 124                     volume.set_auto_snap(False)
 125             except IndexError:
 126                 pass
 127
 128         nexttime = None
 129         waittime = None
 130         while True:
 131             try:
 132                 self.refresh()
 133                 # First check and, if necessary, perform any remedial cleanup.
 134                 # This is best done before creating any new snapshots which may
 135                 # otherwise get immediately gobbled up by the remedial cleanup.
 136                 if self._needs_cleanup() == True:
 137                     self._perform_cleanup()
 138                     # Check to see if cleanup actually deleted anything before
 139                     # notifying the user. Avoids the popup appearing continuously
 140                     if len(self._destroyedsnaps) > 0:
 141                         self._send_notification()
 142                     self._send_to_syslog()
 143
 144                 nexttime = self._check_snapshots()
 145                 # Overdue snapshots are already taken automatically
 146                 # inside _check_snapshots() so nexttime should never be
 147                 # < 0. It can be None however, which is fine since it
 148                 # will cause the scheduler thread to sleep indefinitely
 149                 # or until a SIGHUP is caught.
 150                 if nexttime:
 151                     util.debug("Waiting until " + str (nexttime), self.verbose)
 152                 waittime = None
 153                 if nexttime != None:
 154                     waittime = nexttime - long(time.time())
 155                     if (waittime <= 0):
 156                         # We took too long and missed a snapshot, so break out
 157                         # and catch up on it the next time through the loop
 158                         continue
 159                 # waittime could be None if no auto-snap schedules are online
 160                 self._conditionLock.acquire()
 161                 if waittime:
 162                     util.debug("Waiting %d seconds" % (waittime), self.verbose)
 163                     self._conditionLock.wait(waittime)
 164                 else: #None. Just wait a while to check for cleanups.
 165                     util.debug("No auto-snapshot schedules online.", \
 166                                self.verbose)
 167                     self._conditionLock.wait(_MINUTE * 15)
 168
 169             except OSError, message:
 170                 sys.stderr.write("Caught OSError exception in snapshot" +
 171                                  " manager thread\n")
 172                 sys.stderr.write("Error details:\n" + \
 173                                  "--------BEGIN ERROR MESSAGE--------\n" + \
 174                                  str(message) + \
 175                                  "\n--------END ERROR MESSAGE--------\n")
 176                 self.exitCode = smf.SMF_EXIT_ERR_FATAL
 177                 # Exit this thread
 178                 break
 179             except RuntimeError,message:
 180                 sys.stderr.write("Caught RuntimeError exception in snapshot" +
 181                                  " manager thread\n")
 182                 sys.stderr.write("Error details:\n" + \
 183                                  "--------BEGIN ERROR MESSAGE--------\n" + \
 184                                  str(message) + \
 185                                  "\n--------END ERROR MESSAGE--------\n")
 186                 # Exit this thread
 187                 break
 188
 189     def _signalled(self, signum, frame):
 190         if signum == signal.SIGHUP:
 191             if self._refreshLock.acquire(False) == False:
 192                 return
 193             self._stale = True
 194             self._refreshLock.release()
 195             self._conditionLock.acquire()
 196             self._conditionLock.notify()
 197             self._conditionLock.release()
 198
 199     def refresh(self):
 200         """
 201         Checks if defined snapshot schedules are out
 202         of date and rebuilds and updates if necessary
 203         """
 204         self._refreshLock.acquire()
 205         if self._stale == True:
 206             self._configure_svc_props()
 207             self._rebuild_schedules()
 208             self._update_schedules()
 209             self._plugin.refresh()
 210             self._stale = False
 211         self._refreshLock.release()
 212
 213     def _configure_svc_props(self):
 214         try:
 215             self.verbose = self._smf.get_verbose()
 216         except RuntimeError,message:
 217             sys.stderr.write("Error determing whether debugging is enabled\n")
 218             self.verbose = False
 219
 220         try:
 221             cleanup = self._smf.get_remedial_cleanup()
 222             warn = self._smf.get_cleanup_level("warning")
 223             util.debug("Warning level value is:   %d%%" % warn, self.verbose)
 224             crit = self._smf.get_cleanup_level("critical")
 225             util.debug("Critical level value is:  %d%%" % crit, self.verbose)
 226             emer = self._smf.get_cleanup_level("emergency")
 227             util.debug("Emergency level value is: %d%%" % emer, self.verbose)
 228         except RuntimeError,message:
 229             sys.stderr.write("Failed to determine cleanup threshhold levels\n")
 230             sys.stderr.write("Details:\n" + \
 231                              "--------BEGIN ERROR MESSAGE--------\n" + \
 232                              str(message) + \
 233                              "\n---------END ERROR MESSAGE---------\n")
 234             sys.stderr.write("Using factory defaults of 80%, 90% and 95%\n")
 235             #Go with defaults
 236             #FIXME - this would be an appropriate case to mark svc as degraded
 237             self._remedialCleanup = True
 238             self._warningLevel = 80
 239             self._criticalLevel = 90
 240             self._emergencyLevel = 95
 241         else:
 242             self._remedialCleanup = cleanup
 243             self._warningLevel = warn
 244             self._criticalLevel = crit
 245             self._emergencyLevel = emer
 246
 247         try:
 248             self._keepEmpties = self._smf.get_keep_empties()
 249         except RuntimeError,message:
 250             # Not fatal, just assume we delete them (default configuration)
 251             sys.stderr.write("Can't determine whether to keep empty snapshots\n")
 252             sys.stderr.write("Details:\n" + \
 253                              "--------BEGIN ERROR MESSAGE--------\n" + \
 254                              str(message) + \
 255                              "\n---------END ERROR MESSAGE---------\n")
 256             sys.stderr.write("Assuming default value: False\n")
 257             self._keepEmpties = False
 258
 259         # Previously, snapshot labels used the ":" character was used as a
 260         # separator character for datestamps. Windows filesystems such as
 261         # CIFS and FAT choke on this character so now we use a user definable
 262         # separator value, with a default value of "_"
 263         # We need to check for both the old and new format when looking for
 264         # snapshots.
 265         self._separator = self._smf.get_separator()
 266         self._prefix = "%s[:%s]" \
 267             % (autosnapsmf.SNAPLABELPREFIX, self._separator)
 268
 269         # Rebuild pool list
 270         self._zpools = []
 271         try:
 272             for poolname in zfs.list_zpools():
 273                 # Do not try to examine FAULTED pools
 274                 zpool = zfs.ZPool(poolname)
 275                 if zpool.health == "FAULTED":
 276                     util.debug("Ignoring faulted Zpool: %s\n" \
 277                                % (zpool.name), \
 278                                self.verbose)
 279                 else:
 280                     self._zpools.append(zpool)
 281                 util.debug(str(zpool), self.verbose)
 282         except RuntimeError,message:
 283             sys.stderr.write("Could not list Zpools\n")
 284             self.exitCode = smf.SMF_EXIT_ERR_FATAL
 285             # Propogate exception up to thread's run() method
 286             raise RuntimeError,message
 287
 288
 289     def _rebuild_schedules(self):
 290         """
 291         Builds 2 lists of default and custom auto-snapshot SMF instances
 292         """
 293
 294         self._last = {}
 295         self._next = {}
 296         self._keep = {}
 297
 298         try:
 299             _defaultSchedules = autosnapsmf.get_default_schedules()
 300             _customSchedules = autosnapsmf.get_custom_schedules()
 301         except RuntimeError,message:
 302             self.exitCode = smf.SMF_EXIT_ERR_FATAL
 303             raise RuntimeError, "Error reading SMF schedule instances\n" + \
 304                                 "Details:\n" + str(message)
 305         else:
 306             # Now set it in stone.
 307             self._defaultSchedules = tuple(_defaultSchedules)
 308             self._customSchedules = tuple(_customSchedules)
 309
 310             # Build the combined schedule tuple from default + custom schedules
 311             _defaultSchedules.extend(_customSchedules)
 312             self._allSchedules = tuple(_defaultSchedules)
 313             for schedule,i,p,keep in self._allSchedules:
 314                 self._last[schedule] = 0
 315                 self._next[schedule] = 0
 316                 self._keep[schedule] = keep
 317
 318     def _update_schedules(self):
 319         interval = 0
 320         idx = 1 # Used to index subsets for schedule overlap calculation
 321         last = None
 322
 323         for schedule,interval,period,keep in self._allSchedules:
 324             # Shortcut if we've already processed this schedule and it's
 325             # still up to date. Don't skip the default schedules though
 326             # because overlap affects their scheduling
 327             if [schedule,interval,period,keep] not in \
 328                 self._defaultSchedules and \
 329                 (self._next[schedule] > self._last[schedule]):
 330                 util.debug("Short circuiting %s recalculation" \
 331                            % (schedule), \
 332                            self.verbose)
 333                 continue
 334
 335             # If we don't have an internal timestamp for the given schedule
 336             # ask zfs for the last snapshot and get it's creation timestamp.
 337             if self._last[schedule] == 0:
 338                 try:
 339                     snaps = self._datasets.list_snapshots("%s%s" % \
 340                                                          (self._prefix,
 341                                                           schedule))
 342                 except RuntimeError,message:
 343                     self.exitCode = smf.SMF_EXIT_ERR_FATAL
 344                     sys.stderr.write("Failed to list snapshots during schedule update\n")
 345                     #Propogate up to the thread's run() method
 346                     raise RuntimeError,message
 347
 348                 if len(snaps) > 0:
 349                     util.debug("Last %s snapshot was: %s" % \
 350                                (schedule, snaps[-1][0]), \
 351                                self.verbose)
 352                     self._last[schedule] = snaps[-1][1]
 353
 354             last = self._last[schedule]
 355             if interval != "months": # months is non-constant. See below.
 356                 util.debug("Recalculating %s schedule" % (schedule), \
 357                            self.verbose)
 358                 try:
 359                     totalinterval = intervals[interval] * period
 360                 except KeyError:
 361                     self.exitCode = smf.SMF_EXIT_ERR_CONFIG
 362                     sys.stderr.write(schedule + \
 363                                       " schedule has invalid interval: " + \
 364                                       "'%s\'\n" % interval)
 365                     #Propogate up to thread's run() method
 366                     raise RuntimeError
 367                 if [schedule,interval,period,keep] in self._defaultSchedules:
 368                     # This is one of the default schedules so check for an
 369                     # overlap with one of the dominant shchedules.
 370                     for s,i,p,k in self._defaultSchedules[:idx]:
 371                         last = max(last, self._last[s])
 372                     idx += 1
 373
 374             else: # interval == "months"
 375                 if self._next[schedule] > last:
 376                     util.debug("Short circuiting " + \
 377                                schedule + \
 378                                " recalculation", \
 379                                self.verbose)
 380                     continue
 381                 util.debug("Recalculating %s schedule" % (schedule), \
 382                            self.verbose)
 383                 snap_tm = time.gmtime(self._last[schedule])
 384                 # Increment year if period >= than 1 calender year.
 385                 year = snap_tm.tm_year
 386                 year += period / 12
 387                 period = period % 12
 388
 389                 mon = (snap_tm.tm_mon + period) % 12
 390                 # Result of 0 actually means december.
 391                 if mon == 0:
 392                     mon = 12
 393                 # Account for period that spans calendar year boundary.
 394                 elif snap_tm.tm_mon + period > 12:
 395                     year += 1
 396
 397                 d,dlastmon = calendar.monthrange(snap_tm.tm_year, snap_tm.tm_mon)
 398                 d,dnewmon = calendar.monthrange(year, mon)
 399                 mday = snap_tm.tm_mday
 400                 if dlastmon > dnewmon and snap_tm.tm_mday > dnewmon:
 401                    mday = dnewmon
 402
 403                 tm =(year, mon, mday, \
 404                     snap_tm.tm_hour, snap_tm.tm_min, snap_tm.tm_sec, \
 405                     0, 0, -1)
 406                 newt = calendar.timegm(tm)
 407                 new_tm = time.gmtime(newt)
 408                 totalinterval = newt - self._last[schedule]
 409
 410             self._next[schedule] = last + totalinterval
 411
 412     def _next_due(self):
 413         schedule = None
 414         earliest = None
 415         now = long(time.time())
 416
 417         for s,i,p,k in self._defaultSchedules:
 418             due = self._next[s]
 419             if due <= now:
 420                 #Default Schedule - so break out at the first
 421                 #schedule that is overdue. The subordinate schedules
 422                 #will re-adjust afterwards.
 423                 earliest,schedule = due,s
 424                 break
 425             elif earliest != None:
 426                 if due < earliest:
 427                     earliest,schedule = due,s
 428             else: #FIXME better optimisation with above condition
 429                 earliest,schedule = due,s
 430         for s,i,p,k in self._customSchedules:
 431             due = self._next[s]
 432             if earliest != None:
 433                 if due < earliest:
 434                     earliest,schedule = due,s
 435             else: #FIXME better optimisation with above condition
 436                 earliest,schedule = due,s
 437         return earliest,schedule
 438
 439     def _check_snapshots(self):
 440         """
 441         Check the schedules and see what the required snapshot is.
 442         Take one immediately on the first overdue snapshot required
 443         """
 444         # Make sure a refresh() doesn't mess with the schedule while
 445         # we're reading through it.
 446         self._refreshLock.acquire()
 447         next,schedule = self._next_due()
 448         self._refreshLock.release()
 449         now = long(time.time())
 450         while next != None and next <= now:
 451             label = self._take_snapshots(schedule)
 452             self._plugin.execute_plugins(schedule, label)
 453             self._refreshLock.acquire()
 454             self._update_schedules()
 455             next,schedule = self._next_due();
 456             self._refreshLock.release()
 457             dt = datetime.datetime.fromtimestamp(next)
 458             util.debug("Next snapshot is %s due at: %s" % \
 459                        (schedule, dt.isoformat()), \
 460                        self.verbose)
 461         return next
 462
 463     def _take_snapshots(self, schedule):
 464         # Set the time before taking snapshot to avoid clock skew due
 465         # to time taken to complete snapshot.
 466         tm = long(time.time())
 467         label = "%s%s%s-%s" % \
 468                 (autosnapsmf.SNAPLABELPREFIX, self._separator, schedule,
 469                  datetime.datetime.now().strftime("%Y-%m-%d-%Hh%M"))
 470         try:
 471             self._datasets.create_auto_snapshot_set(label, tag=schedule)
 472         except RuntimeError, message:
 473             # Write an error message, set the exit code and pass it up the
 474             # stack so the thread can terminate
 475             sys.stderr.write("Failed to create snapshots for schedule: %s\n" \
 476                              % (schedule))
 477             self.exitCode = smf.SMF_EXIT_MON_DEGRADE
 478             raise RuntimeError,message
 479         self._last[schedule] = tm;
 480         self._perform_purge(schedule)
 481         return label
 482
 483     def _prune_snapshots(self, dataset, schedule):
 484         """Cleans out zero sized snapshots, kind of cautiously"""
 485             # Per schedule: We want to delete 0 sized
 486             # snapshots but we need to keep at least one around (the most
 487             # recent one) for each schedule so that that overlap is
 488             # maintained from frequent -> hourly -> daily etc.
 489             # Start off with the smallest interval schedule first and
 490             # move up. This increases the amount of data retained where
 491             # several snapshots are taken together like a frequent hourly
 492             # and daily snapshot taken at 12:00am. If 3 snapshots are all
 493             # identical and reference the same identical data they will all
 494             # be initially reported as zero for used size. Deleting the
 495             # daily first then the hourly would shift make the data referenced
 496             # by all 3 snapshots unique to the frequent scheduled snapshot.
 497             # This snapshot would probably be purged within an how ever and the
 498             # data referenced by it would be gone for good.
 499             # Doing it the other way however ensures that the data should
 500             # remain accessible to the user for at least a week as long as
 501             # the pool doesn't run low on available space before that.
 502
 503         try:
 504             snaps = dataset.list_snapshots("%s%s" % (self._prefix,schedule))
 505             # Clone the list because we want to remove items from it
 506             # while iterating through it.
 507             remainingsnaps = snaps[:]
 508         except RuntimeError,message:
 509             sys.stderr.write("Failed to list snapshots during snapshot cleanup\n")
 510             self.exitCode = smf.SMF_EXIT_ERR_FATAL
 511             raise RuntimeError,message
 512
 513         if (self._keepEmpties == False):
 514             try: # remove the newest one from the list.
 515                 snaps.pop()
 516             except IndexError:
 517                 pass
 518             for snapname in snaps:
 519                 try:
 520                     snapshot = zfs.Snapshot(snapname)
 521                 except Exception,message:
 522                     sys.stderr.write(str(message))
 523                     # Not fatal, just skip to the next snapshot
 524                     continue
 525
 526                 try:
 527                     if snapshot.get_used_size() == 0:
 528                         util.debug("Destroying zero sized: " + snapname, \
 529                                    self.verbose)
 530                         try:
 531                             snapshot.destroy()
 532                         except RuntimeError,message:
 533                             sys.stderr.write("Failed to destroy snapshot: " +
 534                                              snapname + "\n")
 535                             self.exitCode = smf.SMF_EXIT_MON_DEGRADE
 536                             # Propogate exception so thread can exit
 537                             raise RuntimeError,message
 538                         remainingsnaps.remove(snapname)
 539                 except RuntimeError,message:
 540                     sys.stderr.write("Can not determine used size of: " + \
 541                                      snapname + "\n")
 542                     self.exitCode = smf.SMF_EXIT_MON_DEGRADE
 543                     #Propogate the exception to the thead run() method
 544                     raise RuntimeError,message
 545
 546         # Deleting individual snapshots instead of recursive sets
 547         # breaks the recursion chain and leaves child snapshots
 548         # dangling so we need to take care of cleaning up the
 549         # snapshots.
 550         target = len(remainingsnaps) - self._keep[schedule]
 551         counter = 0
 552         while counter < target:
 553             util.debug("Destroy expired snapshot: " + \
 554                        remainingsnaps[counter],
 555                        self.verbose)
 556             try:
 557                 snapshot = zfs.Snapshot(remainingsnaps[counter])
 558             except Exception,message:
 559                     sys.stderr.write(str(message))
 560                     # Not fatal, just skip to the next snapshot
 561                     counter += 1
 562                     continue
 563             try:
 564                 snapshot.destroy()
 565             except RuntimeError,message:
 566                 sys.stderr.write("Failed to destroy snapshot: " +
 567                                  snapshot.name + "\n")
 568                 self.exitCode = smf.SMF_EXIT_ERR_FATAL
 569                 # Propogate exception so thread can exit
 570                 raise RuntimeError,message
 571             else:
 572                 counter += 1
 573
 574     def _perform_purge(self, schedule):
 575         """Cautiously cleans out zero sized snapshots"""
 576         # We need to avoid accidentally pruning auto snapshots received
 577         # from one zpool to another. We ensure this by examining only
 578         # snapshots whose parent fileystems and volumes are explicitly
 579         # tagged to be snapshotted.
 580         try:
 581             for name in self._datasets.list_auto_snapshot_sets(schedule):
 582                 dataset = zfs.ReadWritableDataset(name)
 583                 self._prune_snapshots(dataset, schedule)
 584         except RuntimeError,message:
 585             sys.stderr.write("Error listing datasets during " + \
 586                              "removal of expired snapshots\n")
 587             self.exitCode = smf.SMF_EXIT_ERR_FATAL
 588             # Propogate up to thread's run() method
 589             raise RuntimeError,message
 590
 591     def _needs_cleanup(self):
 592         if self._remedialCleanup == False:
 593             # Sys admin has explicitly instructed for remedial cleanups
 594             # not to be performed.
 595             return False
 596         now = long(time.time())
 597         # Don't run checks any less than 15 minutes apart.
 598         if self._cleanupLock.acquire(False) == False:
 599             #Indicates that a cleanup is already running.
 600             return False
 601         # FIXME - Make the cleanup interval equal to the minimum snapshot interval
 602         # if custom snapshot schedules are defined and enabled.
 603         elif ((now - self._lastCleanupCheck) < (_MINUTE * 15)):
 604             pass
 605         else:
 606             for zpool in self._zpools:
 607                 try:
 608                     if zpool.get_capacity() > self._warningLevel:
 609                         # Before getting into a panic, determine if the pool
 610                         # is one we actually take snapshots on, by checking
 611                         # for one of the "auto-snapshot:<schedule> tags. Not
 612                         # super fast, but it only happens under exceptional
 613                         # circumstances of a zpool nearing it's capacity.
 614
 615                         for sched in self._allSchedules:
 616                             sets = zpool.list_auto_snapshot_sets(sched[0])
 617                             if len(sets) > 0:
 618                                 util.debug("%s needs a cleanup" \
 619                                            % zpool.name, \
 620                                            self.verbose)
 621                                 self._cleanupLock.release()
 622                                 return True
 623                 except RuntimeError, message:
 624                     sys.stderr.write("Error checking zpool capacity of: " + \
 625                                      zpool.name + "\n")
 626                     self._cleanupLock.release()
 627                     self.exitCode = smf.SMF_EXIT_ERR_FATAL
 628                     # Propogate up to thread's run() mehod.
 629                     raise RuntimeError,message
 630             self._lastCleanupCheck = long(time.time())
 631         self._cleanupLock.release()
 632         return False
 633
 634     def _perform_cleanup(self):
 635         if self._cleanupLock.acquire(False) == False:
 636             # Cleanup already running. Skip
 637             return
 638         self._destroyedsnaps = []
 639         for zpool in self._zpools:
 640             try:
 641                 self._poolstatus[zpool.name] = 0
 642                 capacity = zpool.get_capacity()
 643                 if capacity > self._warningLevel:
 644                     self._run_warning_cleanup(zpool)
 645                     self._poolstatus[zpool.name] = 1
 646                     capacity = zpool.get_capacity()
 647                 if capacity > self._criticalLevel:
 648                     self._run_critical_cleanup(zpool)
 649                     self._poolstatus[zpool.name] = 2
 650                     capacity = zpool.get_capacity()
 651                 if capacity > self._emergencyLevel:
 652                     self._run_emergency_cleanup(zpool)
 653                     self._poolstatus[zpool.name] = 3
 654                     capacity = zpool.get_capacity()
 655                 if capacity > self._emergencyLevel:
 656                     self._run_emergency_cleanup(zpool)
 657                     self._poolstatus[zpool.name] = 4
 658             # This also catches exceptions thrown from _run_<level>_cleanup()
 659             # and _run_cleanup() in methods called by _perform_cleanup()
 660             except RuntimeError,message:
 661                 sys.stderr.write("Remedial space cleanup failed because " + \
 662                                  "of failure to determinecapacity of: " + \
 663                                  zpool.name + "\n")
 664                 self.exitCode = smf.SMF_EXIT_ERR_FATAL
 665                 self._cleanupLock.release()
 666                 # Propogate up to thread's run() method.
 667                 raise RuntimeError,message
 668
 669             # Bad - there's no more snapshots left and nothing
 670             # left to delete. We don't disable the service since
 671             # it will permit self recovery and snapshot
 672             # retention when space becomes available on
 673             # the pool (hopefully).
 674             util.debug("%s pool status after cleanup:" \
 675                        % zpool.name, \
 676                        self.verbose)
 677             util.debug(zpool, self.verbose)
 678         util.debug("Cleanup completed. %d snapshots were destroyed" \
 679                    % len(self._destroyedsnaps), \
 680                    self.verbose)
 681         # Avoid needless list iteration for non-debug mode
 682         if self.verbose == True and len(self._destroyedsnaps) > 0:
 683             for snap in self._destroyedsnaps:
 684                 sys.stderr.write("\t%s\n" % snap)
 685         self._cleanupLock.release()
 686
 687     def _run_warning_cleanup(self, zpool):
 688         util.debug("Performing warning level cleanup on %s" % \
 689                    zpool.name, \
 690                    self.verbose)
 691         self._run_cleanup(zpool, "daily", self._warningLevel)
 692         if zpool.get_capacity() > self._warningLevel:
 693             self._run_cleanup(zpool, "hourly", self._warningLevel)
 694
 695     def _run_critical_cleanup(self, zpool):
 696         util.debug("Performing critical level cleanup on %s" % \
 697                    zpool.name, \
 698                    self.verbose)
 699         self._run_cleanup(zpool, "weekly", self._criticalLevel)
 700         if zpool.get_capacity() > self._criticalLevel:
 701             self._run_cleanup(zpool, "daily", self._criticalLevel)
 702         if zpool.get_capacity() > self._criticalLevel:
 703             self._run_cleanup(zpool, "hourly", self._criticalLevel)
 704
 705     def _run_emergency_cleanup(self, zpool):
 706         util.debug("Performing emergency level cleanup on %s" % \
 707                    zpool.name, \
 708                    self.verbose)
 709         self._run_cleanup(zpool, "monthly", self._emergencyLevel)
 710         if zpool.get_capacity() > self._emergencyLevel:
 711             self._run_cleanup(zpool, "weekly", self._emergencyLevel)
 712         if zpool.get_capacity() > self._emergencyLevel:
 713             self._run_cleanup(zpool, "daily", self._emergencyLevel)
 714         if zpool.get_capacity() > self._emergencyLevel:
 715             self._run_cleanup(zpool, "hourly", self._emergencyLevel)
 716         if zpool.get_capacity() > self._emergencyLevel:
 717             self._run_cleanup(zpool, "frequent", self._emergencyLevel)
 718         #Finally, as a last resort, delete custom scheduled snaphots
 719         for schedule,i,p,k in self._customSchedules:
 720             if zpool.get_capacity() < self._emergencyLevel:
 721                 break
 722             else:
 723                 self._run_cleanup(zpool, schedule, self._emergencyLevel)
 724
 725     def _run_cleanup(self, zpool, schedule, threshold):
 726         clonedsnaps = []
 727         snapshots = []
 728         try:
 729             clonedsnaps = self._datasets.list_cloned_snapshots()
 730         except RuntimeError,message:
 731                 sys.stderr.write("Error (non-fatal) listing cloned snapshots" +
 732                                  " while recovering pool capacity\n")
 733                 sys.stderr.write("Error details:\n" + \
 734                                  "--------BEGIN ERROR MESSAGE--------\n" + \
 735                                  str(message) + \
 736                                  "\n--------END ERROR MESSAGE--------\n")
 737
 738         # Build a list of snapshots in the given schedule, that are not
 739         # cloned, and sort the result in reverse chronological order.
 740         try:
 741             snapshots = [s for s,t in \
 742                             zpool.list_snapshots("%s%s" \
 743                             % (self._prefix,schedule)) \
 744                             if not s in clonedsnaps]
 745             snapshots.reverse()
 746         except RuntimeError,message:
 747             sys.stderr.write("Error listing snapshots" +
 748                              " while recovering pool capacity\n")
 749             self.exitCode = smf.SMF_EXIT_ERR_FATAL
 750             # Propogate the error up to the thread's run() method.
 751             raise RuntimeError,message
 752
 753         while zpool.get_capacity() > threshold:
 754             if len(snapshots) == 0:
 755                 syslog.syslog(syslog.LOG_NOTICE,
 756                               "No more %s snapshots left" \
 757                                % schedule)
 758                 return
 759
 760             """This is not an exact science. Deleteing a zero sized
 761             snapshot can have unpredictable results. For example a
 762             pair of snapshots may share exclusive reference to a large
 763             amount of data (eg. a large core file). The usage of both
 764             snapshots will initially be seen to be 0 by zfs(1). Deleting
 765             one of the snapshots will make the data become unique to the
 766             single remaining snapshot that references it uniquely. The
 767             remaining snapshot's size will then show up as non zero. So
 768             deleting 0 sized snapshot is not as pointless as it might seem.
 769             It also means we have to loop through this, each snapshot set
 770             at a time and observe the before and after results. Perhaps
 771             better way exists...."""
 772
 773             # Start with the oldest first
 774             snapname = snapshots.pop()
 775             snapshot = zfs.Snapshot(snapname)
 776             # It would be nicer, for performance purposes, to delete sets
 777             # of snapshots recursively but this might destroy more data than
 778             # absolutely necessary, plus the previous purging of zero sized
 779             # snapshots can easily break the recursion chain between
 780             # filesystems.
 781             # On the positive side there should be fewer snapshots and they
 782             # will mostly non-zero so we should get more effectiveness as a
 783             # result of deleting snapshots since they should be nearly always
 784             # non zero sized.
 785             util.debug("Destroying %s" % snapname, self.verbose)
 786             try:
 787                 snapshot.destroy()
 788             except RuntimeError,message:
 789                 # Would be nice to be able to mark service as degraded here
 790                 # but it's better to try to continue on rather than to give
 791                 # up alltogether (SMF maintenance state)
 792                 sys.stderr.write("Warning: Cleanup failed to destroy: %s\n" % \
 793                                  (snapshot.name))
 794                 sys.stderr.write("Details:\n%s\n" % (str(message)))
 795             else:
 796                 self._destroyedsnaps.append(snapname)
 797             # Give zfs some time to recalculate.
 798             time.sleep(3)
 799
 800     def _send_to_syslog(self):
 801         for zpool in self._zpools:
 802             status = self._poolstatus[zpool.name]
 803             if status == 4:
 804                 syslog.syslog(syslog.LOG_EMERG,
 805                               "%s is over %d%% capacity. " \
 806                               "All automatic snapshots were destroyed" \
 807                                % (zpool.name, self._emergencyLevel))
 808             elif status == 3:
 809                 syslog.syslog(syslog.LOG_ALERT,
 810                               "%s exceeded %d%% capacity. " \
 811                               "Automatic snapshots over 1 hour old were destroyed" \
 812                                % (zpool.name, self._emergencyLevel))
 813             elif status == 2:
 814                 syslog.syslog(syslog.LOG_CRIT,
 815                               "%s exceeded %d%% capacity. " \
 816                               "Weekly, hourly and daily automatic snapshots were destroyed" \
 817                                % (zpool.name, self._criticalLevel))
 818             elif status == 1:
 819                 syslog.syslog(syslog.LOG_WARNING,
 820                               "%s exceeded %d%% capacity. " \
 821                               "Hourly and daily automatic snapshots were destroyed" \
 822                                % (zpool.name, self._warningLevel))
 823
 824         if len(self._destroyedsnaps) > 0:
 825             syslog.syslog(syslog.LOG_NOTICE,
 826                           "%d automatic snapshots were destroyed" \
 827                            % len(self._destroyedsnaps))
 828
 829     def _send_notification(self):
 830         worstpool = None
 831         worststatus = 0
 832
 833         for zpool in self._zpools:
 834             status = self._poolstatus[zpool.name]
 835             # >= to ensure that something should always be set.
 836             if status >= worststatus:
 837                 worstpool = zpool.name
 838                 worststatus = status
 839
 840         #FIXME make the various levels indexible
 841         if worststatus == 4:
 842             self._dbus.capacity_exceeded(worstpool, 4, self._emergencyLevel)
 843         elif worststatus == 3:
 844             self._dbus.capacity_exceeded(worstpool, 3, self._emergencyLevel)
 845         elif worststatus == 2:
 846             self._dbus.capacity_exceeded(worstpool, 2, self._criticalLevel)
 847         elif worststatus == 1:
 848             self._dbus.capacity_exceeded(worstpool, 1, self._warningLevel)
 849         #elif: 0 everything is fine. Do nothing.
 850
 851
 852 def monitor_threads(snapthread):
 853     if snapthread.is_alive():
 854         return True
 855     else:
 856         sys.stderr.write("Snapshot monitor thread exited.\n")
 857         if snapthread.exitCode == smf.SMF_EXIT_MON_DEGRADE:
 858             # FIXME - it would be nicer to mark the service as degraded than
 859             # go into maintenance state for some situations such as a
 860             # particular snapshot schedule failing.
 861             # But for now SMF does not implement this feature. But if/when it
 862             # does it's better to use svcadm to put the # service into the
 863             # correct state since the daemon shouldn't exit whentransitioning
 864             # to a degraded state.
 865             #sys.stderr.write("Placing service into maintenance state\n")
 866             #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
 867             #                 os.getenv("SMF_FMRI")])
 868             # SMF will take care of kill the daemon
 869             sys.exit(smf.SMF_EXIT_ERR_FATAL)
 870             return False
 871         elif snapthread.exitCode == smf.SMF_EXIT_ERR_FATAL:
 872             #sys.stderr.write("Placing service into maintenance state\n")
 873             #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
 874             #                 os.getenv("SMF_FMRI")])
 875             # SMF will take care of killing the daemon
 876             sys.exit(smf.SMF_EXIT_ERR_FATAL)
 877             return False
 878         else:
 879             sys.stderr.write("Snapshot monitor thread exited abnormally\n")
 880             sys.stderr.write("Exit code: %d\n" % (snapthread.exitCode))
 881             #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
 882             #                 os.getenv("SMF_FMRI")])
 883             sys.exit(smf.SMF_EXIT_ERR_FATAL)
 884             return False
 885
 886
 887 def child_sig_handler(signum, frame):
 888     if signum == signal.SIGUSR1:
 889         sys.exit(smf.SMF_EXIT_OK)
 890     elif signum == signal.SIGCHLD:
 891         sys.exit(smf.SMF_EXIT_ERR_FATAL)
 892     elif signum == signal.SIGALRM:
 893         sys.exit(smf.SMF_EXIT_ERR_FATAL)
 894
 895 # Default daemon parameters.
 896 # File mode creation mask of the daemon.
 897 UMASK = 0
 898 # Default working directory for the daemon.
 899 WORKDIR = "/"
 900 # Default maximum for the number of available file descriptors.
 901 MAXFD = 1024
 902
 903 def create_daemon():
 904     """
 905     Detach a process from the controlling terminal and run it in the
 906     background as a daemon.
 907     """
 908     #Catch signals that we might receive from child
 909     signal.signal(signal.SIGCHLD, child_sig_handler)
 910     signal.signal(signal.SIGUSR1, child_sig_handler)
 911     signal.signal(signal.SIGALRM, child_sig_handler)
 912     try:
 913         pid = os.fork()
 914     except OSError, e:
 915         raise Exception, "%s [%d]" % (e.strerror, e.errno)
 916
 917     if (pid == 0):
 918         #Reset signals that we set to trap in parent
 919         signal.signal(signal.SIGCHLD, signal.SIG_DFL)
 920         signal.signal(signal.SIGUSR1, signal.SIG_DFL)
 921         signal.signal(signal.SIGALRM, signal.SIG_DFL)
 922         os.setsid()
 923         os.chdir(WORKDIR)
 924         os.umask(UMASK)
 925     else:
 926         #Wait for the child to give the OK or otherwise.
 927         signal.pause()
 928
 929
 930 def main(argv):
 931
 932     parser = argparse.ArgumentParser()
 933     parser.add_argument('--foreground', action='store_true', help='Do not daemonize', default=False)
 934     args, _ = parser.parse_known_args()
 935
 936     # Daemonise the service.
 937     if not args.foreground:
 938         create_daemon()
 939
 940     # The user security attributes checked are the following:
 941     # Note that UID == 0 will match any profile search so
 942     # no need to check it explicitly.
 943     syslog.openlog("time-sliderd", 0, syslog.LOG_DAEMON)
 944     rbacp = RBACprofile()
 945     if rbacp.has_profile("ZFS File System Management"):
 946
 947         gobject.threads_init()
 948
 949         # Tell dbus to use the gobject mainloop for async ops
 950         dbus.mainloop.glib.DBusGMainLoop(set_as_default=True)
 951         dbus.mainloop.glib.threads_init()
 952         # Register a bus name with the system dbus daemon
 953         systemBus = dbus.SystemBus()
 954         name = dbus.service.BusName("org.opensolaris.TimeSlider", systemBus)
 955
 956         # Create and start the snapshot manger. Takes care of
 957         # auto snapshotting service and auto cleanup.
 958         snapshot = SnapshotManager(systemBus)
 959         snapshot.start()
 960         gobject.timeout_add(2000, monitor_threads, snapshot)
 961
 962         mainloop = gobject.MainLoop()
 963         try:
 964             mainloop.run()
 965         except KeyboardInterrupt:
 966             mainloop.quit()
 967             sys.exit(smf.SMF_EXIT_OK)
 968     else:
 969         syslog.syslog(syslog.LOG_ERR,
 970                "%s has insufficient privileges to run time-sliderd!" \
 971                % rbacp.name)
 972         syslog.closelog()
 973         sys.exit(smf.SMF_EXIT_ERR_PERM)
 974     syslog.closelog()
 975     sys.exit(smf.SMF_EXIT_OK)
 976