usr/share/time-slider/lib/time_slider/timesliderd.py

   1 #!/usr/bin/python2
   2 #
   3 # CDDL HEADER START
   4 #
   5 # The contents of this file are subject to the terms of the
   6 # Common Development and Distribution License (the "License").
   7 # You may not use this file except in compliance with the License.
   8 #
   9 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10 # or http://www.opensolaris.org/os/licensing.
  11 # See the License for the specific language governing permissions
  12 # and limitations under the License.
  13 #
  14 # When distributing Covered Code, include this CDDL HEADER in each
  15 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16 # If applicable, add the following below this CDDL HEADER, with the
  17 # fields enclosed by brackets "[]" replaced with your own identifying
  18 # information: Portions Copyright [yyyy] [name of copyright owner]
  19 #
  20 # CDDL HEADER END
  21 #
  22
  23 import sys
  24 import os
  25 import subprocess
  26 import re
  27 import threading
  28 import getopt
  29 import syslog
  30 import time
  31 import datetime
  32 import calendar
  33 import signal
  34
  35 import glib
  36 import gobject
  37 import dbus
  38 import dbus.service
  39 import dbus.mainloop
  40 import dbus.mainloop.glib
  41
  42 import dbussvc
  43 import zfs
  44 import smf
  45 import timeslidersmf
  46 import autosnapsmf
  47 import plugin
  48 from rbac import RBACprofile
  49 import util
  50
  51 _MINUTE = 60
  52 _HOUR = _MINUTE * 60
  53 _DAY = _HOUR * 24
  54 _WEEK = _DAY * 7
  55
  56
  57 # Status codes for actual zpool capacity levels.
  58 # These are relative to the SMF property defined
  59 # levels for: user, warning and emergenecy levels
  60 STATUS_OK = 0 # Below user specified threshhold. Everything was OK
  61 STATUS_WARNING = 1 # Above specified user threshold level
  62 STATUS_CRITICAL = 2 # Above specified critical threshhold level
  63 STATUS_EMERGENCY = 3 # Above specified emergency threshhold level
  64
  65 intervals = {"weeks" : _WEEK, "days" : _DAY, "hours" : _HOUR, "minutes" : _MINUTE}
  66
  67
  68 class SnapshotManager(threading.Thread):
  69
  70     def __init__(self, bus):
  71         # Used to wake up the run() method prematurely in the event
  72         # of a SIGHUP/SMF refresh
  73         self._conditionLock = threading.Condition(threading.RLock())
  74         # Used when schedules are being rebuilt or examined.
  75         self._refreshLock = threading.Lock()
  76         # Indicates that cleanup is in progress when locked
  77         self._cleanupLock = threading.Lock()
  78         self._datasets = zfs.Datasets()
  79         # Indicates that schedules need to be rebuilt from scratch
  80         self._stale = True
  81         self._lastCleanupCheck = 0;
  82         self._zpools = []
  83         self._poolstatus = {}
  84         self._destroyedsnaps = []
  85
  86         # This is also checked during the refresh() method but we need
  87         # to know it sooner for instantiation of the PluginManager
  88         self._smf = timeslidersmf.TimeSliderSMF()
  89         try:
  90             self.verbose = self._smf.get_verbose()
  91         except RuntimeError,message:
  92             sys.stderr.write("Error determing whether debugging is enabled\n")
  93             self.verbose = False
  94
  95         self._dbus = dbussvc.AutoSnap(bus,
  96                                       '/org/opensolaris/TimeSlider/autosnap',
  97                                       self)
  98
  99         self._plugin = plugin.PluginManager(self.verbose)
 100         self.exitCode = smf.SMF_EXIT_OK
 101         self.refresh()
 102
 103         # Seems we're up and running OK.
 104         # Signal our parent so we can daemonise
 105         os.kill(os.getppid(), signal.SIGUSR1)
 106
 107         # SMF/svc.startd sends SIGHUP to force a
 108         # a refresh of the daemon
 109         signal.signal(signal.SIGHUP, self._signalled)
 110
 111         # Init done. Now initiaslise threading.
 112         threading.Thread.__init__ (self)
 113         self.setDaemon(True)
 114
 115     def run(self):
 116         # Deselect swap and dump volumes so they don't get snapshotted.
 117         for vol in self._datasets.list_volumes():
 118             name = vol.rsplit("/")
 119             try:
 120                 if (name[1] == "swap" or name[1] == "dump"):
 121                     util.debug("Auto excluding %s volume" % vol, self.verbose)
 122                     volume = zfs.Volume(vol)
 123                     volume.set_auto_snap(False)
 124             except IndexError:
 125                 pass
 126
 127         nexttime = None
 128         waittime = None
 129         while True:
 130             try:
 131                 self.refresh()
 132                 # First check and, if necessary, perform any remedial cleanup.
 133                 # This is best done before creating any new snapshots which may
 134                 # otherwise get immediately gobbled up by the remedial cleanup.
 135                 if self._needs_cleanup() == True:
 136                     self._perform_cleanup()
 137                     # Check to see if cleanup actually deleted anything before
 138                     # notifying the user. Avoids the popup appearing continuously
 139                     if len(self._destroyedsnaps) > 0:
 140                         self._send_notification()
 141                     self._send_to_syslog()
 142
 143                 nexttime = self._check_snapshots()
 144                 # Overdue snapshots are already taken automatically
 145                 # inside _check_snapshots() so nexttime should never be
 146                 # < 0. It can be None however, which is fine since it
 147                 # will cause the scheduler thread to sleep indefinitely
 148                 # or until a SIGHUP is caught.
 149                 if nexttime:
 150                     util.debug("Waiting until " + str (nexttime), self.verbose)
 151                 waittime = None
 152                 if nexttime != None:
 153                     waittime = nexttime - long(time.time())
 154                     if (waittime <= 0):
 155                         # We took too long and missed a snapshot, so break out
 156                         # and catch up on it the next time through the loop
 157                         continue
 158                 # waittime could be None if no auto-snap schedules are online
 159                 self._conditionLock.acquire()
 160                 if waittime:
 161                     util.debug("Waiting %d seconds" % (waittime), self.verbose)
 162                     self._conditionLock.wait(waittime)
 163                 else: #None. Just wait a while to check for cleanups.
 164                     util.debug("No auto-snapshot schedules online.", \
 165                                self.verbose)
 166                     self._conditionLock.wait(_MINUTE * 15)
 167
 168             except OSError, message:
 169                 sys.stderr.write("Caught OSError exception in snapshot" +
 170                                  " manager thread\n")
 171                 sys.stderr.write("Error details:\n" + \
 172                                  "--------BEGIN ERROR MESSAGE--------\n" + \
 173                                  str(message) + \
 174                                  "\n--------END ERROR MESSAGE--------\n")
 175                 self.exitCode = smf.SMF_EXIT_ERR_FATAL
 176                 # Exit this thread
 177                 break
 178             except RuntimeError,message:
 179                 sys.stderr.write("Caught RuntimeError exception in snapshot" +
 180                                  " manager thread\n")
 181                 sys.stderr.write("Error details:\n" + \
 182                                  "--------BEGIN ERROR MESSAGE--------\n" + \
 183                                  str(message) + \
 184                                  "\n--------END ERROR MESSAGE--------\n")
 185                 # Exit this thread
 186                 break
 187
 188     def _signalled(self, signum, frame):
 189         if signum == signal.SIGHUP:
 190             if self._refreshLock.acquire(False) == False:
 191                 return
 192             self._stale = True
 193             self._refreshLock.release()
 194             self._conditionLock.acquire()
 195             self._conditionLock.notify()
 196             self._conditionLock.release()
 197
 198     def refresh(self):
 199         """
 200         Checks if defined snapshot schedules are out
 201         of date and rebuilds and updates if necessary
 202         """
 203         self._refreshLock.acquire()
 204         if self._stale == True:
 205             self._configure_svc_props()
 206             self._rebuild_schedules()
 207             self._update_schedules()
 208             self._plugin.refresh()
 209             self._stale = False
 210         self._refreshLock.release()
 211
 212     def _configure_svc_props(self):
 213         try:
 214             self.verbose = self._smf.get_verbose()
 215         except RuntimeError,message:
 216             sys.stderr.write("Error determing whether debugging is enabled\n")
 217             self.verbose = False
 218
 219         try:
 220             cleanup = self._smf.get_remedial_cleanup()
 221             warn = self._smf.get_cleanup_level("warning")
 222             util.debug("Warning level value is:   %d%%" % warn, self.verbose)
 223             crit = self._smf.get_cleanup_level("critical")
 224             util.debug("Critical level value is:  %d%%" % crit, self.verbose)
 225             emer = self._smf.get_cleanup_level("emergency")
 226             util.debug("Emergency level value is: %d%%" % emer, self.verbose)
 227         except RuntimeError,message:
 228             sys.stderr.write("Failed to determine cleanup threshhold levels\n")
 229             sys.stderr.write("Details:\n" + \
 230                              "--------BEGIN ERROR MESSAGE--------\n" + \
 231                              str(message) + \
 232                              "\n---------END ERROR MESSAGE---------\n")
 233             sys.stderr.write("Using factory defaults of 80%, 90% and 95%\n")
 234             #Go with defaults
 235             #FIXME - this would be an appropriate case to mark svc as degraded
 236             self._remedialCleanup = True
 237             self._warningLevel = 80
 238             self._criticalLevel = 90
 239             self._emergencyLevel = 95
 240         else:
 241             self._remedialCleanup = cleanup
 242             self._warningLevel = warn
 243             self._criticalLevel = crit
 244             self._emergencyLevel = emer
 245
 246         try:
 247             self._keepEmpties = self._smf.get_keep_empties()
 248         except RuntimeError,message:
 249             # Not fatal, just assume we delete them (default configuration)
 250             sys.stderr.write("Can't determine whether to keep empty snapshots\n")
 251             sys.stderr.write("Details:\n" + \
 252                              "--------BEGIN ERROR MESSAGE--------\n" + \
 253                              str(message) + \
 254                              "\n---------END ERROR MESSAGE---------\n")
 255             sys.stderr.write("Assuming default value: False\n")
 256             self._keepEmpties = False
 257
 258         # Previously, snapshot labels used the ":" character was used as a
 259         # separator character for datestamps. Windows filesystems such as
 260         # CIFS and FAT choke on this character so now we use a user definable
 261         # separator value, with a default value of "_"
 262         # We need to check for both the old and new format when looking for
 263         # snapshots.
 264         self._separator = self._smf.get_separator()
 265         self._prefix = "%s[:%s]" \
 266             % (autosnapsmf.SNAPLABELPREFIX, self._separator)
 267
 268         # Rebuild pool list
 269         self._zpools = []
 270         try:
 271             for poolname in zfs.list_zpools():
 272                 # Do not try to examine FAULTED pools
 273                 zpool = zfs.ZPool(poolname)
 274                 if zpool.health == "FAULTED":
 275                     util.debug("Ignoring faulted Zpool: %s\n" \
 276                                % (zpool.name), \
 277                                self.verbose)
 278                 else:
 279                     self._zpools.append(zpool)
 280                 util.debug(str(zpool), self.verbose)
 281         except RuntimeError,message:
 282             sys.stderr.write("Could not list Zpools\n")
 283             self.exitCode = smf.SMF_EXIT_ERR_FATAL
 284             # Propogate exception up to thread's run() method
 285             raise RuntimeError,message
 286
 287
 288     def _rebuild_schedules(self):
 289         """
 290         Builds 2 lists of default and custom auto-snapshot SMF instances
 291         """
 292
 293         self._last = {}
 294         self._next = {}
 295         self._keep = {}
 296
 297         try:
 298             _defaultSchedules = autosnapsmf.get_default_schedules()
 299             _customSchedules = autosnapsmf.get_custom_schedules()
 300         except RuntimeError,message:
 301             self.exitCode = smf.SMF_EXIT_ERR_FATAL
 302             raise RuntimeError, "Error reading SMF schedule instances\n" + \
 303                                 "Details:\n" + str(message)
 304         else:
 305             # Now set it in stone.
 306             self._defaultSchedules = tuple(_defaultSchedules)
 307             self._customSchedules = tuple(_customSchedules)
 308
 309             # Build the combined schedule tuple from default + custom schedules
 310             _defaultSchedules.extend(_customSchedules)
 311             self._allSchedules = tuple(_defaultSchedules)
 312             for schedule,i,p,keep in self._allSchedules:
 313                 self._last[schedule] = 0
 314                 self._next[schedule] = 0
 315                 self._keep[schedule] = keep
 316
 317     def _update_schedules(self):
 318         interval = 0
 319         idx = 1 # Used to index subsets for schedule overlap calculation
 320         last = None
 321
 322         for schedule,interval,period,keep in self._allSchedules:
 323             # Shortcut if we've already processed this schedule and it's
 324             # still up to date. Don't skip the default schedules though
 325             # because overlap affects their scheduling
 326             if [schedule,interval,period,keep] not in \
 327                 self._defaultSchedules and \
 328                 (self._next[schedule] > self._last[schedule]):
 329                 util.debug("Short circuiting %s recalculation" \
 330                            % (schedule), \
 331                            self.verbose)
 332                 continue
 333
 334             # If we don't have an internal timestamp for the given schedule
 335             # ask zfs for the last snapshot and get it's creation timestamp.
 336             if self._last[schedule] == 0:
 337                 try:
 338                     snaps = self._datasets.list_snapshots("%s%s" % \
 339                                                          (self._prefix,
 340                                                           schedule))
 341                 except RuntimeError,message:
 342                     self.exitCode = smf.SMF_EXIT_ERR_FATAL
 343                     sys.stderr.write("Failed to list snapshots during schedule update\n")
 344                     #Propogate up to the thread's run() method
 345                     raise RuntimeError,message
 346
 347                 if len(snaps) > 0:
 348                     util.debug("Last %s snapshot was: %s" % \
 349                                (schedule, snaps[-1][0]), \
 350                                self.verbose)
 351                     self._last[schedule] = snaps[-1][1]
 352
 353             last = self._last[schedule]
 354             if interval != "months": # months is non-constant. See below.
 355                 util.debug("Recalculating %s schedule" % (schedule), \
 356                            self.verbose)
 357                 try:
 358                     totalinterval = intervals[interval] * period
 359                 except KeyError:
 360                     self.exitCode = smf.SMF_EXIT_ERR_CONFIG
 361                     sys.stderr.write(schedule + \
 362                                       " schedule has invalid interval: " + \
 363                                       "'%s\'\n" % interval)
 364                     #Propogate up to thread's run() method
 365                     raise RuntimeError
 366                 if [schedule,interval,period,keep] in self._defaultSchedules:
 367                     # This is one of the default schedules so check for an
 368                     # overlap with one of the dominant shchedules.
 369                     for s,i,p,k in self._defaultSchedules[:idx]:
 370                         last = max(last, self._last[s])
 371                     idx += 1
 372
 373             else: # interval == "months"
 374                 if self._next[schedule] > last:
 375                     util.debug("Short circuiting " + \
 376                                schedule + \
 377                                " recalculation", \
 378                                self.verbose)
 379                     continue
 380                 util.debug("Recalculating %s schedule" % (schedule), \
 381                            self.verbose)
 382                 snap_tm = time.gmtime(self._last[schedule])
 383                 # Increment year if period >= than 1 calender year.
 384                 year = snap_tm.tm_year
 385                 year += period / 12
 386                 period = period % 12
 387
 388                 mon = (snap_tm.tm_mon + period) % 12
 389                 # Result of 0 actually means december.
 390                 if mon == 0:
 391                     mon = 12
 392                 # Account for period that spans calendar year boundary.
 393                 elif snap_tm.tm_mon + period > 12:
 394                     year += 1
 395
 396                 d,dlastmon = calendar.monthrange(snap_tm.tm_year, snap_tm.tm_mon)
 397                 d,dnewmon = calendar.monthrange(year, mon)
 398                 mday = snap_tm.tm_mday
 399                 if dlastmon > dnewmon and snap_tm.tm_mday > dnewmon:
 400                    mday = dnewmon
 401
 402                 tm =(year, mon, mday, \
 403                     snap_tm.tm_hour, snap_tm.tm_min, snap_tm.tm_sec, \
 404                     0, 0, -1)
 405                 newt = calendar.timegm(tm)
 406                 new_tm = time.gmtime(newt)
 407                 totalinterval = newt - self._last[schedule]
 408
 409             self._next[schedule] = last + totalinterval
 410
 411     def _next_due(self):
 412         schedule = None
 413         earliest = None
 414         now = long(time.time())
 415
 416         for s,i,p,k in self._defaultSchedules:
 417             due = self._next[s]
 418             if due <= now:
 419                 #Default Schedule - so break out at the first
 420                 #schedule that is overdue. The subordinate schedules
 421                 #will re-adjust afterwards.
 422                 earliest,schedule = due,s
 423                 break
 424             elif earliest != None:
 425                 if due < earliest:
 426                     earliest,schedule = due,s
 427             else: #FIXME better optimisation with above condition
 428                 earliest,schedule = due,s
 429         for s,i,p,k in self._customSchedules:
 430             due = self._next[s]
 431             if earliest != None:
 432                 if due < earliest:
 433                     earliest,schedule = due,s
 434             else: #FIXME better optimisation with above condition
 435                 earliest,schedule = due,s
 436         return earliest,schedule
 437
 438     def _check_snapshots(self):
 439         """
 440         Check the schedules and see what the required snapshot is.
 441         Take one immediately on the first overdue snapshot required
 442         """
 443         # Make sure a refresh() doesn't mess with the schedule while
 444         # we're reading through it.
 445         self._refreshLock.acquire()
 446         next,schedule = self._next_due()
 447         self._refreshLock.release()
 448         now = long(time.time())
 449         while next != None and next <= now:
 450             label = self._take_snapshots(schedule)
 451             self._plugin.execute_plugins(schedule, label)
 452             self._refreshLock.acquire()
 453             self._update_schedules()
 454             next,schedule = self._next_due();
 455             self._refreshLock.release()
 456             dt = datetime.datetime.fromtimestamp(next)
 457             util.debug("Next snapshot is %s due at: %s" % \
 458                        (schedule, dt.isoformat()), \
 459                        self.verbose)
 460         return next
 461
 462     def _take_snapshots(self, schedule):
 463         # Set the time before taking snapshot to avoid clock skew due
 464         # to time taken to complete snapshot.
 465         tm = long(time.time())
 466         label = "%s%s%s-%s" % \
 467                 (autosnapsmf.SNAPLABELPREFIX, self._separator, schedule,
 468                  datetime.datetime.now().strftime("%Y-%m-%d-%Hh%M"))
 469         try:
 470             self._datasets.create_auto_snapshot_set(label, tag=schedule)
 471         except RuntimeError, message:
 472             # Write an error message, set the exit code and pass it up the
 473             # stack so the thread can terminate
 474             sys.stderr.write("Failed to create snapshots for schedule: %s\n" \
 475                              % (schedule))
 476             self.exitCode = smf.SMF_EXIT_MON_DEGRADE
 477             raise RuntimeError,message
 478         self._last[schedule] = tm;
 479         self._perform_purge(schedule)
 480         return label
 481
 482     def _prune_snapshots(self, dataset, schedule):
 483         """Cleans out zero sized snapshots, kind of cautiously"""
 484             # Per schedule: We want to delete 0 sized
 485             # snapshots but we need to keep at least one around (the most
 486             # recent one) for each schedule so that that overlap is
 487             # maintained from frequent -> hourly -> daily etc.
 488             # Start off with the smallest interval schedule first and
 489             # move up. This increases the amount of data retained where
 490             # several snapshots are taken together like a frequent hourly
 491             # and daily snapshot taken at 12:00am. If 3 snapshots are all
 492             # identical and reference the same identical data they will all
 493             # be initially reported as zero for used size. Deleting the
 494             # daily first then the hourly would shift make the data referenced
 495             # by all 3 snapshots unique to the frequent scheduled snapshot.
 496             # This snapshot would probably be purged within an how ever and the
 497             # data referenced by it would be gone for good.
 498             # Doing it the other way however ensures that the data should
 499             # remain accessible to the user for at least a week as long as
 500             # the pool doesn't run low on available space before that.
 501
 502         try:
 503             snaps = dataset.list_snapshots("%s%s" % (self._prefix,schedule))
 504             # Clone the list because we want to remove items from it
 505             # while iterating through it.
 506             remainingsnaps = snaps[:]
 507         except RuntimeError,message:
 508             sys.stderr.write("Failed to list snapshots during snapshot cleanup\n")
 509             self.exitCode = smf.SMF_EXIT_ERR_FATAL
 510             raise RuntimeError,message
 511
 512         if (self._keepEmpties == False):
 513             try: # remove the newest one from the list.
 514                 snaps.pop()
 515             except IndexError:
 516                 pass
 517             for snapname in snaps:
 518                 try:
 519                     snapshot = zfs.Snapshot(snapname)
 520                 except Exception,message:
 521                     sys.stderr.write(str(message))
 522                     # Not fatal, just skip to the next snapshot
 523                     continue
 524
 525                 try:
 526                     if snapshot.get_used_size() == 0:
 527                         util.debug("Destroying zero sized: " + snapname, \
 528                                    self.verbose)
 529                         try:
 530                             snapshot.destroy()
 531                         except RuntimeError,message:
 532                             sys.stderr.write("Failed to destroy snapshot: " +
 533                                              snapname + "\n")
 534                             self.exitCode = smf.SMF_EXIT_MON_DEGRADE
 535                             # Propogate exception so thread can exit
 536                             raise RuntimeError,message
 537                         remainingsnaps.remove(snapname)
 538                 except RuntimeError,message:
 539                     sys.stderr.write("Can not determine used size of: " + \
 540                                      snapname + "\n")
 541                     self.exitCode = smf.SMF_EXIT_MON_DEGRADE
 542                     #Propogate the exception to the thead run() method
 543                     raise RuntimeError,message
 544
 545         # Deleting individual snapshots instead of recursive sets
 546         # breaks the recursion chain and leaves child snapshots
 547         # dangling so we need to take care of cleaning up the
 548         # snapshots.
 549         target = len(remainingsnaps) - self._keep[schedule]
 550         counter = 0
 551         while counter < target:
 552             util.debug("Destroy expired snapshot: " + \
 553                        remainingsnaps[counter],
 554                        self.verbose)
 555             try:
 556                 snapshot = zfs.Snapshot(remainingsnaps[counter])
 557             except Exception,message:
 558                     sys.stderr.write(str(message))
 559                     # Not fatal, just skip to the next snapshot
 560                     counter += 1
 561                     continue
 562             try:
 563                 snapshot.destroy()
 564             except RuntimeError,message:
 565                 sys.stderr.write("Failed to destroy snapshot: " +
 566                                  snapshot.name + "\n")
 567                 self.exitCode = smf.SMF_EXIT_ERR_FATAL
 568                 # Propogate exception so thread can exit
 569                 raise RuntimeError,message
 570             else:
 571                 counter += 1
 572
 573     def _perform_purge(self, schedule):
 574         """Cautiously cleans out zero sized snapshots"""
 575         # We need to avoid accidentally pruning auto snapshots received
 576         # from one zpool to another. We ensure this by examining only
 577         # snapshots whose parent fileystems and volumes are explicitly
 578         # tagged to be snapshotted.
 579         try:
 580             for name in self._datasets.list_auto_snapshot_sets(schedule):
 581                 dataset = zfs.ReadWritableDataset(name)
 582                 self._prune_snapshots(dataset, schedule)
 583         except RuntimeError,message:
 584             sys.stderr.write("Error listing datasets during " + \
 585                              "removal of expired snapshots\n")
 586             self.exitCode = smf.SMF_EXIT_ERR_FATAL
 587             # Propogate up to thread's run() method
 588             raise RuntimeError,message
 589
 590     def _needs_cleanup(self):
 591         if self._remedialCleanup == False:
 592             # Sys admin has explicitly instructed for remedial cleanups
 593             # not to be performed.
 594             return False
 595         now = long(time.time())
 596         # Don't run checks any less than 15 minutes apart.
 597         if self._cleanupLock.acquire(False) == False:
 598             #Indicates that a cleanup is already running.
 599             return False
 600         # FIXME - Make the cleanup interval equal to the minimum snapshot interval
 601         # if custom snapshot schedules are defined and enabled.
 602         elif ((now - self._lastCleanupCheck) < (_MINUTE * 15)):
 603             pass
 604         else:
 605             for zpool in self._zpools:
 606                 try:
 607                     if zpool.get_capacity() > self._warningLevel:
 608                         # Before getting into a panic, determine if the pool
 609                         # is one we actually take snapshots on, by checking
 610                         # for one of the "auto-snapshot:<schedule> tags. Not
 611                         # super fast, but it only happens under exceptional
 612                         # circumstances of a zpool nearing it's capacity.
 613
 614                         for sched in self._allSchedules:
 615                             sets = zpool.list_auto_snapshot_sets(sched[0])
 616                             if len(sets) > 0:
 617                                 util.debug("%s needs a cleanup" \
 618                                            % zpool.name, \
 619                                            self.verbose)
 620                                 self._cleanupLock.release()
 621                                 return True
 622                 except RuntimeError, message:
 623                     sys.stderr.write("Error checking zpool capacity of: " + \
 624                                      zpool.name + "\n")
 625                     self._cleanupLock.release()
 626                     self.exitCode = smf.SMF_EXIT_ERR_FATAL
 627                     # Propogate up to thread's run() mehod.
 628                     raise RuntimeError,message
 629             self._lastCleanupCheck = long(time.time())
 630         self._cleanupLock.release()
 631         return False
 632
 633     def _perform_cleanup(self):
 634         if self._cleanupLock.acquire(False) == False:
 635             # Cleanup already running. Skip
 636             return
 637         self._destroyedsnaps = []
 638         for zpool in self._zpools:
 639             try:
 640                 self._poolstatus[zpool.name] = 0
 641                 capacity = zpool.get_capacity()
 642                 if capacity > self._warningLevel:
 643                     self._run_warning_cleanup(zpool)
 644                     self._poolstatus[zpool.name] = 1
 645                     capacity = zpool.get_capacity()
 646                 if capacity > self._criticalLevel:
 647                     self._run_critical_cleanup(zpool)
 648                     self._poolstatus[zpool.name] = 2
 649                     capacity = zpool.get_capacity()
 650                 if capacity > self._emergencyLevel:
 651                     self._run_emergency_cleanup(zpool)
 652                     self._poolstatus[zpool.name] = 3
 653                     capacity = zpool.get_capacity()
 654                 if capacity > self._emergencyLevel:
 655                     self._run_emergency_cleanup(zpool)
 656                     self._poolstatus[zpool.name] = 4
 657             # This also catches exceptions thrown from _run_<level>_cleanup()
 658             # and _run_cleanup() in methods called by _perform_cleanup()
 659             except RuntimeError,message:
 660                 sys.stderr.write("Remedial space cleanup failed because " + \
 661                                  "of failure to determinecapacity of: " + \
 662                                  zpool.name + "\n")
 663                 self.exitCode = smf.SMF_EXIT_ERR_FATAL
 664                 self._cleanupLock.release()
 665                 # Propogate up to thread's run() method.
 666                 raise RuntimeError,message
 667
 668             # Bad - there's no more snapshots left and nothing
 669             # left to delete. We don't disable the service since
 670             # it will permit self recovery and snapshot
 671             # retention when space becomes available on
 672             # the pool (hopefully).
 673             util.debug("%s pool status after cleanup:" \
 674                        % zpool.name, \
 675                        self.verbose)
 676             util.debug(zpool, self.verbose)
 677         util.debug("Cleanup completed. %d snapshots were destroyed" \
 678                    % len(self._destroyedsnaps), \
 679                    self.verbose)
 680         # Avoid needless list iteration for non-debug mode
 681         if self.verbose == True and len(self._destroyedsnaps) > 0:
 682             for snap in self._destroyedsnaps:
 683                 sys.stderr.write("\t%s\n" % snap)
 684         self._cleanupLock.release()
 685
 686     def _run_warning_cleanup(self, zpool):
 687         util.debug("Performing warning level cleanup on %s" % \
 688                    zpool.name, \
 689                    self.verbose)
 690         self._run_cleanup(zpool, "daily", self._warningLevel)
 691         if zpool.get_capacity() > self._warningLevel:
 692             self._run_cleanup(zpool, "hourly", self._warningLevel)
 693
 694     def _run_critical_cleanup(self, zpool):
 695         util.debug("Performing critical level cleanup on %s" % \
 696                    zpool.name, \
 697                    self.verbose)
 698         self._run_cleanup(zpool, "weekly", self._criticalLevel)
 699         if zpool.get_capacity() > self._criticalLevel:
 700             self._run_cleanup(zpool, "daily", self._criticalLevel)
 701         if zpool.get_capacity() > self._criticalLevel:
 702             self._run_cleanup(zpool, "hourly", self._criticalLevel)
 703
 704     def _run_emergency_cleanup(self, zpool):
 705         util.debug("Performing emergency level cleanup on %s" % \
 706                    zpool.name, \
 707                    self.verbose)
 708         self._run_cleanup(zpool, "monthly", self._emergencyLevel)
 709         if zpool.get_capacity() > self._emergencyLevel:
 710             self._run_cleanup(zpool, "weekly", self._emergencyLevel)
 711         if zpool.get_capacity() > self._emergencyLevel:
 712             self._run_cleanup(zpool, "daily", self._emergencyLevel)
 713         if zpool.get_capacity() > self._emergencyLevel:
 714             self._run_cleanup(zpool, "hourly", self._emergencyLevel)
 715         if zpool.get_capacity() > self._emergencyLevel:
 716             self._run_cleanup(zpool, "frequent", self._emergencyLevel)
 717         #Finally, as a last resort, delete custom scheduled snaphots
 718         for schedule,i,p,k in self._customSchedules:
 719             if zpool.get_capacity() < self._emergencyLevel:
 720                 break
 721             else:
 722                 self._run_cleanup(zpool, schedule, self._emergencyLevel)
 723
 724     def _run_cleanup(self, zpool, schedule, threshold):
 725         clonedsnaps = []
 726         snapshots = []
 727         try:
 728             clonedsnaps = self._datasets.list_cloned_snapshots()
 729         except RuntimeError,message:
 730                 sys.stderr.write("Error (non-fatal) listing cloned snapshots" +
 731                                  " while recovering pool capacity\n")
 732                 sys.stderr.write("Error details:\n" + \
 733                                  "--------BEGIN ERROR MESSAGE--------\n" + \
 734                                  str(message) + \
 735                                  "\n--------END ERROR MESSAGE--------\n")
 736
 737         # Build a list of snapshots in the given schedule, that are not
 738         # cloned, and sort the result in reverse chronological order.
 739         try:
 740             snapshots = [s for s,t in \
 741                             zpool.list_snapshots("%s%s" \
 742                             % (self._prefix,schedule)) \
 743                             if not s in clonedsnaps]
 744             snapshots.reverse()
 745         except RuntimeError,message:
 746             sys.stderr.write("Error listing snapshots" +
 747                              " while recovering pool capacity\n")
 748             self.exitCode = smf.SMF_EXIT_ERR_FATAL
 749             # Propogate the error up to the thread's run() method.
 750             raise RuntimeError,message
 751
 752         while zpool.get_capacity() > threshold:
 753             if len(snapshots) == 0:
 754                 syslog.syslog(syslog.LOG_NOTICE,
 755                               "No more %s snapshots left" \
 756                                % schedule)
 757                 return
 758
 759             """This is not an exact science. Deleteing a zero sized
 760             snapshot can have unpredictable results. For example a
 761             pair of snapshots may share exclusive reference to a large
 762             amount of data (eg. a large core file). The usage of both
 763             snapshots will initially be seen to be 0 by zfs(1). Deleting
 764             one of the snapshots will make the data become unique to the
 765             single remaining snapshot that references it uniquely. The
 766             remaining snapshot's size will then show up as non zero. So
 767             deleting 0 sized snapshot is not as pointless as it might seem.
 768             It also means we have to loop through this, each snapshot set
 769             at a time and observe the before and after results. Perhaps
 770             better way exists...."""
 771
 772             # Start with the oldest first
 773             snapname = snapshots.pop()
 774             snapshot = zfs.Snapshot(snapname)
 775             # It would be nicer, for performance purposes, to delete sets
 776             # of snapshots recursively but this might destroy more data than
 777             # absolutely necessary, plus the previous purging of zero sized
 778             # snapshots can easily break the recursion chain between
 779             # filesystems.
 780             # On the positive side there should be fewer snapshots and they
 781             # will mostly non-zero so we should get more effectiveness as a
 782             # result of deleting snapshots since they should be nearly always
 783             # non zero sized.
 784             util.debug("Destroying %s" % snapname, self.verbose)
 785             try:
 786                 snapshot.destroy()
 787             except RuntimeError,message:
 788                 # Would be nice to be able to mark service as degraded here
 789                 # but it's better to try to continue on rather than to give
 790                 # up alltogether (SMF maintenance state)
 791                 sys.stderr.write("Warning: Cleanup failed to destroy: %s\n" % \
 792                                  (snapshot.name))
 793                 sys.stderr.write("Details:\n%s\n" % (str(message)))
 794             else:
 795                 self._destroyedsnaps.append(snapname)
 796             # Give zfs some time to recalculate.
 797             time.sleep(3)
 798
 799     def _send_to_syslog(self):
 800         for zpool in self._zpools:
 801             status = self._poolstatus[zpool.name]
 802             if status == 4:
 803                 syslog.syslog(syslog.LOG_EMERG,
 804                               "%s is over %d%% capacity. " \
 805                               "All automatic snapshots were destroyed" \
 806                                % (zpool.name, self._emergencyLevel))
 807             elif status == 3:
 808                 syslog.syslog(syslog.LOG_ALERT,
 809                               "%s exceeded %d%% capacity. " \
 810                               "Automatic snapshots over 1 hour old were destroyed" \
 811                                % (zpool.name, self._emergencyLevel))
 812             elif status == 2:
 813                 syslog.syslog(syslog.LOG_CRIT,
 814                               "%s exceeded %d%% capacity. " \
 815                               "Weekly, hourly and daily automatic snapshots were destroyed" \
 816                                % (zpool.name, self._criticalLevel))
 817             elif status == 1:
 818                 syslog.syslog(syslog.LOG_WARNING,
 819                               "%s exceeded %d%% capacity. " \
 820                               "Hourly and daily automatic snapshots were destroyed" \
 821                                % (zpool.name, self._warningLevel))
 822
 823         if len(self._destroyedsnaps) > 0:
 824             syslog.syslog(syslog.LOG_NOTICE,
 825                           "%d automatic snapshots were destroyed" \
 826                            % len(self._destroyedsnaps))
 827
 828     def _send_notification(self):
 829         worstpool = None
 830         worststatus = 0
 831
 832         for zpool in self._zpools:
 833             status = self._poolstatus[zpool.name]
 834             # >= to ensure that something should always be set.
 835             if status >= worststatus:
 836                 worstpool = zpool.name
 837                 worststatus = status
 838
 839         #FIXME make the various levels indexible
 840         if worststatus == 4:
 841             self._dbus.capacity_exceeded(worstpool, 4, self._emergencyLevel)
 842         elif worststatus == 3:
 843             self._dbus.capacity_exceeded(worstpool, 3, self._emergencyLevel)
 844         elif worststatus == 2:
 845             self._dbus.capacity_exceeded(worstpool, 2, self._criticalLevel)
 846         elif worststatus == 1:
 847             self._dbus.capacity_exceeded(worstpool, 1, self._warningLevel)
 848         #elif: 0 everything is fine. Do nothing.
 849
 850
 851 def monitor_threads(snapthread):
 852     if snapthread.is_alive():
 853         return True
 854     else:
 855         sys.stderr.write("Snapshot monitor thread exited.\n")
 856         if snapthread.exitCode == smf.SMF_EXIT_MON_DEGRADE:
 857             # FIXME - it would be nicer to mark the service as degraded than
 858             # go into maintenance state for some situations such as a
 859             # particular snapshot schedule failing.
 860             # But for now SMF does not implement this feature. But if/when it
 861             # does it's better to use svcadm to put the # service into the
 862             # correct state since the daemon shouldn't exit whentransitioning
 863             # to a degraded state.
 864             #sys.stderr.write("Placing service into maintenance state\n")
 865             #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
 866             #                 os.getenv("SMF_FMRI")])
 867             # SMF will take care of kill the daemon
 868             sys.exit(smf.SMF_EXIT_ERR_FATAL)
 869             return False
 870         elif snapthread.exitCode == smf.SMF_EXIT_ERR_FATAL:
 871             #sys.stderr.write("Placing service into maintenance state\n")
 872             #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
 873             #                 os.getenv("SMF_FMRI")])
 874             # SMF will take care of killing the daemon
 875             sys.exit(smf.SMF_EXIT_ERR_FATAL)
 876             return False
 877         else:
 878             sys.stderr.write("Snapshot monitor thread exited abnormally\n")
 879             sys.stderr.write("Exit code: %d\n" % (snapthread.exitCode))
 880             #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
 881             #                 os.getenv("SMF_FMRI")])
 882             sys.exit(smf.SMF_EXIT_ERR_FATAL)
 883             return False
 884
 885
 886 def child_sig_handler(signum, frame):
 887     if signum == signal.SIGUSR1:
 888         sys.exit(smf.SMF_EXIT_OK)
 889     elif signum == signal.SIGCHLD:
 890         sys.exit(smf.SMF_EXIT_ERR_FATAL)
 891     elif signum == signal.SIGALRM:
 892         sys.exit(smf.SMF_EXIT_ERR_FATAL)
 893
 894 # Default daemon parameters.
 895 # File mode creation mask of the daemon.
 896 UMASK = 0
 897 # Default working directory for the daemon.
 898 WORKDIR = "/"
 899 # Default maximum for the number of available file descriptors.
 900 MAXFD = 1024
 901
 902 def create_daemon():
 903     """
 904     Detach a process from the controlling terminal and run it in the
 905     background as a daemon.
 906     """
 907     #Catch signals that we might receive from child
 908     signal.signal(signal.SIGCHLD, child_sig_handler)
 909     signal.signal(signal.SIGUSR1, child_sig_handler)
 910     signal.signal(signal.SIGALRM, child_sig_handler)
 911     try:
 912         pid = os.fork()
 913     except OSError, e:
 914         raise Exception, "%s [%d]" % (e.strerror, e.errno)
 915
 916     if (pid == 0):
 917         #Reset signals that we set to trap in parent
 918         signal.signal(signal.SIGCHLD, signal.SIG_DFL)
 919         signal.signal(signal.SIGUSR1, signal.SIG_DFL)
 920         signal.signal(signal.SIGALRM, signal.SIG_DFL)
 921         os.setsid()
 922         os.chdir(WORKDIR)
 923         os.umask(UMASK)
 924     else:
 925         #Wait for the child to give the OK or otherwise.
 926         signal.pause()
 927
 928
 929 def main(argv):
 930
 931     # Check SMF invocation environment
 932     if os.getenv("SMF_FMRI") == None or os.getenv("SMF_METHOD") != "start":
 933         sys.stderr.write("Command line invocation of %s unsupported.\n" \
 934                          % (sys.argv[0]))
 935         sys.stderr.write("This command is intended for smf(5) invocation only.\n")
 936         sys.exit(smf.SMF_EXIT_ERR_NOSMF)
 937
 938     # Daemonise the service.
 939     create_daemon()
 940
 941     # The user security attributes checked are the following:
 942     # Note that UID == 0 will match any profile search so
 943     # no need to check it explicitly.
 944     syslog.openlog("time-sliderd", 0, syslog.LOG_DAEMON)
 945     rbacp = RBACprofile()
 946     if rbacp.has_profile("ZFS File System Management"):
 947
 948         gobject.threads_init()
 949
 950         # Tell dbus to use the gobject mainloop for async ops
 951         dbus.mainloop.glib.DBusGMainLoop(set_as_default=True)
 952         dbus.mainloop.glib.threads_init()
 953         # Register a bus name with the system dbus daemon
 954         systemBus = dbus.SystemBus()
 955         name = dbus.service.BusName("org.opensolaris.TimeSlider", systemBus)
 956
 957         # Create and start the snapshot manger. Takes care of
 958         # auto snapshotting service and auto cleanup.
 959         snapshot = SnapshotManager(systemBus)
 960         snapshot.start()
 961         gobject.timeout_add(2000, monitor_threads, snapshot)
 962
 963         mainloop = gobject.MainLoop()
 964         try:
 965             mainloop.run()
 966         except KeyboardInterrupt:
 967             mainloop.quit()
 968             sys.exit(smf.SMF_EXIT_OK)
 969     else:
 970         syslog.syslog(syslog.LOG_ERR,
 971                "%s has insufficient privileges to run time-sliderd!" \
 972                % rbacp.name)
 973         syslog.closelog()
 974         sys.exit(smf.SMF_EXIT_ERR_PERM)
 975     syslog.closelog()
 976     sys.exit(smf.SMF_EXIT_OK)
 977