5 # The contents of this file are subject to the terms of the
6 # Common Development and Distribution License (the "License").
7 # You may not use this file except in compliance with the License.
9 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 # or http://www.opensolaris.org/os/licensing.
11 # See the License for the specific language governing permissions
12 # and limitations under the License.
14 # When distributing Covered Code, include this CDDL HEADER in each
15 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 # If applicable, add the following below this CDDL HEADER, with the
17 # fields enclosed by brackets "[]" replaced with your own identifying
18 # information: Portions Copyright [yyyy] [name of copyright owner]
41 import dbus.mainloop.glib
46 import time_slider.linux.timeslidersmf as timeslidersmf
49 from time_slider.linux.rbac import RBACprofile
58 # Status codes for actual zpool capacity levels.
59 # These are relative to the SMF property defined
60 # levels for: user, warning and emergenecy levels
61 STATUS_OK = 0 # Below user specified threshhold. Everything was OK
62 STATUS_WARNING = 1 # Above specified user threshold level
63 STATUS_CRITICAL = 2 # Above specified critical threshhold level
64 STATUS_EMERGENCY = 3 # Above specified emergency threshhold level
66 intervals = {"weeks" : _WEEK, "days" : _DAY, "hours" : _HOUR, "minutes" : _MINUTE}
69 class SnapshotManager(threading.Thread):
71 def __init__(self, bus):
72 # Used to wake up the run() method prematurely in the event
73 # of a SIGHUP/SMF refresh
74 self._conditionLock = threading.Condition(threading.RLock())
75 # Used when schedules are being rebuilt or examined.
76 self._refreshLock = threading.Lock()
77 # Indicates that cleanup is in progress when locked
78 self._cleanupLock = threading.Lock()
79 self._datasets = zfs.Datasets()
80 # Indicates that schedules need to be rebuilt from scratch
82 self._lastCleanupCheck = 0;
85 self._destroyedsnaps = []
87 # This is also checked during the refresh() method but we need
88 # to know it sooner for instantiation of the PluginManager
89 self._smf = timeslidersmf.TimeSliderSMF()
91 self.verbose = self._smf.get_verbose()
92 except RuntimeError,message:
93 sys.stderr.write("Error determing whether debugging is enabled\n")
96 self._dbus = dbussvc.AutoSnap(bus,
97 '/org/opensolaris/TimeSlider/autosnap',
100 self._plugin = plugin.PluginManager(self.verbose)
101 self.exitCode = smf.SMF_EXIT_OK
104 # Seems we're up and running OK.
105 # Signal our parent so we can daemonise
106 os.kill(os.getppid(), signal.SIGUSR1)
108 # SMF/svc.startd sends SIGHUP to force a
109 # a refresh of the daemon
110 signal.signal(signal.SIGHUP, self._signalled)
112 # Init done. Now initiaslise threading.
113 threading.Thread.__init__ (self)
117 # Deselect swap and dump volumes so they don't get snapshotted.
118 for vol in self._datasets.list_volumes():
119 name = vol.rsplit("/")
121 if (name[1] == "swap" or name[1] == "dump"):
122 util.debug("Auto excluding %s volume" % vol, self.verbose)
123 volume = zfs.Volume(vol)
124 volume.set_auto_snap(False)
133 # First check and, if necessary, perform any remedial cleanup.
134 # This is best done before creating any new snapshots which may
135 # otherwise get immediately gobbled up by the remedial cleanup.
136 if self._needs_cleanup() == True:
137 self._perform_cleanup()
138 # Check to see if cleanup actually deleted anything before
139 # notifying the user. Avoids the popup appearing continuously
140 if len(self._destroyedsnaps) > 0:
141 self._send_notification()
142 self._send_to_syslog()
144 nexttime = self._check_snapshots()
145 # Overdue snapshots are already taken automatically
146 # inside _check_snapshots() so nexttime should never be
147 # < 0. It can be None however, which is fine since it
148 # will cause the scheduler thread to sleep indefinitely
149 # or until a SIGHUP is caught.
151 util.debug("Waiting until " + str (nexttime), self.verbose)
154 waittime = nexttime - long(time.time())
156 # We took too long and missed a snapshot, so break out
157 # and catch up on it the next time through the loop
159 # waittime could be None if no auto-snap schedules are online
160 self._conditionLock.acquire()
162 util.debug("Waiting %d seconds" % (waittime), self.verbose)
163 self._conditionLock.wait(waittime)
164 else: #None. Just wait a while to check for cleanups.
165 util.debug("No auto-snapshot schedules online.", \
167 self._conditionLock.wait(_MINUTE * 15)
169 except OSError, message:
170 sys.stderr.write("Caught OSError exception in snapshot" +
172 sys.stderr.write("Error details:\n" + \
173 "--------BEGIN ERROR MESSAGE--------\n" + \
175 "\n--------END ERROR MESSAGE--------\n")
176 self.exitCode = smf.SMF_EXIT_ERR_FATAL
179 except RuntimeError,message:
180 sys.stderr.write("Caught RuntimeError exception in snapshot" +
182 sys.stderr.write("Error details:\n" + \
183 "--------BEGIN ERROR MESSAGE--------\n" + \
185 "\n--------END ERROR MESSAGE--------\n")
189 def _signalled(self, signum, frame):
190 if signum == signal.SIGHUP:
191 if self._refreshLock.acquire(False) == False:
194 self._refreshLock.release()
195 self._conditionLock.acquire()
196 self._conditionLock.notify()
197 self._conditionLock.release()
201 Checks if defined snapshot schedules are out
202 of date and rebuilds and updates if necessary
204 self._refreshLock.acquire()
205 if self._stale == True:
206 self._configure_svc_props()
207 self._rebuild_schedules()
208 self._update_schedules()
209 self._plugin.refresh()
211 self._refreshLock.release()
213 def _configure_svc_props(self):
215 self.verbose = self._smf.get_verbose()
216 except RuntimeError,message:
217 sys.stderr.write("Error determing whether debugging is enabled\n")
221 cleanup = self._smf.get_remedial_cleanup()
222 warn = self._smf.get_cleanup_level("warning")
223 util.debug("Warning level value is: %d%%" % warn, self.verbose)
224 crit = self._smf.get_cleanup_level("critical")
225 util.debug("Critical level value is: %d%%" % crit, self.verbose)
226 emer = self._smf.get_cleanup_level("emergency")
227 util.debug("Emergency level value is: %d%%" % emer, self.verbose)
228 except RuntimeError,message:
229 sys.stderr.write("Failed to determine cleanup threshhold levels\n")
230 sys.stderr.write("Details:\n" + \
231 "--------BEGIN ERROR MESSAGE--------\n" + \
233 "\n---------END ERROR MESSAGE---------\n")
234 sys.stderr.write("Using factory defaults of 80%, 90% and 95%\n")
236 #FIXME - this would be an appropriate case to mark svc as degraded
237 self._remedialCleanup = True
238 self._warningLevel = 80
239 self._criticalLevel = 90
240 self._emergencyLevel = 95
242 self._remedialCleanup = cleanup
243 self._warningLevel = warn
244 self._criticalLevel = crit
245 self._emergencyLevel = emer
248 self._keepEmpties = self._smf.get_keep_empties()
249 except RuntimeError,message:
250 # Not fatal, just assume we delete them (default configuration)
251 sys.stderr.write("Can't determine whether to keep empty snapshots\n")
252 sys.stderr.write("Details:\n" + \
253 "--------BEGIN ERROR MESSAGE--------\n" + \
255 "\n---------END ERROR MESSAGE---------\n")
256 sys.stderr.write("Assuming default value: False\n")
257 self._keepEmpties = False
259 # Previously, snapshot labels used the ":" character was used as a
260 # separator character for datestamps. Windows filesystems such as
261 # CIFS and FAT choke on this character so now we use a user definable
262 # separator value, with a default value of "_"
263 # We need to check for both the old and new format when looking for
265 self._separator = self._smf.get_separator()
266 self._prefix = "%s[:%s]" \
267 % (autosnapsmf.SNAPLABELPREFIX, self._separator)
272 for poolname in zfs.list_zpools():
273 # Do not try to examine FAULTED pools
274 zpool = zfs.ZPool(poolname)
275 if zpool.health == "FAULTED":
276 util.debug("Ignoring faulted Zpool: %s\n" \
280 self._zpools.append(zpool)
281 util.debug(str(zpool), self.verbose)
282 except RuntimeError,message:
283 sys.stderr.write("Could not list Zpools\n")
284 self.exitCode = smf.SMF_EXIT_ERR_FATAL
285 # Propogate exception up to thread's run() method
286 raise RuntimeError,message
289 def _rebuild_schedules(self):
291 Builds 2 lists of default and custom auto-snapshot SMF instances
299 _defaultSchedules = autosnapsmf.get_default_schedules()
300 _customSchedules = autosnapsmf.get_custom_schedules()
301 except RuntimeError,message:
302 self.exitCode = smf.SMF_EXIT_ERR_FATAL
303 raise RuntimeError, "Error reading SMF schedule instances\n" + \
304 "Details:\n" + str(message)
306 # Now set it in stone.
307 self._defaultSchedules = tuple(_defaultSchedules)
308 self._customSchedules = tuple(_customSchedules)
310 # Build the combined schedule tuple from default + custom schedules
311 _defaultSchedules.extend(_customSchedules)
312 self._allSchedules = tuple(_defaultSchedules)
313 for schedule,i,p,keep in self._allSchedules:
314 self._last[schedule] = 0
315 self._next[schedule] = 0
316 self._keep[schedule] = keep
318 def _update_schedules(self):
320 idx = 1 # Used to index subsets for schedule overlap calculation
323 for schedule,interval,period,keep in self._allSchedules:
324 # Shortcut if we've already processed this schedule and it's
325 # still up to date. Don't skip the default schedules though
326 # because overlap affects their scheduling
327 if [schedule,interval,period,keep] not in \
328 self._defaultSchedules and \
329 (self._next[schedule] > self._last[schedule]):
330 util.debug("Short circuiting %s recalculation" \
335 # If we don't have an internal timestamp for the given schedule
336 # ask zfs for the last snapshot and get it's creation timestamp.
337 if self._last[schedule] == 0:
339 snaps = self._datasets.list_snapshots("%s%s" % \
342 except RuntimeError,message:
343 self.exitCode = smf.SMF_EXIT_ERR_FATAL
344 sys.stderr.write("Failed to list snapshots during schedule update\n")
345 #Propogate up to the thread's run() method
346 raise RuntimeError,message
349 util.debug("Last %s snapshot was: %s" % \
350 (schedule, snaps[-1][0]), \
352 self._last[schedule] = snaps[-1][1]
354 last = self._last[schedule]
355 if interval != "months": # months is non-constant. See below.
356 util.debug("Recalculating %s schedule" % (schedule), \
359 totalinterval = intervals[interval] * period
361 self.exitCode = smf.SMF_EXIT_ERR_CONFIG
362 sys.stderr.write(schedule + \
363 " schedule has invalid interval: " + \
364 "'%s\'\n" % interval)
365 #Propogate up to thread's run() method
367 if [schedule,interval,period,keep] in self._defaultSchedules:
368 # This is one of the default schedules so check for an
369 # overlap with one of the dominant shchedules.
370 for s,i,p,k in self._defaultSchedules[:idx]:
371 last = max(last, self._last[s])
374 else: # interval == "months"
375 if self._next[schedule] > last:
376 util.debug("Short circuiting " + \
381 util.debug("Recalculating %s schedule" % (schedule), \
383 snap_tm = time.gmtime(self._last[schedule])
384 # Increment year if period >= than 1 calender year.
385 year = snap_tm.tm_year
389 mon = (snap_tm.tm_mon + period) % 12
390 # Result of 0 actually means december.
393 # Account for period that spans calendar year boundary.
394 elif snap_tm.tm_mon + period > 12:
397 d,dlastmon = calendar.monthrange(snap_tm.tm_year, snap_tm.tm_mon)
398 d,dnewmon = calendar.monthrange(year, mon)
399 mday = snap_tm.tm_mday
400 if dlastmon > dnewmon and snap_tm.tm_mday > dnewmon:
403 tm =(year, mon, mday, \
404 snap_tm.tm_hour, snap_tm.tm_min, snap_tm.tm_sec, \
406 newt = calendar.timegm(tm)
407 new_tm = time.gmtime(newt)
408 totalinterval = newt - self._last[schedule]
410 self._next[schedule] = last + totalinterval
415 now = long(time.time())
417 for s,i,p,k in self._defaultSchedules:
420 #Default Schedule - so break out at the first
421 #schedule that is overdue. The subordinate schedules
422 #will re-adjust afterwards.
423 earliest,schedule = due,s
425 elif earliest != None:
427 earliest,schedule = due,s
428 else: #FIXME better optimisation with above condition
429 earliest,schedule = due,s
430 for s,i,p,k in self._customSchedules:
434 earliest,schedule = due,s
435 else: #FIXME better optimisation with above condition
436 earliest,schedule = due,s
437 return earliest,schedule
439 def _check_snapshots(self):
441 Check the schedules and see what the required snapshot is.
442 Take one immediately on the first overdue snapshot required
444 # Make sure a refresh() doesn't mess with the schedule while
445 # we're reading through it.
446 self._refreshLock.acquire()
447 next,schedule = self._next_due()
448 self._refreshLock.release()
449 now = long(time.time())
450 while next != None and next <= now:
451 label = self._take_snapshots(schedule)
452 self._plugin.execute_plugins(schedule, label)
453 self._refreshLock.acquire()
454 self._update_schedules()
455 next,schedule = self._next_due();
456 self._refreshLock.release()
457 dt = datetime.datetime.fromtimestamp(next)
458 util.debug("Next snapshot is %s due at: %s" % \
459 (schedule, dt.isoformat()), \
463 def _take_snapshots(self, schedule):
464 # Set the time before taking snapshot to avoid clock skew due
465 # to time taken to complete snapshot.
466 tm = long(time.time())
467 label = "%s%s%s-%s" % \
468 (autosnapsmf.SNAPLABELPREFIX, self._separator, schedule,
469 datetime.datetime.now().strftime("%Y-%m-%d-%Hh%M"))
471 self._datasets.create_auto_snapshot_set(label, tag=schedule)
472 except RuntimeError, message:
473 # Write an error message, set the exit code and pass it up the
474 # stack so the thread can terminate
475 sys.stderr.write("Failed to create snapshots for schedule: %s\n" \
477 self.exitCode = smf.SMF_EXIT_MON_DEGRADE
478 raise RuntimeError,message
479 self._last[schedule] = tm;
480 self._perform_purge(schedule)
483 def _prune_snapshots(self, dataset, schedule):
484 """Cleans out zero sized snapshots, kind of cautiously"""
485 # Per schedule: We want to delete 0 sized
486 # snapshots but we need to keep at least one around (the most
487 # recent one) for each schedule so that that overlap is
488 # maintained from frequent -> hourly -> daily etc.
489 # Start off with the smallest interval schedule first and
490 # move up. This increases the amount of data retained where
491 # several snapshots are taken together like a frequent hourly
492 # and daily snapshot taken at 12:00am. If 3 snapshots are all
493 # identical and reference the same identical data they will all
494 # be initially reported as zero for used size. Deleting the
495 # daily first then the hourly would shift make the data referenced
496 # by all 3 snapshots unique to the frequent scheduled snapshot.
497 # This snapshot would probably be purged within an how ever and the
498 # data referenced by it would be gone for good.
499 # Doing it the other way however ensures that the data should
500 # remain accessible to the user for at least a week as long as
501 # the pool doesn't run low on available space before that.
504 snaps = dataset.list_snapshots("%s%s" % (self._prefix,schedule))
505 # Clone the list because we want to remove items from it
506 # while iterating through it.
507 remainingsnaps = snaps[:]
508 except RuntimeError,message:
509 sys.stderr.write("Failed to list snapshots during snapshot cleanup\n")
510 self.exitCode = smf.SMF_EXIT_ERR_FATAL
511 raise RuntimeError,message
513 if (self._keepEmpties == False):
514 try: # remove the newest one from the list.
518 for snapname in snaps:
520 snapshot = zfs.Snapshot(snapname)
521 except Exception,message:
522 sys.stderr.write(str(message))
523 # Not fatal, just skip to the next snapshot
527 if snapshot.get_used_size() == 0:
528 util.debug("Destroying zero sized: " + snapname, \
532 except RuntimeError,message:
533 sys.stderr.write("Failed to destroy snapshot: " +
535 self.exitCode = smf.SMF_EXIT_MON_DEGRADE
536 # Propogate exception so thread can exit
537 raise RuntimeError,message
538 remainingsnaps.remove(snapname)
539 except RuntimeError,message:
540 sys.stderr.write("Can not determine used size of: " + \
542 self.exitCode = smf.SMF_EXIT_MON_DEGRADE
543 #Propogate the exception to the thead run() method
544 raise RuntimeError,message
546 # Deleting individual snapshots instead of recursive sets
547 # breaks the recursion chain and leaves child snapshots
548 # dangling so we need to take care of cleaning up the
550 target = len(remainingsnaps) - self._keep[schedule]
552 while counter < target:
553 util.debug("Destroy expired snapshot: " + \
554 remainingsnaps[counter],
557 snapshot = zfs.Snapshot(remainingsnaps[counter])
558 except Exception,message:
559 sys.stderr.write(str(message))
560 # Not fatal, just skip to the next snapshot
565 except RuntimeError,message:
566 sys.stderr.write("Failed to destroy snapshot: " +
567 snapshot.name + "\n")
568 self.exitCode = smf.SMF_EXIT_ERR_FATAL
569 # Propogate exception so thread can exit
570 raise RuntimeError,message
574 def _perform_purge(self, schedule):
575 """Cautiously cleans out zero sized snapshots"""
576 # We need to avoid accidentally pruning auto snapshots received
577 # from one zpool to another. We ensure this by examining only
578 # snapshots whose parent fileystems and volumes are explicitly
579 # tagged to be snapshotted.
581 for name in self._datasets.list_auto_snapshot_sets(schedule):
582 dataset = zfs.ReadWritableDataset(name)
583 self._prune_snapshots(dataset, schedule)
584 except RuntimeError,message:
585 sys.stderr.write("Error listing datasets during " + \
586 "removal of expired snapshots\n")
587 self.exitCode = smf.SMF_EXIT_ERR_FATAL
588 # Propogate up to thread's run() method
589 raise RuntimeError,message
591 def _needs_cleanup(self):
592 if self._remedialCleanup == False:
593 # Sys admin has explicitly instructed for remedial cleanups
594 # not to be performed.
596 now = long(time.time())
597 # Don't run checks any less than 15 minutes apart.
598 if self._cleanupLock.acquire(False) == False:
599 #Indicates that a cleanup is already running.
601 # FIXME - Make the cleanup interval equal to the minimum snapshot interval
602 # if custom snapshot schedules are defined and enabled.
603 elif ((now - self._lastCleanupCheck) < (_MINUTE * 15)):
606 for zpool in self._zpools:
608 if zpool.get_capacity() > self._warningLevel:
609 # Before getting into a panic, determine if the pool
610 # is one we actually take snapshots on, by checking
611 # for one of the "auto-snapshot:<schedule> tags. Not
612 # super fast, but it only happens under exceptional
613 # circumstances of a zpool nearing it's capacity.
615 for sched in self._allSchedules:
616 sets = zpool.list_auto_snapshot_sets(sched[0])
618 util.debug("%s needs a cleanup" \
621 self._cleanupLock.release()
623 except RuntimeError, message:
624 sys.stderr.write("Error checking zpool capacity of: " + \
626 self._cleanupLock.release()
627 self.exitCode = smf.SMF_EXIT_ERR_FATAL
628 # Propogate up to thread's run() mehod.
629 raise RuntimeError,message
630 self._lastCleanupCheck = long(time.time())
631 self._cleanupLock.release()
634 def _perform_cleanup(self):
635 if self._cleanupLock.acquire(False) == False:
636 # Cleanup already running. Skip
638 self._destroyedsnaps = []
639 for zpool in self._zpools:
641 self._poolstatus[zpool.name] = 0
642 capacity = zpool.get_capacity()
643 if capacity > self._warningLevel:
644 self._run_warning_cleanup(zpool)
645 self._poolstatus[zpool.name] = 1
646 capacity = zpool.get_capacity()
647 if capacity > self._criticalLevel:
648 self._run_critical_cleanup(zpool)
649 self._poolstatus[zpool.name] = 2
650 capacity = zpool.get_capacity()
651 if capacity > self._emergencyLevel:
652 self._run_emergency_cleanup(zpool)
653 self._poolstatus[zpool.name] = 3
654 capacity = zpool.get_capacity()
655 if capacity > self._emergencyLevel:
656 self._run_emergency_cleanup(zpool)
657 self._poolstatus[zpool.name] = 4
658 # This also catches exceptions thrown from _run_<level>_cleanup()
659 # and _run_cleanup() in methods called by _perform_cleanup()
660 except RuntimeError,message:
661 sys.stderr.write("Remedial space cleanup failed because " + \
662 "of failure to determinecapacity of: " + \
664 self.exitCode = smf.SMF_EXIT_ERR_FATAL
665 self._cleanupLock.release()
666 # Propogate up to thread's run() method.
667 raise RuntimeError,message
669 # Bad - there's no more snapshots left and nothing
670 # left to delete. We don't disable the service since
671 # it will permit self recovery and snapshot
672 # retention when space becomes available on
673 # the pool (hopefully).
674 util.debug("%s pool status after cleanup:" \
677 util.debug(zpool, self.verbose)
678 util.debug("Cleanup completed. %d snapshots were destroyed" \
679 % len(self._destroyedsnaps), \
681 # Avoid needless list iteration for non-debug mode
682 if self.verbose == True and len(self._destroyedsnaps) > 0:
683 for snap in self._destroyedsnaps:
684 sys.stderr.write("\t%s\n" % snap)
685 self._cleanupLock.release()
687 def _run_warning_cleanup(self, zpool):
688 util.debug("Performing warning level cleanup on %s" % \
691 self._run_cleanup(zpool, "daily", self._warningLevel)
692 if zpool.get_capacity() > self._warningLevel:
693 self._run_cleanup(zpool, "hourly", self._warningLevel)
695 def _run_critical_cleanup(self, zpool):
696 util.debug("Performing critical level cleanup on %s" % \
699 self._run_cleanup(zpool, "weekly", self._criticalLevel)
700 if zpool.get_capacity() > self._criticalLevel:
701 self._run_cleanup(zpool, "daily", self._criticalLevel)
702 if zpool.get_capacity() > self._criticalLevel:
703 self._run_cleanup(zpool, "hourly", self._criticalLevel)
705 def _run_emergency_cleanup(self, zpool):
706 util.debug("Performing emergency level cleanup on %s" % \
709 self._run_cleanup(zpool, "monthly", self._emergencyLevel)
710 if zpool.get_capacity() > self._emergencyLevel:
711 self._run_cleanup(zpool, "weekly", self._emergencyLevel)
712 if zpool.get_capacity() > self._emergencyLevel:
713 self._run_cleanup(zpool, "daily", self._emergencyLevel)
714 if zpool.get_capacity() > self._emergencyLevel:
715 self._run_cleanup(zpool, "hourly", self._emergencyLevel)
716 if zpool.get_capacity() > self._emergencyLevel:
717 self._run_cleanup(zpool, "frequent", self._emergencyLevel)
718 #Finally, as a last resort, delete custom scheduled snaphots
719 for schedule,i,p,k in self._customSchedules:
720 if zpool.get_capacity() < self._emergencyLevel:
723 self._run_cleanup(zpool, schedule, self._emergencyLevel)
725 def _run_cleanup(self, zpool, schedule, threshold):
729 clonedsnaps = self._datasets.list_cloned_snapshots()
730 except RuntimeError,message:
731 sys.stderr.write("Error (non-fatal) listing cloned snapshots" +
732 " while recovering pool capacity\n")
733 sys.stderr.write("Error details:\n" + \
734 "--------BEGIN ERROR MESSAGE--------\n" + \
736 "\n--------END ERROR MESSAGE--------\n")
738 # Build a list of snapshots in the given schedule, that are not
739 # cloned, and sort the result in reverse chronological order.
741 snapshots = [s for s,t in \
742 zpool.list_snapshots("%s%s" \
743 % (self._prefix,schedule)) \
744 if not s in clonedsnaps]
746 except RuntimeError,message:
747 sys.stderr.write("Error listing snapshots" +
748 " while recovering pool capacity\n")
749 self.exitCode = smf.SMF_EXIT_ERR_FATAL
750 # Propogate the error up to the thread's run() method.
751 raise RuntimeError,message
753 while zpool.get_capacity() > threshold:
754 if len(snapshots) == 0:
755 syslog.syslog(syslog.LOG_NOTICE,
756 "No more %s snapshots left" \
760 """This is not an exact science. Deleteing a zero sized
761 snapshot can have unpredictable results. For example a
762 pair of snapshots may share exclusive reference to a large
763 amount of data (eg. a large core file). The usage of both
764 snapshots will initially be seen to be 0 by zfs(1). Deleting
765 one of the snapshots will make the data become unique to the
766 single remaining snapshot that references it uniquely. The
767 remaining snapshot's size will then show up as non zero. So
768 deleting 0 sized snapshot is not as pointless as it might seem.
769 It also means we have to loop through this, each snapshot set
770 at a time and observe the before and after results. Perhaps
771 better way exists...."""
773 # Start with the oldest first
774 snapname = snapshots.pop()
775 snapshot = zfs.Snapshot(snapname)
776 # It would be nicer, for performance purposes, to delete sets
777 # of snapshots recursively but this might destroy more data than
778 # absolutely necessary, plus the previous purging of zero sized
779 # snapshots can easily break the recursion chain between
781 # On the positive side there should be fewer snapshots and they
782 # will mostly non-zero so we should get more effectiveness as a
783 # result of deleting snapshots since they should be nearly always
785 util.debug("Destroying %s" % snapname, self.verbose)
788 except RuntimeError,message:
789 # Would be nice to be able to mark service as degraded here
790 # but it's better to try to continue on rather than to give
791 # up alltogether (SMF maintenance state)
792 sys.stderr.write("Warning: Cleanup failed to destroy: %s\n" % \
794 sys.stderr.write("Details:\n%s\n" % (str(message)))
796 self._destroyedsnaps.append(snapname)
797 # Give zfs some time to recalculate.
800 def _send_to_syslog(self):
801 for zpool in self._zpools:
802 status = self._poolstatus[zpool.name]
804 syslog.syslog(syslog.LOG_EMERG,
805 "%s is over %d%% capacity. " \
806 "All automatic snapshots were destroyed" \
807 % (zpool.name, self._emergencyLevel))
809 syslog.syslog(syslog.LOG_ALERT,
810 "%s exceeded %d%% capacity. " \
811 "Automatic snapshots over 1 hour old were destroyed" \
812 % (zpool.name, self._emergencyLevel))
814 syslog.syslog(syslog.LOG_CRIT,
815 "%s exceeded %d%% capacity. " \
816 "Weekly, hourly and daily automatic snapshots were destroyed" \
817 % (zpool.name, self._criticalLevel))
819 syslog.syslog(syslog.LOG_WARNING,
820 "%s exceeded %d%% capacity. " \
821 "Hourly and daily automatic snapshots were destroyed" \
822 % (zpool.name, self._warningLevel))
824 if len(self._destroyedsnaps) > 0:
825 syslog.syslog(syslog.LOG_NOTICE,
826 "%d automatic snapshots were destroyed" \
827 % len(self._destroyedsnaps))
829 def _send_notification(self):
833 for zpool in self._zpools:
834 status = self._poolstatus[zpool.name]
835 # >= to ensure that something should always be set.
836 if status >= worststatus:
837 worstpool = zpool.name
840 #FIXME make the various levels indexible
842 self._dbus.capacity_exceeded(worstpool, 4, self._emergencyLevel)
843 elif worststatus == 3:
844 self._dbus.capacity_exceeded(worstpool, 3, self._emergencyLevel)
845 elif worststatus == 2:
846 self._dbus.capacity_exceeded(worstpool, 2, self._criticalLevel)
847 elif worststatus == 1:
848 self._dbus.capacity_exceeded(worstpool, 1, self._warningLevel)
849 #elif: 0 everything is fine. Do nothing.
852 def monitor_threads(snapthread):
853 if snapthread.is_alive():
856 sys.stderr.write("Snapshot monitor thread exited.\n")
857 if snapthread.exitCode == smf.SMF_EXIT_MON_DEGRADE:
858 # FIXME - it would be nicer to mark the service as degraded than
859 # go into maintenance state for some situations such as a
860 # particular snapshot schedule failing.
861 # But for now SMF does not implement this feature. But if/when it
862 # does it's better to use svcadm to put the # service into the
863 # correct state since the daemon shouldn't exit whentransitioning
864 # to a degraded state.
865 #sys.stderr.write("Placing service into maintenance state\n")
866 #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
867 # os.getenv("SMF_FMRI")])
868 # SMF will take care of kill the daemon
869 sys.exit(smf.SMF_EXIT_ERR_FATAL)
871 elif snapthread.exitCode == smf.SMF_EXIT_ERR_FATAL:
872 #sys.stderr.write("Placing service into maintenance state\n")
873 #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
874 # os.getenv("SMF_FMRI")])
875 # SMF will take care of killing the daemon
876 sys.exit(smf.SMF_EXIT_ERR_FATAL)
879 sys.stderr.write("Snapshot monitor thread exited abnormally\n")
880 sys.stderr.write("Exit code: %d\n" % (snapthread.exitCode))
881 #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
882 # os.getenv("SMF_FMRI")])
883 sys.exit(smf.SMF_EXIT_ERR_FATAL)
887 def child_sig_handler(signum, frame):
888 if signum == signal.SIGUSR1:
889 sys.exit(smf.SMF_EXIT_OK)
890 elif signum == signal.SIGCHLD:
891 sys.exit(smf.SMF_EXIT_ERR_FATAL)
892 elif signum == signal.SIGALRM:
893 sys.exit(smf.SMF_EXIT_ERR_FATAL)
895 # Default daemon parameters.
896 # File mode creation mask of the daemon.
898 # Default working directory for the daemon.
900 # Default maximum for the number of available file descriptors.
905 Detach a process from the controlling terminal and run it in the
906 background as a daemon.
908 #Catch signals that we might receive from child
909 signal.signal(signal.SIGCHLD, child_sig_handler)
910 signal.signal(signal.SIGUSR1, child_sig_handler)
911 signal.signal(signal.SIGALRM, child_sig_handler)
915 raise Exception, "%s [%d]" % (e.strerror, e.errno)
918 #Reset signals that we set to trap in parent
919 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
920 signal.signal(signal.SIGUSR1, signal.SIG_DFL)
921 signal.signal(signal.SIGALRM, signal.SIG_DFL)
926 #Wait for the child to give the OK or otherwise.
932 parser = argparse.ArgumentParser()
933 parser.add_argument('--foreground', action='store_true', help='Do not daemonize', default=False)
934 args, _ = parser.parse_known_args()
936 # Daemonise the service.
937 if not args.foreground:
940 # The user security attributes checked are the following:
941 # Note that UID == 0 will match any profile search so
942 # no need to check it explicitly.
943 syslog.openlog("time-sliderd", 0, syslog.LOG_DAEMON)
944 rbacp = RBACprofile()
945 if rbacp.has_profile("ZFS File System Management"):
947 gobject.threads_init()
949 # Tell dbus to use the gobject mainloop for async ops
950 dbus.mainloop.glib.DBusGMainLoop(set_as_default=True)
951 dbus.mainloop.glib.threads_init()
952 # Register a bus name with the system dbus daemon
953 systemBus = dbus.SystemBus()
954 name = dbus.service.BusName("org.opensolaris.TimeSlider", systemBus)
956 # Create and start the snapshot manger. Takes care of
957 # auto snapshotting service and auto cleanup.
958 snapshot = SnapshotManager(systemBus)
960 gobject.timeout_add(2000, monitor_threads, snapshot)
962 mainloop = gobject.MainLoop()
965 except KeyboardInterrupt:
967 sys.exit(smf.SMF_EXIT_OK)
969 syslog.syslog(syslog.LOG_ERR,
970 "%s has insufficient privileges to run time-sliderd!" \
973 sys.exit(smf.SMF_EXIT_ERR_PERM)
975 sys.exit(smf.SMF_EXIT_OK)