5 # The contents of this file are subject to the terms of the
6 # Common Development and Distribution License (the "License").
7 # You may not use this file except in compliance with the License.
9 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 # or http://www.opensolaris.org/os/licensing.
11 # See the License for the specific language governing permissions
12 # and limitations under the License.
14 # When distributing Covered Code, include this CDDL HEADER in each
15 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 # If applicable, add the following below this CDDL HEADER, with the
17 # fields enclosed by brackets "[]" replaced with your own identifying
18 # information: Portions Copyright [yyyy] [name of copyright owner]
40 import dbus.mainloop.glib
48 from rbac import RBACprofile
57 # Status codes for actual zpool capacity levels.
58 # These are relative to the SMF property defined
59 # levels for: user, warning and emergenecy levels
60 STATUS_OK = 0 # Below user specified threshhold. Everything was OK
61 STATUS_WARNING = 1 # Above specified user threshold level
62 STATUS_CRITICAL = 2 # Above specified critical threshhold level
63 STATUS_EMERGENCY = 3 # Above specified emergency threshhold level
65 intervals = {"weeks" : _WEEK, "days" : _DAY, "hours" : _HOUR, "minutes" : _MINUTE}
68 class SnapshotManager(threading.Thread):
70 def __init__(self, bus):
71 # Used to wake up the run() method prematurely in the event
72 # of a SIGHUP/SMF refresh
73 self._conditionLock = threading.Condition(threading.RLock())
74 # Used when schedules are being rebuilt or examined.
75 self._refreshLock = threading.Lock()
76 # Indicates that cleanup is in progress when locked
77 self._cleanupLock = threading.Lock()
78 self._datasets = zfs.Datasets()
79 # Indicates that schedules need to be rebuilt from scratch
81 self._lastCleanupCheck = 0;
84 self._destroyedsnaps = []
86 # This is also checked during the refresh() method but we need
87 # to know it sooner for instantiation of the PluginManager
88 self._smf = timeslidersmf.TimeSliderSMF()
90 self.verbose = self._smf.get_verbose()
91 except RuntimeError,message:
92 sys.stderr.write("Error determing whether debugging is enabled\n")
95 self._dbus = dbussvc.AutoSnap(bus,
96 '/org/opensolaris/TimeSlider/autosnap',
99 self._plugin = plugin.PluginManager(self.verbose)
100 self.exitCode = smf.SMF_EXIT_OK
103 # Seems we're up and running OK.
104 # Signal our parent so we can daemonise
105 os.kill(os.getppid(), signal.SIGUSR1)
107 # SMF/svc.startd sends SIGHUP to force a
108 # a refresh of the daemon
109 signal.signal(signal.SIGHUP, self._signalled)
111 # Init done. Now initiaslise threading.
112 threading.Thread.__init__ (self)
116 # Deselect swap and dump volumes so they don't get snapshotted.
117 for vol in self._datasets.list_volumes():
118 name = vol.rsplit("/")
120 if (name[1] == "swap" or name[1] == "dump"):
121 util.debug("Auto excluding %s volume" % vol, self.verbose)
122 volume = zfs.Volume(vol)
123 volume.set_auto_snap(False)
132 # First check and, if necessary, perform any remedial cleanup.
133 # This is best done before creating any new snapshots which may
134 # otherwise get immediately gobbled up by the remedial cleanup.
135 if self._needs_cleanup() == True:
136 self._perform_cleanup()
137 # Check to see if cleanup actually deleted anything before
138 # notifying the user. Avoids the popup appearing continuously
139 if len(self._destroyedsnaps) > 0:
140 self._send_notification()
141 self._send_to_syslog()
143 nexttime = self._check_snapshots()
144 # Overdue snapshots are already taken automatically
145 # inside _check_snapshots() so nexttime should never be
146 # < 0. It can be None however, which is fine since it
147 # will cause the scheduler thread to sleep indefinitely
148 # or until a SIGHUP is caught.
150 util.debug("Waiting until " + str (nexttime), self.verbose)
153 waittime = nexttime - long(time.time())
155 # We took too long and missed a snapshot, so break out
156 # and catch up on it the next time through the loop
158 # waittime could be None if no auto-snap schedules are online
159 self._conditionLock.acquire()
161 util.debug("Waiting %d seconds" % (waittime), self.verbose)
162 self._conditionLock.wait(waittime)
163 else: #None. Just wait a while to check for cleanups.
164 util.debug("No auto-snapshot schedules online.", \
166 self._conditionLock.wait(_MINUTE * 15)
168 except OSError, message:
169 sys.stderr.write("Caught OSError exception in snapshot" +
171 sys.stderr.write("Error details:\n" + \
172 "--------BEGIN ERROR MESSAGE--------\n" + \
174 "\n--------END ERROR MESSAGE--------\n")
175 self.exitCode = smf.SMF_EXIT_ERR_FATAL
178 except RuntimeError,message:
179 sys.stderr.write("Caught RuntimeError exception in snapshot" +
181 sys.stderr.write("Error details:\n" + \
182 "--------BEGIN ERROR MESSAGE--------\n" + \
184 "\n--------END ERROR MESSAGE--------\n")
188 def _signalled(self, signum, frame):
189 if signum == signal.SIGHUP:
190 if self._refreshLock.acquire(False) == False:
193 self._refreshLock.release()
194 self._conditionLock.acquire()
195 self._conditionLock.notify()
196 self._conditionLock.release()
200 Checks if defined snapshot schedules are out
201 of date and rebuilds and updates if necessary
203 self._refreshLock.acquire()
204 if self._stale == True:
205 self._configure_svc_props()
206 self._rebuild_schedules()
207 self._update_schedules()
208 self._plugin.refresh()
210 self._refreshLock.release()
212 def _configure_svc_props(self):
214 self.verbose = self._smf.get_verbose()
215 except RuntimeError,message:
216 sys.stderr.write("Error determing whether debugging is enabled\n")
220 cleanup = self._smf.get_remedial_cleanup()
221 warn = self._smf.get_cleanup_level("warning")
222 util.debug("Warning level value is: %d%%" % warn, self.verbose)
223 crit = self._smf.get_cleanup_level("critical")
224 util.debug("Critical level value is: %d%%" % crit, self.verbose)
225 emer = self._smf.get_cleanup_level("emergency")
226 util.debug("Emergency level value is: %d%%" % emer, self.verbose)
227 except RuntimeError,message:
228 sys.stderr.write("Failed to determine cleanup threshhold levels\n")
229 sys.stderr.write("Details:\n" + \
230 "--------BEGIN ERROR MESSAGE--------\n" + \
232 "\n---------END ERROR MESSAGE---------\n")
233 sys.stderr.write("Using factory defaults of 80%, 90% and 95%\n")
235 #FIXME - this would be an appropriate case to mark svc as degraded
236 self._remedialCleanup = True
237 self._warningLevel = 80
238 self._criticalLevel = 90
239 self._emergencyLevel = 95
241 self._remedialCleanup = cleanup
242 self._warningLevel = warn
243 self._criticalLevel = crit
244 self._emergencyLevel = emer
247 self._keepEmpties = self._smf.get_keep_empties()
248 except RuntimeError,message:
249 # Not fatal, just assume we delete them (default configuration)
250 sys.stderr.write("Can't determine whether to keep empty snapshots\n")
251 sys.stderr.write("Details:\n" + \
252 "--------BEGIN ERROR MESSAGE--------\n" + \
254 "\n---------END ERROR MESSAGE---------\n")
255 sys.stderr.write("Assuming default value: False\n")
256 self._keepEmpties = False
258 # Previously, snapshot labels used the ":" character was used as a
259 # separator character for datestamps. Windows filesystems such as
260 # CIFS and FAT choke on this character so now we use a user definable
261 # separator value, with a default value of "_"
262 # We need to check for both the old and new format when looking for
264 self._separator = self._smf.get_separator()
265 self._prefix = "%s[:%s]" \
266 % (autosnapsmf.SNAPLABELPREFIX, self._separator)
271 for poolname in zfs.list_zpools():
272 # Do not try to examine FAULTED pools
273 zpool = zfs.ZPool(poolname)
274 if zpool.health == "FAULTED":
275 util.debug("Ignoring faulted Zpool: %s\n" \
279 self._zpools.append(zpool)
280 util.debug(str(zpool), self.verbose)
281 except RuntimeError,message:
282 sys.stderr.write("Could not list Zpools\n")
283 self.exitCode = smf.SMF_EXIT_ERR_FATAL
284 # Propogate exception up to thread's run() method
285 raise RuntimeError,message
288 def _rebuild_schedules(self):
290 Builds 2 lists of default and custom auto-snapshot SMF instances
298 _defaultSchedules = autosnapsmf.get_default_schedules()
299 _customSchedules = autosnapsmf.get_custom_schedules()
300 except RuntimeError,message:
301 self.exitCode = smf.SMF_EXIT_ERR_FATAL
302 raise RuntimeError, "Error reading SMF schedule instances\n" + \
303 "Details:\n" + str(message)
305 # Now set it in stone.
306 self._defaultSchedules = tuple(_defaultSchedules)
307 self._customSchedules = tuple(_customSchedules)
309 # Build the combined schedule tuple from default + custom schedules
310 _defaultSchedules.extend(_customSchedules)
311 self._allSchedules = tuple(_defaultSchedules)
312 for schedule,i,p,keep in self._allSchedules:
313 self._last[schedule] = 0
314 self._next[schedule] = 0
315 self._keep[schedule] = keep
317 def _update_schedules(self):
319 idx = 1 # Used to index subsets for schedule overlap calculation
322 for schedule,interval,period,keep in self._allSchedules:
323 # Shortcut if we've already processed this schedule and it's
324 # still up to date. Don't skip the default schedules though
325 # because overlap affects their scheduling
326 if [schedule,interval,period,keep] not in \
327 self._defaultSchedules and \
328 (self._next[schedule] > self._last[schedule]):
329 util.debug("Short circuiting %s recalculation" \
334 # If we don't have an internal timestamp for the given schedule
335 # ask zfs for the last snapshot and get it's creation timestamp.
336 if self._last[schedule] == 0:
338 snaps = self._datasets.list_snapshots("%s%s" % \
341 except RuntimeError,message:
342 self.exitCode = smf.SMF_EXIT_ERR_FATAL
343 sys.stderr.write("Failed to list snapshots during schedule update\n")
344 #Propogate up to the thread's run() method
345 raise RuntimeError,message
348 util.debug("Last %s snapshot was: %s" % \
349 (schedule, snaps[-1][0]), \
351 self._last[schedule] = snaps[-1][1]
353 last = self._last[schedule]
354 if interval != "months": # months is non-constant. See below.
355 util.debug("Recalculating %s schedule" % (schedule), \
358 totalinterval = intervals[interval] * period
360 self.exitCode = smf.SMF_EXIT_ERR_CONFIG
361 sys.stderr.write(schedule + \
362 " schedule has invalid interval: " + \
363 "'%s\'\n" % interval)
364 #Propogate up to thread's run() method
366 if [schedule,interval,period,keep] in self._defaultSchedules:
367 # This is one of the default schedules so check for an
368 # overlap with one of the dominant shchedules.
369 for s,i,p,k in self._defaultSchedules[:idx]:
370 last = max(last, self._last[s])
373 else: # interval == "months"
374 if self._next[schedule] > last:
375 util.debug("Short circuiting " + \
380 util.debug("Recalculating %s schedule" % (schedule), \
382 snap_tm = time.gmtime(self._last[schedule])
383 # Increment year if period >= than 1 calender year.
384 year = snap_tm.tm_year
388 mon = (snap_tm.tm_mon + period) % 12
389 # Result of 0 actually means december.
392 # Account for period that spans calendar year boundary.
393 elif snap_tm.tm_mon + period > 12:
396 d,dlastmon = calendar.monthrange(snap_tm.tm_year, snap_tm.tm_mon)
397 d,dnewmon = calendar.monthrange(year, mon)
398 mday = snap_tm.tm_mday
399 if dlastmon > dnewmon and snap_tm.tm_mday > dnewmon:
402 tm =(year, mon, mday, \
403 snap_tm.tm_hour, snap_tm.tm_min, snap_tm.tm_sec, \
405 newt = calendar.timegm(tm)
406 new_tm = time.gmtime(newt)
407 totalinterval = newt - self._last[schedule]
409 self._next[schedule] = last + totalinterval
414 now = long(time.time())
416 for s,i,p,k in self._defaultSchedules:
419 #Default Schedule - so break out at the first
420 #schedule that is overdue. The subordinate schedules
421 #will re-adjust afterwards.
422 earliest,schedule = due,s
424 elif earliest != None:
426 earliest,schedule = due,s
427 else: #FIXME better optimisation with above condition
428 earliest,schedule = due,s
429 for s,i,p,k in self._customSchedules:
433 earliest,schedule = due,s
434 else: #FIXME better optimisation with above condition
435 earliest,schedule = due,s
436 return earliest,schedule
438 def _check_snapshots(self):
440 Check the schedules and see what the required snapshot is.
441 Take one immediately on the first overdue snapshot required
443 # Make sure a refresh() doesn't mess with the schedule while
444 # we're reading through it.
445 self._refreshLock.acquire()
446 next,schedule = self._next_due()
447 self._refreshLock.release()
448 now = long(time.time())
449 while next != None and next <= now:
450 label = self._take_snapshots(schedule)
451 self._plugin.execute_plugins(schedule, label)
452 self._refreshLock.acquire()
453 self._update_schedules()
454 next,schedule = self._next_due();
455 self._refreshLock.release()
456 dt = datetime.datetime.fromtimestamp(next)
457 util.debug("Next snapshot is %s due at: %s" % \
458 (schedule, dt.isoformat()), \
462 def _take_snapshots(self, schedule):
463 # Set the time before taking snapshot to avoid clock skew due
464 # to time taken to complete snapshot.
465 tm = long(time.time())
466 label = "%s%s%s-%s" % \
467 (autosnapsmf.SNAPLABELPREFIX, self._separator, schedule,
468 datetime.datetime.now().strftime("%Y-%m-%d-%Hh%M"))
470 self._datasets.create_auto_snapshot_set(label, tag=schedule)
471 except RuntimeError, message:
472 # Write an error message, set the exit code and pass it up the
473 # stack so the thread can terminate
474 sys.stderr.write("Failed to create snapshots for schedule: %s\n" \
476 self.exitCode = smf.SMF_EXIT_MON_DEGRADE
477 raise RuntimeError,message
478 self._last[schedule] = tm;
479 self._perform_purge(schedule)
482 def _prune_snapshots(self, dataset, schedule):
483 """Cleans out zero sized snapshots, kind of cautiously"""
484 # Per schedule: We want to delete 0 sized
485 # snapshots but we need to keep at least one around (the most
486 # recent one) for each schedule so that that overlap is
487 # maintained from frequent -> hourly -> daily etc.
488 # Start off with the smallest interval schedule first and
489 # move up. This increases the amount of data retained where
490 # several snapshots are taken together like a frequent hourly
491 # and daily snapshot taken at 12:00am. If 3 snapshots are all
492 # identical and reference the same identical data they will all
493 # be initially reported as zero for used size. Deleting the
494 # daily first then the hourly would shift make the data referenced
495 # by all 3 snapshots unique to the frequent scheduled snapshot.
496 # This snapshot would probably be purged within an how ever and the
497 # data referenced by it would be gone for good.
498 # Doing it the other way however ensures that the data should
499 # remain accessible to the user for at least a week as long as
500 # the pool doesn't run low on available space before that.
503 snaps = dataset.list_snapshots("%s%s" % (self._prefix,schedule))
504 # Clone the list because we want to remove items from it
505 # while iterating through it.
506 remainingsnaps = snaps[:]
507 except RuntimeError,message:
508 sys.stderr.write("Failed to list snapshots during snapshot cleanup\n")
509 self.exitCode = smf.SMF_EXIT_ERR_FATAL
510 raise RuntimeError,message
512 if (self._keepEmpties == False):
513 try: # remove the newest one from the list.
517 for snapname in snaps:
519 snapshot = zfs.Snapshot(snapname)
520 except Exception,message:
521 sys.stderr.write(str(message))
522 # Not fatal, just skip to the next snapshot
526 if snapshot.get_used_size() == 0:
527 util.debug("Destroying zero sized: " + snapname, \
531 except RuntimeError,message:
532 sys.stderr.write("Failed to destroy snapshot: " +
534 self.exitCode = smf.SMF_EXIT_MON_DEGRADE
535 # Propogate exception so thread can exit
536 raise RuntimeError,message
537 remainingsnaps.remove(snapname)
538 except RuntimeError,message:
539 sys.stderr.write("Can not determine used size of: " + \
541 self.exitCode = smf.SMF_EXIT_MON_DEGRADE
542 #Propogate the exception to the thead run() method
543 raise RuntimeError,message
545 # Deleting individual snapshots instead of recursive sets
546 # breaks the recursion chain and leaves child snapshots
547 # dangling so we need to take care of cleaning up the
549 target = len(remainingsnaps) - self._keep[schedule]
551 while counter < target:
552 util.debug("Destroy expired snapshot: " + \
553 remainingsnaps[counter],
556 snapshot = zfs.Snapshot(remainingsnaps[counter])
557 except Exception,message:
558 sys.stderr.write(str(message))
559 # Not fatal, just skip to the next snapshot
564 except RuntimeError,message:
565 sys.stderr.write("Failed to destroy snapshot: " +
566 snapshot.name + "\n")
567 self.exitCode = smf.SMF_EXIT_ERR_FATAL
568 # Propogate exception so thread can exit
569 raise RuntimeError,message
573 def _perform_purge(self, schedule):
574 """Cautiously cleans out zero sized snapshots"""
575 # We need to avoid accidentally pruning auto snapshots received
576 # from one zpool to another. We ensure this by examining only
577 # snapshots whose parent fileystems and volumes are explicitly
578 # tagged to be snapshotted.
580 for name in self._datasets.list_auto_snapshot_sets(schedule):
581 dataset = zfs.ReadWritableDataset(name)
582 self._prune_snapshots(dataset, schedule)
583 except RuntimeError,message:
584 sys.stderr.write("Error listing datasets during " + \
585 "removal of expired snapshots\n")
586 self.exitCode = smf.SMF_EXIT_ERR_FATAL
587 # Propogate up to thread's run() method
588 raise RuntimeError,message
590 def _needs_cleanup(self):
591 if self._remedialCleanup == False:
592 # Sys admin has explicitly instructed for remedial cleanups
593 # not to be performed.
595 now = long(time.time())
596 # Don't run checks any less than 15 minutes apart.
597 if self._cleanupLock.acquire(False) == False:
598 #Indicates that a cleanup is already running.
600 # FIXME - Make the cleanup interval equal to the minimum snapshot interval
601 # if custom snapshot schedules are defined and enabled.
602 elif ((now - self._lastCleanupCheck) < (_MINUTE * 15)):
605 for zpool in self._zpools:
607 if zpool.get_capacity() > self._warningLevel:
608 # Before getting into a panic, determine if the pool
609 # is one we actually take snapshots on, by checking
610 # for one of the "auto-snapshot:<schedule> tags. Not
611 # super fast, but it only happens under exceptional
612 # circumstances of a zpool nearing it's capacity.
614 for sched in self._allSchedules:
615 sets = zpool.list_auto_snapshot_sets(sched[0])
617 util.debug("%s needs a cleanup" \
620 self._cleanupLock.release()
622 except RuntimeError, message:
623 sys.stderr.write("Error checking zpool capacity of: " + \
625 self._cleanupLock.release()
626 self.exitCode = smf.SMF_EXIT_ERR_FATAL
627 # Propogate up to thread's run() mehod.
628 raise RuntimeError,message
629 self._lastCleanupCheck = long(time.time())
630 self._cleanupLock.release()
633 def _perform_cleanup(self):
634 if self._cleanupLock.acquire(False) == False:
635 # Cleanup already running. Skip
637 self._destroyedsnaps = []
638 for zpool in self._zpools:
640 self._poolstatus[zpool.name] = 0
641 capacity = zpool.get_capacity()
642 if capacity > self._warningLevel:
643 self._run_warning_cleanup(zpool)
644 self._poolstatus[zpool.name] = 1
645 capacity = zpool.get_capacity()
646 if capacity > self._criticalLevel:
647 self._run_critical_cleanup(zpool)
648 self._poolstatus[zpool.name] = 2
649 capacity = zpool.get_capacity()
650 if capacity > self._emergencyLevel:
651 self._run_emergency_cleanup(zpool)
652 self._poolstatus[zpool.name] = 3
653 capacity = zpool.get_capacity()
654 if capacity > self._emergencyLevel:
655 self._run_emergency_cleanup(zpool)
656 self._poolstatus[zpool.name] = 4
657 # This also catches exceptions thrown from _run_<level>_cleanup()
658 # and _run_cleanup() in methods called by _perform_cleanup()
659 except RuntimeError,message:
660 sys.stderr.write("Remedial space cleanup failed because " + \
661 "of failure to determinecapacity of: " + \
663 self.exitCode = smf.SMF_EXIT_ERR_FATAL
664 self._cleanupLock.release()
665 # Propogate up to thread's run() method.
666 raise RuntimeError,message
668 # Bad - there's no more snapshots left and nothing
669 # left to delete. We don't disable the service since
670 # it will permit self recovery and snapshot
671 # retention when space becomes available on
672 # the pool (hopefully).
673 util.debug("%s pool status after cleanup:" \
676 util.debug(zpool, self.verbose)
677 util.debug("Cleanup completed. %d snapshots were destroyed" \
678 % len(self._destroyedsnaps), \
680 # Avoid needless list iteration for non-debug mode
681 if self.verbose == True and len(self._destroyedsnaps) > 0:
682 for snap in self._destroyedsnaps:
683 sys.stderr.write("\t%s\n" % snap)
684 self._cleanupLock.release()
686 def _run_warning_cleanup(self, zpool):
687 util.debug("Performing warning level cleanup on %s" % \
690 self._run_cleanup(zpool, "daily", self._warningLevel)
691 if zpool.get_capacity() > self._warningLevel:
692 self._run_cleanup(zpool, "hourly", self._warningLevel)
694 def _run_critical_cleanup(self, zpool):
695 util.debug("Performing critical level cleanup on %s" % \
698 self._run_cleanup(zpool, "weekly", self._criticalLevel)
699 if zpool.get_capacity() > self._criticalLevel:
700 self._run_cleanup(zpool, "daily", self._criticalLevel)
701 if zpool.get_capacity() > self._criticalLevel:
702 self._run_cleanup(zpool, "hourly", self._criticalLevel)
704 def _run_emergency_cleanup(self, zpool):
705 util.debug("Performing emergency level cleanup on %s" % \
708 self._run_cleanup(zpool, "monthly", self._emergencyLevel)
709 if zpool.get_capacity() > self._emergencyLevel:
710 self._run_cleanup(zpool, "weekly", self._emergencyLevel)
711 if zpool.get_capacity() > self._emergencyLevel:
712 self._run_cleanup(zpool, "daily", self._emergencyLevel)
713 if zpool.get_capacity() > self._emergencyLevel:
714 self._run_cleanup(zpool, "hourly", self._emergencyLevel)
715 if zpool.get_capacity() > self._emergencyLevel:
716 self._run_cleanup(zpool, "frequent", self._emergencyLevel)
717 #Finally, as a last resort, delete custom scheduled snaphots
718 for schedule,i,p,k in self._customSchedules:
719 if zpool.get_capacity() < self._emergencyLevel:
722 self._run_cleanup(zpool, schedule, self._emergencyLevel)
724 def _run_cleanup(self, zpool, schedule, threshold):
728 clonedsnaps = self._datasets.list_cloned_snapshots()
729 except RuntimeError,message:
730 sys.stderr.write("Error (non-fatal) listing cloned snapshots" +
731 " while recovering pool capacity\n")
732 sys.stderr.write("Error details:\n" + \
733 "--------BEGIN ERROR MESSAGE--------\n" + \
735 "\n--------END ERROR MESSAGE--------\n")
737 # Build a list of snapshots in the given schedule, that are not
738 # cloned, and sort the result in reverse chronological order.
740 snapshots = [s for s,t in \
741 zpool.list_snapshots("%s%s" \
742 % (self._prefix,schedule)) \
743 if not s in clonedsnaps]
745 except RuntimeError,message:
746 sys.stderr.write("Error listing snapshots" +
747 " while recovering pool capacity\n")
748 self.exitCode = smf.SMF_EXIT_ERR_FATAL
749 # Propogate the error up to the thread's run() method.
750 raise RuntimeError,message
752 while zpool.get_capacity() > threshold:
753 if len(snapshots) == 0:
754 syslog.syslog(syslog.LOG_NOTICE,
755 "No more %s snapshots left" \
759 """This is not an exact science. Deleteing a zero sized
760 snapshot can have unpredictable results. For example a
761 pair of snapshots may share exclusive reference to a large
762 amount of data (eg. a large core file). The usage of both
763 snapshots will initially be seen to be 0 by zfs(1). Deleting
764 one of the snapshots will make the data become unique to the
765 single remaining snapshot that references it uniquely. The
766 remaining snapshot's size will then show up as non zero. So
767 deleting 0 sized snapshot is not as pointless as it might seem.
768 It also means we have to loop through this, each snapshot set
769 at a time and observe the before and after results. Perhaps
770 better way exists...."""
772 # Start with the oldest first
773 snapname = snapshots.pop()
774 snapshot = zfs.Snapshot(snapname)
775 # It would be nicer, for performance purposes, to delete sets
776 # of snapshots recursively but this might destroy more data than
777 # absolutely necessary, plus the previous purging of zero sized
778 # snapshots can easily break the recursion chain between
780 # On the positive side there should be fewer snapshots and they
781 # will mostly non-zero so we should get more effectiveness as a
782 # result of deleting snapshots since they should be nearly always
784 util.debug("Destroying %s" % snapname, self.verbose)
787 except RuntimeError,message:
788 # Would be nice to be able to mark service as degraded here
789 # but it's better to try to continue on rather than to give
790 # up alltogether (SMF maintenance state)
791 sys.stderr.write("Warning: Cleanup failed to destroy: %s\n" % \
793 sys.stderr.write("Details:\n%s\n" % (str(message)))
795 self._destroyedsnaps.append(snapname)
796 # Give zfs some time to recalculate.
799 def _send_to_syslog(self):
800 for zpool in self._zpools:
801 status = self._poolstatus[zpool.name]
803 syslog.syslog(syslog.LOG_EMERG,
804 "%s is over %d%% capacity. " \
805 "All automatic snapshots were destroyed" \
806 % (zpool.name, self._emergencyLevel))
808 syslog.syslog(syslog.LOG_ALERT,
809 "%s exceeded %d%% capacity. " \
810 "Automatic snapshots over 1 hour old were destroyed" \
811 % (zpool.name, self._emergencyLevel))
813 syslog.syslog(syslog.LOG_CRIT,
814 "%s exceeded %d%% capacity. " \
815 "Weekly, hourly and daily automatic snapshots were destroyed" \
816 % (zpool.name, self._criticalLevel))
818 syslog.syslog(syslog.LOG_WARNING,
819 "%s exceeded %d%% capacity. " \
820 "Hourly and daily automatic snapshots were destroyed" \
821 % (zpool.name, self._warningLevel))
823 if len(self._destroyedsnaps) > 0:
824 syslog.syslog(syslog.LOG_NOTICE,
825 "%d automatic snapshots were destroyed" \
826 % len(self._destroyedsnaps))
828 def _send_notification(self):
832 for zpool in self._zpools:
833 status = self._poolstatus[zpool.name]
834 # >= to ensure that something should always be set.
835 if status >= worststatus:
836 worstpool = zpool.name
839 #FIXME make the various levels indexible
841 self._dbus.capacity_exceeded(worstpool, 4, self._emergencyLevel)
842 elif worststatus == 3:
843 self._dbus.capacity_exceeded(worstpool, 3, self._emergencyLevel)
844 elif worststatus == 2:
845 self._dbus.capacity_exceeded(worstpool, 2, self._criticalLevel)
846 elif worststatus == 1:
847 self._dbus.capacity_exceeded(worstpool, 1, self._warningLevel)
848 #elif: 0 everything is fine. Do nothing.
851 def monitor_threads(snapthread):
852 if snapthread.is_alive():
855 sys.stderr.write("Snapshot monitor thread exited.\n")
856 if snapthread.exitCode == smf.SMF_EXIT_MON_DEGRADE:
857 # FIXME - it would be nicer to mark the service as degraded than
858 # go into maintenance state for some situations such as a
859 # particular snapshot schedule failing.
860 # But for now SMF does not implement this feature. But if/when it
861 # does it's better to use svcadm to put the # service into the
862 # correct state since the daemon shouldn't exit whentransitioning
863 # to a degraded state.
864 #sys.stderr.write("Placing service into maintenance state\n")
865 #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
866 # os.getenv("SMF_FMRI")])
867 # SMF will take care of kill the daemon
868 sys.exit(smf.SMF_EXIT_ERR_FATAL)
870 elif snapthread.exitCode == smf.SMF_EXIT_ERR_FATAL:
871 #sys.stderr.write("Placing service into maintenance state\n")
872 #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
873 # os.getenv("SMF_FMRI")])
874 # SMF will take care of killing the daemon
875 sys.exit(smf.SMF_EXIT_ERR_FATAL)
878 sys.stderr.write("Snapshot monitor thread exited abnormally\n")
879 sys.stderr.write("Exit code: %d\n" % (snapthread.exitCode))
880 #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
881 # os.getenv("SMF_FMRI")])
882 sys.exit(smf.SMF_EXIT_ERR_FATAL)
886 def child_sig_handler(signum, frame):
887 if signum == signal.SIGUSR1:
888 sys.exit(smf.SMF_EXIT_OK)
889 elif signum == signal.SIGCHLD:
890 sys.exit(smf.SMF_EXIT_ERR_FATAL)
891 elif signum == signal.SIGALRM:
892 sys.exit(smf.SMF_EXIT_ERR_FATAL)
894 # Default daemon parameters.
895 # File mode creation mask of the daemon.
897 # Default working directory for the daemon.
899 # Default maximum for the number of available file descriptors.
904 Detach a process from the controlling terminal and run it in the
905 background as a daemon.
907 #Catch signals that we might receive from child
908 signal.signal(signal.SIGCHLD, child_sig_handler)
909 signal.signal(signal.SIGUSR1, child_sig_handler)
910 signal.signal(signal.SIGALRM, child_sig_handler)
914 raise Exception, "%s [%d]" % (e.strerror, e.errno)
917 #Reset signals that we set to trap in parent
918 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
919 signal.signal(signal.SIGUSR1, signal.SIG_DFL)
920 signal.signal(signal.SIGALRM, signal.SIG_DFL)
925 #Wait for the child to give the OK or otherwise.
931 # Check SMF invocation environment
932 if os.getenv("SMF_FMRI") == None or os.getenv("SMF_METHOD") != "start":
933 sys.stderr.write("Command line invocation of %s unsupported.\n" \
935 sys.stderr.write("This command is intended for smf(5) invocation only.\n")
936 sys.exit(smf.SMF_EXIT_ERR_NOSMF)
938 # Daemonise the service.
941 # The user security attributes checked are the following:
942 # Note that UID == 0 will match any profile search so
943 # no need to check it explicitly.
944 syslog.openlog("time-sliderd", 0, syslog.LOG_DAEMON)
945 rbacp = RBACprofile()
946 if rbacp.has_profile("ZFS File System Management"):
948 gobject.threads_init()
950 # Tell dbus to use the gobject mainloop for async ops
951 dbus.mainloop.glib.DBusGMainLoop(set_as_default=True)
952 dbus.mainloop.glib.threads_init()
953 # Register a bus name with the system dbus daemon
954 systemBus = dbus.SystemBus()
955 name = dbus.service.BusName("org.opensolaris.TimeSlider", systemBus)
957 # Create and start the snapshot manger. Takes care of
958 # auto snapshotting service and auto cleanup.
959 snapshot = SnapshotManager(systemBus)
961 gobject.timeout_add(2000, monitor_threads, snapshot)
963 mainloop = gobject.MainLoop()
966 except KeyboardInterrupt:
968 sys.exit(smf.SMF_EXIT_OK)
970 syslog.syslog(syslog.LOG_ERR,
971 "%s has insufficient privileges to run time-sliderd!" \
974 sys.exit(smf.SMF_EXIT_ERR_PERM)
976 sys.exit(smf.SMF_EXIT_OK)