5 # The contents of this file are subject to the terms of the
6 # Common Development and Distribution License (the "License").
7 # You may not use this file except in compliance with the License.
9 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 # or http://www.opensolaris.org/os/licensing.
11 # See the License for the specific language governing permissions
12 # and limitations under the License.
14 # When distributing Covered Code, include this CDDL HEADER in each
15 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 # If applicable, add the following below this CDDL HEADER, with the
17 # fields enclosed by brackets "[]" replaced with your own identifying
18 # information: Portions Copyright [yyyy] [name of copyright owner]
36 from logging.handlers import SysLogHandler
43 import dbus.mainloop.glib
48 import time_slider.linux.timeslidersmf as timeslidersmf
49 import time_slider.linux.autosnapsmf as autosnapsmf
51 from time_slider.linux.rbac import RBACprofile
54 import time_slider.linux.timesliderconfig as timesliderconfig
62 # Status codes for actual zpool capacity levels.
63 # These are relative to the SMF property defined
64 # levels for: user, warning and emergenecy levels
65 STATUS_OK = 0 # Below user specified threshhold. Everything was OK
66 STATUS_WARNING = 1 # Above specified user threshold level
67 STATUS_CRITICAL = 2 # Above specified critical threshhold level
68 STATUS_EMERGENCY = 3 # Above specified emergency threshhold level
70 intervals = {"weeks" : _WEEK, "days" : _DAY, "hours" : _HOUR, "minutes" : _MINUTE}
73 class SnapshotManager(threading.Thread):
75 def __init__(self, bus):
76 # Used to wake up the run() method prematurely in the event
77 # of a SIGHUP/SMF refresh
78 self._conditionLock = threading.Condition(threading.RLock())
79 # Used when schedules are being rebuilt or examined.
80 self._refreshLock = threading.Lock()
81 # Indicates that cleanup is in progress when locked
82 self._cleanupLock = threading.Lock()
83 self._datasets = zfs.Datasets()
84 # Indicates that schedules need to be rebuilt from scratch
86 self._lastCleanupCheck = 0;
89 self._destroyedsnaps = []
90 self.logger = logging.getLogger('time-slider')
92 # This is also checked during the refresh() method but we need
93 # to know it sooner for instantiation of the PluginManager
94 self._smf = timeslidersmf.TimeSliderSMF()
96 self.verbose = self._smf.get_verbose()
97 except RuntimeError,message:
98 self.logger.error("Error determing whether debugging is enabled")
101 self._dbus = dbussvc.AutoSnap(bus,
102 '/org/opensolaris/TimeSlider/autosnap',
105 # self._plugin = plugin.PluginManager(self.verbose)
106 self.exitCode = smf.SMF_EXIT_OK
109 # Seems we're up and running OK.
110 # Signal our parent so we can daemonise
111 os.kill(os.getppid(), signal.SIGUSR1)
113 # SMF/svc.startd sends SIGHUP to force a
114 # a refresh of the daemon
115 signal.signal(signal.SIGHUP, self._signalled)
117 # Init done. Now initiaslise threading.
118 threading.Thread.__init__ (self)
122 # Deselect swap and dump volumes so they don't get snapshotted.
123 for vol in self._datasets.list_volumes():
124 name = vol.rsplit("/")
126 if (name[1] == "swap" or name[1] == "dump"):
127 util.debug("Auto excluding %s volume" % vol, self.verbose)
128 volume = zfs.Volume(vol)
129 volume.set_auto_snap(False)
138 # First check and, if necessary, perform any remedial cleanup.
139 # This is best done before creating any new snapshots which may
140 # otherwise get immediately gobbled up by the remedial cleanup.
141 if self._needs_cleanup() == True:
142 self._perform_cleanup()
143 # Check to see if cleanup actually deleted anything before
144 # notifying the user. Avoids the popup appearing continuously
145 if len(self._destroyedsnaps) > 0:
146 self._send_notification()
147 self._send_to_syslog()
149 nexttime = self._check_snapshots()
150 # Overdue snapshots are already taken automatically
151 # inside _check_snapshots() so nexttime should never be
152 # < 0. It can be None however, which is fine since it
153 # will cause the scheduler thread to sleep indefinitely
154 # or until a SIGHUP is caught.
156 util.debug("Waiting until " + str (nexttime), self.verbose)
159 waittime = nexttime - long(time.time())
161 # We took too long and missed a snapshot, so break out
162 # and catch up on it the next time through the loop
164 # waittime could be None if no auto-snap schedules are online
165 self._conditionLock.acquire()
167 util.debug("Waiting %d seconds" % (waittime), self.verbose)
168 self._conditionLock.wait(waittime)
169 else: #None. Just wait a while to check for cleanups.
170 util.debug("No auto-snapshot schedules online.", \
172 self._conditionLock.wait(_MINUTE * 15)
174 except OSError, message:
175 self.logger.error("Caught OSError exception in snapshot" +
177 self.logger.error("Error details:\n" + \
178 "--------BEGIN ERROR MESSAGE--------\n" + \
180 "\n--------END ERROR MESSAGE--------")
181 self.exitCode = smf.SMF_EXIT_ERR_FATAL
184 except RuntimeError,message:
185 self.logger.error("Caught RuntimeError exception in snapshot" +
187 self.logger.error("Error details:\n" + \
188 "--------BEGIN ERROR MESSAGE--------\n" + \
190 "\n--------END ERROR MESSAGE--------")
194 def _signalled(self, signum, frame):
195 if signum == signal.SIGHUP:
196 if self._refreshLock.acquire(False) == False:
199 self._refreshLock.release()
200 self._conditionLock.acquire()
201 self._conditionLock.notify()
202 self._conditionLock.release()
206 Checks if defined snapshot schedules are out
207 of date and rebuilds and updates if necessary
209 self._refreshLock.acquire()
210 if self._stale == True:
211 self._configure_svc_props()
212 self._rebuild_schedules()
213 self._update_schedules()
214 # self._plugin.refresh()
216 self._refreshLock.release()
218 def _configure_svc_props(self):
220 self.verbose = self._smf.get_verbose()
221 except RuntimeError,message:
222 self.logger.error("Error determing whether debugging is enabled")
226 cleanup = self._smf.get_remedial_cleanup()
227 warn = self._smf.get_cleanup_level("warning")
228 util.debug("Warning level value is: %d%%" % warn, self.verbose)
229 crit = self._smf.get_cleanup_level("critical")
230 util.debug("Critical level value is: %d%%" % crit, self.verbose)
231 emer = self._smf.get_cleanup_level("emergency")
232 util.debug("Emergency level value is: %d%%" % emer, self.verbose)
233 except RuntimeError,message:
234 self.logger.error("Failed to determine cleanup threshhold levels")
235 self.logger.error("Details:\n" + \
236 "--------BEGIN ERROR MESSAGE--------\n" + \
238 "\n---------END ERROR MESSAGE---------\n")
239 self.logger.error("Using factory defaults of 80%, 90% and 95%")
241 #FIXME - this would be an appropriate case to mark svc as degraded
242 self._remedialCleanup = True
243 self._warningLevel = 80
244 self._criticalLevel = 90
245 self._emergencyLevel = 95
247 self._remedialCleanup = cleanup
248 self._warningLevel = warn
249 self._criticalLevel = crit
250 self._emergencyLevel = emer
253 self._keepEmpties = self._smf.get_keep_empties()
254 except RuntimeError,message:
255 # Not fatal, just assume we delete them (default configuration)
256 self.logger.error("Can't determine whether to keep empty snapshots")
257 self.logger.error("Details:\n" + \
258 "--------BEGIN ERROR MESSAGE--------\n" + \
260 "\n---------END ERROR MESSAGE---------")
261 self.logger.error("Assuming default value: False")
262 self._keepEmpties = False
264 # Previously, snapshot labels used the ":" character was used as a
265 # separator character for datestamps. Windows filesystems such as
266 # CIFS and FAT choke on this character so now we use a user definable
267 # separator value, with a default value of "_"
268 # We need to check for both the old and new format when looking for
270 self._separator = self._smf.get_separator()
271 self._prefix = "%s[:%s]" \
272 % (autosnapsmf.SNAPLABELPREFIX, self._separator)
277 for poolname in zfs.list_zpools():
278 # Do not try to examine FAULTED pools
279 zpool = zfs.ZPool(poolname)
280 if zpool.health == "FAULTED":
281 util.debug("Ignoring faulted Zpool: %s\n" \
285 self._zpools.append(zpool)
286 util.debug(str(zpool), self.verbose)
287 except RuntimeError,message:
288 self.logger.error("Could not list Zpools")
289 self.exitCode = smf.SMF_EXIT_ERR_FATAL
290 # Propogate exception up to thread's run() method
291 raise RuntimeError,message
294 def _rebuild_schedules(self):
296 Builds 2 lists of default and custom auto-snapshot SMF instances
304 _defaultSchedules = autosnapsmf.get_default_schedules()
305 _customSchedules = autosnapsmf.get_custom_schedules()
306 except RuntimeError,message:
307 self.exitCode = smf.SMF_EXIT_ERR_FATAL
308 raise RuntimeError, "Error reading SMF schedule instances\n" + \
309 "Details:\n" + str(message)
311 # Now set it in stone.
312 self._defaultSchedules = tuple(_defaultSchedules)
313 self._customSchedules = tuple(_customSchedules)
315 # Build the combined schedule tuple from default + custom schedules
316 _defaultSchedules.extend(_customSchedules)
317 self._allSchedules = tuple(_defaultSchedules)
318 for schedule,i,p,keep in self._allSchedules:
319 self._last[schedule] = 0
320 self._next[schedule] = 0
321 self._keep[schedule] = keep
323 def _update_schedules(self):
325 idx = 1 # Used to index subsets for schedule overlap calculation
328 for schedule,interval,period,keep in self._allSchedules:
329 # Shortcut if we've already processed this schedule and it's
330 # still up to date. Don't skip the default schedules though
331 # because overlap affects their scheduling
332 if [schedule,interval,period,keep] not in \
333 self._defaultSchedules and \
334 (self._next[schedule] > self._last[schedule]):
335 util.debug("Short circuiting %s recalculation" \
340 # If we don't have an internal timestamp for the given schedule
341 # ask zfs for the last snapshot and get it's creation timestamp.
342 if self._last[schedule] == 0:
344 snaps = self._datasets.list_snapshots("%s%s" % \
347 except RuntimeError,message:
348 self.exitCode = smf.SMF_EXIT_ERR_FATAL
349 self.logger.error("Failed to list snapshots during schedule update")
350 #Propogate up to the thread's run() method
351 raise RuntimeError,message
354 util.debug("Last %s snapshot was: %s" % \
355 (schedule, snaps[-1][0]), \
357 self._last[schedule] = snaps[-1][1]
359 last = self._last[schedule]
360 if interval != "months": # months is non-constant. See below.
361 util.debug("Recalculating %s schedule" % (schedule), \
364 totalinterval = intervals[interval] * period
366 self.exitCode = smf.SMF_EXIT_ERR_CONFIG
367 self.logger.error(schedule + \
368 " schedule has invalid interval: " + \
370 #Propogate up to thread's run() method
372 if [schedule,interval,period,keep] in self._defaultSchedules:
373 # This is one of the default schedules so check for an
374 # overlap with one of the dominant shchedules.
375 for s,i,p,k in self._defaultSchedules[:idx]:
376 last = max(last, self._last[s])
379 else: # interval == "months"
380 if self._next[schedule] > last:
381 util.debug("Short circuiting " + \
386 util.debug("Recalculating %s schedule" % (schedule), \
388 snap_tm = time.gmtime(self._last[schedule])
389 # Increment year if period >= than 1 calender year.
390 year = snap_tm.tm_year
394 mon = (snap_tm.tm_mon + period) % 12
395 # Result of 0 actually means december.
398 # Account for period that spans calendar year boundary.
399 elif snap_tm.tm_mon + period > 12:
402 d,dlastmon = calendar.monthrange(snap_tm.tm_year, snap_tm.tm_mon)
403 d,dnewmon = calendar.monthrange(year, mon)
404 mday = snap_tm.tm_mday
405 if dlastmon > dnewmon and snap_tm.tm_mday > dnewmon:
408 tm =(year, mon, mday, \
409 snap_tm.tm_hour, snap_tm.tm_min, snap_tm.tm_sec, \
411 newt = calendar.timegm(tm)
412 new_tm = time.gmtime(newt)
413 totalinterval = newt - self._last[schedule]
415 self._next[schedule] = last + totalinterval
420 now = long(time.time())
422 for s,i,p,k in self._defaultSchedules:
425 #Default Schedule - so break out at the first
426 #schedule that is overdue. The subordinate schedules
427 #will re-adjust afterwards.
428 earliest,schedule = due,s
430 elif earliest != None:
432 earliest,schedule = due,s
433 else: #FIXME better optimisation with above condition
434 earliest,schedule = due,s
435 for s,i,p,k in self._customSchedules:
439 earliest,schedule = due,s
440 else: #FIXME better optimisation with above condition
441 earliest,schedule = due,s
442 return earliest,schedule
444 def _check_snapshots(self):
446 Check the schedules and see what the required snapshot is.
447 Take one immediately on the first overdue snapshot required
449 # Make sure a refresh() doesn't mess with the schedule while
450 # we're reading through it.
451 self._refreshLock.acquire()
452 next,schedule = self._next_due()
453 self._refreshLock.release()
454 now = long(time.time())
455 while next != None and next <= now:
456 label = self._take_snapshots(schedule)
457 # self._plugin.execute_plugins(schedule, label)
458 self._refreshLock.acquire()
459 self._update_schedules()
460 next,schedule = self._next_due();
461 self._refreshLock.release()
462 dt = datetime.datetime.fromtimestamp(next)
463 util.debug("Next snapshot is %s due at: %s" % \
464 (schedule, dt.isoformat()), \
468 def _take_snapshots(self, schedule):
469 # Set the time before taking snapshot to avoid clock skew due
470 # to time taken to complete snapshot.
471 tm = long(time.time())
472 label = "%s%s%s-%s" % \
473 (autosnapsmf.SNAPLABELPREFIX, self._separator, schedule,
474 datetime.datetime.now().strftime("%Y-%m-%d-%Hh%M"))
476 self._datasets.create_auto_snapshot_set(label, tag=schedule)
477 except RuntimeError, message:
478 # Write an error message, set the exit code and pass it up the
479 # stack so the thread can terminate
480 self.logger.error("Failed to create snapshots for schedule: %s" \
482 self.exitCode = smf.SMF_EXIT_MON_DEGRADE
483 raise RuntimeError,message
484 self._last[schedule] = tm;
485 self._perform_purge(schedule)
488 def _prune_snapshots(self, dataset, schedule):
489 """Cleans out zero sized snapshots, kind of cautiously"""
490 # Per schedule: We want to delete 0 sized
491 # snapshots but we need to keep at least one around (the most
492 # recent one) for each schedule so that that overlap is
493 # maintained from frequent -> hourly -> daily etc.
494 # Start off with the smallest interval schedule first and
495 # move up. This increases the amount of data retained where
496 # several snapshots are taken together like a frequent hourly
497 # and daily snapshot taken at 12:00am. If 3 snapshots are all
498 # identical and reference the same identical data they will all
499 # be initially reported as zero for used size. Deleting the
500 # daily first then the hourly would shift make the data referenced
501 # by all 3 snapshots unique to the frequent scheduled snapshot.
502 # This snapshot would probably be purged within an how ever and the
503 # data referenced by it would be gone for good.
504 # Doing it the other way however ensures that the data should
505 # remain accessible to the user for at least a week as long as
506 # the pool doesn't run low on available space before that.
509 snaps = dataset.list_snapshots("%s%s" % (self._prefix,schedule))
510 # Clone the list because we want to remove items from it
511 # while iterating through it.
512 remainingsnaps = snaps[:]
513 except RuntimeError,message:
514 self.logger.error("Failed to list snapshots during snapshot cleanup")
515 self.exitCode = smf.SMF_EXIT_ERR_FATAL
516 raise RuntimeError,message
518 if (self._keepEmpties == False):
519 try: # remove the newest one from the list.
523 for snapname in snaps:
525 snapshot = zfs.Snapshot(snapname)
526 except Exception,message:
527 self.logger.error(str(message))
528 # Not fatal, just skip to the next snapshot
532 if snapshot.get_used_size() == 0:
533 util.debug("Destroying zero sized: " + snapname, \
537 except RuntimeError,message:
538 self.logger.error("Failed to destroy snapshot: " +
540 self.exitCode = smf.SMF_EXIT_MON_DEGRADE
541 # Propogate exception so thread can exit
542 raise RuntimeError,message
543 remainingsnaps.remove(snapname)
544 except RuntimeError,message:
545 self.logger.error("Can not determine used size of: " + \
547 self.exitCode = smf.SMF_EXIT_MON_DEGRADE
548 #Propogate the exception to the thead run() method
549 raise RuntimeError,message
551 # Deleting individual snapshots instead of recursive sets
552 # breaks the recursion chain and leaves child snapshots
553 # dangling so we need to take care of cleaning up the
555 target = len(remainingsnaps) - self._keep[schedule]
557 while counter < target:
558 util.debug("Destroy expired snapshot: " + \
559 remainingsnaps[counter],
562 snapshot = zfs.Snapshot(remainingsnaps[counter])
563 except Exception,message:
564 self.logger.error(str(message))
565 # Not fatal, just skip to the next snapshot
570 except RuntimeError,message:
571 self.logger.error("Failed to destroy snapshot: " +
573 self.exitCode = smf.SMF_EXIT_ERR_FATAL
574 # Propogate exception so thread can exit
575 raise RuntimeError,message
579 def _perform_purge(self, schedule):
580 """Cautiously cleans out zero sized snapshots"""
581 # We need to avoid accidentally pruning auto snapshots received
582 # from one zpool to another. We ensure this by examining only
583 # snapshots whose parent fileystems and volumes are explicitly
584 # tagged to be snapshotted.
586 for name in self._datasets.list_auto_snapshot_sets(schedule):
587 dataset = zfs.ReadWritableDataset(name)
588 self._prune_snapshots(dataset, schedule)
589 except RuntimeError,message:
590 self.logger.error("Error listing datasets during " + \
591 "removal of expired snapshots")
592 self.exitCode = smf.SMF_EXIT_ERR_FATAL
593 # Propogate up to thread's run() method
594 raise RuntimeError,message
596 def _needs_cleanup(self):
597 if self._remedialCleanup == False:
598 # Sys admin has explicitly instructed for remedial cleanups
599 # not to be performed.
601 now = long(time.time())
602 # Don't run checks any less than 15 minutes apart.
603 if self._cleanupLock.acquire(False) == False:
604 #Indicates that a cleanup is already running.
606 # FIXME - Make the cleanup interval equal to the minimum snapshot interval
607 # if custom snapshot schedules are defined and enabled.
608 elif ((now - self._lastCleanupCheck) < (_MINUTE * 15)):
611 for zpool in self._zpools:
613 if zpool.get_capacity() > self._warningLevel:
614 # Before getting into a panic, determine if the pool
615 # is one we actually take snapshots on, by checking
616 # for one of the "auto-snapshot:<schedule> tags. Not
617 # super fast, but it only happens under exceptional
618 # circumstances of a zpool nearing it's capacity.
620 for sched in self._allSchedules:
621 sets = zpool.list_auto_snapshot_sets(sched[0])
623 util.debug("%s needs a cleanup" \
626 self._cleanupLock.release()
628 except RuntimeError, message:
629 self.logger.error("Error checking zpool capacity of: " + \
631 self._cleanupLock.release()
632 self.exitCode = smf.SMF_EXIT_ERR_FATAL
633 # Propogate up to thread's run() mehod.
634 raise RuntimeError,message
635 self._lastCleanupCheck = long(time.time())
636 self._cleanupLock.release()
639 def _perform_cleanup(self):
640 if self._cleanupLock.acquire(False) == False:
641 # Cleanup already running. Skip
643 self._destroyedsnaps = []
644 for zpool in self._zpools:
646 self._poolstatus[zpool.name] = 0
647 capacity = zpool.get_capacity()
648 if capacity > self._warningLevel:
649 self._run_warning_cleanup(zpool)
650 self._poolstatus[zpool.name] = 1
651 capacity = zpool.get_capacity()
652 if capacity > self._criticalLevel:
653 self._run_critical_cleanup(zpool)
654 self._poolstatus[zpool.name] = 2
655 capacity = zpool.get_capacity()
656 if capacity > self._emergencyLevel:
657 self._run_emergency_cleanup(zpool)
658 self._poolstatus[zpool.name] = 3
659 capacity = zpool.get_capacity()
660 if capacity > self._emergencyLevel:
661 self._run_emergency_cleanup(zpool)
662 self._poolstatus[zpool.name] = 4
663 # This also catches exceptions thrown from _run_<level>_cleanup()
664 # and _run_cleanup() in methods called by _perform_cleanup()
665 except RuntimeError,message:
666 self.logger.error("Remedial space cleanup failed because " + \
667 "of failure to determinecapacity of: " + \
669 self.exitCode = smf.SMF_EXIT_ERR_FATAL
670 self._cleanupLock.release()
671 # Propogate up to thread's run() method.
672 raise RuntimeError,message
674 # Bad - there's no more snapshots left and nothing
675 # left to delete. We don't disable the service since
676 # it will permit self recovery and snapshot
677 # retention when space becomes available on
678 # the pool (hopefully).
679 util.debug("%s pool status after cleanup:" \
682 util.debug(zpool, self.verbose)
683 util.debug("Cleanup completed. %d snapshots were destroyed" \
684 % len(self._destroyedsnaps), \
686 # Avoid needless list iteration for non-debug mode
687 if self.verbose == True and len(self._destroyedsnaps) > 0:
688 for snap in self._destroyedsnaps:
689 self.logger.error("\t%s" % snap)
690 self._cleanupLock.release()
692 def _run_warning_cleanup(self, zpool):
693 util.debug("Performing warning level cleanup on %s" % \
696 self._run_cleanup(zpool, "daily", self._warningLevel)
697 if zpool.get_capacity() > self._warningLevel:
698 self._run_cleanup(zpool, "hourly", self._warningLevel)
700 def _run_critical_cleanup(self, zpool):
701 util.debug("Performing critical level cleanup on %s" % \
704 self._run_cleanup(zpool, "weekly", self._criticalLevel)
705 if zpool.get_capacity() > self._criticalLevel:
706 self._run_cleanup(zpool, "daily", self._criticalLevel)
707 if zpool.get_capacity() > self._criticalLevel:
708 self._run_cleanup(zpool, "hourly", self._criticalLevel)
710 def _run_emergency_cleanup(self, zpool):
711 util.debug("Performing emergency level cleanup on %s" % \
714 self._run_cleanup(zpool, "monthly", self._emergencyLevel)
715 if zpool.get_capacity() > self._emergencyLevel:
716 self._run_cleanup(zpool, "weekly", self._emergencyLevel)
717 if zpool.get_capacity() > self._emergencyLevel:
718 self._run_cleanup(zpool, "daily", self._emergencyLevel)
719 if zpool.get_capacity() > self._emergencyLevel:
720 self._run_cleanup(zpool, "hourly", self._emergencyLevel)
721 if zpool.get_capacity() > self._emergencyLevel:
722 self._run_cleanup(zpool, "frequent", self._emergencyLevel)
723 #Finally, as a last resort, delete custom scheduled snaphots
724 for schedule,i,p,k in self._customSchedules:
725 if zpool.get_capacity() < self._emergencyLevel:
728 self._run_cleanup(zpool, schedule, self._emergencyLevel)
730 def _run_cleanup(self, zpool, schedule, threshold):
734 clonedsnaps = self._datasets.list_cloned_snapshots()
735 except RuntimeError,message:
736 self.logger.error("Error (non-fatal) listing cloned snapshots" +
737 " while recovering pool capacity")
738 self.logger.error("Error details:\n" + \
739 "--------BEGIN ERROR MESSAGE--------\n" + \
741 "\n--------END ERROR MESSAGE--------")
743 # Build a list of snapshots in the given schedule, that are not
744 # cloned, and sort the result in reverse chronological order.
746 snapshots = [s for s,t in \
747 zpool.list_snapshots("%s%s" \
748 % (self._prefix,schedule)) \
749 if not s in clonedsnaps]
751 except RuntimeError,message:
752 self.logger.error("Error listing snapshots" +
753 " while recovering pool capacity")
754 self.exitCode = smf.SMF_EXIT_ERR_FATAL
755 # Propogate the error up to the thread's run() method.
756 raise RuntimeError,message
758 while zpool.get_capacity() > threshold:
759 if len(snapshots) == 0:
761 "No more %s snapshots left" \
765 """This is not an exact science. Deleteing a zero sized
766 snapshot can have unpredictable results. For example a
767 pair of snapshots may share exclusive reference to a large
768 amount of data (eg. a large core file). The usage of both
769 snapshots will initially be seen to be 0 by zfs(1). Deleting
770 one of the snapshots will make the data become unique to the
771 single remaining snapshot that references it uniquely. The
772 remaining snapshot's size will then show up as non zero. So
773 deleting 0 sized snapshot is not as pointless as it might seem.
774 It also means we have to loop through this, each snapshot set
775 at a time and observe the before and after results. Perhaps
776 better way exists...."""
778 # Start with the oldest first
779 snapname = snapshots.pop()
780 snapshot = zfs.Snapshot(snapname)
781 # It would be nicer, for performance purposes, to delete sets
782 # of snapshots recursively but this might destroy more data than
783 # absolutely necessary, plus the previous purging of zero sized
784 # snapshots can easily break the recursion chain between
786 # On the positive side there should be fewer snapshots and they
787 # will mostly non-zero so we should get more effectiveness as a
788 # result of deleting snapshots since they should be nearly always
790 util.debug("Destroying %s" % snapname, self.verbose)
793 except RuntimeError,message:
794 # Would be nice to be able to mark service as degraded here
795 # but it's better to try to continue on rather than to give
796 # up alltogether (SMF maintenance state)
797 self.logger.error("Warning: Cleanup failed to destroy: %s" % \
799 self.logger.error("Details:\n%s" % (str(message)))
801 self._destroyedsnaps.append(snapname)
802 # Give zfs some time to recalculate.
805 def _send_to_syslog(self):
806 for zpool in self._zpools:
807 status = self._poolstatus[zpool.name]
809 self.logger.critical( \
810 "All automatic snapshots were destroyed" \
811 % (zpool.name, self._emergencyLevel))
814 "%s exceeded %d%% capacity. " \
815 "Automatic snapshots over 1 hour old were destroyed" \
816 % (zpool.name, self._emergencyLevel))
818 self.logger.critical( \
819 "%s exceeded %d%% capacity. " \
820 "Weekly, hourly and daily automatic snapshots were destroyed" \
821 % (zpool.name, self._criticalLevel))
823 self.logger.warning( \
824 "%s exceeded %d%% capacity. " \
825 "Hourly and daily automatic snapshots were destroyed" \
826 % (zpool.name, self._warningLevel))
828 if len(self._destroyedsnaps) > 0:
829 self.logger.warning( \
830 "%d automatic snapshots were destroyed" \
831 % len(self._destroyedsnaps))
833 def _send_notification(self):
837 for zpool in self._zpools:
838 status = self._poolstatus[zpool.name]
839 # >= to ensure that something should always be set.
840 if status >= worststatus:
841 worstpool = zpool.name
844 #FIXME make the various levels indexible
846 self._dbus.capacity_exceeded(worstpool, 4, self._emergencyLevel)
847 elif worststatus == 3:
848 self._dbus.capacity_exceeded(worstpool, 3, self._emergencyLevel)
849 elif worststatus == 2:
850 self._dbus.capacity_exceeded(worstpool, 2, self._criticalLevel)
851 elif worststatus == 1:
852 self._dbus.capacity_exceeded(worstpool, 1, self._warningLevel)
853 #elif: 0 everything is fine. Do nothing.
856 def monitor_threads(snapthread):
857 logger = logging.getLogger('time-slider')
858 if snapthread.is_alive():
861 logger.error("Snapshot monitor thread exited.")
862 if snapthread.exitCode == smf.SMF_EXIT_MON_DEGRADE:
863 # FIXME - it would be nicer to mark the service as degraded than
864 # go into maintenance state for some situations such as a
865 # particular snapshot schedule failing.
866 # But for now SMF does not implement this feature. But if/when it
867 # does it's better to use svcadm to put the # service into the
868 # correct state since the daemon shouldn't exit whentransitioning
869 # to a degraded state.
870 #sys.stderr.write("Placing service into maintenance state\n")
871 #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
872 # os.getenv("SMF_FMRI")])
873 # SMF will take care of kill the daemon
874 sys.exit(smf.SMF_EXIT_ERR_FATAL)
876 elif snapthread.exitCode == smf.SMF_EXIT_ERR_FATAL:
877 #sys.stderr.write("Placing service into maintenance state\n")
878 #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
879 # os.getenv("SMF_FMRI")])
880 # SMF will take care of killing the daemon
881 sys.exit(smf.SMF_EXIT_ERR_FATAL)
884 logger.error("Snapshot monitor thread exited abnormally")
885 logger.error("Exit code: %d" % (snapthread.exitCode))
886 #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
887 # os.getenv("SMF_FMRI")])
888 sys.exit(smf.SMF_EXIT_ERR_FATAL)
892 def child_sig_handler(signum, frame):
893 if signum == signal.SIGUSR1:
894 sys.exit(smf.SMF_EXIT_OK)
895 elif signum == signal.SIGCHLD:
896 sys.exit(smf.SMF_EXIT_ERR_FATAL)
897 elif signum == signal.SIGALRM:
898 sys.exit(smf.SMF_EXIT_ERR_FATAL)
900 # Default daemon parameters.
901 # File mode creation mask of the daemon.
903 # Default working directory for the daemon.
905 # Default maximum for the number of available file descriptors.
910 Detach a process from the controlling terminal and run it in the
911 background as a daemon.
913 #Catch signals that we might receive from child
914 signal.signal(signal.SIGCHLD, child_sig_handler)
915 signal.signal(signal.SIGUSR1, child_sig_handler)
916 signal.signal(signal.SIGALRM, child_sig_handler)
920 raise Exception, "%s [%d]" % (e.strerror, e.errno)
923 #Reset signals that we set to trap in parent
924 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
925 signal.signal(signal.SIGUSR1, signal.SIG_DFL)
926 signal.signal(signal.SIGALRM, signal.SIG_DFL)
931 #Wait for the child to give the OK or otherwise.
937 parser = argparse.ArgumentParser()
938 parser.add_argument('--foreground', action='store_true', help='Do not daemonize', default=False)
939 parser.add_argument('--config', '-c', type=str, help='Configuration file', default='/etc/time-slider/timesliderd.conf')
940 parser.add_argument('--configdump', action='store_true', help='Dump default values in config file format', default=False)
941 args, _ = parser.parse_known_args()
943 logger = logging.getLogger('time-slider')
944 logger.setLevel(logging.DEBUG)
946 handler = logging.StreamHandler()
947 handler.setFormatter(logging.Formatter('%(message)s'))
949 handler = SysLogHandler(address='/dev/log')
950 handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s: %(message)s', '%b %d %H:%M:%S time-sliderd:'))
951 handler.setLevel(logging.DEBUG)
952 logger.addHandler(handler)
955 timesliderconfig.configdump()
956 sys.exit(smf.SMF_EXIT_OK)
958 timesliderconfig.configfile = args.config
960 # Daemonise the service.
961 if not args.foreground:
964 # The user security attributes checked are the following:
965 # Note that UID == 0 will match any profile search so
966 # no need to check it explicitly.
967 rbacp = RBACprofile()
968 if rbacp.has_profile("ZFS File System Management"):
970 gobject.threads_init()
972 # Tell dbus to use the gobject mainloop for async ops
973 dbus.mainloop.glib.DBusGMainLoop(set_as_default=True)
974 dbus.mainloop.glib.threads_init()
975 # Register a bus name with the system dbus daemon
976 systemBus = dbus.SystemBus()
977 name = dbus.service.BusName("org.opensolaris.TimeSlider", systemBus)
979 # Create and start the snapshot manger. Takes care of
980 # auto snapshotting service and auto cleanup.
981 snapshot = SnapshotManager(systemBus)
983 gobject.timeout_add(2000, monitor_threads, snapshot)
985 mainloop = gobject.MainLoop()
988 except KeyboardInterrupt:
990 sys.exit(smf.SMF_EXIT_OK)
993 "%s has insufficient privileges to run time-sliderd!" \
995 sys.exit(smf.SMF_EXIT_ERR_PERM)
996 sys.exit(smf.SMF_EXIT_OK)