5 # The contents of this file are subject to the terms of the
6 # Common Development and Distribution License (the "License").
7 # You may not use this file except in compliance with the License.
9 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 # or http://www.opensolaris.org/os/licensing.
11 # See the License for the specific language governing permissions
12 # and limitations under the License.
14 # When distributing Covered Code, include this CDDL HEADER in each
15 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 # If applicable, add the following below this CDDL HEADER, with the
17 # fields enclosed by brackets "[]" replaced with your own identifying
18 # information: Portions Copyright [yyyy] [name of copyright owner]
41 import dbus.mainloop.glib
46 import time_slider.linux.timeslidersmf as timeslidersmf
47 import time_slider.linux.autosnapsmf as autosnapsmf
49 from time_slider.linux.rbac import RBACprofile
52 import time_slider.linux.timesliderconfig as timesliderconfig
60 # Status codes for actual zpool capacity levels.
61 # These are relative to the SMF property defined
62 # levels for: user, warning and emergenecy levels
63 STATUS_OK = 0 # Below user specified threshhold. Everything was OK
64 STATUS_WARNING = 1 # Above specified user threshold level
65 STATUS_CRITICAL = 2 # Above specified critical threshhold level
66 STATUS_EMERGENCY = 3 # Above specified emergency threshhold level
68 intervals = {"weeks" : _WEEK, "days" : _DAY, "hours" : _HOUR, "minutes" : _MINUTE}
71 class SnapshotManager(threading.Thread):
73 def __init__(self, bus):
74 # Used to wake up the run() method prematurely in the event
75 # of a SIGHUP/SMF refresh
76 self._conditionLock = threading.Condition(threading.RLock())
77 # Used when schedules are being rebuilt or examined.
78 self._refreshLock = threading.Lock()
79 # Indicates that cleanup is in progress when locked
80 self._cleanupLock = threading.Lock()
81 self._datasets = zfs.Datasets()
82 # Indicates that schedules need to be rebuilt from scratch
84 self._lastCleanupCheck = 0;
87 self._destroyedsnaps = []
89 # This is also checked during the refresh() method but we need
90 # to know it sooner for instantiation of the PluginManager
91 self._smf = timeslidersmf.TimeSliderSMF()
93 self.verbose = self._smf.get_verbose()
94 except RuntimeError,message:
95 sys.stderr.write("Error determing whether debugging is enabled\n")
98 self._dbus = dbussvc.AutoSnap(bus,
99 '/org/opensolaris/TimeSlider/autosnap',
102 # self._plugin = plugin.PluginManager(self.verbose)
103 self.exitCode = smf.SMF_EXIT_OK
106 # Seems we're up and running OK.
107 # Signal our parent so we can daemonise
108 os.kill(os.getppid(), signal.SIGUSR1)
110 # SMF/svc.startd sends SIGHUP to force a
111 # a refresh of the daemon
112 signal.signal(signal.SIGHUP, self._signalled)
114 # Init done. Now initiaslise threading.
115 threading.Thread.__init__ (self)
119 # Deselect swap and dump volumes so they don't get snapshotted.
120 for vol in self._datasets.list_volumes():
121 name = vol.rsplit("/")
123 if (name[1] == "swap" or name[1] == "dump"):
124 util.debug("Auto excluding %s volume" % vol, self.verbose)
125 volume = zfs.Volume(vol)
126 volume.set_auto_snap(False)
135 # First check and, if necessary, perform any remedial cleanup.
136 # This is best done before creating any new snapshots which may
137 # otherwise get immediately gobbled up by the remedial cleanup.
138 if self._needs_cleanup() == True:
139 self._perform_cleanup()
140 # Check to see if cleanup actually deleted anything before
141 # notifying the user. Avoids the popup appearing continuously
142 if len(self._destroyedsnaps) > 0:
143 self._send_notification()
144 self._send_to_syslog()
146 nexttime = self._check_snapshots()
147 # Overdue snapshots are already taken automatically
148 # inside _check_snapshots() so nexttime should never be
149 # < 0. It can be None however, which is fine since it
150 # will cause the scheduler thread to sleep indefinitely
151 # or until a SIGHUP is caught.
153 util.debug("Waiting until " + str (nexttime), self.verbose)
156 waittime = nexttime - long(time.time())
158 # We took too long and missed a snapshot, so break out
159 # and catch up on it the next time through the loop
161 # waittime could be None if no auto-snap schedules are online
162 self._conditionLock.acquire()
164 util.debug("Waiting %d seconds" % (waittime), self.verbose)
165 self._conditionLock.wait(waittime)
166 else: #None. Just wait a while to check for cleanups.
167 util.debug("No auto-snapshot schedules online.", \
169 self._conditionLock.wait(_MINUTE * 15)
171 except OSError, message:
172 sys.stderr.write("Caught OSError exception in snapshot" +
174 sys.stderr.write("Error details:\n" + \
175 "--------BEGIN ERROR MESSAGE--------\n" + \
177 "\n--------END ERROR MESSAGE--------\n")
178 self.exitCode = smf.SMF_EXIT_ERR_FATAL
181 except RuntimeError,message:
182 sys.stderr.write("Caught RuntimeError exception in snapshot" +
184 sys.stderr.write("Error details:\n" + \
185 "--------BEGIN ERROR MESSAGE--------\n" + \
187 "\n--------END ERROR MESSAGE--------\n")
191 def _signalled(self, signum, frame):
192 if signum == signal.SIGHUP:
193 if self._refreshLock.acquire(False) == False:
196 self._refreshLock.release()
197 self._conditionLock.acquire()
198 self._conditionLock.notify()
199 self._conditionLock.release()
203 Checks if defined snapshot schedules are out
204 of date and rebuilds and updates if necessary
206 self._refreshLock.acquire()
207 if self._stale == True:
208 self._configure_svc_props()
209 self._rebuild_schedules()
210 self._update_schedules()
211 # self._plugin.refresh()
213 self._refreshLock.release()
215 def _configure_svc_props(self):
217 self.verbose = self._smf.get_verbose()
218 except RuntimeError,message:
219 sys.stderr.write("Error determing whether debugging is enabled\n")
223 cleanup = self._smf.get_remedial_cleanup()
224 warn = self._smf.get_cleanup_level("warning")
225 util.debug("Warning level value is: %d%%" % warn, self.verbose)
226 crit = self._smf.get_cleanup_level("critical")
227 util.debug("Critical level value is: %d%%" % crit, self.verbose)
228 emer = self._smf.get_cleanup_level("emergency")
229 util.debug("Emergency level value is: %d%%" % emer, self.verbose)
230 except RuntimeError,message:
231 sys.stderr.write("Failed to determine cleanup threshhold levels\n")
232 sys.stderr.write("Details:\n" + \
233 "--------BEGIN ERROR MESSAGE--------\n" + \
235 "\n---------END ERROR MESSAGE---------\n")
236 sys.stderr.write("Using factory defaults of 80%, 90% and 95%\n")
238 #FIXME - this would be an appropriate case to mark svc as degraded
239 self._remedialCleanup = True
240 self._warningLevel = 80
241 self._criticalLevel = 90
242 self._emergencyLevel = 95
244 self._remedialCleanup = cleanup
245 self._warningLevel = warn
246 self._criticalLevel = crit
247 self._emergencyLevel = emer
250 self._keepEmpties = self._smf.get_keep_empties()
251 except RuntimeError,message:
252 # Not fatal, just assume we delete them (default configuration)
253 sys.stderr.write("Can't determine whether to keep empty snapshots\n")
254 sys.stderr.write("Details:\n" + \
255 "--------BEGIN ERROR MESSAGE--------\n" + \
257 "\n---------END ERROR MESSAGE---------\n")
258 sys.stderr.write("Assuming default value: False\n")
259 self._keepEmpties = False
261 # Previously, snapshot labels used the ":" character was used as a
262 # separator character for datestamps. Windows filesystems such as
263 # CIFS and FAT choke on this character so now we use a user definable
264 # separator value, with a default value of "_"
265 # We need to check for both the old and new format when looking for
267 self._separator = self._smf.get_separator()
268 self._prefix = "%s[:%s]" \
269 % (autosnapsmf.SNAPLABELPREFIX, self._separator)
274 for poolname in zfs.list_zpools():
275 # Do not try to examine FAULTED pools
276 zpool = zfs.ZPool(poolname)
277 if zpool.health == "FAULTED":
278 util.debug("Ignoring faulted Zpool: %s\n" \
282 self._zpools.append(zpool)
283 util.debug(str(zpool), self.verbose)
284 except RuntimeError,message:
285 sys.stderr.write("Could not list Zpools\n")
286 self.exitCode = smf.SMF_EXIT_ERR_FATAL
287 # Propogate exception up to thread's run() method
288 raise RuntimeError,message
291 def _rebuild_schedules(self):
293 Builds 2 lists of default and custom auto-snapshot SMF instances
301 _defaultSchedules = autosnapsmf.get_default_schedules()
302 _customSchedules = autosnapsmf.get_custom_schedules()
303 except RuntimeError,message:
304 self.exitCode = smf.SMF_EXIT_ERR_FATAL
305 raise RuntimeError, "Error reading SMF schedule instances\n" + \
306 "Details:\n" + str(message)
308 # Now set it in stone.
309 self._defaultSchedules = tuple(_defaultSchedules)
310 self._customSchedules = tuple(_customSchedules)
312 # Build the combined schedule tuple from default + custom schedules
313 _defaultSchedules.extend(_customSchedules)
314 self._allSchedules = tuple(_defaultSchedules)
315 for schedule,i,p,keep in self._allSchedules:
316 self._last[schedule] = 0
317 self._next[schedule] = 0
318 self._keep[schedule] = keep
320 def _update_schedules(self):
322 idx = 1 # Used to index subsets for schedule overlap calculation
325 for schedule,interval,period,keep in self._allSchedules:
326 # Shortcut if we've already processed this schedule and it's
327 # still up to date. Don't skip the default schedules though
328 # because overlap affects their scheduling
329 if [schedule,interval,period,keep] not in \
330 self._defaultSchedules and \
331 (self._next[schedule] > self._last[schedule]):
332 util.debug("Short circuiting %s recalculation" \
337 # If we don't have an internal timestamp for the given schedule
338 # ask zfs for the last snapshot and get it's creation timestamp.
339 if self._last[schedule] == 0:
341 snaps = self._datasets.list_snapshots("%s%s" % \
344 except RuntimeError,message:
345 self.exitCode = smf.SMF_EXIT_ERR_FATAL
346 sys.stderr.write("Failed to list snapshots during schedule update\n")
347 #Propogate up to the thread's run() method
348 raise RuntimeError,message
351 util.debug("Last %s snapshot was: %s" % \
352 (schedule, snaps[-1][0]), \
354 self._last[schedule] = snaps[-1][1]
356 last = self._last[schedule]
357 if interval != "months": # months is non-constant. See below.
358 util.debug("Recalculating %s schedule" % (schedule), \
361 totalinterval = intervals[interval] * period
363 self.exitCode = smf.SMF_EXIT_ERR_CONFIG
364 sys.stderr.write(schedule + \
365 " schedule has invalid interval: " + \
366 "'%s\'\n" % interval)
367 #Propogate up to thread's run() method
369 if [schedule,interval,period,keep] in self._defaultSchedules:
370 # This is one of the default schedules so check for an
371 # overlap with one of the dominant shchedules.
372 for s,i,p,k in self._defaultSchedules[:idx]:
373 last = max(last, self._last[s])
376 else: # interval == "months"
377 if self._next[schedule] > last:
378 util.debug("Short circuiting " + \
383 util.debug("Recalculating %s schedule" % (schedule), \
385 snap_tm = time.gmtime(self._last[schedule])
386 # Increment year if period >= than 1 calender year.
387 year = snap_tm.tm_year
391 mon = (snap_tm.tm_mon + period) % 12
392 # Result of 0 actually means december.
395 # Account for period that spans calendar year boundary.
396 elif snap_tm.tm_mon + period > 12:
399 d,dlastmon = calendar.monthrange(snap_tm.tm_year, snap_tm.tm_mon)
400 d,dnewmon = calendar.monthrange(year, mon)
401 mday = snap_tm.tm_mday
402 if dlastmon > dnewmon and snap_tm.tm_mday > dnewmon:
405 tm =(year, mon, mday, \
406 snap_tm.tm_hour, snap_tm.tm_min, snap_tm.tm_sec, \
408 newt = calendar.timegm(tm)
409 new_tm = time.gmtime(newt)
410 totalinterval = newt - self._last[schedule]
412 self._next[schedule] = last + totalinterval
417 now = long(time.time())
419 for s,i,p,k in self._defaultSchedules:
422 #Default Schedule - so break out at the first
423 #schedule that is overdue. The subordinate schedules
424 #will re-adjust afterwards.
425 earliest,schedule = due,s
427 elif earliest != None:
429 earliest,schedule = due,s
430 else: #FIXME better optimisation with above condition
431 earliest,schedule = due,s
432 for s,i,p,k in self._customSchedules:
436 earliest,schedule = due,s
437 else: #FIXME better optimisation with above condition
438 earliest,schedule = due,s
439 return earliest,schedule
441 def _check_snapshots(self):
443 Check the schedules and see what the required snapshot is.
444 Take one immediately on the first overdue snapshot required
446 # Make sure a refresh() doesn't mess with the schedule while
447 # we're reading through it.
448 self._refreshLock.acquire()
449 next,schedule = self._next_due()
450 self._refreshLock.release()
451 now = long(time.time())
452 while next != None and next <= now:
453 label = self._take_snapshots(schedule)
454 # self._plugin.execute_plugins(schedule, label)
455 self._refreshLock.acquire()
456 self._update_schedules()
457 next,schedule = self._next_due();
458 self._refreshLock.release()
459 dt = datetime.datetime.fromtimestamp(next)
460 util.debug("Next snapshot is %s due at: %s" % \
461 (schedule, dt.isoformat()), \
465 def _take_snapshots(self, schedule):
466 # Set the time before taking snapshot to avoid clock skew due
467 # to time taken to complete snapshot.
468 tm = long(time.time())
469 label = "%s%s%s-%s" % \
470 (autosnapsmf.SNAPLABELPREFIX, self._separator, schedule,
471 datetime.datetime.now().strftime("%Y-%m-%d-%Hh%M"))
473 self._datasets.create_auto_snapshot_set(label, tag=schedule)
474 except RuntimeError, message:
475 # Write an error message, set the exit code and pass it up the
476 # stack so the thread can terminate
477 sys.stderr.write("Failed to create snapshots for schedule: %s\n" \
479 self.exitCode = smf.SMF_EXIT_MON_DEGRADE
480 raise RuntimeError,message
481 self._last[schedule] = tm;
482 self._perform_purge(schedule)
485 def _prune_snapshots(self, dataset, schedule):
486 """Cleans out zero sized snapshots, kind of cautiously"""
487 # Per schedule: We want to delete 0 sized
488 # snapshots but we need to keep at least one around (the most
489 # recent one) for each schedule so that that overlap is
490 # maintained from frequent -> hourly -> daily etc.
491 # Start off with the smallest interval schedule first and
492 # move up. This increases the amount of data retained where
493 # several snapshots are taken together like a frequent hourly
494 # and daily snapshot taken at 12:00am. If 3 snapshots are all
495 # identical and reference the same identical data they will all
496 # be initially reported as zero for used size. Deleting the
497 # daily first then the hourly would shift make the data referenced
498 # by all 3 snapshots unique to the frequent scheduled snapshot.
499 # This snapshot would probably be purged within an how ever and the
500 # data referenced by it would be gone for good.
501 # Doing it the other way however ensures that the data should
502 # remain accessible to the user for at least a week as long as
503 # the pool doesn't run low on available space before that.
506 snaps = dataset.list_snapshots("%s%s" % (self._prefix,schedule))
507 # Clone the list because we want to remove items from it
508 # while iterating through it.
509 remainingsnaps = snaps[:]
510 except RuntimeError,message:
511 sys.stderr.write("Failed to list snapshots during snapshot cleanup\n")
512 self.exitCode = smf.SMF_EXIT_ERR_FATAL
513 raise RuntimeError,message
515 if (self._keepEmpties == False):
516 try: # remove the newest one from the list.
520 for snapname in snaps:
522 snapshot = zfs.Snapshot(snapname)
523 except Exception,message:
524 sys.stderr.write(str(message))
525 # Not fatal, just skip to the next snapshot
529 if snapshot.get_used_size() == 0:
530 util.debug("Destroying zero sized: " + snapname, \
534 except RuntimeError,message:
535 sys.stderr.write("Failed to destroy snapshot: " +
537 self.exitCode = smf.SMF_EXIT_MON_DEGRADE
538 # Propogate exception so thread can exit
539 raise RuntimeError,message
540 remainingsnaps.remove(snapname)
541 except RuntimeError,message:
542 sys.stderr.write("Can not determine used size of: " + \
544 self.exitCode = smf.SMF_EXIT_MON_DEGRADE
545 #Propogate the exception to the thead run() method
546 raise RuntimeError,message
548 # Deleting individual snapshots instead of recursive sets
549 # breaks the recursion chain and leaves child snapshots
550 # dangling so we need to take care of cleaning up the
552 target = len(remainingsnaps) - self._keep[schedule]
554 while counter < target:
555 util.debug("Destroy expired snapshot: " + \
556 remainingsnaps[counter],
559 snapshot = zfs.Snapshot(remainingsnaps[counter])
560 except Exception,message:
561 sys.stderr.write(str(message))
562 # Not fatal, just skip to the next snapshot
567 except RuntimeError,message:
568 sys.stderr.write("Failed to destroy snapshot: " +
569 snapshot.name + "\n")
570 self.exitCode = smf.SMF_EXIT_ERR_FATAL
571 # Propogate exception so thread can exit
572 raise RuntimeError,message
576 def _perform_purge(self, schedule):
577 """Cautiously cleans out zero sized snapshots"""
578 # We need to avoid accidentally pruning auto snapshots received
579 # from one zpool to another. We ensure this by examining only
580 # snapshots whose parent fileystems and volumes are explicitly
581 # tagged to be snapshotted.
583 for name in self._datasets.list_auto_snapshot_sets(schedule):
584 dataset = zfs.ReadWritableDataset(name)
585 self._prune_snapshots(dataset, schedule)
586 except RuntimeError,message:
587 sys.stderr.write("Error listing datasets during " + \
588 "removal of expired snapshots\n")
589 self.exitCode = smf.SMF_EXIT_ERR_FATAL
590 # Propogate up to thread's run() method
591 raise RuntimeError,message
593 def _needs_cleanup(self):
594 if self._remedialCleanup == False:
595 # Sys admin has explicitly instructed for remedial cleanups
596 # not to be performed.
598 now = long(time.time())
599 # Don't run checks any less than 15 minutes apart.
600 if self._cleanupLock.acquire(False) == False:
601 #Indicates that a cleanup is already running.
603 # FIXME - Make the cleanup interval equal to the minimum snapshot interval
604 # if custom snapshot schedules are defined and enabled.
605 elif ((now - self._lastCleanupCheck) < (_MINUTE * 15)):
608 for zpool in self._zpools:
610 if zpool.get_capacity() > self._warningLevel:
611 # Before getting into a panic, determine if the pool
612 # is one we actually take snapshots on, by checking
613 # for one of the "auto-snapshot:<schedule> tags. Not
614 # super fast, but it only happens under exceptional
615 # circumstances of a zpool nearing it's capacity.
617 for sched in self._allSchedules:
618 sets = zpool.list_auto_snapshot_sets(sched[0])
620 util.debug("%s needs a cleanup" \
623 self._cleanupLock.release()
625 except RuntimeError, message:
626 sys.stderr.write("Error checking zpool capacity of: " + \
628 self._cleanupLock.release()
629 self.exitCode = smf.SMF_EXIT_ERR_FATAL
630 # Propogate up to thread's run() mehod.
631 raise RuntimeError,message
632 self._lastCleanupCheck = long(time.time())
633 self._cleanupLock.release()
636 def _perform_cleanup(self):
637 if self._cleanupLock.acquire(False) == False:
638 # Cleanup already running. Skip
640 self._destroyedsnaps = []
641 for zpool in self._zpools:
643 self._poolstatus[zpool.name] = 0
644 capacity = zpool.get_capacity()
645 if capacity > self._warningLevel:
646 self._run_warning_cleanup(zpool)
647 self._poolstatus[zpool.name] = 1
648 capacity = zpool.get_capacity()
649 if capacity > self._criticalLevel:
650 self._run_critical_cleanup(zpool)
651 self._poolstatus[zpool.name] = 2
652 capacity = zpool.get_capacity()
653 if capacity > self._emergencyLevel:
654 self._run_emergency_cleanup(zpool)
655 self._poolstatus[zpool.name] = 3
656 capacity = zpool.get_capacity()
657 if capacity > self._emergencyLevel:
658 self._run_emergency_cleanup(zpool)
659 self._poolstatus[zpool.name] = 4
660 # This also catches exceptions thrown from _run_<level>_cleanup()
661 # and _run_cleanup() in methods called by _perform_cleanup()
662 except RuntimeError,message:
663 sys.stderr.write("Remedial space cleanup failed because " + \
664 "of failure to determinecapacity of: " + \
666 self.exitCode = smf.SMF_EXIT_ERR_FATAL
667 self._cleanupLock.release()
668 # Propogate up to thread's run() method.
669 raise RuntimeError,message
671 # Bad - there's no more snapshots left and nothing
672 # left to delete. We don't disable the service since
673 # it will permit self recovery and snapshot
674 # retention when space becomes available on
675 # the pool (hopefully).
676 util.debug("%s pool status after cleanup:" \
679 util.debug(zpool, self.verbose)
680 util.debug("Cleanup completed. %d snapshots were destroyed" \
681 % len(self._destroyedsnaps), \
683 # Avoid needless list iteration for non-debug mode
684 if self.verbose == True and len(self._destroyedsnaps) > 0:
685 for snap in self._destroyedsnaps:
686 sys.stderr.write("\t%s\n" % snap)
687 self._cleanupLock.release()
689 def _run_warning_cleanup(self, zpool):
690 util.debug("Performing warning level cleanup on %s" % \
693 self._run_cleanup(zpool, "daily", self._warningLevel)
694 if zpool.get_capacity() > self._warningLevel:
695 self._run_cleanup(zpool, "hourly", self._warningLevel)
697 def _run_critical_cleanup(self, zpool):
698 util.debug("Performing critical level cleanup on %s" % \
701 self._run_cleanup(zpool, "weekly", self._criticalLevel)
702 if zpool.get_capacity() > self._criticalLevel:
703 self._run_cleanup(zpool, "daily", self._criticalLevel)
704 if zpool.get_capacity() > self._criticalLevel:
705 self._run_cleanup(zpool, "hourly", self._criticalLevel)
707 def _run_emergency_cleanup(self, zpool):
708 util.debug("Performing emergency level cleanup on %s" % \
711 self._run_cleanup(zpool, "monthly", self._emergencyLevel)
712 if zpool.get_capacity() > self._emergencyLevel:
713 self._run_cleanup(zpool, "weekly", self._emergencyLevel)
714 if zpool.get_capacity() > self._emergencyLevel:
715 self._run_cleanup(zpool, "daily", self._emergencyLevel)
716 if zpool.get_capacity() > self._emergencyLevel:
717 self._run_cleanup(zpool, "hourly", self._emergencyLevel)
718 if zpool.get_capacity() > self._emergencyLevel:
719 self._run_cleanup(zpool, "frequent", self._emergencyLevel)
720 #Finally, as a last resort, delete custom scheduled snaphots
721 for schedule,i,p,k in self._customSchedules:
722 if zpool.get_capacity() < self._emergencyLevel:
725 self._run_cleanup(zpool, schedule, self._emergencyLevel)
727 def _run_cleanup(self, zpool, schedule, threshold):
731 clonedsnaps = self._datasets.list_cloned_snapshots()
732 except RuntimeError,message:
733 sys.stderr.write("Error (non-fatal) listing cloned snapshots" +
734 " while recovering pool capacity\n")
735 sys.stderr.write("Error details:\n" + \
736 "--------BEGIN ERROR MESSAGE--------\n" + \
738 "\n--------END ERROR MESSAGE--------\n")
740 # Build a list of snapshots in the given schedule, that are not
741 # cloned, and sort the result in reverse chronological order.
743 snapshots = [s for s,t in \
744 zpool.list_snapshots("%s%s" \
745 % (self._prefix,schedule)) \
746 if not s in clonedsnaps]
748 except RuntimeError,message:
749 sys.stderr.write("Error listing snapshots" +
750 " while recovering pool capacity\n")
751 self.exitCode = smf.SMF_EXIT_ERR_FATAL
752 # Propogate the error up to the thread's run() method.
753 raise RuntimeError,message
755 while zpool.get_capacity() > threshold:
756 if len(snapshots) == 0:
757 syslog.syslog(syslog.LOG_NOTICE,
758 "No more %s snapshots left" \
762 """This is not an exact science. Deleteing a zero sized
763 snapshot can have unpredictable results. For example a
764 pair of snapshots may share exclusive reference to a large
765 amount of data (eg. a large core file). The usage of both
766 snapshots will initially be seen to be 0 by zfs(1). Deleting
767 one of the snapshots will make the data become unique to the
768 single remaining snapshot that references it uniquely. The
769 remaining snapshot's size will then show up as non zero. So
770 deleting 0 sized snapshot is not as pointless as it might seem.
771 It also means we have to loop through this, each snapshot set
772 at a time and observe the before and after results. Perhaps
773 better way exists...."""
775 # Start with the oldest first
776 snapname = snapshots.pop()
777 snapshot = zfs.Snapshot(snapname)
778 # It would be nicer, for performance purposes, to delete sets
779 # of snapshots recursively but this might destroy more data than
780 # absolutely necessary, plus the previous purging of zero sized
781 # snapshots can easily break the recursion chain between
783 # On the positive side there should be fewer snapshots and they
784 # will mostly non-zero so we should get more effectiveness as a
785 # result of deleting snapshots since they should be nearly always
787 util.debug("Destroying %s" % snapname, self.verbose)
790 except RuntimeError,message:
791 # Would be nice to be able to mark service as degraded here
792 # but it's better to try to continue on rather than to give
793 # up alltogether (SMF maintenance state)
794 sys.stderr.write("Warning: Cleanup failed to destroy: %s\n" % \
796 sys.stderr.write("Details:\n%s\n" % (str(message)))
798 self._destroyedsnaps.append(snapname)
799 # Give zfs some time to recalculate.
802 def _send_to_syslog(self):
803 for zpool in self._zpools:
804 status = self._poolstatus[zpool.name]
806 syslog.syslog(syslog.LOG_EMERG,
807 "%s is over %d%% capacity. " \
808 "All automatic snapshots were destroyed" \
809 % (zpool.name, self._emergencyLevel))
811 syslog.syslog(syslog.LOG_ALERT,
812 "%s exceeded %d%% capacity. " \
813 "Automatic snapshots over 1 hour old were destroyed" \
814 % (zpool.name, self._emergencyLevel))
816 syslog.syslog(syslog.LOG_CRIT,
817 "%s exceeded %d%% capacity. " \
818 "Weekly, hourly and daily automatic snapshots were destroyed" \
819 % (zpool.name, self._criticalLevel))
821 syslog.syslog(syslog.LOG_WARNING,
822 "%s exceeded %d%% capacity. " \
823 "Hourly and daily automatic snapshots were destroyed" \
824 % (zpool.name, self._warningLevel))
826 if len(self._destroyedsnaps) > 0:
827 syslog.syslog(syslog.LOG_NOTICE,
828 "%d automatic snapshots were destroyed" \
829 % len(self._destroyedsnaps))
831 def _send_notification(self):
835 for zpool in self._zpools:
836 status = self._poolstatus[zpool.name]
837 # >= to ensure that something should always be set.
838 if status >= worststatus:
839 worstpool = zpool.name
842 #FIXME make the various levels indexible
844 self._dbus.capacity_exceeded(worstpool, 4, self._emergencyLevel)
845 elif worststatus == 3:
846 self._dbus.capacity_exceeded(worstpool, 3, self._emergencyLevel)
847 elif worststatus == 2:
848 self._dbus.capacity_exceeded(worstpool, 2, self._criticalLevel)
849 elif worststatus == 1:
850 self._dbus.capacity_exceeded(worstpool, 1, self._warningLevel)
851 #elif: 0 everything is fine. Do nothing.
854 def monitor_threads(snapthread):
855 if snapthread.is_alive():
858 sys.stderr.write("Snapshot monitor thread exited.\n")
859 if snapthread.exitCode == smf.SMF_EXIT_MON_DEGRADE:
860 # FIXME - it would be nicer to mark the service as degraded than
861 # go into maintenance state for some situations such as a
862 # particular snapshot schedule failing.
863 # But for now SMF does not implement this feature. But if/when it
864 # does it's better to use svcadm to put the # service into the
865 # correct state since the daemon shouldn't exit whentransitioning
866 # to a degraded state.
867 #sys.stderr.write("Placing service into maintenance state\n")
868 #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
869 # os.getenv("SMF_FMRI")])
870 # SMF will take care of kill the daemon
871 sys.exit(smf.SMF_EXIT_ERR_FATAL)
873 elif snapthread.exitCode == smf.SMF_EXIT_ERR_FATAL:
874 #sys.stderr.write("Placing service into maintenance state\n")
875 #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
876 # os.getenv("SMF_FMRI")])
877 # SMF will take care of killing the daemon
878 sys.exit(smf.SMF_EXIT_ERR_FATAL)
881 sys.stderr.write("Snapshot monitor thread exited abnormally\n")
882 sys.stderr.write("Exit code: %d\n" % (snapthread.exitCode))
883 #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
884 # os.getenv("SMF_FMRI")])
885 sys.exit(smf.SMF_EXIT_ERR_FATAL)
889 def child_sig_handler(signum, frame):
890 if signum == signal.SIGUSR1:
891 sys.exit(smf.SMF_EXIT_OK)
892 elif signum == signal.SIGCHLD:
893 sys.exit(smf.SMF_EXIT_ERR_FATAL)
894 elif signum == signal.SIGALRM:
895 sys.exit(smf.SMF_EXIT_ERR_FATAL)
897 # Default daemon parameters.
898 # File mode creation mask of the daemon.
900 # Default working directory for the daemon.
902 # Default maximum for the number of available file descriptors.
907 Detach a process from the controlling terminal and run it in the
908 background as a daemon.
910 #Catch signals that we might receive from child
911 signal.signal(signal.SIGCHLD, child_sig_handler)
912 signal.signal(signal.SIGUSR1, child_sig_handler)
913 signal.signal(signal.SIGALRM, child_sig_handler)
917 raise Exception, "%s [%d]" % (e.strerror, e.errno)
920 #Reset signals that we set to trap in parent
921 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
922 signal.signal(signal.SIGUSR1, signal.SIG_DFL)
923 signal.signal(signal.SIGALRM, signal.SIG_DFL)
928 #Wait for the child to give the OK or otherwise.
934 parser = argparse.ArgumentParser()
935 parser.add_argument('--foreground', action='store_true', help='Do not daemonize', default=False)
936 parser.add_argument('--config', '-c', type=str, help='Configuration file', default='/etc/time-slider/timesliderd.conf')
937 parser.add_argument('--configdump', action='store_true', help='Dump default values in config file format', default=False)
938 args, _ = parser.parse_known_args()
941 timesliderconfig.configdump()
942 sys.exit(smf.SMF_EXIT_OK)
944 timesliderconfig.configfile = args.config
946 # Daemonise the service.
947 if not args.foreground:
950 # The user security attributes checked are the following:
951 # Note that UID == 0 will match any profile search so
952 # no need to check it explicitly.
953 syslog.openlog("time-sliderd", 0, syslog.LOG_DAEMON)
954 rbacp = RBACprofile()
955 if rbacp.has_profile("ZFS File System Management"):
957 gobject.threads_init()
959 # Tell dbus to use the gobject mainloop for async ops
960 dbus.mainloop.glib.DBusGMainLoop(set_as_default=True)
961 dbus.mainloop.glib.threads_init()
962 # Register a bus name with the system dbus daemon
963 systemBus = dbus.SystemBus()
964 name = dbus.service.BusName("org.opensolaris.TimeSlider", systemBus)
966 # Create and start the snapshot manger. Takes care of
967 # auto snapshotting service and auto cleanup.
968 snapshot = SnapshotManager(systemBus)
970 gobject.timeout_add(2000, monitor_threads, snapshot)
972 mainloop = gobject.MainLoop()
975 except KeyboardInterrupt:
977 sys.exit(smf.SMF_EXIT_OK)
979 syslog.syslog(syslog.LOG_ERR,
980 "%s has insufficient privileges to run time-sliderd!" \
983 sys.exit(smf.SMF_EXIT_ERR_PERM)
985 sys.exit(smf.SMF_EXIT_OK)