Add linux specific autosnapsmf module and dependencies
[time-slider.git] / usr / share / time-slider / lib / time_slider / timesliderd.py
1 #!/usr/bin/python2
2 #
3 # CDDL HEADER START
4 #
5 # The contents of this file are subject to the terms of the
6 # Common Development and Distribution License (the "License").
7 # You may not use this file except in compliance with the License.
8 #
9 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 # or http://www.opensolaris.org/os/licensing.
11 # See the License for the specific language governing permissions
12 # and limitations under the License.
13 #
14 # When distributing Covered Code, include this CDDL HEADER in each
15 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 # If applicable, add the following below this CDDL HEADER, with the
17 # fields enclosed by brackets "[]" replaced with your own identifying
18 # information: Portions Copyright [yyyy] [name of copyright owner]
19 #
20 # CDDL HEADER END
21 #
22
23 import sys
24 import os
25 import subprocess
26 import re
27 import threading
28 import getopt
29 import syslog
30 import time
31 import datetime
32 import calendar
33 import signal
34 import argparse
35
36 import glib
37 import gobject
38 import dbus
39 import dbus.service
40 import dbus.mainloop
41 import dbus.mainloop.glib
42
43 import dbussvc
44 import zfs
45 import smf
46 import time_slider.linux.timeslidersmf as timeslidersmf
47 import time_slider.linux.autosnapsmf as autosnapsmf
48 import plugin
49 from time_slider.linux.rbac import RBACprofile
50 import util
51
52 _MINUTE = 60
53 _HOUR = _MINUTE * 60
54 _DAY = _HOUR * 24
55 _WEEK = _DAY * 7
56
57
58 # Status codes for actual zpool capacity levels.
59 # These are relative to the SMF property defined
60 # levels for: user, warning and emergenecy levels
61 STATUS_OK = 0 # Below user specified threshhold. Everything was OK
62 STATUS_WARNING = 1 # Above specified user threshold level
63 STATUS_CRITICAL = 2 # Above specified critical threshhold level
64 STATUS_EMERGENCY = 3 # Above specified emergency threshhold level
65
66 intervals = {"weeks" : _WEEK, "days" : _DAY, "hours" : _HOUR, "minutes" : _MINUTE}
67
68
69 class SnapshotManager(threading.Thread):
70
71     def __init__(self, bus):
72         # Used to wake up the run() method prematurely in the event
73         # of a SIGHUP/SMF refresh
74         self._conditionLock = threading.Condition(threading.RLock())
75         # Used when schedules are being rebuilt or examined.
76         self._refreshLock = threading.Lock()
77         # Indicates that cleanup is in progress when locked
78         self._cleanupLock = threading.Lock()
79         self._datasets = zfs.Datasets()
80         # Indicates that schedules need to be rebuilt from scratch
81         self._stale = True
82         self._lastCleanupCheck = 0;
83         self._zpools = []
84         self._poolstatus = {}
85         self._destroyedsnaps = []
86
87         # This is also checked during the refresh() method but we need
88         # to know it sooner for instantiation of the PluginManager
89         self._smf = timeslidersmf.TimeSliderSMF()
90         try:
91             self.verbose = self._smf.get_verbose()
92         except RuntimeError,message:
93             sys.stderr.write("Error determing whether debugging is enabled\n")
94             self.verbose = False
95
96         self._dbus = dbussvc.AutoSnap(bus,
97                                       '/org/opensolaris/TimeSlider/autosnap',
98                                       self)
99
100         self._plugin = plugin.PluginManager(self.verbose)
101         self.exitCode = smf.SMF_EXIT_OK
102         self.refresh()
103
104         # Seems we're up and running OK. 
105         # Signal our parent so we can daemonise
106         os.kill(os.getppid(), signal.SIGUSR1)
107
108         # SMF/svc.startd sends SIGHUP to force a
109         # a refresh of the daemon
110         signal.signal(signal.SIGHUP, self._signalled)
111
112         # Init done. Now initiaslise threading.
113         threading.Thread.__init__ (self)
114         self.setDaemon(True)
115
116     def run(self):
117         # Deselect swap and dump volumes so they don't get snapshotted.
118         for vol in self._datasets.list_volumes():
119             name = vol.rsplit("/")
120             try:
121                 if (name[1] == "swap" or name[1] == "dump"):
122                     util.debug("Auto excluding %s volume" % vol, self.verbose)
123                     volume = zfs.Volume(vol)
124                     volume.set_auto_snap(False)
125             except IndexError:
126                 pass
127             
128         nexttime = None
129         waittime = None
130         while True:
131             try:
132                 self.refresh()
133                 # First check and, if necessary, perform any remedial cleanup.
134                 # This is best done before creating any new snapshots which may
135                 # otherwise get immediately gobbled up by the remedial cleanup.
136                 if self._needs_cleanup() == True:
137                     self._perform_cleanup()
138                     # Check to see if cleanup actually deleted anything before
139                     # notifying the user. Avoids the popup appearing continuously
140                     if len(self._destroyedsnaps) > 0:
141                         self._send_notification()
142                     self._send_to_syslog()
143
144                 nexttime = self._check_snapshots()
145                 # Overdue snapshots are already taken automatically
146                 # inside _check_snapshots() so nexttime should never be
147                 # < 0. It can be None however, which is fine since it 
148                 # will cause the scheduler thread to sleep indefinitely
149                 # or until a SIGHUP is caught.
150                 if nexttime:
151                     util.debug("Waiting until " + str (nexttime), self.verbose)
152                 waittime = None
153                 if nexttime != None:
154                     waittime = nexttime - long(time.time())
155                     if (waittime <= 0):
156                         # We took too long and missed a snapshot, so break out
157                         # and catch up on it the next time through the loop
158                         continue
159                 # waittime could be None if no auto-snap schedules are online
160                 self._conditionLock.acquire()
161                 if waittime:
162                     util.debug("Waiting %d seconds" % (waittime), self.verbose)
163                     self._conditionLock.wait(waittime)
164                 else: #None. Just wait a while to check for cleanups.
165                     util.debug("No auto-snapshot schedules online.", \
166                                self.verbose)
167                     self._conditionLock.wait(_MINUTE * 15)
168
169             except OSError, message:
170                 sys.stderr.write("Caught OSError exception in snapshot" +
171                                  " manager thread\n")
172                 sys.stderr.write("Error details:\n" + \
173                                  "--------BEGIN ERROR MESSAGE--------\n" + \
174                                  str(message) + \
175                                  "\n--------END ERROR MESSAGE--------\n")
176                 self.exitCode = smf.SMF_EXIT_ERR_FATAL
177                 # Exit this thread
178                 break
179             except RuntimeError,message:
180                 sys.stderr.write("Caught RuntimeError exception in snapshot" +
181                                  " manager thread\n")
182                 sys.stderr.write("Error details:\n" + \
183                                  "--------BEGIN ERROR MESSAGE--------\n" + \
184                                  str(message) + \
185                                  "\n--------END ERROR MESSAGE--------\n")
186                 # Exit this thread
187                 break
188
189     def _signalled(self, signum, frame):
190         if signum == signal.SIGHUP:
191             if self._refreshLock.acquire(False) == False:
192                 return
193             self._stale = True
194             self._refreshLock.release()
195             self._conditionLock.acquire()
196             self._conditionLock.notify()
197             self._conditionLock.release()
198
199     def refresh(self):
200         """
201         Checks if defined snapshot schedules are out
202         of date and rebuilds and updates if necessary
203         """
204         self._refreshLock.acquire()
205         if self._stale == True:
206             self._configure_svc_props()
207             self._rebuild_schedules()
208             self._update_schedules()
209             self._plugin.refresh()
210             self._stale = False
211         self._refreshLock.release()
212
213     def _configure_svc_props(self):
214         try:
215             self.verbose = self._smf.get_verbose()
216         except RuntimeError,message:
217             sys.stderr.write("Error determing whether debugging is enabled\n")
218             self.verbose = False
219
220         try:
221             cleanup = self._smf.get_remedial_cleanup()
222             warn = self._smf.get_cleanup_level("warning")
223             util.debug("Warning level value is:   %d%%" % warn, self.verbose)
224             crit = self._smf.get_cleanup_level("critical")
225             util.debug("Critical level value is:  %d%%" % crit, self.verbose)
226             emer = self._smf.get_cleanup_level("emergency")
227             util.debug("Emergency level value is: %d%%" % emer, self.verbose)
228         except RuntimeError,message:
229             sys.stderr.write("Failed to determine cleanup threshhold levels\n")
230             sys.stderr.write("Details:\n" + \
231                              "--------BEGIN ERROR MESSAGE--------\n" + \
232                              str(message) + \
233                              "\n---------END ERROR MESSAGE---------\n")
234             sys.stderr.write("Using factory defaults of 80%, 90% and 95%\n")
235             #Go with defaults
236             #FIXME - this would be an appropriate case to mark svc as degraded
237             self._remedialCleanup = True
238             self._warningLevel = 80
239             self._criticalLevel = 90
240             self._emergencyLevel = 95
241         else:
242             self._remedialCleanup = cleanup
243             self._warningLevel = warn
244             self._criticalLevel = crit
245             self._emergencyLevel = emer
246
247         try:
248             self._keepEmpties = self._smf.get_keep_empties()
249         except RuntimeError,message:
250             # Not fatal, just assume we delete them (default configuration)
251             sys.stderr.write("Can't determine whether to keep empty snapshots\n")
252             sys.stderr.write("Details:\n" + \
253                              "--------BEGIN ERROR MESSAGE--------\n" + \
254                              str(message) + \
255                              "\n---------END ERROR MESSAGE---------\n")
256             sys.stderr.write("Assuming default value: False\n")
257             self._keepEmpties = False
258
259         # Previously, snapshot labels used the ":" character was used as a 
260         # separator character for datestamps. Windows filesystems such as
261         # CIFS and FAT choke on this character so now we use a user definable
262         # separator value, with a default value of "_"
263         # We need to check for both the old and new format when looking for
264         # snapshots.
265         self._separator = self._smf.get_separator()
266         self._prefix = "%s[:%s]" \
267             % (autosnapsmf.SNAPLABELPREFIX, self._separator)
268
269         # Rebuild pool list
270         self._zpools = []
271         try:
272             for poolname in zfs.list_zpools():
273                 # Do not try to examine FAULTED pools
274                 zpool = zfs.ZPool(poolname)
275                 if zpool.health == "FAULTED":
276                     util.debug("Ignoring faulted Zpool: %s\n" \
277                                % (zpool.name), \
278                                self.verbose)
279                 else:
280                     self._zpools.append(zpool)
281                 util.debug(str(zpool), self.verbose)
282         except RuntimeError,message:
283             sys.stderr.write("Could not list Zpools\n")
284             self.exitCode = smf.SMF_EXIT_ERR_FATAL
285             # Propogate exception up to thread's run() method
286             raise RuntimeError,message
287
288
289     def _rebuild_schedules(self):
290         """
291         Builds 2 lists of default and custom auto-snapshot SMF instances
292         """
293
294         self._last = {}
295         self._next = {}
296         self._keep = {}
297
298         try:
299             _defaultSchedules = autosnapsmf.get_default_schedules()
300             _customSchedules = autosnapsmf.get_custom_schedules()
301         except RuntimeError,message:
302             self.exitCode = smf.SMF_EXIT_ERR_FATAL
303             raise RuntimeError, "Error reading SMF schedule instances\n" + \
304                                 "Details:\n" + str(message)
305         else:
306             # Now set it in stone.
307             self._defaultSchedules = tuple(_defaultSchedules)
308             self._customSchedules = tuple(_customSchedules)
309             
310             # Build the combined schedule tuple from default + custom schedules
311             _defaultSchedules.extend(_customSchedules)
312             self._allSchedules = tuple(_defaultSchedules)
313             for schedule,i,p,keep in self._allSchedules:
314                 self._last[schedule] = 0
315                 self._next[schedule] = 0
316                 self._keep[schedule] = keep
317
318     def _update_schedules(self):
319         interval = 0
320         idx = 1 # Used to index subsets for schedule overlap calculation
321         last = None
322
323         for schedule,interval,period,keep in self._allSchedules:
324             # Shortcut if we've already processed this schedule and it's 
325             # still up to date. Don't skip the default schedules though
326             # because overlap affects their scheduling
327             if [schedule,interval,period,keep] not in \
328                 self._defaultSchedules and \
329                 (self._next[schedule] > self._last[schedule]):
330                 util.debug("Short circuiting %s recalculation" \
331                            % (schedule), \
332                            self.verbose)
333                 continue
334
335             # If we don't have an internal timestamp for the given schedule
336             # ask zfs for the last snapshot and get it's creation timestamp.
337             if self._last[schedule] == 0:
338                 try:
339                     snaps = self._datasets.list_snapshots("%s%s" % \
340                                                          (self._prefix,
341                                                           schedule))
342                 except RuntimeError,message:
343                     self.exitCode = smf.SMF_EXIT_ERR_FATAL
344                     sys.stderr.write("Failed to list snapshots during schedule update\n")
345                     #Propogate up to the thread's run() method
346                     raise RuntimeError,message
347
348                 if len(snaps) > 0:
349                     util.debug("Last %s snapshot was: %s" % \
350                                (schedule, snaps[-1][0]), \
351                                self.verbose)
352                     self._last[schedule] = snaps[-1][1]
353
354             last = self._last[schedule]
355             if interval != "months": # months is non-constant. See below.
356                 util.debug("Recalculating %s schedule" % (schedule), \
357                            self.verbose)
358                 try:
359                     totalinterval = intervals[interval] * period
360                 except KeyError:
361                     self.exitCode = smf.SMF_EXIT_ERR_CONFIG
362                     sys.stderr.write(schedule + \
363                                       " schedule has invalid interval: " + \
364                                       "'%s\'\n" % interval)
365                     #Propogate up to thread's run() method
366                     raise RuntimeError
367                 if [schedule,interval,period,keep] in self._defaultSchedules:
368                     # This is one of the default schedules so check for an
369                     # overlap with one of the dominant shchedules.
370                     for s,i,p,k in self._defaultSchedules[:idx]:
371                         last = max(last, self._last[s])
372                     idx += 1
373
374             else: # interval == "months"
375                 if self._next[schedule] > last:
376                     util.debug("Short circuiting " + \
377                                schedule + \
378                                " recalculation", \
379                                self.verbose)
380                     continue
381                 util.debug("Recalculating %s schedule" % (schedule), \
382                            self.verbose)
383                 snap_tm = time.gmtime(self._last[schedule])
384                 # Increment year if period >= than 1 calender year.
385                 year = snap_tm.tm_year
386                 year += period / 12
387                 period = period % 12
388
389                 mon = (snap_tm.tm_mon + period) % 12
390                 # Result of 0 actually means december.
391                 if mon == 0:
392                     mon = 12
393                 # Account for period that spans calendar year boundary.
394                 elif snap_tm.tm_mon + period > 12:
395                     year += 1
396
397                 d,dlastmon = calendar.monthrange(snap_tm.tm_year, snap_tm.tm_mon)
398                 d,dnewmon = calendar.monthrange(year, mon)
399                 mday = snap_tm.tm_mday
400                 if dlastmon > dnewmon and snap_tm.tm_mday > dnewmon:
401                    mday = dnewmon
402                 
403                 tm =(year, mon, mday, \
404                     snap_tm.tm_hour, snap_tm.tm_min, snap_tm.tm_sec, \
405                     0, 0, -1)
406                 newt = calendar.timegm(tm)
407                 new_tm = time.gmtime(newt)
408                 totalinterval = newt - self._last[schedule]
409
410             self._next[schedule] = last + totalinterval
411
412     def _next_due(self):
413         schedule = None
414         earliest = None
415         now = long(time.time())
416         
417         for s,i,p,k in self._defaultSchedules:
418             due = self._next[s]
419             if due <= now:
420                 #Default Schedule - so break out at the first 
421                 #schedule that is overdue. The subordinate schedules
422                 #will re-adjust afterwards.
423                 earliest,schedule = due,s
424                 break
425             elif earliest != None:
426                 if due < earliest:
427                     earliest,schedule = due,s
428             else: #FIXME better optimisation with above condition
429                 earliest,schedule = due,s
430         for s,i,p,k in self._customSchedules:
431             due = self._next[s]
432             if earliest != None:
433                 if due < earliest:
434                     earliest,schedule = due,s
435             else: #FIXME better optimisation with above condition
436                 earliest,schedule = due,s
437         return earliest,schedule
438
439     def _check_snapshots(self):
440         """
441         Check the schedules and see what the required snapshot is.
442         Take one immediately on the first overdue snapshot required
443         """
444         # Make sure a refresh() doesn't mess with the schedule while
445         # we're reading through it.
446         self._refreshLock.acquire()
447         next,schedule = self._next_due()
448         self._refreshLock.release()
449         now = long(time.time())
450         while next != None and next <= now:
451             label = self._take_snapshots(schedule)
452             self._plugin.execute_plugins(schedule, label)
453             self._refreshLock.acquire()
454             self._update_schedules()
455             next,schedule = self._next_due();
456             self._refreshLock.release()
457             dt = datetime.datetime.fromtimestamp(next)
458             util.debug("Next snapshot is %s due at: %s" % \
459                        (schedule, dt.isoformat()), \
460                        self.verbose)
461         return next
462                     
463     def _take_snapshots(self, schedule):
464         # Set the time before taking snapshot to avoid clock skew due
465         # to time taken to complete snapshot.
466         tm = long(time.time())
467         label = "%s%s%s-%s" % \
468                 (autosnapsmf.SNAPLABELPREFIX, self._separator, schedule,
469                  datetime.datetime.now().strftime("%Y-%m-%d-%Hh%M"))
470         try:
471             self._datasets.create_auto_snapshot_set(label, tag=schedule)
472         except RuntimeError, message:
473             # Write an error message, set the exit code and pass it up the
474             # stack so the thread can terminate
475             sys.stderr.write("Failed to create snapshots for schedule: %s\n" \
476                              % (schedule))
477             self.exitCode = smf.SMF_EXIT_MON_DEGRADE
478             raise RuntimeError,message
479         self._last[schedule] = tm;
480         self._perform_purge(schedule)
481         return label
482
483     def _prune_snapshots(self, dataset, schedule):
484         """Cleans out zero sized snapshots, kind of cautiously"""
485             # Per schedule: We want to delete 0 sized
486             # snapshots but we need to keep at least one around (the most
487             # recent one) for each schedule so that that overlap is 
488             # maintained from frequent -> hourly -> daily etc.
489             # Start off with the smallest interval schedule first and
490             # move up. This increases the amount of data retained where
491             # several snapshots are taken together like a frequent hourly
492             # and daily snapshot taken at 12:00am. If 3 snapshots are all
493             # identical and reference the same identical data they will all
494             # be initially reported as zero for used size. Deleting the
495             # daily first then the hourly would shift make the data referenced
496             # by all 3 snapshots unique to the frequent scheduled snapshot.
497             # This snapshot would probably be purged within an how ever and the
498             # data referenced by it would be gone for good.
499             # Doing it the other way however ensures that the data should
500             # remain accessible to the user for at least a week as long as
501             # the pool doesn't run low on available space before that.
502
503         try:
504             snaps = dataset.list_snapshots("%s%s" % (self._prefix,schedule))
505             # Clone the list because we want to remove items from it
506             # while iterating through it.
507             remainingsnaps = snaps[:]
508         except RuntimeError,message:
509             sys.stderr.write("Failed to list snapshots during snapshot cleanup\n")
510             self.exitCode = smf.SMF_EXIT_ERR_FATAL
511             raise RuntimeError,message
512
513         if (self._keepEmpties == False):
514             try: # remove the newest one from the list.
515                 snaps.pop()
516             except IndexError:
517                 pass
518             for snapname in snaps:
519                 try:
520                     snapshot = zfs.Snapshot(snapname)
521                 except Exception,message:
522                     sys.stderr.write(str(message))
523                     # Not fatal, just skip to the next snapshot
524                     continue
525
526                 try:
527                     if snapshot.get_used_size() == 0:
528                         util.debug("Destroying zero sized: " + snapname, \
529                                    self.verbose)
530                         try:
531                             snapshot.destroy()
532                         except RuntimeError,message:
533                             sys.stderr.write("Failed to destroy snapshot: " +
534                                              snapname + "\n")
535                             self.exitCode = smf.SMF_EXIT_MON_DEGRADE
536                             # Propogate exception so thread can exit
537                             raise RuntimeError,message
538                         remainingsnaps.remove(snapname)
539                 except RuntimeError,message:
540                     sys.stderr.write("Can not determine used size of: " + \
541                                      snapname + "\n")
542                     self.exitCode = smf.SMF_EXIT_MON_DEGRADE
543                     #Propogate the exception to the thead run() method
544                     raise RuntimeError,message
545
546         # Deleting individual snapshots instead of recursive sets
547         # breaks the recursion chain and leaves child snapshots
548         # dangling so we need to take care of cleaning up the 
549         # snapshots.
550         target = len(remainingsnaps) - self._keep[schedule]
551         counter = 0
552         while counter < target:
553             util.debug("Destroy expired snapshot: " + \
554                        remainingsnaps[counter], 
555                        self.verbose)
556             try:
557                 snapshot = zfs.Snapshot(remainingsnaps[counter])
558             except Exception,message:
559                     sys.stderr.write(str(message))
560                     # Not fatal, just skip to the next snapshot
561                     counter += 1
562                     continue
563             try:
564                 snapshot.destroy()
565             except RuntimeError,message:
566                 sys.stderr.write("Failed to destroy snapshot: " +
567                                  snapshot.name + "\n")
568                 self.exitCode = smf.SMF_EXIT_ERR_FATAL
569                 # Propogate exception so thread can exit
570                 raise RuntimeError,message
571             else:
572                 counter += 1
573
574     def _perform_purge(self, schedule):
575         """Cautiously cleans out zero sized snapshots"""
576         # We need to avoid accidentally pruning auto snapshots received
577         # from one zpool to another. We ensure this by examining only
578         # snapshots whose parent fileystems and volumes are explicitly
579         # tagged to be snapshotted.
580         try:
581             for name in self._datasets.list_auto_snapshot_sets(schedule):
582                 dataset = zfs.ReadWritableDataset(name)
583                 self._prune_snapshots(dataset, schedule)
584         except RuntimeError,message:
585             sys.stderr.write("Error listing datasets during " + \
586                              "removal of expired snapshots\n")
587             self.exitCode = smf.SMF_EXIT_ERR_FATAL
588             # Propogate up to thread's run() method
589             raise RuntimeError,message
590
591     def _needs_cleanup(self):
592         if self._remedialCleanup == False:
593             # Sys admin has explicitly instructed for remedial cleanups
594             # not to be performed.
595             return False
596         now = long(time.time())
597         # Don't run checks any less than 15 minutes apart.
598         if self._cleanupLock.acquire(False) == False:
599             #Indicates that a cleanup is already running.
600             return False
601         # FIXME - Make the cleanup interval equal to the minimum snapshot interval
602         # if custom snapshot schedules are defined and enabled.
603         elif ((now - self._lastCleanupCheck) < (_MINUTE * 15)):
604             pass
605         else:
606             for zpool in self._zpools:
607                 try:
608                     if zpool.get_capacity() > self._warningLevel:
609                         # Before getting into a panic, determine if the pool
610                         # is one we actually take snapshots on, by checking
611                         # for one of the "auto-snapshot:<schedule> tags. Not
612                         # super fast, but it only happens under exceptional
613                         # circumstances of a zpool nearing it's capacity.
614
615                         for sched in self._allSchedules:
616                             sets = zpool.list_auto_snapshot_sets(sched[0])
617                             if len(sets) > 0:
618                                 util.debug("%s needs a cleanup" \
619                                            % zpool.name, \
620                                            self.verbose)
621                                 self._cleanupLock.release()
622                                 return True
623                 except RuntimeError, message:
624                     sys.stderr.write("Error checking zpool capacity of: " + \
625                                      zpool.name + "\n")
626                     self._cleanupLock.release()
627                     self.exitCode = smf.SMF_EXIT_ERR_FATAL
628                     # Propogate up to thread's run() mehod.
629                     raise RuntimeError,message
630             self._lastCleanupCheck = long(time.time())
631         self._cleanupLock.release()
632         return False
633
634     def _perform_cleanup(self):
635         if self._cleanupLock.acquire(False) == False:
636             # Cleanup already running. Skip
637             return
638         self._destroyedsnaps = []
639         for zpool in self._zpools:
640             try:
641                 self._poolstatus[zpool.name] = 0
642                 capacity = zpool.get_capacity()
643                 if capacity > self._warningLevel:
644                     self._run_warning_cleanup(zpool)
645                     self._poolstatus[zpool.name] = 1
646                     capacity = zpool.get_capacity()
647                 if capacity > self._criticalLevel:
648                     self._run_critical_cleanup(zpool)
649                     self._poolstatus[zpool.name] = 2
650                     capacity = zpool.get_capacity()
651                 if capacity > self._emergencyLevel:
652                     self._run_emergency_cleanup(zpool)
653                     self._poolstatus[zpool.name] = 3
654                     capacity = zpool.get_capacity()
655                 if capacity > self._emergencyLevel:
656                     self._run_emergency_cleanup(zpool)
657                     self._poolstatus[zpool.name] = 4
658             # This also catches exceptions thrown from _run_<level>_cleanup()
659             # and _run_cleanup() in methods called by _perform_cleanup()
660             except RuntimeError,message:
661                 sys.stderr.write("Remedial space cleanup failed because " + \
662                                  "of failure to determinecapacity of: " + \
663                                  zpool.name + "\n")
664                 self.exitCode = smf.SMF_EXIT_ERR_FATAL
665                 self._cleanupLock.release()
666                 # Propogate up to thread's run() method.
667                 raise RuntimeError,message
668
669             # Bad - there's no more snapshots left and nothing 
670             # left to delete. We don't disable the service since
671             # it will permit self recovery and snapshot
672             # retention when space becomes available on
673             # the pool (hopefully).
674             util.debug("%s pool status after cleanup:" \
675                        % zpool.name, \
676                        self.verbose)
677             util.debug(zpool, self.verbose)
678         util.debug("Cleanup completed. %d snapshots were destroyed" \
679                    % len(self._destroyedsnaps), \
680                    self.verbose)
681         # Avoid needless list iteration for non-debug mode
682         if self.verbose == True and len(self._destroyedsnaps) > 0:
683             for snap in self._destroyedsnaps:
684                 sys.stderr.write("\t%s\n" % snap)
685         self._cleanupLock.release()
686
687     def _run_warning_cleanup(self, zpool):
688         util.debug("Performing warning level cleanup on %s" % \
689                    zpool.name, \
690                    self.verbose)
691         self._run_cleanup(zpool, "daily", self._warningLevel)
692         if zpool.get_capacity() > self._warningLevel:
693             self._run_cleanup(zpool, "hourly", self._warningLevel)
694
695     def _run_critical_cleanup(self, zpool):
696         util.debug("Performing critical level cleanup on %s" % \
697                    zpool.name, \
698                    self.verbose)
699         self._run_cleanup(zpool, "weekly", self._criticalLevel)
700         if zpool.get_capacity() > self._criticalLevel:
701             self._run_cleanup(zpool, "daily", self._criticalLevel)
702         if zpool.get_capacity() > self._criticalLevel:
703             self._run_cleanup(zpool, "hourly", self._criticalLevel)
704
705     def _run_emergency_cleanup(self, zpool):
706         util.debug("Performing emergency level cleanup on %s" % \
707                    zpool.name, \
708                    self.verbose)
709         self._run_cleanup(zpool, "monthly", self._emergencyLevel)
710         if zpool.get_capacity() > self._emergencyLevel:
711             self._run_cleanup(zpool, "weekly", self._emergencyLevel)
712         if zpool.get_capacity() > self._emergencyLevel:
713             self._run_cleanup(zpool, "daily", self._emergencyLevel)
714         if zpool.get_capacity() > self._emergencyLevel:
715             self._run_cleanup(zpool, "hourly", self._emergencyLevel)
716         if zpool.get_capacity() > self._emergencyLevel:
717             self._run_cleanup(zpool, "frequent", self._emergencyLevel)
718         #Finally, as a last resort, delete custom scheduled snaphots
719         for schedule,i,p,k in self._customSchedules:
720             if zpool.get_capacity() < self._emergencyLevel:
721                 break
722             else:
723                 self._run_cleanup(zpool, schedule, self._emergencyLevel)
724
725     def _run_cleanup(self, zpool, schedule, threshold):
726         clonedsnaps = []
727         snapshots = []
728         try:
729             clonedsnaps = self._datasets.list_cloned_snapshots()
730         except RuntimeError,message:
731                 sys.stderr.write("Error (non-fatal) listing cloned snapshots" +
732                                  " while recovering pool capacity\n")
733                 sys.stderr.write("Error details:\n" + \
734                                  "--------BEGIN ERROR MESSAGE--------\n" + \
735                                  str(message) + \
736                                  "\n--------END ERROR MESSAGE--------\n")    
737
738         # Build a list of snapshots in the given schedule, that are not
739         # cloned, and sort the result in reverse chronological order.
740         try:
741             snapshots = [s for s,t in \
742                             zpool.list_snapshots("%s%s" \
743                             % (self._prefix,schedule)) \
744                             if not s in clonedsnaps]
745             snapshots.reverse()
746         except RuntimeError,message:
747             sys.stderr.write("Error listing snapshots" +
748                              " while recovering pool capacity\n")
749             self.exitCode = smf.SMF_EXIT_ERR_FATAL
750             # Propogate the error up to the thread's run() method.
751             raise RuntimeError,message
752    
753         while zpool.get_capacity() > threshold:
754             if len(snapshots) == 0:
755                 syslog.syslog(syslog.LOG_NOTICE,
756                               "No more %s snapshots left" \
757                                % schedule)
758                 return
759
760             """This is not an exact science. Deleteing a zero sized 
761             snapshot can have unpredictable results. For example a
762             pair of snapshots may share exclusive reference to a large
763             amount of data (eg. a large core file). The usage of both
764             snapshots will initially be seen to be 0 by zfs(1). Deleting
765             one of the snapshots will make the data become unique to the
766             single remaining snapshot that references it uniquely. The
767             remaining snapshot's size will then show up as non zero. So
768             deleting 0 sized snapshot is not as pointless as it might seem.
769             It also means we have to loop through this, each snapshot set
770             at a time and observe the before and after results. Perhaps
771             better way exists...."""
772
773             # Start with the oldest first
774             snapname = snapshots.pop()
775             snapshot = zfs.Snapshot(snapname)
776             # It would be nicer, for performance purposes, to delete sets
777             # of snapshots recursively but this might destroy more data than
778             # absolutely necessary, plus the previous purging of zero sized
779             # snapshots can easily break the recursion chain between
780             # filesystems.
781             # On the positive side there should be fewer snapshots and they
782             # will mostly non-zero so we should get more effectiveness as a
783             # result of deleting snapshots since they should be nearly always
784             # non zero sized.
785             util.debug("Destroying %s" % snapname, self.verbose)
786             try:
787                 snapshot.destroy()
788             except RuntimeError,message:
789                 # Would be nice to be able to mark service as degraded here
790                 # but it's better to try to continue on rather than to give
791                 # up alltogether (SMF maintenance state)
792                 sys.stderr.write("Warning: Cleanup failed to destroy: %s\n" % \
793                                  (snapshot.name))
794                 sys.stderr.write("Details:\n%s\n" % (str(message)))
795             else:
796                 self._destroyedsnaps.append(snapname)
797             # Give zfs some time to recalculate.
798             time.sleep(3)
799         
800     def _send_to_syslog(self):
801         for zpool in self._zpools:
802             status = self._poolstatus[zpool.name]
803             if status == 4:
804                 syslog.syslog(syslog.LOG_EMERG,
805                               "%s is over %d%% capacity. " \
806                               "All automatic snapshots were destroyed" \
807                                % (zpool.name, self._emergencyLevel))
808             elif status == 3:
809                 syslog.syslog(syslog.LOG_ALERT,
810                               "%s exceeded %d%% capacity. " \
811                               "Automatic snapshots over 1 hour old were destroyed" \
812                                % (zpool.name, self._emergencyLevel))
813             elif status == 2:
814                 syslog.syslog(syslog.LOG_CRIT,
815                               "%s exceeded %d%% capacity. " \
816                               "Weekly, hourly and daily automatic snapshots were destroyed" \
817                                % (zpool.name, self._criticalLevel))                             
818             elif status == 1:
819                 syslog.syslog(syslog.LOG_WARNING,
820                               "%s exceeded %d%% capacity. " \
821                               "Hourly and daily automatic snapshots were destroyed" \
822                                % (zpool.name, self._warningLevel))
823
824         if len(self._destroyedsnaps) > 0:
825             syslog.syslog(syslog.LOG_NOTICE,
826                           "%d automatic snapshots were destroyed" \
827                            % len(self._destroyedsnaps))
828
829     def _send_notification(self):
830         worstpool = None
831         worststatus = 0
832
833         for zpool in self._zpools:
834             status = self._poolstatus[zpool.name]
835             # >= to ensure that something should always be set.
836             if status >= worststatus:
837                 worstpool = zpool.name
838                 worststatus = status
839
840         #FIXME make the various levels indexible
841         if worststatus == 4:
842             self._dbus.capacity_exceeded(worstpool, 4, self._emergencyLevel)
843         elif worststatus == 3:
844             self._dbus.capacity_exceeded(worstpool, 3, self._emergencyLevel)
845         elif worststatus == 2:
846             self._dbus.capacity_exceeded(worstpool, 2, self._criticalLevel)
847         elif worststatus == 1:
848             self._dbus.capacity_exceeded(worstpool, 1, self._warningLevel)
849         #elif: 0 everything is fine. Do nothing.
850
851
852 def monitor_threads(snapthread):
853     if snapthread.is_alive():
854         return True
855     else:
856         sys.stderr.write("Snapshot monitor thread exited.\n")
857         if snapthread.exitCode == smf.SMF_EXIT_MON_DEGRADE:
858             # FIXME - it would be nicer to mark the service as degraded than
859             # go into maintenance state for some situations such as a
860             # particular snapshot schedule failing.
861             # But for now SMF does not implement this feature. But if/when it
862             # does it's better to use svcadm to put the # service into the
863             # correct state since the daemon shouldn't exit whentransitioning
864             # to a degraded state.
865             #sys.stderr.write("Placing service into maintenance state\n")
866             #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
867             #                 os.getenv("SMF_FMRI")])
868             # SMF will take care of kill the daemon
869             sys.exit(smf.SMF_EXIT_ERR_FATAL)
870             return False
871         elif snapthread.exitCode == smf.SMF_EXIT_ERR_FATAL:
872             #sys.stderr.write("Placing service into maintenance state\n")
873             #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
874             #                 os.getenv("SMF_FMRI")])
875             # SMF will take care of killing the daemon
876             sys.exit(smf.SMF_EXIT_ERR_FATAL)
877             return False
878         else:
879             sys.stderr.write("Snapshot monitor thread exited abnormally\n")
880             sys.stderr.write("Exit code: %d\n" % (snapthread.exitCode))
881             #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
882             #                 os.getenv("SMF_FMRI")])
883             sys.exit(smf.SMF_EXIT_ERR_FATAL)
884             return False
885
886
887 def child_sig_handler(signum, frame):
888     if signum == signal.SIGUSR1:
889         sys.exit(smf.SMF_EXIT_OK)
890     elif signum == signal.SIGCHLD:
891         sys.exit(smf.SMF_EXIT_ERR_FATAL)
892     elif signum == signal.SIGALRM:
893         sys.exit(smf.SMF_EXIT_ERR_FATAL)
894
895 # Default daemon parameters.
896 # File mode creation mask of the daemon.
897 UMASK = 0
898 # Default working directory for the daemon.
899 WORKDIR = "/"
900 # Default maximum for the number of available file descriptors.
901 MAXFD = 1024
902
903 def create_daemon():
904     """
905     Detach a process from the controlling terminal and run it in the
906     background as a daemon.
907     """
908     #Catch signals that we might receive from child
909     signal.signal(signal.SIGCHLD, child_sig_handler)
910     signal.signal(signal.SIGUSR1, child_sig_handler)
911     signal.signal(signal.SIGALRM, child_sig_handler)
912     try:
913         pid = os.fork()
914     except OSError, e:
915         raise Exception, "%s [%d]" % (e.strerror, e.errno)
916
917     if (pid == 0):
918         #Reset signals that we set to trap in parent
919         signal.signal(signal.SIGCHLD, signal.SIG_DFL)
920         signal.signal(signal.SIGUSR1, signal.SIG_DFL)
921         signal.signal(signal.SIGALRM, signal.SIG_DFL)
922         os.setsid()
923         os.chdir(WORKDIR)
924         os.umask(UMASK)
925     else:
926         #Wait for the child to give the OK or otherwise.
927         signal.pause()
928
929
930 def main(argv):
931
932     parser = argparse.ArgumentParser()
933     parser.add_argument('--foreground', action='store_true', help='Do not daemonize', default=False)
934     args, _ = parser.parse_known_args()
935
936     # Daemonise the service.
937     if not args.foreground:
938         create_daemon()
939
940     # The user security attributes checked are the following:
941     # Note that UID == 0 will match any profile search so
942     # no need to check it explicitly.
943     syslog.openlog("time-sliderd", 0, syslog.LOG_DAEMON)
944     rbacp = RBACprofile()
945     if rbacp.has_profile("ZFS File System Management"):
946
947         gobject.threads_init()
948
949         # Tell dbus to use the gobject mainloop for async ops
950         dbus.mainloop.glib.DBusGMainLoop(set_as_default=True)
951         dbus.mainloop.glib.threads_init()
952         # Register a bus name with the system dbus daemon
953         systemBus = dbus.SystemBus()
954         name = dbus.service.BusName("org.opensolaris.TimeSlider", systemBus)
955
956         # Create and start the snapshot manger. Takes care of
957         # auto snapshotting service and auto cleanup.
958         snapshot = SnapshotManager(systemBus)
959         snapshot.start()
960         gobject.timeout_add(2000, monitor_threads, snapshot)
961
962         mainloop = gobject.MainLoop()
963         try:
964             mainloop.run()
965         except KeyboardInterrupt:
966             mainloop.quit()
967             sys.exit(smf.SMF_EXIT_OK)
968     else:
969         syslog.syslog(syslog.LOG_ERR,
970                "%s has insufficient privileges to run time-sliderd!" \
971                % rbacp.name)
972         syslog.closelog()    
973         sys.exit(smf.SMF_EXIT_ERR_PERM)
974     syslog.closelog()
975     sys.exit(smf.SMF_EXIT_OK)
976