99f802adc59ebfa35d465a186d08de84f64621e4
[time-slider.git] / usr / share / time-slider / lib / time_slider / timesliderd.py
1 #!/usr/bin/python2
2 #
3 # CDDL HEADER START
4 #
5 # The contents of this file are subject to the terms of the
6 # Common Development and Distribution License (the "License").
7 # You may not use this file except in compliance with the License.
8 #
9 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 # or http://www.opensolaris.org/os/licensing.
11 # See the License for the specific language governing permissions
12 # and limitations under the License.
13 #
14 # When distributing Covered Code, include this CDDL HEADER in each
15 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 # If applicable, add the following below this CDDL HEADER, with the
17 # fields enclosed by brackets "[]" replaced with your own identifying
18 # information: Portions Copyright [yyyy] [name of copyright owner]
19 #
20 # CDDL HEADER END
21 #
22
23 import sys
24 import os
25 import subprocess
26 import re
27 import threading
28 import getopt
29 import syslog
30 import time
31 import datetime
32 import calendar
33 import signal
34 import argparse
35
36 import glib
37 import gobject
38 import dbus
39 import dbus.service
40 import dbus.mainloop
41 import dbus.mainloop.glib
42
43 import dbussvc
44 import zfs
45 import smf
46 import time_slider.linux.timeslidersmf as timeslidersmf
47 import time_slider.linux.autosnapsmf as autosnapsmf
48 import plugin
49 from time_slider.linux.rbac import RBACprofile
50 import util
51
52 import time_slider.linux.timesliderconfig as timesliderconfig
53
54 _MINUTE = 60
55 _HOUR = _MINUTE * 60
56 _DAY = _HOUR * 24
57 _WEEK = _DAY * 7
58
59
60 # Status codes for actual zpool capacity levels.
61 # These are relative to the SMF property defined
62 # levels for: user, warning and emergenecy levels
63 STATUS_OK = 0 # Below user specified threshhold. Everything was OK
64 STATUS_WARNING = 1 # Above specified user threshold level
65 STATUS_CRITICAL = 2 # Above specified critical threshhold level
66 STATUS_EMERGENCY = 3 # Above specified emergency threshhold level
67
68 intervals = {"weeks" : _WEEK, "days" : _DAY, "hours" : _HOUR, "minutes" : _MINUTE}
69
70
71 class SnapshotManager(threading.Thread):
72
73     def __init__(self, bus):
74         # Used to wake up the run() method prematurely in the event
75         # of a SIGHUP/SMF refresh
76         self._conditionLock = threading.Condition(threading.RLock())
77         # Used when schedules are being rebuilt or examined.
78         self._refreshLock = threading.Lock()
79         # Indicates that cleanup is in progress when locked
80         self._cleanupLock = threading.Lock()
81         self._datasets = zfs.Datasets()
82         # Indicates that schedules need to be rebuilt from scratch
83         self._stale = True
84         self._lastCleanupCheck = 0;
85         self._zpools = []
86         self._poolstatus = {}
87         self._destroyedsnaps = []
88
89         # This is also checked during the refresh() method but we need
90         # to know it sooner for instantiation of the PluginManager
91         self._smf = timeslidersmf.TimeSliderSMF()
92         try:
93             self.verbose = self._smf.get_verbose()
94         except RuntimeError,message:
95             sys.stderr.write("Error determing whether debugging is enabled\n")
96             self.verbose = False
97
98         self._dbus = dbussvc.AutoSnap(bus,
99                                       '/org/opensolaris/TimeSlider/autosnap',
100                                       self)
101
102         self._plugin = plugin.PluginManager(self.verbose)
103         self.exitCode = smf.SMF_EXIT_OK
104         self.refresh()
105
106         # Seems we're up and running OK. 
107         # Signal our parent so we can daemonise
108         os.kill(os.getppid(), signal.SIGUSR1)
109
110         # SMF/svc.startd sends SIGHUP to force a
111         # a refresh of the daemon
112         signal.signal(signal.SIGHUP, self._signalled)
113
114         # Init done. Now initiaslise threading.
115         threading.Thread.__init__ (self)
116         self.setDaemon(True)
117
118     def run(self):
119         # Deselect swap and dump volumes so they don't get snapshotted.
120         for vol in self._datasets.list_volumes():
121             name = vol.rsplit("/")
122             try:
123                 if (name[1] == "swap" or name[1] == "dump"):
124                     util.debug("Auto excluding %s volume" % vol, self.verbose)
125                     volume = zfs.Volume(vol)
126                     volume.set_auto_snap(False)
127             except IndexError:
128                 pass
129             
130         nexttime = None
131         waittime = None
132         while True:
133             try:
134                 self.refresh()
135                 # First check and, if necessary, perform any remedial cleanup.
136                 # This is best done before creating any new snapshots which may
137                 # otherwise get immediately gobbled up by the remedial cleanup.
138                 if self._needs_cleanup() == True:
139                     self._perform_cleanup()
140                     # Check to see if cleanup actually deleted anything before
141                     # notifying the user. Avoids the popup appearing continuously
142                     if len(self._destroyedsnaps) > 0:
143                         self._send_notification()
144                     self._send_to_syslog()
145
146                 nexttime = self._check_snapshots()
147                 # Overdue snapshots are already taken automatically
148                 # inside _check_snapshots() so nexttime should never be
149                 # < 0. It can be None however, which is fine since it 
150                 # will cause the scheduler thread to sleep indefinitely
151                 # or until a SIGHUP is caught.
152                 if nexttime:
153                     util.debug("Waiting until " + str (nexttime), self.verbose)
154                 waittime = None
155                 if nexttime != None:
156                     waittime = nexttime - long(time.time())
157                     if (waittime <= 0):
158                         # We took too long and missed a snapshot, so break out
159                         # and catch up on it the next time through the loop
160                         continue
161                 # waittime could be None if no auto-snap schedules are online
162                 self._conditionLock.acquire()
163                 if waittime:
164                     util.debug("Waiting %d seconds" % (waittime), self.verbose)
165                     self._conditionLock.wait(waittime)
166                 else: #None. Just wait a while to check for cleanups.
167                     util.debug("No auto-snapshot schedules online.", \
168                                self.verbose)
169                     self._conditionLock.wait(_MINUTE * 15)
170
171             except OSError, message:
172                 sys.stderr.write("Caught OSError exception in snapshot" +
173                                  " manager thread\n")
174                 sys.stderr.write("Error details:\n" + \
175                                  "--------BEGIN ERROR MESSAGE--------\n" + \
176                                  str(message) + \
177                                  "\n--------END ERROR MESSAGE--------\n")
178                 self.exitCode = smf.SMF_EXIT_ERR_FATAL
179                 # Exit this thread
180                 break
181             except RuntimeError,message:
182                 sys.stderr.write("Caught RuntimeError exception in snapshot" +
183                                  " manager thread\n")
184                 sys.stderr.write("Error details:\n" + \
185                                  "--------BEGIN ERROR MESSAGE--------\n" + \
186                                  str(message) + \
187                                  "\n--------END ERROR MESSAGE--------\n")
188                 # Exit this thread
189                 break
190
191     def _signalled(self, signum, frame):
192         if signum == signal.SIGHUP:
193             if self._refreshLock.acquire(False) == False:
194                 return
195             self._stale = True
196             self._refreshLock.release()
197             self._conditionLock.acquire()
198             self._conditionLock.notify()
199             self._conditionLock.release()
200
201     def refresh(self):
202         """
203         Checks if defined snapshot schedules are out
204         of date and rebuilds and updates if necessary
205         """
206         self._refreshLock.acquire()
207         if self._stale == True:
208             self._configure_svc_props()
209             self._rebuild_schedules()
210             self._update_schedules()
211             self._plugin.refresh()
212             self._stale = False
213         self._refreshLock.release()
214
215     def _configure_svc_props(self):
216         try:
217             self.verbose = self._smf.get_verbose()
218         except RuntimeError,message:
219             sys.stderr.write("Error determing whether debugging is enabled\n")
220             self.verbose = False
221
222         try:
223             cleanup = self._smf.get_remedial_cleanup()
224             warn = self._smf.get_cleanup_level("warning")
225             util.debug("Warning level value is:   %d%%" % warn, self.verbose)
226             crit = self._smf.get_cleanup_level("critical")
227             util.debug("Critical level value is:  %d%%" % crit, self.verbose)
228             emer = self._smf.get_cleanup_level("emergency")
229             util.debug("Emergency level value is: %d%%" % emer, self.verbose)
230         except RuntimeError,message:
231             sys.stderr.write("Failed to determine cleanup threshhold levels\n")
232             sys.stderr.write("Details:\n" + \
233                              "--------BEGIN ERROR MESSAGE--------\n" + \
234                              str(message) + \
235                              "\n---------END ERROR MESSAGE---------\n")
236             sys.stderr.write("Using factory defaults of 80%, 90% and 95%\n")
237             #Go with defaults
238             #FIXME - this would be an appropriate case to mark svc as degraded
239             self._remedialCleanup = True
240             self._warningLevel = 80
241             self._criticalLevel = 90
242             self._emergencyLevel = 95
243         else:
244             self._remedialCleanup = cleanup
245             self._warningLevel = warn
246             self._criticalLevel = crit
247             self._emergencyLevel = emer
248
249         try:
250             self._keepEmpties = self._smf.get_keep_empties()
251         except RuntimeError,message:
252             # Not fatal, just assume we delete them (default configuration)
253             sys.stderr.write("Can't determine whether to keep empty snapshots\n")
254             sys.stderr.write("Details:\n" + \
255                              "--------BEGIN ERROR MESSAGE--------\n" + \
256                              str(message) + \
257                              "\n---------END ERROR MESSAGE---------\n")
258             sys.stderr.write("Assuming default value: False\n")
259             self._keepEmpties = False
260
261         # Previously, snapshot labels used the ":" character was used as a 
262         # separator character for datestamps. Windows filesystems such as
263         # CIFS and FAT choke on this character so now we use a user definable
264         # separator value, with a default value of "_"
265         # We need to check for both the old and new format when looking for
266         # snapshots.
267         self._separator = self._smf.get_separator()
268         self._prefix = "%s[:%s]" \
269             % (autosnapsmf.SNAPLABELPREFIX, self._separator)
270
271         # Rebuild pool list
272         self._zpools = []
273         try:
274             for poolname in zfs.list_zpools():
275                 # Do not try to examine FAULTED pools
276                 zpool = zfs.ZPool(poolname)
277                 if zpool.health == "FAULTED":
278                     util.debug("Ignoring faulted Zpool: %s\n" \
279                                % (zpool.name), \
280                                self.verbose)
281                 else:
282                     self._zpools.append(zpool)
283                 util.debug(str(zpool), self.verbose)
284         except RuntimeError,message:
285             sys.stderr.write("Could not list Zpools\n")
286             self.exitCode = smf.SMF_EXIT_ERR_FATAL
287             # Propogate exception up to thread's run() method
288             raise RuntimeError,message
289
290
291     def _rebuild_schedules(self):
292         """
293         Builds 2 lists of default and custom auto-snapshot SMF instances
294         """
295
296         self._last = {}
297         self._next = {}
298         self._keep = {}
299
300         try:
301             _defaultSchedules = autosnapsmf.get_default_schedules()
302             _customSchedules = autosnapsmf.get_custom_schedules()
303         except RuntimeError,message:
304             self.exitCode = smf.SMF_EXIT_ERR_FATAL
305             raise RuntimeError, "Error reading SMF schedule instances\n" + \
306                                 "Details:\n" + str(message)
307         else:
308             # Now set it in stone.
309             self._defaultSchedules = tuple(_defaultSchedules)
310             self._customSchedules = tuple(_customSchedules)
311             
312             # Build the combined schedule tuple from default + custom schedules
313             _defaultSchedules.extend(_customSchedules)
314             self._allSchedules = tuple(_defaultSchedules)
315             for schedule,i,p,keep in self._allSchedules:
316                 self._last[schedule] = 0
317                 self._next[schedule] = 0
318                 self._keep[schedule] = keep
319
320     def _update_schedules(self):
321         interval = 0
322         idx = 1 # Used to index subsets for schedule overlap calculation
323         last = None
324
325         for schedule,interval,period,keep in self._allSchedules:
326             # Shortcut if we've already processed this schedule and it's 
327             # still up to date. Don't skip the default schedules though
328             # because overlap affects their scheduling
329             if [schedule,interval,period,keep] not in \
330                 self._defaultSchedules and \
331                 (self._next[schedule] > self._last[schedule]):
332                 util.debug("Short circuiting %s recalculation" \
333                            % (schedule), \
334                            self.verbose)
335                 continue
336
337             # If we don't have an internal timestamp for the given schedule
338             # ask zfs for the last snapshot and get it's creation timestamp.
339             if self._last[schedule] == 0:
340                 try:
341                     snaps = self._datasets.list_snapshots("%s%s" % \
342                                                          (self._prefix,
343                                                           schedule))
344                 except RuntimeError,message:
345                     self.exitCode = smf.SMF_EXIT_ERR_FATAL
346                     sys.stderr.write("Failed to list snapshots during schedule update\n")
347                     #Propogate up to the thread's run() method
348                     raise RuntimeError,message
349
350                 if len(snaps) > 0:
351                     util.debug("Last %s snapshot was: %s" % \
352                                (schedule, snaps[-1][0]), \
353                                self.verbose)
354                     self._last[schedule] = snaps[-1][1]
355
356             last = self._last[schedule]
357             if interval != "months": # months is non-constant. See below.
358                 util.debug("Recalculating %s schedule" % (schedule), \
359                            self.verbose)
360                 try:
361                     totalinterval = intervals[interval] * period
362                 except KeyError:
363                     self.exitCode = smf.SMF_EXIT_ERR_CONFIG
364                     sys.stderr.write(schedule + \
365                                       " schedule has invalid interval: " + \
366                                       "'%s\'\n" % interval)
367                     #Propogate up to thread's run() method
368                     raise RuntimeError
369                 if [schedule,interval,period,keep] in self._defaultSchedules:
370                     # This is one of the default schedules so check for an
371                     # overlap with one of the dominant shchedules.
372                     for s,i,p,k in self._defaultSchedules[:idx]:
373                         last = max(last, self._last[s])
374                     idx += 1
375
376             else: # interval == "months"
377                 if self._next[schedule] > last:
378                     util.debug("Short circuiting " + \
379                                schedule + \
380                                " recalculation", \
381                                self.verbose)
382                     continue
383                 util.debug("Recalculating %s schedule" % (schedule), \
384                            self.verbose)
385                 snap_tm = time.gmtime(self._last[schedule])
386                 # Increment year if period >= than 1 calender year.
387                 year = snap_tm.tm_year
388                 year += period / 12
389                 period = period % 12
390
391                 mon = (snap_tm.tm_mon + period) % 12
392                 # Result of 0 actually means december.
393                 if mon == 0:
394                     mon = 12
395                 # Account for period that spans calendar year boundary.
396                 elif snap_tm.tm_mon + period > 12:
397                     year += 1
398
399                 d,dlastmon = calendar.monthrange(snap_tm.tm_year, snap_tm.tm_mon)
400                 d,dnewmon = calendar.monthrange(year, mon)
401                 mday = snap_tm.tm_mday
402                 if dlastmon > dnewmon and snap_tm.tm_mday > dnewmon:
403                    mday = dnewmon
404                 
405                 tm =(year, mon, mday, \
406                     snap_tm.tm_hour, snap_tm.tm_min, snap_tm.tm_sec, \
407                     0, 0, -1)
408                 newt = calendar.timegm(tm)
409                 new_tm = time.gmtime(newt)
410                 totalinterval = newt - self._last[schedule]
411
412             self._next[schedule] = last + totalinterval
413
414     def _next_due(self):
415         schedule = None
416         earliest = None
417         now = long(time.time())
418         
419         for s,i,p,k in self._defaultSchedules:
420             due = self._next[s]
421             if due <= now:
422                 #Default Schedule - so break out at the first 
423                 #schedule that is overdue. The subordinate schedules
424                 #will re-adjust afterwards.
425                 earliest,schedule = due,s
426                 break
427             elif earliest != None:
428                 if due < earliest:
429                     earliest,schedule = due,s
430             else: #FIXME better optimisation with above condition
431                 earliest,schedule = due,s
432         for s,i,p,k in self._customSchedules:
433             due = self._next[s]
434             if earliest != None:
435                 if due < earliest:
436                     earliest,schedule = due,s
437             else: #FIXME better optimisation with above condition
438                 earliest,schedule = due,s
439         return earliest,schedule
440
441     def _check_snapshots(self):
442         """
443         Check the schedules and see what the required snapshot is.
444         Take one immediately on the first overdue snapshot required
445         """
446         # Make sure a refresh() doesn't mess with the schedule while
447         # we're reading through it.
448         self._refreshLock.acquire()
449         next,schedule = self._next_due()
450         self._refreshLock.release()
451         now = long(time.time())
452         while next != None and next <= now:
453             label = self._take_snapshots(schedule)
454             self._plugin.execute_plugins(schedule, label)
455             self._refreshLock.acquire()
456             self._update_schedules()
457             next,schedule = self._next_due();
458             self._refreshLock.release()
459             dt = datetime.datetime.fromtimestamp(next)
460             util.debug("Next snapshot is %s due at: %s" % \
461                        (schedule, dt.isoformat()), \
462                        self.verbose)
463         return next
464                     
465     def _take_snapshots(self, schedule):
466         # Set the time before taking snapshot to avoid clock skew due
467         # to time taken to complete snapshot.
468         tm = long(time.time())
469         label = "%s%s%s-%s" % \
470                 (autosnapsmf.SNAPLABELPREFIX, self._separator, schedule,
471                  datetime.datetime.now().strftime("%Y-%m-%d-%Hh%M"))
472         try:
473             self._datasets.create_auto_snapshot_set(label, tag=schedule)
474         except RuntimeError, message:
475             # Write an error message, set the exit code and pass it up the
476             # stack so the thread can terminate
477             sys.stderr.write("Failed to create snapshots for schedule: %s\n" \
478                              % (schedule))
479             self.exitCode = smf.SMF_EXIT_MON_DEGRADE
480             raise RuntimeError,message
481         self._last[schedule] = tm;
482         self._perform_purge(schedule)
483         return label
484
485     def _prune_snapshots(self, dataset, schedule):
486         """Cleans out zero sized snapshots, kind of cautiously"""
487             # Per schedule: We want to delete 0 sized
488             # snapshots but we need to keep at least one around (the most
489             # recent one) for each schedule so that that overlap is 
490             # maintained from frequent -> hourly -> daily etc.
491             # Start off with the smallest interval schedule first and
492             # move up. This increases the amount of data retained where
493             # several snapshots are taken together like a frequent hourly
494             # and daily snapshot taken at 12:00am. If 3 snapshots are all
495             # identical and reference the same identical data they will all
496             # be initially reported as zero for used size. Deleting the
497             # daily first then the hourly would shift make the data referenced
498             # by all 3 snapshots unique to the frequent scheduled snapshot.
499             # This snapshot would probably be purged within an how ever and the
500             # data referenced by it would be gone for good.
501             # Doing it the other way however ensures that the data should
502             # remain accessible to the user for at least a week as long as
503             # the pool doesn't run low on available space before that.
504
505         try:
506             snaps = dataset.list_snapshots("%s%s" % (self._prefix,schedule))
507             # Clone the list because we want to remove items from it
508             # while iterating through it.
509             remainingsnaps = snaps[:]
510         except RuntimeError,message:
511             sys.stderr.write("Failed to list snapshots during snapshot cleanup\n")
512             self.exitCode = smf.SMF_EXIT_ERR_FATAL
513             raise RuntimeError,message
514
515         if (self._keepEmpties == False):
516             try: # remove the newest one from the list.
517                 snaps.pop()
518             except IndexError:
519                 pass
520             for snapname in snaps:
521                 try:
522                     snapshot = zfs.Snapshot(snapname)
523                 except Exception,message:
524                     sys.stderr.write(str(message))
525                     # Not fatal, just skip to the next snapshot
526                     continue
527
528                 try:
529                     if snapshot.get_used_size() == 0:
530                         util.debug("Destroying zero sized: " + snapname, \
531                                    self.verbose)
532                         try:
533                             snapshot.destroy()
534                         except RuntimeError,message:
535                             sys.stderr.write("Failed to destroy snapshot: " +
536                                              snapname + "\n")
537                             self.exitCode = smf.SMF_EXIT_MON_DEGRADE
538                             # Propogate exception so thread can exit
539                             raise RuntimeError,message
540                         remainingsnaps.remove(snapname)
541                 except RuntimeError,message:
542                     sys.stderr.write("Can not determine used size of: " + \
543                                      snapname + "\n")
544                     self.exitCode = smf.SMF_EXIT_MON_DEGRADE
545                     #Propogate the exception to the thead run() method
546                     raise RuntimeError,message
547
548         # Deleting individual snapshots instead of recursive sets
549         # breaks the recursion chain and leaves child snapshots
550         # dangling so we need to take care of cleaning up the 
551         # snapshots.
552         target = len(remainingsnaps) - self._keep[schedule]
553         counter = 0
554         while counter < target:
555             util.debug("Destroy expired snapshot: " + \
556                        remainingsnaps[counter], 
557                        self.verbose)
558             try:
559                 snapshot = zfs.Snapshot(remainingsnaps[counter])
560             except Exception,message:
561                     sys.stderr.write(str(message))
562                     # Not fatal, just skip to the next snapshot
563                     counter += 1
564                     continue
565             try:
566                 snapshot.destroy()
567             except RuntimeError,message:
568                 sys.stderr.write("Failed to destroy snapshot: " +
569                                  snapshot.name + "\n")
570                 self.exitCode = smf.SMF_EXIT_ERR_FATAL
571                 # Propogate exception so thread can exit
572                 raise RuntimeError,message
573             else:
574                 counter += 1
575
576     def _perform_purge(self, schedule):
577         """Cautiously cleans out zero sized snapshots"""
578         # We need to avoid accidentally pruning auto snapshots received
579         # from one zpool to another. We ensure this by examining only
580         # snapshots whose parent fileystems and volumes are explicitly
581         # tagged to be snapshotted.
582         try:
583             for name in self._datasets.list_auto_snapshot_sets(schedule):
584                 dataset = zfs.ReadWritableDataset(name)
585                 self._prune_snapshots(dataset, schedule)
586         except RuntimeError,message:
587             sys.stderr.write("Error listing datasets during " + \
588                              "removal of expired snapshots\n")
589             self.exitCode = smf.SMF_EXIT_ERR_FATAL
590             # Propogate up to thread's run() method
591             raise RuntimeError,message
592
593     def _needs_cleanup(self):
594         if self._remedialCleanup == False:
595             # Sys admin has explicitly instructed for remedial cleanups
596             # not to be performed.
597             return False
598         now = long(time.time())
599         # Don't run checks any less than 15 minutes apart.
600         if self._cleanupLock.acquire(False) == False:
601             #Indicates that a cleanup is already running.
602             return False
603         # FIXME - Make the cleanup interval equal to the minimum snapshot interval
604         # if custom snapshot schedules are defined and enabled.
605         elif ((now - self._lastCleanupCheck) < (_MINUTE * 15)):
606             pass
607         else:
608             for zpool in self._zpools:
609                 try:
610                     if zpool.get_capacity() > self._warningLevel:
611                         # Before getting into a panic, determine if the pool
612                         # is one we actually take snapshots on, by checking
613                         # for one of the "auto-snapshot:<schedule> tags. Not
614                         # super fast, but it only happens under exceptional
615                         # circumstances of a zpool nearing it's capacity.
616
617                         for sched in self._allSchedules:
618                             sets = zpool.list_auto_snapshot_sets(sched[0])
619                             if len(sets) > 0:
620                                 util.debug("%s needs a cleanup" \
621                                            % zpool.name, \
622                                            self.verbose)
623                                 self._cleanupLock.release()
624                                 return True
625                 except RuntimeError, message:
626                     sys.stderr.write("Error checking zpool capacity of: " + \
627                                      zpool.name + "\n")
628                     self._cleanupLock.release()
629                     self.exitCode = smf.SMF_EXIT_ERR_FATAL
630                     # Propogate up to thread's run() mehod.
631                     raise RuntimeError,message
632             self._lastCleanupCheck = long(time.time())
633         self._cleanupLock.release()
634         return False
635
636     def _perform_cleanup(self):
637         if self._cleanupLock.acquire(False) == False:
638             # Cleanup already running. Skip
639             return
640         self._destroyedsnaps = []
641         for zpool in self._zpools:
642             try:
643                 self._poolstatus[zpool.name] = 0
644                 capacity = zpool.get_capacity()
645                 if capacity > self._warningLevel:
646                     self._run_warning_cleanup(zpool)
647                     self._poolstatus[zpool.name] = 1
648                     capacity = zpool.get_capacity()
649                 if capacity > self._criticalLevel:
650                     self._run_critical_cleanup(zpool)
651                     self._poolstatus[zpool.name] = 2
652                     capacity = zpool.get_capacity()
653                 if capacity > self._emergencyLevel:
654                     self._run_emergency_cleanup(zpool)
655                     self._poolstatus[zpool.name] = 3
656                     capacity = zpool.get_capacity()
657                 if capacity > self._emergencyLevel:
658                     self._run_emergency_cleanup(zpool)
659                     self._poolstatus[zpool.name] = 4
660             # This also catches exceptions thrown from _run_<level>_cleanup()
661             # and _run_cleanup() in methods called by _perform_cleanup()
662             except RuntimeError,message:
663                 sys.stderr.write("Remedial space cleanup failed because " + \
664                                  "of failure to determinecapacity of: " + \
665                                  zpool.name + "\n")
666                 self.exitCode = smf.SMF_EXIT_ERR_FATAL
667                 self._cleanupLock.release()
668                 # Propogate up to thread's run() method.
669                 raise RuntimeError,message
670
671             # Bad - there's no more snapshots left and nothing 
672             # left to delete. We don't disable the service since
673             # it will permit self recovery and snapshot
674             # retention when space becomes available on
675             # the pool (hopefully).
676             util.debug("%s pool status after cleanup:" \
677                        % zpool.name, \
678                        self.verbose)
679             util.debug(zpool, self.verbose)
680         util.debug("Cleanup completed. %d snapshots were destroyed" \
681                    % len(self._destroyedsnaps), \
682                    self.verbose)
683         # Avoid needless list iteration for non-debug mode
684         if self.verbose == True and len(self._destroyedsnaps) > 0:
685             for snap in self._destroyedsnaps:
686                 sys.stderr.write("\t%s\n" % snap)
687         self._cleanupLock.release()
688
689     def _run_warning_cleanup(self, zpool):
690         util.debug("Performing warning level cleanup on %s" % \
691                    zpool.name, \
692                    self.verbose)
693         self._run_cleanup(zpool, "daily", self._warningLevel)
694         if zpool.get_capacity() > self._warningLevel:
695             self._run_cleanup(zpool, "hourly", self._warningLevel)
696
697     def _run_critical_cleanup(self, zpool):
698         util.debug("Performing critical level cleanup on %s" % \
699                    zpool.name, \
700                    self.verbose)
701         self._run_cleanup(zpool, "weekly", self._criticalLevel)
702         if zpool.get_capacity() > self._criticalLevel:
703             self._run_cleanup(zpool, "daily", self._criticalLevel)
704         if zpool.get_capacity() > self._criticalLevel:
705             self._run_cleanup(zpool, "hourly", self._criticalLevel)
706
707     def _run_emergency_cleanup(self, zpool):
708         util.debug("Performing emergency level cleanup on %s" % \
709                    zpool.name, \
710                    self.verbose)
711         self._run_cleanup(zpool, "monthly", self._emergencyLevel)
712         if zpool.get_capacity() > self._emergencyLevel:
713             self._run_cleanup(zpool, "weekly", self._emergencyLevel)
714         if zpool.get_capacity() > self._emergencyLevel:
715             self._run_cleanup(zpool, "daily", self._emergencyLevel)
716         if zpool.get_capacity() > self._emergencyLevel:
717             self._run_cleanup(zpool, "hourly", self._emergencyLevel)
718         if zpool.get_capacity() > self._emergencyLevel:
719             self._run_cleanup(zpool, "frequent", self._emergencyLevel)
720         #Finally, as a last resort, delete custom scheduled snaphots
721         for schedule,i,p,k in self._customSchedules:
722             if zpool.get_capacity() < self._emergencyLevel:
723                 break
724             else:
725                 self._run_cleanup(zpool, schedule, self._emergencyLevel)
726
727     def _run_cleanup(self, zpool, schedule, threshold):
728         clonedsnaps = []
729         snapshots = []
730         try:
731             clonedsnaps = self._datasets.list_cloned_snapshots()
732         except RuntimeError,message:
733                 sys.stderr.write("Error (non-fatal) listing cloned snapshots" +
734                                  " while recovering pool capacity\n")
735                 sys.stderr.write("Error details:\n" + \
736                                  "--------BEGIN ERROR MESSAGE--------\n" + \
737                                  str(message) + \
738                                  "\n--------END ERROR MESSAGE--------\n")    
739
740         # Build a list of snapshots in the given schedule, that are not
741         # cloned, and sort the result in reverse chronological order.
742         try:
743             snapshots = [s for s,t in \
744                             zpool.list_snapshots("%s%s" \
745                             % (self._prefix,schedule)) \
746                             if not s in clonedsnaps]
747             snapshots.reverse()
748         except RuntimeError,message:
749             sys.stderr.write("Error listing snapshots" +
750                              " while recovering pool capacity\n")
751             self.exitCode = smf.SMF_EXIT_ERR_FATAL
752             # Propogate the error up to the thread's run() method.
753             raise RuntimeError,message
754    
755         while zpool.get_capacity() > threshold:
756             if len(snapshots) == 0:
757                 syslog.syslog(syslog.LOG_NOTICE,
758                               "No more %s snapshots left" \
759                                % schedule)
760                 return
761
762             """This is not an exact science. Deleteing a zero sized 
763             snapshot can have unpredictable results. For example a
764             pair of snapshots may share exclusive reference to a large
765             amount of data (eg. a large core file). The usage of both
766             snapshots will initially be seen to be 0 by zfs(1). Deleting
767             one of the snapshots will make the data become unique to the
768             single remaining snapshot that references it uniquely. The
769             remaining snapshot's size will then show up as non zero. So
770             deleting 0 sized snapshot is not as pointless as it might seem.
771             It also means we have to loop through this, each snapshot set
772             at a time and observe the before and after results. Perhaps
773             better way exists...."""
774
775             # Start with the oldest first
776             snapname = snapshots.pop()
777             snapshot = zfs.Snapshot(snapname)
778             # It would be nicer, for performance purposes, to delete sets
779             # of snapshots recursively but this might destroy more data than
780             # absolutely necessary, plus the previous purging of zero sized
781             # snapshots can easily break the recursion chain between
782             # filesystems.
783             # On the positive side there should be fewer snapshots and they
784             # will mostly non-zero so we should get more effectiveness as a
785             # result of deleting snapshots since they should be nearly always
786             # non zero sized.
787             util.debug("Destroying %s" % snapname, self.verbose)
788             try:
789                 snapshot.destroy()
790             except RuntimeError,message:
791                 # Would be nice to be able to mark service as degraded here
792                 # but it's better to try to continue on rather than to give
793                 # up alltogether (SMF maintenance state)
794                 sys.stderr.write("Warning: Cleanup failed to destroy: %s\n" % \
795                                  (snapshot.name))
796                 sys.stderr.write("Details:\n%s\n" % (str(message)))
797             else:
798                 self._destroyedsnaps.append(snapname)
799             # Give zfs some time to recalculate.
800             time.sleep(3)
801         
802     def _send_to_syslog(self):
803         for zpool in self._zpools:
804             status = self._poolstatus[zpool.name]
805             if status == 4:
806                 syslog.syslog(syslog.LOG_EMERG,
807                               "%s is over %d%% capacity. " \
808                               "All automatic snapshots were destroyed" \
809                                % (zpool.name, self._emergencyLevel))
810             elif status == 3:
811                 syslog.syslog(syslog.LOG_ALERT,
812                               "%s exceeded %d%% capacity. " \
813                               "Automatic snapshots over 1 hour old were destroyed" \
814                                % (zpool.name, self._emergencyLevel))
815             elif status == 2:
816                 syslog.syslog(syslog.LOG_CRIT,
817                               "%s exceeded %d%% capacity. " \
818                               "Weekly, hourly and daily automatic snapshots were destroyed" \
819                                % (zpool.name, self._criticalLevel))                             
820             elif status == 1:
821                 syslog.syslog(syslog.LOG_WARNING,
822                               "%s exceeded %d%% capacity. " \
823                               "Hourly and daily automatic snapshots were destroyed" \
824                                % (zpool.name, self._warningLevel))
825
826         if len(self._destroyedsnaps) > 0:
827             syslog.syslog(syslog.LOG_NOTICE,
828                           "%d automatic snapshots were destroyed" \
829                            % len(self._destroyedsnaps))
830
831     def _send_notification(self):
832         worstpool = None
833         worststatus = 0
834
835         for zpool in self._zpools:
836             status = self._poolstatus[zpool.name]
837             # >= to ensure that something should always be set.
838             if status >= worststatus:
839                 worstpool = zpool.name
840                 worststatus = status
841
842         #FIXME make the various levels indexible
843         if worststatus == 4:
844             self._dbus.capacity_exceeded(worstpool, 4, self._emergencyLevel)
845         elif worststatus == 3:
846             self._dbus.capacity_exceeded(worstpool, 3, self._emergencyLevel)
847         elif worststatus == 2:
848             self._dbus.capacity_exceeded(worstpool, 2, self._criticalLevel)
849         elif worststatus == 1:
850             self._dbus.capacity_exceeded(worstpool, 1, self._warningLevel)
851         #elif: 0 everything is fine. Do nothing.
852
853
854 def monitor_threads(snapthread):
855     if snapthread.is_alive():
856         return True
857     else:
858         sys.stderr.write("Snapshot monitor thread exited.\n")
859         if snapthread.exitCode == smf.SMF_EXIT_MON_DEGRADE:
860             # FIXME - it would be nicer to mark the service as degraded than
861             # go into maintenance state for some situations such as a
862             # particular snapshot schedule failing.
863             # But for now SMF does not implement this feature. But if/when it
864             # does it's better to use svcadm to put the # service into the
865             # correct state since the daemon shouldn't exit whentransitioning
866             # to a degraded state.
867             #sys.stderr.write("Placing service into maintenance state\n")
868             #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
869             #                 os.getenv("SMF_FMRI")])
870             # SMF will take care of kill the daemon
871             sys.exit(smf.SMF_EXIT_ERR_FATAL)
872             return False
873         elif snapthread.exitCode == smf.SMF_EXIT_ERR_FATAL:
874             #sys.stderr.write("Placing service into maintenance state\n")
875             #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
876             #                 os.getenv("SMF_FMRI")])
877             # SMF will take care of killing the daemon
878             sys.exit(smf.SMF_EXIT_ERR_FATAL)
879             return False
880         else:
881             sys.stderr.write("Snapshot monitor thread exited abnormally\n")
882             sys.stderr.write("Exit code: %d\n" % (snapthread.exitCode))
883             #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
884             #                 os.getenv("SMF_FMRI")])
885             sys.exit(smf.SMF_EXIT_ERR_FATAL)
886             return False
887
888
889 def child_sig_handler(signum, frame):
890     if signum == signal.SIGUSR1:
891         sys.exit(smf.SMF_EXIT_OK)
892     elif signum == signal.SIGCHLD:
893         sys.exit(smf.SMF_EXIT_ERR_FATAL)
894     elif signum == signal.SIGALRM:
895         sys.exit(smf.SMF_EXIT_ERR_FATAL)
896
897 # Default daemon parameters.
898 # File mode creation mask of the daemon.
899 UMASK = 0
900 # Default working directory for the daemon.
901 WORKDIR = "/"
902 # Default maximum for the number of available file descriptors.
903 MAXFD = 1024
904
905 def create_daemon():
906     """
907     Detach a process from the controlling terminal and run it in the
908     background as a daemon.
909     """
910     #Catch signals that we might receive from child
911     signal.signal(signal.SIGCHLD, child_sig_handler)
912     signal.signal(signal.SIGUSR1, child_sig_handler)
913     signal.signal(signal.SIGALRM, child_sig_handler)
914     try:
915         pid = os.fork()
916     except OSError, e:
917         raise Exception, "%s [%d]" % (e.strerror, e.errno)
918
919     if (pid == 0):
920         #Reset signals that we set to trap in parent
921         signal.signal(signal.SIGCHLD, signal.SIG_DFL)
922         signal.signal(signal.SIGUSR1, signal.SIG_DFL)
923         signal.signal(signal.SIGALRM, signal.SIG_DFL)
924         os.setsid()
925         os.chdir(WORKDIR)
926         os.umask(UMASK)
927     else:
928         #Wait for the child to give the OK or otherwise.
929         signal.pause()
930
931
932 def main(argv):
933
934     parser = argparse.ArgumentParser()
935     parser.add_argument('--foreground', action='store_true', help='Do not daemonize', default=False)
936     parser.add_argument('--config', '-c', type=str, help='Configuration file', default='/etc/time-slider/timesliderd.conf')
937     args, _ = parser.parse_known_args()
938
939     timesliderconfig.configfile = args.config
940
941     # Daemonise the service.
942     if not args.foreground:
943         create_daemon()
944
945     # The user security attributes checked are the following:
946     # Note that UID == 0 will match any profile search so
947     # no need to check it explicitly.
948     syslog.openlog("time-sliderd", 0, syslog.LOG_DAEMON)
949     rbacp = RBACprofile()
950     if rbacp.has_profile("ZFS File System Management"):
951
952         gobject.threads_init()
953
954         # Tell dbus to use the gobject mainloop for async ops
955         dbus.mainloop.glib.DBusGMainLoop(set_as_default=True)
956         dbus.mainloop.glib.threads_init()
957         # Register a bus name with the system dbus daemon
958         systemBus = dbus.SystemBus()
959         name = dbus.service.BusName("org.opensolaris.TimeSlider", systemBus)
960
961         # Create and start the snapshot manger. Takes care of
962         # auto snapshotting service and auto cleanup.
963         snapshot = SnapshotManager(systemBus)
964         snapshot.start()
965         gobject.timeout_add(2000, monitor_threads, snapshot)
966
967         mainloop = gobject.MainLoop()
968         try:
969             mainloop.run()
970         except KeyboardInterrupt:
971             mainloop.quit()
972             sys.exit(smf.SMF_EXIT_OK)
973     else:
974         syslog.syslog(syslog.LOG_ERR,
975                "%s has insufficient privileges to run time-sliderd!" \
976                % rbacp.name)
977         syslog.closelog()    
978         sys.exit(smf.SMF_EXIT_ERR_PERM)
979     syslog.closelog()
980     sys.exit(smf.SMF_EXIT_OK)
981