Replace all mentions of /usr/bin/python2.6 with /usr/bin/python2
[time-slider.git] / usr / share / time-slider / lib / time_slider / timesliderd.py
1 #!/usr/bin/python2
2 #
3 # CDDL HEADER START
4 #
5 # The contents of this file are subject to the terms of the
6 # Common Development and Distribution License (the "License").
7 # You may not use this file except in compliance with the License.
8 #
9 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 # or http://www.opensolaris.org/os/licensing.
11 # See the License for the specific language governing permissions
12 # and limitations under the License.
13 #
14 # When distributing Covered Code, include this CDDL HEADER in each
15 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 # If applicable, add the following below this CDDL HEADER, with the
17 # fields enclosed by brackets "[]" replaced with your own identifying
18 # information: Portions Copyright [yyyy] [name of copyright owner]
19 #
20 # CDDL HEADER END
21 #
22
23 import sys
24 import os
25 import subprocess
26 import re
27 import threading
28 import getopt
29 import syslog
30 import time
31 import datetime
32 import calendar
33 import signal
34
35 import glib
36 import gobject
37 import dbus
38 import dbus.service
39 import dbus.mainloop
40 import dbus.mainloop.glib
41
42 import dbussvc
43 import zfs
44 import smf
45 import timeslidersmf
46 import autosnapsmf
47 import plugin
48 from rbac import RBACprofile
49 import util
50
51 _MINUTE = 60
52 _HOUR = _MINUTE * 60
53 _DAY = _HOUR * 24
54 _WEEK = _DAY * 7
55
56
57 # Status codes for actual zpool capacity levels.
58 # These are relative to the SMF property defined
59 # levels for: user, warning and emergenecy levels
60 STATUS_OK = 0 # Below user specified threshhold. Everything was OK
61 STATUS_WARNING = 1 # Above specified user threshold level
62 STATUS_CRITICAL = 2 # Above specified critical threshhold level
63 STATUS_EMERGENCY = 3 # Above specified emergency threshhold level
64
65 intervals = {"weeks" : _WEEK, "days" : _DAY, "hours" : _HOUR, "minutes" : _MINUTE}
66
67
68 class SnapshotManager(threading.Thread):
69
70     def __init__(self, bus):
71         # Used to wake up the run() method prematurely in the event
72         # of a SIGHUP/SMF refresh
73         self._conditionLock = threading.Condition(threading.RLock())
74         # Used when schedules are being rebuilt or examined.
75         self._refreshLock = threading.Lock()
76         # Indicates that cleanup is in progress when locked
77         self._cleanupLock = threading.Lock()
78         self._datasets = zfs.Datasets()
79         # Indicates that schedules need to be rebuilt from scratch
80         self._stale = True
81         self._lastCleanupCheck = 0;
82         self._zpools = []
83         self._poolstatus = {}
84         self._destroyedsnaps = []
85
86         # This is also checked during the refresh() method but we need
87         # to know it sooner for instantiation of the PluginManager
88         self._smf = timeslidersmf.TimeSliderSMF()
89         try:
90             self.verbose = self._smf.get_verbose()
91         except RuntimeError,message:
92             sys.stderr.write("Error determing whether debugging is enabled\n")
93             self.verbose = False
94
95         self._dbus = dbussvc.AutoSnap(bus,
96                                       '/org/opensolaris/TimeSlider/autosnap',
97                                       self)
98
99         self._plugin = plugin.PluginManager(self.verbose)
100         self.exitCode = smf.SMF_EXIT_OK
101         self.refresh()
102
103         # Seems we're up and running OK. 
104         # Signal our parent so we can daemonise
105         os.kill(os.getppid(), signal.SIGUSR1)
106
107         # SMF/svc.startd sends SIGHUP to force a
108         # a refresh of the daemon
109         signal.signal(signal.SIGHUP, self._signalled)
110
111         # Init done. Now initiaslise threading.
112         threading.Thread.__init__ (self)
113         self.setDaemon(True)
114
115     def run(self):
116         # Deselect swap and dump volumes so they don't get snapshotted.
117         for vol in self._datasets.list_volumes():
118             name = vol.rsplit("/")
119             try:
120                 if (name[1] == "swap" or name[1] == "dump"):
121                     util.debug("Auto excluding %s volume" % vol, self.verbose)
122                     volume = zfs.Volume(vol)
123                     volume.set_auto_snap(False)
124             except IndexError:
125                 pass
126             
127         nexttime = None
128         waittime = None
129         while True:
130             try:
131                 self.refresh()
132                 # First check and, if necessary, perform any remedial cleanup.
133                 # This is best done before creating any new snapshots which may
134                 # otherwise get immediately gobbled up by the remedial cleanup.
135                 if self._needs_cleanup() == True:
136                     self._perform_cleanup()
137                     # Check to see if cleanup actually deleted anything before
138                     # notifying the user. Avoids the popup appearing continuously
139                     if len(self._destroyedsnaps) > 0:
140                         self._send_notification()
141                     self._send_to_syslog()
142
143                 nexttime = self._check_snapshots()
144                 # Overdue snapshots are already taken automatically
145                 # inside _check_snapshots() so nexttime should never be
146                 # < 0. It can be None however, which is fine since it 
147                 # will cause the scheduler thread to sleep indefinitely
148                 # or until a SIGHUP is caught.
149                 if nexttime:
150                     util.debug("Waiting until " + str (nexttime), self.verbose)
151                 waittime = None
152                 if nexttime != None:
153                     waittime = nexttime - long(time.time())
154                     if (waittime <= 0):
155                         # We took too long and missed a snapshot, so break out
156                         # and catch up on it the next time through the loop
157                         continue
158                 # waittime could be None if no auto-snap schedules are online
159                 self._conditionLock.acquire()
160                 if waittime:
161                     util.debug("Waiting %d seconds" % (waittime), self.verbose)
162                     self._conditionLock.wait(waittime)
163                 else: #None. Just wait a while to check for cleanups.
164                     util.debug("No auto-snapshot schedules online.", \
165                                self.verbose)
166                     self._conditionLock.wait(_MINUTE * 15)
167
168             except OSError, message:
169                 sys.stderr.write("Caught OSError exception in snapshot" +
170                                  " manager thread\n")
171                 sys.stderr.write("Error details:\n" + \
172                                  "--------BEGIN ERROR MESSAGE--------\n" + \
173                                  str(message) + \
174                                  "\n--------END ERROR MESSAGE--------\n")
175                 self.exitCode = smf.SMF_EXIT_ERR_FATAL
176                 # Exit this thread
177                 break
178             except RuntimeError,message:
179                 sys.stderr.write("Caught RuntimeError exception in snapshot" +
180                                  " manager thread\n")
181                 sys.stderr.write("Error details:\n" + \
182                                  "--------BEGIN ERROR MESSAGE--------\n" + \
183                                  str(message) + \
184                                  "\n--------END ERROR MESSAGE--------\n")
185                 # Exit this thread
186                 break
187
188     def _signalled(self, signum, frame):
189         if signum == signal.SIGHUP:
190             if self._refreshLock.acquire(False) == False:
191                 return
192             self._stale = True
193             self._refreshLock.release()
194             self._conditionLock.acquire()
195             self._conditionLock.notify()
196             self._conditionLock.release()
197
198     def refresh(self):
199         """
200         Checks if defined snapshot schedules are out
201         of date and rebuilds and updates if necessary
202         """
203         self._refreshLock.acquire()
204         if self._stale == True:
205             self._configure_svc_props()
206             self._rebuild_schedules()
207             self._update_schedules()
208             self._plugin.refresh()
209             self._stale = False
210         self._refreshLock.release()
211
212     def _configure_svc_props(self):
213         try:
214             self.verbose = self._smf.get_verbose()
215         except RuntimeError,message:
216             sys.stderr.write("Error determing whether debugging is enabled\n")
217             self.verbose = False
218
219         try:
220             cleanup = self._smf.get_remedial_cleanup()
221             warn = self._smf.get_cleanup_level("warning")
222             util.debug("Warning level value is:   %d%%" % warn, self.verbose)
223             crit = self._smf.get_cleanup_level("critical")
224             util.debug("Critical level value is:  %d%%" % crit, self.verbose)
225             emer = self._smf.get_cleanup_level("emergency")
226             util.debug("Emergency level value is: %d%%" % emer, self.verbose)
227         except RuntimeError,message:
228             sys.stderr.write("Failed to determine cleanup threshhold levels\n")
229             sys.stderr.write("Details:\n" + \
230                              "--------BEGIN ERROR MESSAGE--------\n" + \
231                              str(message) + \
232                              "\n---------END ERROR MESSAGE---------\n")
233             sys.stderr.write("Using factory defaults of 80%, 90% and 95%\n")
234             #Go with defaults
235             #FIXME - this would be an appropriate case to mark svc as degraded
236             self._remedialCleanup = True
237             self._warningLevel = 80
238             self._criticalLevel = 90
239             self._emergencyLevel = 95
240         else:
241             self._remedialCleanup = cleanup
242             self._warningLevel = warn
243             self._criticalLevel = crit
244             self._emergencyLevel = emer
245
246         try:
247             self._keepEmpties = self._smf.get_keep_empties()
248         except RuntimeError,message:
249             # Not fatal, just assume we delete them (default configuration)
250             sys.stderr.write("Can't determine whether to keep empty snapshots\n")
251             sys.stderr.write("Details:\n" + \
252                              "--------BEGIN ERROR MESSAGE--------\n" + \
253                              str(message) + \
254                              "\n---------END ERROR MESSAGE---------\n")
255             sys.stderr.write("Assuming default value: False\n")
256             self._keepEmpties = False
257
258         # Previously, snapshot labels used the ":" character was used as a 
259         # separator character for datestamps. Windows filesystems such as
260         # CIFS and FAT choke on this character so now we use a user definable
261         # separator value, with a default value of "_"
262         # We need to check for both the old and new format when looking for
263         # snapshots.
264         self._separator = self._smf.get_separator()
265         self._prefix = "%s[:%s]" \
266             % (autosnapsmf.SNAPLABELPREFIX, self._separator)
267
268         # Rebuild pool list
269         self._zpools = []
270         try:
271             for poolname in zfs.list_zpools():
272                 # Do not try to examine FAULTED pools
273                 zpool = zfs.ZPool(poolname)
274                 if zpool.health == "FAULTED":
275                     util.debug("Ignoring faulted Zpool: %s\n" \
276                                % (zpool.name), \
277                                self.verbose)
278                 else:
279                     self._zpools.append(zpool)
280                 util.debug(str(zpool), self.verbose)
281         except RuntimeError,message:
282             sys.stderr.write("Could not list Zpools\n")
283             self.exitCode = smf.SMF_EXIT_ERR_FATAL
284             # Propogate exception up to thread's run() method
285             raise RuntimeError,message
286
287
288     def _rebuild_schedules(self):
289         """
290         Builds 2 lists of default and custom auto-snapshot SMF instances
291         """
292
293         self._last = {}
294         self._next = {}
295         self._keep = {}
296
297         try:
298             _defaultSchedules = autosnapsmf.get_default_schedules()
299             _customSchedules = autosnapsmf.get_custom_schedules()
300         except RuntimeError,message:
301             self.exitCode = smf.SMF_EXIT_ERR_FATAL
302             raise RuntimeError, "Error reading SMF schedule instances\n" + \
303                                 "Details:\n" + str(message)
304         else:
305             # Now set it in stone.
306             self._defaultSchedules = tuple(_defaultSchedules)
307             self._customSchedules = tuple(_customSchedules)
308             
309             # Build the combined schedule tuple from default + custom schedules
310             _defaultSchedules.extend(_customSchedules)
311             self._allSchedules = tuple(_defaultSchedules)
312             for schedule,i,p,keep in self._allSchedules:
313                 self._last[schedule] = 0
314                 self._next[schedule] = 0
315                 self._keep[schedule] = keep
316
317     def _update_schedules(self):
318         interval = 0
319         idx = 1 # Used to index subsets for schedule overlap calculation
320         last = None
321
322         for schedule,interval,period,keep in self._allSchedules:
323             # Shortcut if we've already processed this schedule and it's 
324             # still up to date. Don't skip the default schedules though
325             # because overlap affects their scheduling
326             if [schedule,interval,period,keep] not in \
327                 self._defaultSchedules and \
328                 (self._next[schedule] > self._last[schedule]):
329                 util.debug("Short circuiting %s recalculation" \
330                            % (schedule), \
331                            self.verbose)
332                 continue
333
334             # If we don't have an internal timestamp for the given schedule
335             # ask zfs for the last snapshot and get it's creation timestamp.
336             if self._last[schedule] == 0:
337                 try:
338                     snaps = self._datasets.list_snapshots("%s%s" % \
339                                                          (self._prefix,
340                                                           schedule))
341                 except RuntimeError,message:
342                     self.exitCode = smf.SMF_EXIT_ERR_FATAL
343                     sys.stderr.write("Failed to list snapshots during schedule update\n")
344                     #Propogate up to the thread's run() method
345                     raise RuntimeError,message
346
347                 if len(snaps) > 0:
348                     util.debug("Last %s snapshot was: %s" % \
349                                (schedule, snaps[-1][0]), \
350                                self.verbose)
351                     self._last[schedule] = snaps[-1][1]
352
353             last = self._last[schedule]
354             if interval != "months": # months is non-constant. See below.
355                 util.debug("Recalculating %s schedule" % (schedule), \
356                            self.verbose)
357                 try:
358                     totalinterval = intervals[interval] * period
359                 except KeyError:
360                     self.exitCode = smf.SMF_EXIT_ERR_CONFIG
361                     sys.stderr.write(schedule + \
362                                       " schedule has invalid interval: " + \
363                                       "'%s\'\n" % interval)
364                     #Propogate up to thread's run() method
365                     raise RuntimeError
366                 if [schedule,interval,period,keep] in self._defaultSchedules:
367                     # This is one of the default schedules so check for an
368                     # overlap with one of the dominant shchedules.
369                     for s,i,p,k in self._defaultSchedules[:idx]:
370                         last = max(last, self._last[s])
371                     idx += 1
372
373             else: # interval == "months"
374                 if self._next[schedule] > last:
375                     util.debug("Short circuiting " + \
376                                schedule + \
377                                " recalculation", \
378                                self.verbose)
379                     continue
380                 util.debug("Recalculating %s schedule" % (schedule), \
381                            self.verbose)
382                 snap_tm = time.gmtime(self._last[schedule])
383                 # Increment year if period >= than 1 calender year.
384                 year = snap_tm.tm_year
385                 year += period / 12
386                 period = period % 12
387
388                 mon = (snap_tm.tm_mon + period) % 12
389                 # Result of 0 actually means december.
390                 if mon == 0:
391                     mon = 12
392                 # Account for period that spans calendar year boundary.
393                 elif snap_tm.tm_mon + period > 12:
394                     year += 1
395
396                 d,dlastmon = calendar.monthrange(snap_tm.tm_year, snap_tm.tm_mon)
397                 d,dnewmon = calendar.monthrange(year, mon)
398                 mday = snap_tm.tm_mday
399                 if dlastmon > dnewmon and snap_tm.tm_mday > dnewmon:
400                    mday = dnewmon
401                 
402                 tm =(year, mon, mday, \
403                     snap_tm.tm_hour, snap_tm.tm_min, snap_tm.tm_sec, \
404                     0, 0, -1)
405                 newt = calendar.timegm(tm)
406                 new_tm = time.gmtime(newt)
407                 totalinterval = newt - self._last[schedule]
408
409             self._next[schedule] = last + totalinterval
410
411     def _next_due(self):
412         schedule = None
413         earliest = None
414         now = long(time.time())
415         
416         for s,i,p,k in self._defaultSchedules:
417             due = self._next[s]
418             if due <= now:
419                 #Default Schedule - so break out at the first 
420                 #schedule that is overdue. The subordinate schedules
421                 #will re-adjust afterwards.
422                 earliest,schedule = due,s
423                 break
424             elif earliest != None:
425                 if due < earliest:
426                     earliest,schedule = due,s
427             else: #FIXME better optimisation with above condition
428                 earliest,schedule = due,s
429         for s,i,p,k in self._customSchedules:
430             due = self._next[s]
431             if earliest != None:
432                 if due < earliest:
433                     earliest,schedule = due,s
434             else: #FIXME better optimisation with above condition
435                 earliest,schedule = due,s
436         return earliest,schedule
437
438     def _check_snapshots(self):
439         """
440         Check the schedules and see what the required snapshot is.
441         Take one immediately on the first overdue snapshot required
442         """
443         # Make sure a refresh() doesn't mess with the schedule while
444         # we're reading through it.
445         self._refreshLock.acquire()
446         next,schedule = self._next_due()
447         self._refreshLock.release()
448         now = long(time.time())
449         while next != None and next <= now:
450             label = self._take_snapshots(schedule)
451             self._plugin.execute_plugins(schedule, label)
452             self._refreshLock.acquire()
453             self._update_schedules()
454             next,schedule = self._next_due();
455             self._refreshLock.release()
456             dt = datetime.datetime.fromtimestamp(next)
457             util.debug("Next snapshot is %s due at: %s" % \
458                        (schedule, dt.isoformat()), \
459                        self.verbose)
460         return next
461                     
462     def _take_snapshots(self, schedule):
463         # Set the time before taking snapshot to avoid clock skew due
464         # to time taken to complete snapshot.
465         tm = long(time.time())
466         label = "%s%s%s-%s" % \
467                 (autosnapsmf.SNAPLABELPREFIX, self._separator, schedule,
468                  datetime.datetime.now().strftime("%Y-%m-%d-%Hh%M"))
469         try:
470             self._datasets.create_auto_snapshot_set(label, tag=schedule)
471         except RuntimeError, message:
472             # Write an error message, set the exit code and pass it up the
473             # stack so the thread can terminate
474             sys.stderr.write("Failed to create snapshots for schedule: %s\n" \
475                              % (schedule))
476             self.exitCode = smf.SMF_EXIT_MON_DEGRADE
477             raise RuntimeError,message
478         self._last[schedule] = tm;
479         self._perform_purge(schedule)
480         return label
481
482     def _prune_snapshots(self, dataset, schedule):
483         """Cleans out zero sized snapshots, kind of cautiously"""
484             # Per schedule: We want to delete 0 sized
485             # snapshots but we need to keep at least one around (the most
486             # recent one) for each schedule so that that overlap is 
487             # maintained from frequent -> hourly -> daily etc.
488             # Start off with the smallest interval schedule first and
489             # move up. This increases the amount of data retained where
490             # several snapshots are taken together like a frequent hourly
491             # and daily snapshot taken at 12:00am. If 3 snapshots are all
492             # identical and reference the same identical data they will all
493             # be initially reported as zero for used size. Deleting the
494             # daily first then the hourly would shift make the data referenced
495             # by all 3 snapshots unique to the frequent scheduled snapshot.
496             # This snapshot would probably be purged within an how ever and the
497             # data referenced by it would be gone for good.
498             # Doing it the other way however ensures that the data should
499             # remain accessible to the user for at least a week as long as
500             # the pool doesn't run low on available space before that.
501
502         try:
503             snaps = dataset.list_snapshots("%s%s" % (self._prefix,schedule))
504             # Clone the list because we want to remove items from it
505             # while iterating through it.
506             remainingsnaps = snaps[:]
507         except RuntimeError,message:
508             sys.stderr.write("Failed to list snapshots during snapshot cleanup\n")
509             self.exitCode = smf.SMF_EXIT_ERR_FATAL
510             raise RuntimeError,message
511
512         if (self._keepEmpties == False):
513             try: # remove the newest one from the list.
514                 snaps.pop()
515             except IndexError:
516                 pass
517             for snapname in snaps:
518                 try:
519                     snapshot = zfs.Snapshot(snapname)
520                 except Exception,message:
521                     sys.stderr.write(str(message))
522                     # Not fatal, just skip to the next snapshot
523                     continue
524
525                 try:
526                     if snapshot.get_used_size() == 0:
527                         util.debug("Destroying zero sized: " + snapname, \
528                                    self.verbose)
529                         try:
530                             snapshot.destroy()
531                         except RuntimeError,message:
532                             sys.stderr.write("Failed to destroy snapshot: " +
533                                              snapname + "\n")
534                             self.exitCode = smf.SMF_EXIT_MON_DEGRADE
535                             # Propogate exception so thread can exit
536                             raise RuntimeError,message
537                         remainingsnaps.remove(snapname)
538                 except RuntimeError,message:
539                     sys.stderr.write("Can not determine used size of: " + \
540                                      snapname + "\n")
541                     self.exitCode = smf.SMF_EXIT_MON_DEGRADE
542                     #Propogate the exception to the thead run() method
543                     raise RuntimeError,message
544
545         # Deleting individual snapshots instead of recursive sets
546         # breaks the recursion chain and leaves child snapshots
547         # dangling so we need to take care of cleaning up the 
548         # snapshots.
549         target = len(remainingsnaps) - self._keep[schedule]
550         counter = 0
551         while counter < target:
552             util.debug("Destroy expired snapshot: " + \
553                        remainingsnaps[counter], 
554                        self.verbose)
555             try:
556                 snapshot = zfs.Snapshot(remainingsnaps[counter])
557             except Exception,message:
558                     sys.stderr.write(str(message))
559                     # Not fatal, just skip to the next snapshot
560                     counter += 1
561                     continue
562             try:
563                 snapshot.destroy()
564             except RuntimeError,message:
565                 sys.stderr.write("Failed to destroy snapshot: " +
566                                  snapshot.name + "\n")
567                 self.exitCode = smf.SMF_EXIT_ERR_FATAL
568                 # Propogate exception so thread can exit
569                 raise RuntimeError,message
570             else:
571                 counter += 1
572
573     def _perform_purge(self, schedule):
574         """Cautiously cleans out zero sized snapshots"""
575         # We need to avoid accidentally pruning auto snapshots received
576         # from one zpool to another. We ensure this by examining only
577         # snapshots whose parent fileystems and volumes are explicitly
578         # tagged to be snapshotted.
579         try:
580             for name in self._datasets.list_auto_snapshot_sets(schedule):
581                 dataset = zfs.ReadWritableDataset(name)
582                 self._prune_snapshots(dataset, schedule)
583         except RuntimeError,message:
584             sys.stderr.write("Error listing datasets during " + \
585                              "removal of expired snapshots\n")
586             self.exitCode = smf.SMF_EXIT_ERR_FATAL
587             # Propogate up to thread's run() method
588             raise RuntimeError,message
589
590     def _needs_cleanup(self):
591         if self._remedialCleanup == False:
592             # Sys admin has explicitly instructed for remedial cleanups
593             # not to be performed.
594             return False
595         now = long(time.time())
596         # Don't run checks any less than 15 minutes apart.
597         if self._cleanupLock.acquire(False) == False:
598             #Indicates that a cleanup is already running.
599             return False
600         # FIXME - Make the cleanup interval equal to the minimum snapshot interval
601         # if custom snapshot schedules are defined and enabled.
602         elif ((now - self._lastCleanupCheck) < (_MINUTE * 15)):
603             pass
604         else:
605             for zpool in self._zpools:
606                 try:
607                     if zpool.get_capacity() > self._warningLevel:
608                         # Before getting into a panic, determine if the pool
609                         # is one we actually take snapshots on, by checking
610                         # for one of the "auto-snapshot:<schedule> tags. Not
611                         # super fast, but it only happens under exceptional
612                         # circumstances of a zpool nearing it's capacity.
613
614                         for sched in self._allSchedules:
615                             sets = zpool.list_auto_snapshot_sets(sched[0])
616                             if len(sets) > 0:
617                                 util.debug("%s needs a cleanup" \
618                                            % zpool.name, \
619                                            self.verbose)
620                                 self._cleanupLock.release()
621                                 return True
622                 except RuntimeError, message:
623                     sys.stderr.write("Error checking zpool capacity of: " + \
624                                      zpool.name + "\n")
625                     self._cleanupLock.release()
626                     self.exitCode = smf.SMF_EXIT_ERR_FATAL
627                     # Propogate up to thread's run() mehod.
628                     raise RuntimeError,message
629             self._lastCleanupCheck = long(time.time())
630         self._cleanupLock.release()
631         return False
632
633     def _perform_cleanup(self):
634         if self._cleanupLock.acquire(False) == False:
635             # Cleanup already running. Skip
636             return
637         self._destroyedsnaps = []
638         for zpool in self._zpools:
639             try:
640                 self._poolstatus[zpool.name] = 0
641                 capacity = zpool.get_capacity()
642                 if capacity > self._warningLevel:
643                     self._run_warning_cleanup(zpool)
644                     self._poolstatus[zpool.name] = 1
645                     capacity = zpool.get_capacity()
646                 if capacity > self._criticalLevel:
647                     self._run_critical_cleanup(zpool)
648                     self._poolstatus[zpool.name] = 2
649                     capacity = zpool.get_capacity()
650                 if capacity > self._emergencyLevel:
651                     self._run_emergency_cleanup(zpool)
652                     self._poolstatus[zpool.name] = 3
653                     capacity = zpool.get_capacity()
654                 if capacity > self._emergencyLevel:
655                     self._run_emergency_cleanup(zpool)
656                     self._poolstatus[zpool.name] = 4
657             # This also catches exceptions thrown from _run_<level>_cleanup()
658             # and _run_cleanup() in methods called by _perform_cleanup()
659             except RuntimeError,message:
660                 sys.stderr.write("Remedial space cleanup failed because " + \
661                                  "of failure to determinecapacity of: " + \
662                                  zpool.name + "\n")
663                 self.exitCode = smf.SMF_EXIT_ERR_FATAL
664                 self._cleanupLock.release()
665                 # Propogate up to thread's run() method.
666                 raise RuntimeError,message
667
668             # Bad - there's no more snapshots left and nothing 
669             # left to delete. We don't disable the service since
670             # it will permit self recovery and snapshot
671             # retention when space becomes available on
672             # the pool (hopefully).
673             util.debug("%s pool status after cleanup:" \
674                        % zpool.name, \
675                        self.verbose)
676             util.debug(zpool, self.verbose)
677         util.debug("Cleanup completed. %d snapshots were destroyed" \
678                    % len(self._destroyedsnaps), \
679                    self.verbose)
680         # Avoid needless list iteration for non-debug mode
681         if self.verbose == True and len(self._destroyedsnaps) > 0:
682             for snap in self._destroyedsnaps:
683                 sys.stderr.write("\t%s\n" % snap)
684         self._cleanupLock.release()
685
686     def _run_warning_cleanup(self, zpool):
687         util.debug("Performing warning level cleanup on %s" % \
688                    zpool.name, \
689                    self.verbose)
690         self._run_cleanup(zpool, "daily", self._warningLevel)
691         if zpool.get_capacity() > self._warningLevel:
692             self._run_cleanup(zpool, "hourly", self._warningLevel)
693
694     def _run_critical_cleanup(self, zpool):
695         util.debug("Performing critical level cleanup on %s" % \
696                    zpool.name, \
697                    self.verbose)
698         self._run_cleanup(zpool, "weekly", self._criticalLevel)
699         if zpool.get_capacity() > self._criticalLevel:
700             self._run_cleanup(zpool, "daily", self._criticalLevel)
701         if zpool.get_capacity() > self._criticalLevel:
702             self._run_cleanup(zpool, "hourly", self._criticalLevel)
703
704     def _run_emergency_cleanup(self, zpool):
705         util.debug("Performing emergency level cleanup on %s" % \
706                    zpool.name, \
707                    self.verbose)
708         self._run_cleanup(zpool, "monthly", self._emergencyLevel)
709         if zpool.get_capacity() > self._emergencyLevel:
710             self._run_cleanup(zpool, "weekly", self._emergencyLevel)
711         if zpool.get_capacity() > self._emergencyLevel:
712             self._run_cleanup(zpool, "daily", self._emergencyLevel)
713         if zpool.get_capacity() > self._emergencyLevel:
714             self._run_cleanup(zpool, "hourly", self._emergencyLevel)
715         if zpool.get_capacity() > self._emergencyLevel:
716             self._run_cleanup(zpool, "frequent", self._emergencyLevel)
717         #Finally, as a last resort, delete custom scheduled snaphots
718         for schedule,i,p,k in self._customSchedules:
719             if zpool.get_capacity() < self._emergencyLevel:
720                 break
721             else:
722                 self._run_cleanup(zpool, schedule, self._emergencyLevel)
723
724     def _run_cleanup(self, zpool, schedule, threshold):
725         clonedsnaps = []
726         snapshots = []
727         try:
728             clonedsnaps = self._datasets.list_cloned_snapshots()
729         except RuntimeError,message:
730                 sys.stderr.write("Error (non-fatal) listing cloned snapshots" +
731                                  " while recovering pool capacity\n")
732                 sys.stderr.write("Error details:\n" + \
733                                  "--------BEGIN ERROR MESSAGE--------\n" + \
734                                  str(message) + \
735                                  "\n--------END ERROR MESSAGE--------\n")    
736
737         # Build a list of snapshots in the given schedule, that are not
738         # cloned, and sort the result in reverse chronological order.
739         try:
740             snapshots = [s for s,t in \
741                             zpool.list_snapshots("%s%s" \
742                             % (self._prefix,schedule)) \
743                             if not s in clonedsnaps]
744             snapshots.reverse()
745         except RuntimeError,message:
746             sys.stderr.write("Error listing snapshots" +
747                              " while recovering pool capacity\n")
748             self.exitCode = smf.SMF_EXIT_ERR_FATAL
749             # Propogate the error up to the thread's run() method.
750             raise RuntimeError,message
751    
752         while zpool.get_capacity() > threshold:
753             if len(snapshots) == 0:
754                 syslog.syslog(syslog.LOG_NOTICE,
755                               "No more %s snapshots left" \
756                                % schedule)
757                 return
758
759             """This is not an exact science. Deleteing a zero sized 
760             snapshot can have unpredictable results. For example a
761             pair of snapshots may share exclusive reference to a large
762             amount of data (eg. a large core file). The usage of both
763             snapshots will initially be seen to be 0 by zfs(1). Deleting
764             one of the snapshots will make the data become unique to the
765             single remaining snapshot that references it uniquely. The
766             remaining snapshot's size will then show up as non zero. So
767             deleting 0 sized snapshot is not as pointless as it might seem.
768             It also means we have to loop through this, each snapshot set
769             at a time and observe the before and after results. Perhaps
770             better way exists...."""
771
772             # Start with the oldest first
773             snapname = snapshots.pop()
774             snapshot = zfs.Snapshot(snapname)
775             # It would be nicer, for performance purposes, to delete sets
776             # of snapshots recursively but this might destroy more data than
777             # absolutely necessary, plus the previous purging of zero sized
778             # snapshots can easily break the recursion chain between
779             # filesystems.
780             # On the positive side there should be fewer snapshots and they
781             # will mostly non-zero so we should get more effectiveness as a
782             # result of deleting snapshots since they should be nearly always
783             # non zero sized.
784             util.debug("Destroying %s" % snapname, self.verbose)
785             try:
786                 snapshot.destroy()
787             except RuntimeError,message:
788                 # Would be nice to be able to mark service as degraded here
789                 # but it's better to try to continue on rather than to give
790                 # up alltogether (SMF maintenance state)
791                 sys.stderr.write("Warning: Cleanup failed to destroy: %s\n" % \
792                                  (snapshot.name))
793                 sys.stderr.write("Details:\n%s\n" % (str(message)))
794             else:
795                 self._destroyedsnaps.append(snapname)
796             # Give zfs some time to recalculate.
797             time.sleep(3)
798         
799     def _send_to_syslog(self):
800         for zpool in self._zpools:
801             status = self._poolstatus[zpool.name]
802             if status == 4:
803                 syslog.syslog(syslog.LOG_EMERG,
804                               "%s is over %d%% capacity. " \
805                               "All automatic snapshots were destroyed" \
806                                % (zpool.name, self._emergencyLevel))
807             elif status == 3:
808                 syslog.syslog(syslog.LOG_ALERT,
809                               "%s exceeded %d%% capacity. " \
810                               "Automatic snapshots over 1 hour old were destroyed" \
811                                % (zpool.name, self._emergencyLevel))
812             elif status == 2:
813                 syslog.syslog(syslog.LOG_CRIT,
814                               "%s exceeded %d%% capacity. " \
815                               "Weekly, hourly and daily automatic snapshots were destroyed" \
816                                % (zpool.name, self._criticalLevel))                             
817             elif status == 1:
818                 syslog.syslog(syslog.LOG_WARNING,
819                               "%s exceeded %d%% capacity. " \
820                               "Hourly and daily automatic snapshots were destroyed" \
821                                % (zpool.name, self._warningLevel))
822
823         if len(self._destroyedsnaps) > 0:
824             syslog.syslog(syslog.LOG_NOTICE,
825                           "%d automatic snapshots were destroyed" \
826                            % len(self._destroyedsnaps))
827
828     def _send_notification(self):
829         worstpool = None
830         worststatus = 0
831
832         for zpool in self._zpools:
833             status = self._poolstatus[zpool.name]
834             # >= to ensure that something should always be set.
835             if status >= worststatus:
836                 worstpool = zpool.name
837                 worststatus = status
838
839         #FIXME make the various levels indexible
840         if worststatus == 4:
841             self._dbus.capacity_exceeded(worstpool, 4, self._emergencyLevel)
842         elif worststatus == 3:
843             self._dbus.capacity_exceeded(worstpool, 3, self._emergencyLevel)
844         elif worststatus == 2:
845             self._dbus.capacity_exceeded(worstpool, 2, self._criticalLevel)
846         elif worststatus == 1:
847             self._dbus.capacity_exceeded(worstpool, 1, self._warningLevel)
848         #elif: 0 everything is fine. Do nothing.
849
850
851 def monitor_threads(snapthread):
852     if snapthread.is_alive():
853         return True
854     else:
855         sys.stderr.write("Snapshot monitor thread exited.\n")
856         if snapthread.exitCode == smf.SMF_EXIT_MON_DEGRADE:
857             # FIXME - it would be nicer to mark the service as degraded than
858             # go into maintenance state for some situations such as a
859             # particular snapshot schedule failing.
860             # But for now SMF does not implement this feature. But if/when it
861             # does it's better to use svcadm to put the # service into the
862             # correct state since the daemon shouldn't exit whentransitioning
863             # to a degraded state.
864             #sys.stderr.write("Placing service into maintenance state\n")
865             #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
866             #                 os.getenv("SMF_FMRI")])
867             # SMF will take care of kill the daemon
868             sys.exit(smf.SMF_EXIT_ERR_FATAL)
869             return False
870         elif snapthread.exitCode == smf.SMF_EXIT_ERR_FATAL:
871             #sys.stderr.write("Placing service into maintenance state\n")
872             #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
873             #                 os.getenv("SMF_FMRI")])
874             # SMF will take care of killing the daemon
875             sys.exit(smf.SMF_EXIT_ERR_FATAL)
876             return False
877         else:
878             sys.stderr.write("Snapshot monitor thread exited abnormally\n")
879             sys.stderr.write("Exit code: %d\n" % (snapthread.exitCode))
880             #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
881             #                 os.getenv("SMF_FMRI")])
882             sys.exit(smf.SMF_EXIT_ERR_FATAL)
883             return False
884
885
886 def child_sig_handler(signum, frame):
887     if signum == signal.SIGUSR1:
888         sys.exit(smf.SMF_EXIT_OK)
889     elif signum == signal.SIGCHLD:
890         sys.exit(smf.SMF_EXIT_ERR_FATAL)
891     elif signum == signal.SIGALRM:
892         sys.exit(smf.SMF_EXIT_ERR_FATAL)
893
894 # Default daemon parameters.
895 # File mode creation mask of the daemon.
896 UMASK = 0
897 # Default working directory for the daemon.
898 WORKDIR = "/"
899 # Default maximum for the number of available file descriptors.
900 MAXFD = 1024
901
902 def create_daemon():
903     """
904     Detach a process from the controlling terminal and run it in the
905     background as a daemon.
906     """
907     #Catch signals that we might receive from child
908     signal.signal(signal.SIGCHLD, child_sig_handler)
909     signal.signal(signal.SIGUSR1, child_sig_handler)
910     signal.signal(signal.SIGALRM, child_sig_handler)
911     try:
912         pid = os.fork()
913     except OSError, e:
914         raise Exception, "%s [%d]" % (e.strerror, e.errno)
915
916     if (pid == 0):
917         #Reset signals that we set to trap in parent
918         signal.signal(signal.SIGCHLD, signal.SIG_DFL)
919         signal.signal(signal.SIGUSR1, signal.SIG_DFL)
920         signal.signal(signal.SIGALRM, signal.SIG_DFL)
921         os.setsid()
922         os.chdir(WORKDIR)
923         os.umask(UMASK)
924     else:
925         #Wait for the child to give the OK or otherwise.
926         signal.pause()
927
928
929 def main(argv):
930
931     # Check SMF invocation environment
932     if os.getenv("SMF_FMRI") == None or os.getenv("SMF_METHOD") != "start":
933         sys.stderr.write("Command line invocation of %s unsupported.\n" \
934                          % (sys.argv[0]))
935         sys.stderr.write("This command is intended for smf(5) invocation only.\n")
936         sys.exit(smf.SMF_EXIT_ERR_NOSMF)
937
938     # Daemonise the service.
939     create_daemon()
940
941     # The user security attributes checked are the following:
942     # Note that UID == 0 will match any profile search so
943     # no need to check it explicitly.
944     syslog.openlog("time-sliderd", 0, syslog.LOG_DAEMON)
945     rbacp = RBACprofile()
946     if rbacp.has_profile("ZFS File System Management"):
947
948         gobject.threads_init()
949
950         # Tell dbus to use the gobject mainloop for async ops
951         dbus.mainloop.glib.DBusGMainLoop(set_as_default=True)
952         dbus.mainloop.glib.threads_init()
953         # Register a bus name with the system dbus daemon
954         systemBus = dbus.SystemBus()
955         name = dbus.service.BusName("org.opensolaris.TimeSlider", systemBus)
956
957         # Create and start the snapshot manger. Takes care of
958         # auto snapshotting service and auto cleanup.
959         snapshot = SnapshotManager(systemBus)
960         snapshot.start()
961         gobject.timeout_add(2000, monitor_threads, snapshot)
962
963         mainloop = gobject.MainLoop()
964         try:
965             mainloop.run()
966         except KeyboardInterrupt:
967             mainloop.quit()
968             sys.exit(smf.SMF_EXIT_OK)
969     else:
970         syslog.syslog(syslog.LOG_ERR,
971                "%s has insufficient privileges to run time-sliderd!" \
972                % rbacp.name)
973         syslog.closelog()    
974         sys.exit(smf.SMF_EXIT_ERR_PERM)
975     syslog.closelog()
976     sys.exit(smf.SMF_EXIT_OK)
977