9f7fcbd79bb288bc3f6199df147891ee2b259164
[time-slider.git] / usr / share / time-slider / lib / time_slider / timesliderd.py
1 #!/usr/bin/python2
2 #
3 # CDDL HEADER START
4 #
5 # The contents of this file are subject to the terms of the
6 # Common Development and Distribution License (the "License").
7 # You may not use this file except in compliance with the License.
8 #
9 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 # or http://www.opensolaris.org/os/licensing.
11 # See the License for the specific language governing permissions
12 # and limitations under the License.
13 #
14 # When distributing Covered Code, include this CDDL HEADER in each
15 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 # If applicable, add the following below this CDDL HEADER, with the
17 # fields enclosed by brackets "[]" replaced with your own identifying
18 # information: Portions Copyright [yyyy] [name of copyright owner]
19 #
20 # CDDL HEADER END
21 #
22
23 import sys
24 import os
25 import subprocess
26 import re
27 import threading
28 import getopt
29 import syslog
30 import time
31 import datetime
32 import calendar
33 import signal
34 import argparse
35 import logging
36 from logging.handlers import SysLogHandler
37
38 import glib
39 import gobject
40 import dbus
41 import dbus.service
42 import dbus.mainloop
43 import dbus.mainloop.glib
44
45 import dbussvc
46 import zfs
47 import smf
48 import time_slider.linux.timeslidersmf as timeslidersmf
49 import time_slider.linux.autosnapsmf as autosnapsmf
50 # import plugin
51 from time_slider.linux.rbac import RBACprofile
52 import util
53
54 import time_slider.linux.timesliderconfig as timesliderconfig
55
56 _MINUTE = 60
57 _HOUR = _MINUTE * 60
58 _DAY = _HOUR * 24
59 _WEEK = _DAY * 7
60
61
62 # Status codes for actual zpool capacity levels.
63 # These are relative to the SMF property defined
64 # levels for: user, warning and emergenecy levels
65 STATUS_OK = 0 # Below user specified threshhold. Everything was OK
66 STATUS_WARNING = 1 # Above specified user threshold level
67 STATUS_CRITICAL = 2 # Above specified critical threshhold level
68 STATUS_EMERGENCY = 3 # Above specified emergency threshhold level
69
70 intervals = {"weeks" : _WEEK, "days" : _DAY, "hours" : _HOUR, "minutes" : _MINUTE}
71
72
73 class SnapshotManager(threading.Thread):
74
75     def __init__(self, bus):
76         # Used to wake up the run() method prematurely in the event
77         # of a SIGHUP/SMF refresh
78         self._conditionLock = threading.Condition(threading.RLock())
79         # Used when schedules are being rebuilt or examined.
80         self._refreshLock = threading.Lock()
81         # Indicates that cleanup is in progress when locked
82         self._cleanupLock = threading.Lock()
83         self._datasets = zfs.Datasets()
84         # Indicates that schedules need to be rebuilt from scratch
85         self._stale = True
86         self._lastCleanupCheck = 0;
87         self._zpools = []
88         self._poolstatus = {}
89         self._destroyedsnaps = []
90         self.logger = logging.getLogger('time-slider')
91
92         # This is also checked during the refresh() method but we need
93         # to know it sooner for instantiation of the PluginManager
94         self._smf = timeslidersmf.TimeSliderSMF()
95         try:
96             self.verbose = self._smf.get_verbose()
97         except RuntimeError,message:
98             self.logger.error("Error determing whether debugging is enabled")
99             self.verbose = False
100
101         self._dbus = dbussvc.AutoSnap(bus,
102                                       '/org/opensolaris/TimeSlider/autosnap',
103                                       self)
104
105         # self._plugin = plugin.PluginManager(self.verbose)
106         self.exitCode = smf.SMF_EXIT_OK
107         self.refresh()
108
109         # Seems we're up and running OK. 
110         # Signal our parent so we can daemonise
111         os.kill(os.getppid(), signal.SIGUSR1)
112
113         # SMF/svc.startd sends SIGHUP to force a
114         # a refresh of the daemon
115         signal.signal(signal.SIGHUP, self._signalled)
116
117         # Init done. Now initiaslise threading.
118         threading.Thread.__init__ (self)
119         self.setDaemon(True)
120
121     def run(self):
122         # Deselect swap and dump volumes so they don't get snapshotted.
123         for vol in self._datasets.list_volumes():
124             name = vol.rsplit("/")
125             try:
126                 if (name[1] == "swap" or name[1] == "dump"):
127                     util.debug("Auto excluding %s volume" % vol, self.verbose)
128                     volume = zfs.Volume(vol)
129                     volume.set_auto_snap(False)
130             except IndexError:
131                 pass
132             
133         nexttime = None
134         waittime = None
135         while True:
136             try:
137                 self.refresh()
138                 # First check and, if necessary, perform any remedial cleanup.
139                 # This is best done before creating any new snapshots which may
140                 # otherwise get immediately gobbled up by the remedial cleanup.
141                 if self._needs_cleanup() == True:
142                     self._perform_cleanup()
143                     # Check to see if cleanup actually deleted anything before
144                     # notifying the user. Avoids the popup appearing continuously
145                     if len(self._destroyedsnaps) > 0:
146                         self._send_notification()
147                     self._send_to_syslog()
148
149                 nexttime = self._check_snapshots()
150                 # Overdue snapshots are already taken automatically
151                 # inside _check_snapshots() so nexttime should never be
152                 # < 0. It can be None however, which is fine since it 
153                 # will cause the scheduler thread to sleep indefinitely
154                 # or until a SIGHUP is caught.
155                 if nexttime:
156                     util.debug("Waiting until " + str (nexttime), self.verbose)
157                 waittime = None
158                 if nexttime != None:
159                     waittime = nexttime - long(time.time())
160                     if (waittime <= 0):
161                         # We took too long and missed a snapshot, so break out
162                         # and catch up on it the next time through the loop
163                         continue
164                 # waittime could be None if no auto-snap schedules are online
165                 self._conditionLock.acquire()
166                 if waittime:
167                     util.debug("Waiting %d seconds" % (waittime), self.verbose)
168                     self._conditionLock.wait(waittime)
169                 else: #None. Just wait a while to check for cleanups.
170                     util.debug("No auto-snapshot schedules online.", \
171                                self.verbose)
172                     self._conditionLock.wait(_MINUTE * 15)
173
174             except OSError, message:
175                 self.logger.error("Caught OSError exception in snapshot" +
176                                  " manager thread")
177                 self.logger.error("Error details:\n" + \
178                                  "--------BEGIN ERROR MESSAGE--------\n" + \
179                                  str(message) + \
180                                  "\n--------END ERROR MESSAGE--------")
181                 self.exitCode = smf.SMF_EXIT_ERR_FATAL
182                 # Exit this thread
183                 break
184             except RuntimeError,message:
185                 self.logger.error("Caught RuntimeError exception in snapshot" +
186                                  " manager thread")
187                 self.logger.error("Error details:\n" + \
188                                  "--------BEGIN ERROR MESSAGE--------\n" + \
189                                  str(message) + \
190                                  "\n--------END ERROR MESSAGE--------")
191                 # Exit this thread
192                 break
193
194     def _signalled(self, signum, frame):
195         if signum == signal.SIGHUP:
196             if self._refreshLock.acquire(False) == False:
197                 return
198             self._stale = True
199             self._refreshLock.release()
200             self._conditionLock.acquire()
201             self._conditionLock.notify()
202             self._conditionLock.release()
203
204     def refresh(self):
205         """
206         Checks if defined snapshot schedules are out
207         of date and rebuilds and updates if necessary
208         """
209         self._refreshLock.acquire()
210         if self._stale == True:
211             self._configure_svc_props()
212             self._rebuild_schedules()
213             self._update_schedules()
214             # self._plugin.refresh()
215             self._stale = False
216         self._refreshLock.release()
217
218     def _configure_svc_props(self):
219         try:
220             self.verbose = self._smf.get_verbose()
221         except RuntimeError,message:
222             self.logger.error("Error determing whether debugging is enabled")
223             self.verbose = False
224
225         try:
226             cleanup = self._smf.get_remedial_cleanup()
227             warn = self._smf.get_cleanup_level("warning")
228             util.debug("Warning level value is:   %d%%" % warn, self.verbose)
229             crit = self._smf.get_cleanup_level("critical")
230             util.debug("Critical level value is:  %d%%" % crit, self.verbose)
231             emer = self._smf.get_cleanup_level("emergency")
232             util.debug("Emergency level value is: %d%%" % emer, self.verbose)
233         except RuntimeError,message:
234             self.logger.error("Failed to determine cleanup threshhold levels")
235             self.logger.error("Details:\n" + \
236                              "--------BEGIN ERROR MESSAGE--------\n" + \
237                              str(message) + \
238                              "\n---------END ERROR MESSAGE---------\n")
239             self.logger.error("Using factory defaults of 80%, 90% and 95%")
240             #Go with defaults
241             #FIXME - this would be an appropriate case to mark svc as degraded
242             self._remedialCleanup = True
243             self._warningLevel = 80
244             self._criticalLevel = 90
245             self._emergencyLevel = 95
246         else:
247             self._remedialCleanup = cleanup
248             self._warningLevel = warn
249             self._criticalLevel = crit
250             self._emergencyLevel = emer
251
252         try:
253             self._keepEmpties = self._smf.get_keep_empties()
254         except RuntimeError,message:
255             # Not fatal, just assume we delete them (default configuration)
256             self.logger.error("Can't determine whether to keep empty snapshots")
257             self.logger.error("Details:\n" + \
258                              "--------BEGIN ERROR MESSAGE--------\n" + \
259                              str(message) + \
260                              "\n---------END ERROR MESSAGE---------")
261             self.logger.error("Assuming default value: False")
262             self._keepEmpties = False
263
264         # Previously, snapshot labels used the ":" character was used as a 
265         # separator character for datestamps. Windows filesystems such as
266         # CIFS and FAT choke on this character so now we use a user definable
267         # separator value, with a default value of "_"
268         # We need to check for both the old and new format when looking for
269         # snapshots.
270         self._separator = self._smf.get_separator()
271         self._prefix = "%s[:%s]" \
272             % (autosnapsmf.SNAPLABELPREFIX, self._separator)
273
274         # Rebuild pool list
275         self._zpools = []
276         try:
277             for poolname in zfs.list_zpools():
278                 # Do not try to examine FAULTED pools
279                 zpool = zfs.ZPool(poolname)
280                 if zpool.health == "FAULTED":
281                     util.debug("Ignoring faulted Zpool: %s\n" \
282                                % (zpool.name), \
283                                self.verbose)
284                 else:
285                     self._zpools.append(zpool)
286                 util.debug(str(zpool), self.verbose)
287         except RuntimeError,message:
288             self.logger.error("Could not list Zpools")
289             self.exitCode = smf.SMF_EXIT_ERR_FATAL
290             # Propogate exception up to thread's run() method
291             raise RuntimeError,message
292
293
294     def _rebuild_schedules(self):
295         """
296         Builds 2 lists of default and custom auto-snapshot SMF instances
297         """
298
299         self._last = {}
300         self._next = {}
301         self._keep = {}
302
303         try:
304             _defaultSchedules = autosnapsmf.get_default_schedules()
305             _customSchedules = autosnapsmf.get_custom_schedules()
306         except RuntimeError,message:
307             self.exitCode = smf.SMF_EXIT_ERR_FATAL
308             raise RuntimeError, "Error reading SMF schedule instances\n" + \
309                                 "Details:\n" + str(message)
310         else:
311             # Now set it in stone.
312             self._defaultSchedules = tuple(_defaultSchedules)
313             self._customSchedules = tuple(_customSchedules)
314             
315             # Build the combined schedule tuple from default + custom schedules
316             _defaultSchedules.extend(_customSchedules)
317             self._allSchedules = tuple(_defaultSchedules)
318             for schedule,i,p,keep in self._allSchedules:
319                 self._last[schedule] = 0
320                 self._next[schedule] = 0
321                 self._keep[schedule] = keep
322
323     def _update_schedules(self):
324         interval = 0
325         idx = 1 # Used to index subsets for schedule overlap calculation
326         last = None
327
328         for schedule,interval,period,keep in self._allSchedules:
329             # Shortcut if we've already processed this schedule and it's 
330             # still up to date. Don't skip the default schedules though
331             # because overlap affects their scheduling
332             if [schedule,interval,period,keep] not in \
333                 self._defaultSchedules and \
334                 (self._next[schedule] > self._last[schedule]):
335                 util.debug("Short circuiting %s recalculation" \
336                            % (schedule), \
337                            self.verbose)
338                 continue
339
340             # If we don't have an internal timestamp for the given schedule
341             # ask zfs for the last snapshot and get it's creation timestamp.
342             if self._last[schedule] == 0:
343                 try:
344                     snaps = self._datasets.list_snapshots("%s%s" % \
345                                                          (self._prefix,
346                                                           schedule))
347                 except RuntimeError,message:
348                     self.exitCode = smf.SMF_EXIT_ERR_FATAL
349                     self.logger.error("Failed to list snapshots during schedule update")
350                     #Propogate up to the thread's run() method
351                     raise RuntimeError,message
352
353                 if len(snaps) > 0:
354                     util.debug("Last %s snapshot was: %s" % \
355                                (schedule, snaps[-1][0]), \
356                                self.verbose)
357                     self._last[schedule] = snaps[-1][1]
358
359             last = self._last[schedule]
360             if interval != "months": # months is non-constant. See below.
361                 util.debug("Recalculating %s schedule" % (schedule), \
362                            self.verbose)
363                 try:
364                     totalinterval = intervals[interval] * period
365                 except KeyError:
366                     self.exitCode = smf.SMF_EXIT_ERR_CONFIG
367                     self.logger.error(schedule + \
368                                       " schedule has invalid interval: " + \
369                                       "'%s\'" % interval)
370                     #Propogate up to thread's run() method
371                     raise RuntimeError
372                 if [schedule,interval,period,keep] in self._defaultSchedules:
373                     # This is one of the default schedules so check for an
374                     # overlap with one of the dominant shchedules.
375                     for s,i,p,k in self._defaultSchedules[:idx]:
376                         last = max(last, self._last[s])
377                     idx += 1
378
379             else: # interval == "months"
380                 if self._next[schedule] > last:
381                     util.debug("Short circuiting " + \
382                                schedule + \
383                                " recalculation", \
384                                self.verbose)
385                     continue
386                 util.debug("Recalculating %s schedule" % (schedule), \
387                            self.verbose)
388                 snap_tm = time.gmtime(self._last[schedule])
389                 # Increment year if period >= than 1 calender year.
390                 year = snap_tm.tm_year
391                 year += period / 12
392                 period = period % 12
393
394                 mon = (snap_tm.tm_mon + period) % 12
395                 # Result of 0 actually means december.
396                 if mon == 0:
397                     mon = 12
398                 # Account for period that spans calendar year boundary.
399                 elif snap_tm.tm_mon + period > 12:
400                     year += 1
401
402                 d,dlastmon = calendar.monthrange(snap_tm.tm_year, snap_tm.tm_mon)
403                 d,dnewmon = calendar.monthrange(year, mon)
404                 mday = snap_tm.tm_mday
405                 if dlastmon > dnewmon and snap_tm.tm_mday > dnewmon:
406                    mday = dnewmon
407                 
408                 tm =(year, mon, mday, \
409                     snap_tm.tm_hour, snap_tm.tm_min, snap_tm.tm_sec, \
410                     0, 0, -1)
411                 newt = calendar.timegm(tm)
412                 new_tm = time.gmtime(newt)
413                 totalinterval = newt - self._last[schedule]
414
415             self._next[schedule] = last + totalinterval
416
417     def _next_due(self):
418         schedule = None
419         earliest = None
420         now = long(time.time())
421         
422         for s,i,p,k in self._defaultSchedules:
423             due = self._next[s]
424             if due <= now:
425                 #Default Schedule - so break out at the first 
426                 #schedule that is overdue. The subordinate schedules
427                 #will re-adjust afterwards.
428                 earliest,schedule = due,s
429                 break
430             elif earliest != None:
431                 if due < earliest:
432                     earliest,schedule = due,s
433             else: #FIXME better optimisation with above condition
434                 earliest,schedule = due,s
435         for s,i,p,k in self._customSchedules:
436             due = self._next[s]
437             if earliest != None:
438                 if due < earliest:
439                     earliest,schedule = due,s
440             else: #FIXME better optimisation with above condition
441                 earliest,schedule = due,s
442         return earliest,schedule
443
444     def _check_snapshots(self):
445         """
446         Check the schedules and see what the required snapshot is.
447         Take one immediately on the first overdue snapshot required
448         """
449         # Make sure a refresh() doesn't mess with the schedule while
450         # we're reading through it.
451         self._refreshLock.acquire()
452         next,schedule = self._next_due()
453         self._refreshLock.release()
454         now = long(time.time())
455         while next != None and next <= now:
456             label = self._take_snapshots(schedule)
457             # self._plugin.execute_plugins(schedule, label)
458             self._refreshLock.acquire()
459             self._update_schedules()
460             next,schedule = self._next_due();
461             self._refreshLock.release()
462             dt = datetime.datetime.fromtimestamp(next)
463             util.debug("Next snapshot is %s due at: %s" % \
464                        (schedule, dt.isoformat()), \
465                        self.verbose)
466         return next
467                     
468     def _take_snapshots(self, schedule):
469         # Set the time before taking snapshot to avoid clock skew due
470         # to time taken to complete snapshot.
471         tm = long(time.time())
472         label = "%s%s%s-%s" % \
473                 (autosnapsmf.SNAPLABELPREFIX, self._separator, schedule,
474                  datetime.datetime.now().strftime("%Y-%m-%d-%Hh%M"))
475         try:
476             self._datasets.create_auto_snapshot_set(label, tag=schedule)
477         except RuntimeError, message:
478             # Write an error message, set the exit code and pass it up the
479             # stack so the thread can terminate
480             self.logger.error("Failed to create snapshots for schedule: %s" \
481                              % (schedule))
482             self.exitCode = smf.SMF_EXIT_MON_DEGRADE
483             raise RuntimeError,message
484         self._last[schedule] = tm;
485         self._perform_purge(schedule)
486         return label
487
488     def _prune_snapshots(self, dataset, schedule):
489         """Cleans out zero sized snapshots, kind of cautiously"""
490             # Per schedule: We want to delete 0 sized
491             # snapshots but we need to keep at least one around (the most
492             # recent one) for each schedule so that that overlap is 
493             # maintained from frequent -> hourly -> daily etc.
494             # Start off with the smallest interval schedule first and
495             # move up. This increases the amount of data retained where
496             # several snapshots are taken together like a frequent hourly
497             # and daily snapshot taken at 12:00am. If 3 snapshots are all
498             # identical and reference the same identical data they will all
499             # be initially reported as zero for used size. Deleting the
500             # daily first then the hourly would shift make the data referenced
501             # by all 3 snapshots unique to the frequent scheduled snapshot.
502             # This snapshot would probably be purged within an how ever and the
503             # data referenced by it would be gone for good.
504             # Doing it the other way however ensures that the data should
505             # remain accessible to the user for at least a week as long as
506             # the pool doesn't run low on available space before that.
507
508         try:
509             snaps = dataset.list_snapshots("%s%s" % (self._prefix,schedule))
510             # Clone the list because we want to remove items from it
511             # while iterating through it.
512             remainingsnaps = snaps[:]
513         except RuntimeError,message:
514             self.logger.error("Failed to list snapshots during snapshot cleanup")
515             self.exitCode = smf.SMF_EXIT_ERR_FATAL
516             raise RuntimeError,message
517
518         if (self._keepEmpties == False):
519             try: # remove the newest one from the list.
520                 snaps.pop()
521             except IndexError:
522                 pass
523             for snapname in snaps:
524                 try:
525                     snapshot = zfs.Snapshot(snapname)
526                 except Exception,message:
527                     self.logger.error(str(message))
528                     # Not fatal, just skip to the next snapshot
529                     continue
530
531                 try:
532                     if snapshot.get_used_size() == 0:
533                         util.debug("Destroying zero sized: " + snapname, \
534                                    self.verbose)
535                         try:
536                             snapshot.destroy()
537                         except RuntimeError,message:
538                             self.logger.error("Failed to destroy snapshot: " +
539                                              snapname)
540                             self.exitCode = smf.SMF_EXIT_MON_DEGRADE
541                             # Propogate exception so thread can exit
542                             raise RuntimeError,message
543                         remainingsnaps.remove(snapname)
544                 except RuntimeError,message:
545                     self.logger.error("Can not determine used size of: " + \
546                                      snapname)
547                     self.exitCode = smf.SMF_EXIT_MON_DEGRADE
548                     #Propogate the exception to the thead run() method
549                     raise RuntimeError,message
550
551         # Deleting individual snapshots instead of recursive sets
552         # breaks the recursion chain and leaves child snapshots
553         # dangling so we need to take care of cleaning up the 
554         # snapshots.
555         target = len(remainingsnaps) - self._keep[schedule]
556         counter = 0
557         while counter < target:
558             util.debug("Destroy expired snapshot: " + \
559                        remainingsnaps[counter], 
560                        self.verbose)
561             try:
562                 snapshot = zfs.Snapshot(remainingsnaps[counter])
563             except Exception,message:
564                     self.logger.error(str(message))
565                     # Not fatal, just skip to the next snapshot
566                     counter += 1
567                     continue
568             try:
569                 snapshot.destroy()
570             except RuntimeError,message:
571                 self.logger.error("Failed to destroy snapshot: " +
572                                  snapshot.name)
573                 self.exitCode = smf.SMF_EXIT_ERR_FATAL
574                 # Propogate exception so thread can exit
575                 raise RuntimeError,message
576             else:
577                 counter += 1
578
579     def _perform_purge(self, schedule):
580         """Cautiously cleans out zero sized snapshots"""
581         # We need to avoid accidentally pruning auto snapshots received
582         # from one zpool to another. We ensure this by examining only
583         # snapshots whose parent fileystems and volumes are explicitly
584         # tagged to be snapshotted.
585         try:
586             for name in self._datasets.list_auto_snapshot_sets(schedule):
587                 dataset = zfs.ReadWritableDataset(name)
588                 self._prune_snapshots(dataset, schedule)
589         except RuntimeError,message:
590             self.logger.error("Error listing datasets during " + \
591                              "removal of expired snapshots")
592             self.exitCode = smf.SMF_EXIT_ERR_FATAL
593             # Propogate up to thread's run() method
594             raise RuntimeError,message
595
596     def _needs_cleanup(self):
597         if self._remedialCleanup == False:
598             # Sys admin has explicitly instructed for remedial cleanups
599             # not to be performed.
600             return False
601         now = long(time.time())
602         # Don't run checks any less than 15 minutes apart.
603         if self._cleanupLock.acquire(False) == False:
604             #Indicates that a cleanup is already running.
605             return False
606         # FIXME - Make the cleanup interval equal to the minimum snapshot interval
607         # if custom snapshot schedules are defined and enabled.
608         elif ((now - self._lastCleanupCheck) < (_MINUTE * 15)):
609             pass
610         else:
611             for zpool in self._zpools:
612                 try:
613                     if zpool.get_capacity() > self._warningLevel:
614                         # Before getting into a panic, determine if the pool
615                         # is one we actually take snapshots on, by checking
616                         # for one of the "auto-snapshot:<schedule> tags. Not
617                         # super fast, but it only happens under exceptional
618                         # circumstances of a zpool nearing it's capacity.
619
620                         for sched in self._allSchedules:
621                             sets = zpool.list_auto_snapshot_sets(sched[0])
622                             if len(sets) > 0:
623                                 util.debug("%s needs a cleanup" \
624                                            % zpool.name, \
625                                            self.verbose)
626                                 self._cleanupLock.release()
627                                 return True
628                 except RuntimeError, message:
629                     self.logger.error("Error checking zpool capacity of: " + \
630                                      zpool.name)
631                     self._cleanupLock.release()
632                     self.exitCode = smf.SMF_EXIT_ERR_FATAL
633                     # Propogate up to thread's run() mehod.
634                     raise RuntimeError,message
635             self._lastCleanupCheck = long(time.time())
636         self._cleanupLock.release()
637         return False
638
639     def _perform_cleanup(self):
640         if self._cleanupLock.acquire(False) == False:
641             # Cleanup already running. Skip
642             return
643         self._destroyedsnaps = []
644         for zpool in self._zpools:
645             try:
646                 self._poolstatus[zpool.name] = 0
647                 capacity = zpool.get_capacity()
648                 if capacity > self._warningLevel:
649                     self._run_warning_cleanup(zpool)
650                     self._poolstatus[zpool.name] = 1
651                     capacity = zpool.get_capacity()
652                 if capacity > self._criticalLevel:
653                     self._run_critical_cleanup(zpool)
654                     self._poolstatus[zpool.name] = 2
655                     capacity = zpool.get_capacity()
656                 if capacity > self._emergencyLevel:
657                     self._run_emergency_cleanup(zpool)
658                     self._poolstatus[zpool.name] = 3
659                     capacity = zpool.get_capacity()
660                 if capacity > self._emergencyLevel:
661                     self._run_emergency_cleanup(zpool)
662                     self._poolstatus[zpool.name] = 4
663             # This also catches exceptions thrown from _run_<level>_cleanup()
664             # and _run_cleanup() in methods called by _perform_cleanup()
665             except RuntimeError,message:
666                 self.logger.error("Remedial space cleanup failed because " + \
667                                  "of failure to determinecapacity of: " + \
668                                  zpool.name)
669                 self.exitCode = smf.SMF_EXIT_ERR_FATAL
670                 self._cleanupLock.release()
671                 # Propogate up to thread's run() method.
672                 raise RuntimeError,message
673
674             # Bad - there's no more snapshots left and nothing 
675             # left to delete. We don't disable the service since
676             # it will permit self recovery and snapshot
677             # retention when space becomes available on
678             # the pool (hopefully).
679             util.debug("%s pool status after cleanup:" \
680                        % zpool.name, \
681                        self.verbose)
682             util.debug(zpool, self.verbose)
683         util.debug("Cleanup completed. %d snapshots were destroyed" \
684                    % len(self._destroyedsnaps), \
685                    self.verbose)
686         # Avoid needless list iteration for non-debug mode
687         if self.verbose == True and len(self._destroyedsnaps) > 0:
688             for snap in self._destroyedsnaps:
689                 self.logger.error("\t%s" % snap)
690         self._cleanupLock.release()
691
692     def _run_warning_cleanup(self, zpool):
693         util.debug("Performing warning level cleanup on %s" % \
694                    zpool.name, \
695                    self.verbose)
696         self._run_cleanup(zpool, "daily", self._warningLevel)
697         if zpool.get_capacity() > self._warningLevel:
698             self._run_cleanup(zpool, "hourly", self._warningLevel)
699
700     def _run_critical_cleanup(self, zpool):
701         util.debug("Performing critical level cleanup on %s" % \
702                    zpool.name, \
703                    self.verbose)
704         self._run_cleanup(zpool, "weekly", self._criticalLevel)
705         if zpool.get_capacity() > self._criticalLevel:
706             self._run_cleanup(zpool, "daily", self._criticalLevel)
707         if zpool.get_capacity() > self._criticalLevel:
708             self._run_cleanup(zpool, "hourly", self._criticalLevel)
709
710     def _run_emergency_cleanup(self, zpool):
711         util.debug("Performing emergency level cleanup on %s" % \
712                    zpool.name, \
713                    self.verbose)
714         self._run_cleanup(zpool, "monthly", self._emergencyLevel)
715         if zpool.get_capacity() > self._emergencyLevel:
716             self._run_cleanup(zpool, "weekly", self._emergencyLevel)
717         if zpool.get_capacity() > self._emergencyLevel:
718             self._run_cleanup(zpool, "daily", self._emergencyLevel)
719         if zpool.get_capacity() > self._emergencyLevel:
720             self._run_cleanup(zpool, "hourly", self._emergencyLevel)
721         if zpool.get_capacity() > self._emergencyLevel:
722             self._run_cleanup(zpool, "frequent", self._emergencyLevel)
723         #Finally, as a last resort, delete custom scheduled snaphots
724         for schedule,i,p,k in self._customSchedules:
725             if zpool.get_capacity() < self._emergencyLevel:
726                 break
727             else:
728                 self._run_cleanup(zpool, schedule, self._emergencyLevel)
729
730     def _run_cleanup(self, zpool, schedule, threshold):
731         clonedsnaps = []
732         snapshots = []
733         try:
734             clonedsnaps = self._datasets.list_cloned_snapshots()
735         except RuntimeError,message:
736                 self.logger.error("Error (non-fatal) listing cloned snapshots" +
737                                  " while recovering pool capacity")
738                 self.logger.error("Error details:\n" + \
739                                  "--------BEGIN ERROR MESSAGE--------\n" + \
740                                  str(message) + \
741                                  "\n--------END ERROR MESSAGE--------")    
742
743         # Build a list of snapshots in the given schedule, that are not
744         # cloned, and sort the result in reverse chronological order.
745         try:
746             snapshots = [s for s,t in \
747                             zpool.list_snapshots("%s%s" \
748                             % (self._prefix,schedule)) \
749                             if not s in clonedsnaps]
750             snapshots.reverse()
751         except RuntimeError,message:
752             self.logger.error("Error listing snapshots" +
753                              " while recovering pool capacity")
754             self.exitCode = smf.SMF_EXIT_ERR_FATAL
755             # Propogate the error up to the thread's run() method.
756             raise RuntimeError,message
757    
758         while zpool.get_capacity() > threshold:
759             if len(snapshots) == 0:
760                 self.logger.info( \
761                               "No more %s snapshots left" \
762                                % schedule)
763                 return
764
765             """This is not an exact science. Deleteing a zero sized 
766             snapshot can have unpredictable results. For example a
767             pair of snapshots may share exclusive reference to a large
768             amount of data (eg. a large core file). The usage of both
769             snapshots will initially be seen to be 0 by zfs(1). Deleting
770             one of the snapshots will make the data become unique to the
771             single remaining snapshot that references it uniquely. The
772             remaining snapshot's size will then show up as non zero. So
773             deleting 0 sized snapshot is not as pointless as it might seem.
774             It also means we have to loop through this, each snapshot set
775             at a time and observe the before and after results. Perhaps
776             better way exists...."""
777
778             # Start with the oldest first
779             snapname = snapshots.pop()
780             snapshot = zfs.Snapshot(snapname)
781             # It would be nicer, for performance purposes, to delete sets
782             # of snapshots recursively but this might destroy more data than
783             # absolutely necessary, plus the previous purging of zero sized
784             # snapshots can easily break the recursion chain between
785             # filesystems.
786             # On the positive side there should be fewer snapshots and they
787             # will mostly non-zero so we should get more effectiveness as a
788             # result of deleting snapshots since they should be nearly always
789             # non zero sized.
790             util.debug("Destroying %s" % snapname, self.verbose)
791             try:
792                 snapshot.destroy()
793             except RuntimeError,message:
794                 # Would be nice to be able to mark service as degraded here
795                 # but it's better to try to continue on rather than to give
796                 # up alltogether (SMF maintenance state)
797                 self.logger.error("Warning: Cleanup failed to destroy: %s" % \
798                                  (snapshot.name))
799                 self.logger.error("Details:\n%s" % (str(message)))
800             else:
801                 self._destroyedsnaps.append(snapname)
802             # Give zfs some time to recalculate.
803             time.sleep(3)
804         
805     def _send_to_syslog(self):
806         for zpool in self._zpools:
807             status = self._poolstatus[zpool.name]
808             if status == 4:
809                 self.logger.critical( \
810                               "All automatic snapshots were destroyed" \
811                                % (zpool.name, self._emergencyLevel))
812             elif status == 3:
813                 self.logger.error( \
814                               "%s exceeded %d%% capacity. " \
815                               "Automatic snapshots over 1 hour old were destroyed" \
816                                % (zpool.name, self._emergencyLevel))
817             elif status == 2:
818                 self.logger.critical( \
819                               "%s exceeded %d%% capacity. " \
820                               "Weekly, hourly and daily automatic snapshots were destroyed" \
821                                % (zpool.name, self._criticalLevel))                             
822             elif status == 1:
823                 self.logger.warning( \
824                               "%s exceeded %d%% capacity. " \
825                               "Hourly and daily automatic snapshots were destroyed" \
826                                % (zpool.name, self._warningLevel))
827
828         if len(self._destroyedsnaps) > 0:
829             self.logger.warning( \
830                           "%d automatic snapshots were destroyed" \
831                            % len(self._destroyedsnaps))
832
833     def _send_notification(self):
834         worstpool = None
835         worststatus = 0
836
837         for zpool in self._zpools:
838             status = self._poolstatus[zpool.name]
839             # >= to ensure that something should always be set.
840             if status >= worststatus:
841                 worstpool = zpool.name
842                 worststatus = status
843
844         #FIXME make the various levels indexible
845         if worststatus == 4:
846             self._dbus.capacity_exceeded(worstpool, 4, self._emergencyLevel)
847         elif worststatus == 3:
848             self._dbus.capacity_exceeded(worstpool, 3, self._emergencyLevel)
849         elif worststatus == 2:
850             self._dbus.capacity_exceeded(worstpool, 2, self._criticalLevel)
851         elif worststatus == 1:
852             self._dbus.capacity_exceeded(worstpool, 1, self._warningLevel)
853         #elif: 0 everything is fine. Do nothing.
854
855
856 def monitor_threads(snapthread):
857     logger = logging.getLogger('time-slider')
858     if snapthread.is_alive():
859         return True
860     else:
861         logger.error("Snapshot monitor thread exited.")
862         if snapthread.exitCode == smf.SMF_EXIT_MON_DEGRADE:
863             # FIXME - it would be nicer to mark the service as degraded than
864             # go into maintenance state for some situations such as a
865             # particular snapshot schedule failing.
866             # But for now SMF does not implement this feature. But if/when it
867             # does it's better to use svcadm to put the # service into the
868             # correct state since the daemon shouldn't exit whentransitioning
869             # to a degraded state.
870             #sys.stderr.write("Placing service into maintenance state\n")
871             #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
872             #                 os.getenv("SMF_FMRI")])
873             # SMF will take care of kill the daemon
874             sys.exit(smf.SMF_EXIT_ERR_FATAL)
875             return False
876         elif snapthread.exitCode == smf.SMF_EXIT_ERR_FATAL:
877             #sys.stderr.write("Placing service into maintenance state\n")
878             #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
879             #                 os.getenv("SMF_FMRI")])
880             # SMF will take care of killing the daemon
881             sys.exit(smf.SMF_EXIT_ERR_FATAL)
882             return False
883         else:
884             logger.error("Snapshot monitor thread exited abnormally")
885             logger.error("Exit code: %d" % (snapthread.exitCode))
886             #subprocess.call(["/usr/sbin/svcadm", "mark", "maintenance",
887             #                 os.getenv("SMF_FMRI")])
888             sys.exit(smf.SMF_EXIT_ERR_FATAL)
889             return False
890
891
892 def child_sig_handler(signum, frame):
893     if signum == signal.SIGUSR1:
894         sys.exit(smf.SMF_EXIT_OK)
895     elif signum == signal.SIGCHLD:
896         sys.exit(smf.SMF_EXIT_ERR_FATAL)
897     elif signum == signal.SIGALRM:
898         sys.exit(smf.SMF_EXIT_ERR_FATAL)
899
900 # Default daemon parameters.
901 # File mode creation mask of the daemon.
902 UMASK = 0
903 # Default working directory for the daemon.
904 WORKDIR = "/"
905 # Default maximum for the number of available file descriptors.
906 MAXFD = 1024
907
908 def create_daemon():
909     """
910     Detach a process from the controlling terminal and run it in the
911     background as a daemon.
912     """
913     #Catch signals that we might receive from child
914     signal.signal(signal.SIGCHLD, child_sig_handler)
915     signal.signal(signal.SIGUSR1, child_sig_handler)
916     signal.signal(signal.SIGALRM, child_sig_handler)
917     try:
918         pid = os.fork()
919     except OSError, e:
920         raise Exception, "%s [%d]" % (e.strerror, e.errno)
921
922     if (pid == 0):
923         #Reset signals that we set to trap in parent
924         signal.signal(signal.SIGCHLD, signal.SIG_DFL)
925         signal.signal(signal.SIGUSR1, signal.SIG_DFL)
926         signal.signal(signal.SIGALRM, signal.SIG_DFL)
927         os.setsid()
928         os.chdir(WORKDIR)
929         os.umask(UMASK)
930     else:
931         #Wait for the child to give the OK or otherwise.
932         signal.pause()
933
934
935 def main(argv):
936
937     parser = argparse.ArgumentParser()
938     parser.add_argument('--foreground', action='store_true', help='Do not daemonize', default=False)
939     parser.add_argument('--config', '-c', type=str, help='Configuration file', default='/etc/time-slider/timesliderd.conf')
940     parser.add_argument('--configdump', action='store_true', help='Dump default values in config file format', default=False)
941     args, _ = parser.parse_known_args()
942
943     logger = logging.getLogger('time-slider')
944     logger.setLevel(logging.DEBUG)
945     if args.foreground:
946         handler = logging.StreamHandler()
947         handler.setFormatter(logging.Formatter('%(message)s')
948     else:
949         handler = SysLogHandler(address='/dev/log')
950         handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s: %(message)s', '%b %d %H:%M:%S time-sliderd:'))
951     handler.setLevel(logging.DEBUG)
952     logger.addHandler(handler)
953
954     if args.configdump:
955         timesliderconfig.configdump()
956         sys.exit(smf.SMF_EXIT_OK)
957
958     timesliderconfig.configfile = args.config
959
960     # Daemonise the service.
961     if not args.foreground:
962         create_daemon()
963
964     # The user security attributes checked are the following:
965     # Note that UID == 0 will match any profile search so
966     # no need to check it explicitly.
967     rbacp = RBACprofile()
968     if rbacp.has_profile("ZFS File System Management"):
969
970         gobject.threads_init()
971
972         # Tell dbus to use the gobject mainloop for async ops
973         dbus.mainloop.glib.DBusGMainLoop(set_as_default=True)
974         dbus.mainloop.glib.threads_init()
975         # Register a bus name with the system dbus daemon
976         systemBus = dbus.SystemBus()
977         name = dbus.service.BusName("org.opensolaris.TimeSlider", systemBus)
978
979         # Create and start the snapshot manger. Takes care of
980         # auto snapshotting service and auto cleanup.
981         snapshot = SnapshotManager(systemBus)
982         snapshot.start()
983         gobject.timeout_add(2000, monitor_threads, snapshot)
984
985         mainloop = gobject.MainLoop()
986         try:
987             mainloop.run()
988         except KeyboardInterrupt:
989             mainloop.quit()
990             sys.exit(smf.SMF_EXIT_OK)
991     else:
992         logger.error( \
993                "%s has insufficient privileges to run time-sliderd!" \
994                % rbacp.name)
995         sys.exit(smf.SMF_EXIT_ERR_PERM)
996     sys.exit(smf.SMF_EXIT_OK)
997