diff --git a/dedupfs.py b/dedupfs.py index 0d987f2..4307bea 100755 --- a/dedupfs.py +++ b/dedupfs.py @@ -9,7 +9,7 @@ is only stored once. In addition to deduplication the file system also supports transparent -compression using any of the compression methods lzo, zlib and bz2. +compression using any of the compression methods snappy, lzo, lzma, zlib and bz2. These two properties make the file system ideal for backups: I'm currently storing 250 GB worth of backups using only 8 GB of disk space. @@ -24,8 +24,8 @@ # Check the Python version, warn the user if untested. import sys -if sys.version_info[:2] != (2, 6): - msg = "Warning: DedupFS has only been tested on Python 2.6, while you're running Python %d.%d!\n" +if sys.version_info[:2] <= (2, 6): + msg = "Warning: DedupFS has only been tested on Python 2.6 and later, while you're running Python %d.%d!\n" sys.stderr.write(msg % (sys.version_info[0], sys.version_info[1])) # Try to load the required modules from Python's standard library. @@ -40,6 +40,7 @@ import stat import time import traceback + from math import floor except ImportError, e: msg = "Error: Failed to load one of the required Python modules! (%s)\n" sys.stderr.write(msg % str(e)) @@ -53,10 +54,6 @@ "If you're on Ubuntu try running `sudo apt-get install python-fuse'.\n") sys.exit(1) -# Local modules that are mostly useful for debugging. -from my_formats import format_size, format_timespan -from get_memory_usage import get_memory_usage - def main(): # {{{1 """ This function enables using dedupfs.py as a shell script that creates FUSE @@ -155,6 +152,7 @@ def __init__(self, *args, **kw): # {{{2 self.parser.add_option('--no-transactions', dest='use_transactions', action='store_false', default=True, help="don't use transactions when making multiple related changes, this might make the file system faster or slower (?)") self.parser.add_option('--nosync', dest='synchronous', action='store_false', default=True, help="disable SQLite's normal synchronous behavior which guarantees that data is written to disk immediately, because it slows down the file system too much (this means you might lose data when the mount point isn't cleanly unmounted)") self.parser.add_option('--nogc', dest='gc_enabled', action='store_false', default=True, help="disable the periodic garbage collection because it degrades performance (only do this when you've got disk space to waste or you know that nothing will be be deleted from the file system, which means little to no garbage will be produced)") + self.parser.add_option('--testmem', help="determine allocable memory amount") self.parser.add_option('--verify-writes', dest='verify_writes', action='store_true', default=False, help="after writing a new data block to the database, check that the block was written correctly by reading it back again and checking for differences") # Dynamically check for supported hashing algorithms. @@ -167,7 +165,7 @@ def __init__(self, *args, **kw): # {{{2 def noop(s): return s self.compressors = { 'none': (noop, noop) } compression_methods = ['none'] - for modname in 'lzo', 'zlib', 'bz2': + for modname in 'snappy', 'lzma', 'lzo', 'zlib', 'bz2': try: module = __import__(modname) if hasattr(module, 'compress') and hasattr(module, 'decompress'): @@ -1209,6 +1207,55 @@ class StatVFS(__Struct): pass # }}}1 +# my_formats +def format_timespan(seconds): # {{{1 + """ + Format a timespan in seconds as a human-readable string. + """ + result = [] + units = [('day', 60 * 60 * 24), ('hour', 60 * 60), ('minute', 60), ('second', 1)] + for name, size in units: + if seconds >= size: + count = seconds / size + seconds %= size + result.append('%i %s%s' % (count, name, floor(count) != 1 and 's' or '')) + if result == []: + return 'less than a second' + if len(result) == 1: + return result[0] + else: + return ', '.join(result[:-1]) + ' and ' + result[-1] + +def format_size(nbytes): + """ + Format a byte count as a human-readable file size. + """ + return nbytes < 1024 and '%i bytes' % nbytes \ + or nbytes < (1024 ** 2) and __round(nbytes, 1024, 'KB') \ + or nbytes < (1024 ** 3) and __round(nbytes, 1024 ** 2, 'MB') \ + or nbytes < (1024 ** 4) and __round(nbytes, 1024 ** 3, 'GB') \ + or __round(nbytes, 1024 ** 4, 'TB') + +def __round(nbytes, divisor, suffix): + nbytes = float(nbytes) / divisor + if floor(nbytes) == nbytes: + return str(int(nbytes)) + ' ' + suffix + else: + return '%.2f %s' % (nbytes, suffix) + +def get_memory_usage(): + global _proc_status, _units, _handle + try: + for line in _handle: + if line.startswith('VmSize:'): + label, count, unit = line.split() + return int(count) * _units[unit.upper()] + except: + return 0 + finally: + _handle.seek(0) + + if __name__ == '__main__': if '--profile' in sys.argv: @@ -1221,6 +1268,32 @@ class StatVFS(__Struct): pass s.sort_stats('time') s.print_stats(0.1) os.unlink(profile) + elif '--testmem' in sys.argv: + """ + Determines the current memory usage of the current process + by reading the VmSize value from /proc/$pid/status. + It's based on the following entry in the Python cookbook: + http://code.activestate.com/recipes/286222/ + """ + _units = { 'KB': 1024, 'MB': 1024**2, 'GB': 1024**3 } + _handle = _handle = open('/proc/%d/status' % os.getpid()) + megabyte = 1024**2 + counter = megabyte + limit = megabyte * 50 + memory = [] + old_memory_usage = get_memory_usage() + assert old_memory_usage > 0 + while counter < limit: + memory.append('a' * counter) + msg = "I've just allocated %s and get_memory_usage() returns %s (%s more, deviation is %s)" + new_memory_usage = get_memory_usage() + difference = new_memory_usage - old_memory_usage + deviation = max(difference, counter) - min(difference, counter) + assert deviation < 1024*100 + print msg % (format_size(counter), format_size(new_memory_usage), format_size(difference), format_size(deviation)) + old_memory_usage = new_memory_usage + counter += megabyte + print "Stopped allocating new strings at %s" % format_size(limit) else: main() diff --git a/get_memory_usage.py b/get_memory_usage.py deleted file mode 100755 index 978e5e6..0000000 --- a/get_memory_usage.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/python - -""" -The function in this Python module determines the current memory usage of the -current process by reading the VmSize value from /proc/$pid/status. It's based -on the following entry in the Python cookbook: -http://code.activestate.com/recipes/286222/ -""" - -import os - -_units = { 'KB': 1024, 'MB': 1024**2, 'GB': 1024**3 } -_handle = _handle = open('/proc/%d/status' % os.getpid()) - -def get_memory_usage(): - global _proc_status, _units, _handle - try: - for line in _handle: - if line.startswith('VmSize:'): - label, count, unit = line.split() - return int(count) * _units[unit.upper()] - except: - return 0 - finally: - _handle.seek(0) - -if __name__ == '__main__': - from my_formats import format_size - megabyte = 1024**2 - counter = megabyte - limit = megabyte * 50 - memory = [] - old_memory_usage = get_memory_usage() - assert old_memory_usage > 0 - while counter < limit: - memory.append('a' * counter) - msg = "I've just allocated %s and get_memory_usage() returns %s (%s more, deviation is %s)" - new_memory_usage = get_memory_usage() - difference = new_memory_usage - old_memory_usage - deviation = max(difference, counter) - min(difference, counter) - assert deviation < 1024*100 - print msg % (format_size(counter), format_size(new_memory_usage), format_size(difference), format_size(deviation)) - old_memory_usage = new_memory_usage - counter += megabyte - print "Stopped allocating new strings at %s" % format_size(limit) - -# vim: ts=2 sw=2 et diff --git a/my_formats.py b/my_formats.py deleted file mode 100644 index e7cb25b..0000000 --- a/my_formats.py +++ /dev/null @@ -1,38 +0,0 @@ -from math import floor - -def format_timespan(seconds): # {{{1 - """ - Format a timespan in seconds as a human-readable string. - """ - result = [] - units = [('day', 60 * 60 * 24), ('hour', 60 * 60), ('minute', 60), ('second', 1)] - for name, size in units: - if seconds >= size: - count = seconds / size - seconds %= size - result.append('%i %s%s' % (count, name, floor(count) != 1 and 's' or '')) - if result == []: - return 'less than a second' - if len(result) == 1: - return result[0] - else: - return ', '.join(result[:-1]) + ' and ' + result[-1] - -def format_size(nbytes): - """ - Format a byte count as a human-readable file size. - """ - return nbytes < 1024 and '%i bytes' % nbytes \ - or nbytes < (1024 ** 2) and __round(nbytes, 1024, 'KB') \ - or nbytes < (1024 ** 3) and __round(nbytes, 1024 ** 2, 'MB') \ - or nbytes < (1024 ** 4) and __round(nbytes, 1024 ** 3, 'GB') \ - or __round(nbytes, 1024 ** 4, 'TB') - -def __round(nbytes, divisor, suffix): - nbytes = float(nbytes) / divisor - if floor(nbytes) == nbytes: - return str(int(nbytes)) + ' ' + suffix - else: - return '%.2f %s' % (nbytes, suffix) - -# vim: sw=2 sw=2 et