Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 81 additions & 8 deletions dedupfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
is only stored once.

In addition to deduplication the file system also supports transparent
compression using any of the compression methods lzo, zlib and bz2.
compression using any of the compression methods snappy, lzo, lzma, zlib and bz2.

These two properties make the file system ideal for backups: I'm currently
storing 250 GB worth of backups using only 8 GB of disk space.
Expand All @@ -24,8 +24,8 @@

# Check the Python version, warn the user if untested.
import sys
if sys.version_info[:2] != (2, 6):
msg = "Warning: DedupFS has only been tested on Python 2.6, while you're running Python %d.%d!\n"
if sys.version_info[:2] <= (2, 6):
msg = "Warning: DedupFS has only been tested on Python 2.6 and later, while you're running Python %d.%d!\n"
sys.stderr.write(msg % (sys.version_info[0], sys.version_info[1]))

# Try to load the required modules from Python's standard library.
Expand All @@ -40,6 +40,7 @@
import stat
import time
import traceback
from math import floor
except ImportError, e:
msg = "Error: Failed to load one of the required Python modules! (%s)\n"
sys.stderr.write(msg % str(e))
Expand All @@ -53,10 +54,6 @@
"If you're on Ubuntu try running `sudo apt-get install python-fuse'.\n")
sys.exit(1)

# Local modules that are mostly useful for debugging.
from my_formats import format_size, format_timespan
from get_memory_usage import get_memory_usage

def main(): # {{{1
"""
This function enables using dedupfs.py as a shell script that creates FUSE
Expand Down Expand Up @@ -155,6 +152,7 @@ def __init__(self, *args, **kw): # {{{2
self.parser.add_option('--no-transactions', dest='use_transactions', action='store_false', default=True, help="don't use transactions when making multiple related changes, this might make the file system faster or slower (?)")
self.parser.add_option('--nosync', dest='synchronous', action='store_false', default=True, help="disable SQLite's normal synchronous behavior which guarantees that data is written to disk immediately, because it slows down the file system too much (this means you might lose data when the mount point isn't cleanly unmounted)")
self.parser.add_option('--nogc', dest='gc_enabled', action='store_false', default=True, help="disable the periodic garbage collection because it degrades performance (only do this when you've got disk space to waste or you know that nothing will be be deleted from the file system, which means little to no garbage will be produced)")
self.parser.add_option('--testmem', help="determine allocable memory amount")
self.parser.add_option('--verify-writes', dest='verify_writes', action='store_true', default=False, help="after writing a new data block to the database, check that the block was written correctly by reading it back again and checking for differences")

# Dynamically check for supported hashing algorithms.
Expand All @@ -167,7 +165,7 @@ def __init__(self, *args, **kw): # {{{2
def noop(s): return s
self.compressors = { 'none': (noop, noop) }
compression_methods = ['none']
for modname in 'lzo', 'zlib', 'bz2':
for modname in 'snappy', 'lzma', 'lzo', 'zlib', 'bz2':
try:
module = __import__(modname)
if hasattr(module, 'compress') and hasattr(module, 'decompress'):
Expand Down Expand Up @@ -1209,6 +1207,55 @@ class StatVFS(__Struct): pass

# }}}1

# my_formats
def format_timespan(seconds): # {{{1
"""
Format a timespan in seconds as a human-readable string.
"""
result = []
units = [('day', 60 * 60 * 24), ('hour', 60 * 60), ('minute', 60), ('second', 1)]
for name, size in units:
if seconds >= size:
count = seconds / size
seconds %= size
result.append('%i %s%s' % (count, name, floor(count) != 1 and 's' or ''))
if result == []:
return 'less than a second'
if len(result) == 1:
return result[0]
else:
return ', '.join(result[:-1]) + ' and ' + result[-1]

def format_size(nbytes):
"""
Format a byte count as a human-readable file size.
"""
return nbytes < 1024 and '%i bytes' % nbytes \
or nbytes < (1024 ** 2) and __round(nbytes, 1024, 'KB') \
or nbytes < (1024 ** 3) and __round(nbytes, 1024 ** 2, 'MB') \
or nbytes < (1024 ** 4) and __round(nbytes, 1024 ** 3, 'GB') \
or __round(nbytes, 1024 ** 4, 'TB')

def __round(nbytes, divisor, suffix):
nbytes = float(nbytes) / divisor
if floor(nbytes) == nbytes:
return str(int(nbytes)) + ' ' + suffix
else:
return '%.2f %s' % (nbytes, suffix)

def get_memory_usage():
global _proc_status, _units, _handle
try:
for line in _handle:
if line.startswith('VmSize:'):
label, count, unit = line.split()
return int(count) * _units[unit.upper()]
except:
return 0
finally:
_handle.seek(0)


if __name__ == '__main__':

if '--profile' in sys.argv:
Expand All @@ -1221,6 +1268,32 @@ class StatVFS(__Struct): pass
s.sort_stats('time')
s.print_stats(0.1)
os.unlink(profile)
elif '--testmem' in sys.argv:
"""
Determines the current memory usage of the current process
by reading the VmSize value from /proc/$pid/status.
It's based on the following entry in the Python cookbook:
http://code.activestate.com/recipes/286222/
"""
_units = { 'KB': 1024, 'MB': 1024**2, 'GB': 1024**3 }
_handle = _handle = open('/proc/%d/status' % os.getpid())
megabyte = 1024**2
counter = megabyte
limit = megabyte * 50
memory = []
old_memory_usage = get_memory_usage()
assert old_memory_usage > 0
while counter < limit:
memory.append('a' * counter)
msg = "I've just allocated %s and get_memory_usage() returns %s (%s more, deviation is %s)"
new_memory_usage = get_memory_usage()
difference = new_memory_usage - old_memory_usage
deviation = max(difference, counter) - min(difference, counter)
assert deviation < 1024*100
print msg % (format_size(counter), format_size(new_memory_usage), format_size(difference), format_size(deviation))
old_memory_usage = new_memory_usage
counter += megabyte
print "Stopped allocating new strings at %s" % format_size(limit)
else:
main()

Expand Down
47 changes: 0 additions & 47 deletions get_memory_usage.py

This file was deleted.

38 changes: 0 additions & 38 deletions my_formats.py

This file was deleted.