From 2e65d166a5211193ab399b9232eb252a4eaf2035 Mon Sep 17 00:00:00 2001
From: "Eugene San (eugenesan)" <eugenesan@gmail.com>
Date: Wed, 15 Oct 2014 09:06:30 +0300
Subject: [PATCH 1/4] Embedd my_formats and memory_usage for usage as
 standalone script

---
 dedupfs.py          | 81 ++++++++++++++++++++++++++++++++++++++++++---
 get_memory_usage.py | 47 --------------------------
 my_formats.py       | 38 ---------------------
 3 files changed, 77 insertions(+), 89 deletions(-)
 delete mode 100755 get_memory_usage.py
 delete mode 100644 my_formats.py

diff --git a/dedupfs.py b/dedupfs.py
index 0d987f2..2c4ed39 100755
--- a/dedupfs.py
+++ b/dedupfs.py
@@ -40,6 +40,7 @@
   import stat
   import time
   import traceback
+  from math import floor
 except ImportError, e:
   msg = "Error: Failed to load one of the required Python modules! (%s)\n"
   sys.stderr.write(msg % str(e))
@@ -53,10 +54,6 @@
       "If you're on Ubuntu try running `sudo apt-get install python-fuse'.\n")
   sys.exit(1)
 
-# Local modules that are mostly useful for debugging.
-from my_formats import format_size, format_timespan
-from get_memory_usage import get_memory_usage
-
 def main(): # {{{1
   """
   This function enables using dedupfs.py as a shell script that creates FUSE
@@ -155,6 +152,7 @@ def __init__(self, *args, **kw):  # {{{2
       self.parser.add_option('--no-transactions', dest='use_transactions', action='store_false', default=True, help="don't use transactions when making multiple related changes, this might make the file system faster or slower (?)")
       self.parser.add_option('--nosync', dest='synchronous', action='store_false', default=True, help="disable SQLite's normal synchronous behavior which guarantees that data is written to disk immediately, because it slows down the file system too much (this means you might lose data when the mount point isn't cleanly unmounted)")
       self.parser.add_option('--nogc', dest='gc_enabled', action='store_false', default=True, help="disable the periodic garbage collection because it degrades performance (only do this when you've got disk space to waste or you know that nothing will be be deleted from the file system, which means little to no garbage will be produced)")
+      self.parser.add_option('--testmem', help="determine allocable memory amount")
       self.parser.add_option('--verify-writes', dest='verify_writes', action='store_true', default=False, help="after writing a new data block to the database, check that the block was written correctly by reading it back again and checking for differences")
 
       # Dynamically check for supported hashing algorithms.
@@ -1209,6 +1207,55 @@ class StatVFS(__Struct): pass
 
 # }}}1
 
+# my_formats
+def format_timespan(seconds): # {{{1
+  """
+  Format a timespan in seconds as a human-readable string.
+  """
+  result = []
+  units = [('day', 60 * 60 * 24), ('hour', 60 * 60), ('minute', 60), ('second', 1)]
+  for name, size in units:
+    if seconds >= size:
+      count = seconds / size
+      seconds %= size
+      result.append('%i %s%s' % (count, name, floor(count) != 1 and 's' or ''))
+  if result == []:
+    return 'less than a second'
+  if len(result) == 1:
+    return result[0]
+  else:
+    return ', '.join(result[:-1]) + ' and ' + result[-1]
+
+def format_size(nbytes):
+  """
+  Format a byte count as a human-readable file size.
+  """
+  return nbytes < 1024 and '%i bytes' % nbytes \
+      or nbytes < (1024 ** 2) and __round(nbytes, 1024, 'KB') \
+      or nbytes < (1024 ** 3) and __round(nbytes, 1024 ** 2, 'MB') \
+      or nbytes < (1024 ** 4) and __round(nbytes, 1024 ** 3, 'GB') \
+      or __round(nbytes, 1024 ** 4, 'TB')
+
+def __round(nbytes, divisor, suffix):
+  nbytes = float(nbytes) / divisor
+  if floor(nbytes) == nbytes:
+    return str(int(nbytes)) + ' ' + suffix
+  else:
+    return '%.2f %s' % (nbytes, suffix)
+
+def get_memory_usage():
+  global _proc_status, _units, _handle
+  try:
+    for line in _handle:
+      if line.startswith('VmSize:'):
+        label, count, unit = line.split()
+        return int(count) * _units[unit.upper()]
+  except:
+    return 0
+  finally:
+    _handle.seek(0)
+
+
 if __name__ == '__main__':
 
   if '--profile' in sys.argv:
@@ -1221,6 +1268,32 @@ class StatVFS(__Struct): pass
     s.sort_stats('time')
     s.print_stats(0.1)
     os.unlink(profile)
+  elif '--testmem' in sys.argv:
+    """
+    Determines the current memory usage of the current process
+    by reading the VmSize value from /proc/$pid/status.
+    It's based on the following entry in the Python cookbook:
+    http://code.activestate.com/recipes/286222/
+    """
+    _units = { 'KB': 1024, 'MB': 1024**2, 'GB': 1024**3 }
+    _handle = _handle = open('/proc/%d/status' % os.getpid())
+    megabyte = 1024**2
+    counter = megabyte
+    limit = megabyte * 50
+    memory = []
+    old_memory_usage = get_memory_usage()
+    assert old_memory_usage > 0
+    while counter < limit:
+      memory.append('a' * counter)
+      msg = "I've just allocated %s and get_memory_usage() returns %s (%s more, deviation is %s)"
+      new_memory_usage = get_memory_usage()
+      difference = new_memory_usage - old_memory_usage
+      deviation = max(difference, counter) - min(difference, counter)
+      assert deviation < 1024*100
+      print msg % (format_size(counter), format_size(new_memory_usage), format_size(difference), format_size(deviation))
+      old_memory_usage = new_memory_usage
+      counter += megabyte
+    print "Stopped allocating new strings at %s" % format_size(limit)
   else:
     main()
 
diff --git a/get_memory_usage.py b/get_memory_usage.py
deleted file mode 100755
index 978e5e6..0000000
--- a/get_memory_usage.py
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/usr/bin/python
-
-"""
-The function in this Python module determines the current memory usage of the
-current process by reading the VmSize value from /proc/$pid/status. It's based
-on the following entry in the Python cookbook:
-http://code.activestate.com/recipes/286222/
-"""
-
-import os
-
-_units = { 'KB': 1024, 'MB': 1024**2, 'GB': 1024**3 }
-_handle = _handle = open('/proc/%d/status' % os.getpid())
-
-def get_memory_usage():
-  global _proc_status, _units, _handle
-  try:
-    for line in _handle:
-      if line.startswith('VmSize:'):
-        label, count, unit = line.split()
-        return int(count) * _units[unit.upper()]
-  except:
-    return 0
-  finally:
-    _handle.seek(0)
-
-if __name__ == '__main__':
-  from my_formats import format_size
-  megabyte = 1024**2
-  counter = megabyte
-  limit = megabyte * 50
-  memory = []
-  old_memory_usage = get_memory_usage()
-  assert old_memory_usage > 0
-  while counter < limit:
-    memory.append('a' * counter)
-    msg = "I've just allocated %s and get_memory_usage() returns %s (%s more, deviation is %s)"
-    new_memory_usage = get_memory_usage()
-    difference = new_memory_usage - old_memory_usage
-    deviation = max(difference, counter) - min(difference, counter)
-    assert deviation < 1024*100
-    print msg % (format_size(counter), format_size(new_memory_usage), format_size(difference), format_size(deviation))
-    old_memory_usage = new_memory_usage
-    counter += megabyte
-  print "Stopped allocating new strings at %s" % format_size(limit)
-
-# vim: ts=2 sw=2 et
diff --git a/my_formats.py b/my_formats.py
deleted file mode 100644
index e7cb25b..0000000
--- a/my_formats.py
+++ /dev/null
@@ -1,38 +0,0 @@
-from math import floor
-
-def format_timespan(seconds): # {{{1
-  """
-  Format a timespan in seconds as a human-readable string.
-  """
-  result = []
-  units = [('day', 60 * 60 * 24), ('hour', 60 * 60), ('minute', 60), ('second', 1)]
-  for name, size in units:
-    if seconds >= size:
-      count = seconds / size
-      seconds %= size
-      result.append('%i %s%s' % (count, name, floor(count) != 1 and 's' or ''))
-  if result == []:
-    return 'less than a second'
-  if len(result) == 1:
-    return result[0]
-  else:
-    return ', '.join(result[:-1]) + ' and ' + result[-1]
-
-def format_size(nbytes):
-  """
-  Format a byte count as a human-readable file size.
-  """
-  return nbytes < 1024 and '%i bytes' % nbytes \
-      or nbytes < (1024 ** 2) and __round(nbytes, 1024, 'KB') \
-      or nbytes < (1024 ** 3) and __round(nbytes, 1024 ** 2, 'MB') \
-      or nbytes < (1024 ** 4) and __round(nbytes, 1024 ** 3, 'GB') \
-      or __round(nbytes, 1024 ** 4, 'TB')
-
-def __round(nbytes, divisor, suffix):
-  nbytes = float(nbytes) / divisor
-  if floor(nbytes) == nbytes:
-    return str(int(nbytes)) + ' ' + suffix
-  else:
-    return '%.2f %s' % (nbytes, suffix)
-
-# vim: sw=2 sw=2 et

From 2d14916e97932a5a6088825f27540fd4ed51ee66 Mon Sep 17 00:00:00 2001
From: "Eugene San (eugenesan)" <eugenesan@gmail.com>
Date: Wed, 15 Oct 2014 09:06:55 +0300
Subject: [PATCH 2/4] Add support for Snappy compression

---
 dedupfs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dedupfs.py b/dedupfs.py
index 2c4ed39..f63eceb 100755
--- a/dedupfs.py
+++ b/dedupfs.py
@@ -9,7 +9,7 @@
 is only stored once.
 
 In addition to deduplication the file system also supports transparent
-compression using any of the compression methods lzo, zlib and bz2.
+compression using any of the compression methods snappy, lzo, zlib and bz2.
 
 These two properties make the file system ideal for backups: I'm currently
 storing 250 GB worth of backups using only 8 GB of disk space.
@@ -165,7 +165,7 @@ def __init__(self, *args, **kw):  # {{{2
       def noop(s): return s
       self.compressors = { 'none': (noop, noop) }
       compression_methods = ['none']
-      for modname in 'lzo', 'zlib', 'bz2':
+      for modname in 'snappy', 'lzo', 'zlib', 'bz2':
         try:
           module = __import__(modname)
           if hasattr(module, 'compress') and hasattr(module, 'decompress'):

From cab78c33505f18dfc6dc2d3f8aa3236180b5d17a Mon Sep 17 00:00:00 2001
From: "Eugene San (eugenesan)" <eugenesan@gmail.com>
Date: Wed, 15 Oct 2014 09:57:04 +0300
Subject: [PATCH 3/4] Add support for lzma compression

---
 dedupfs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dedupfs.py b/dedupfs.py
index f63eceb..2132489 100755
--- a/dedupfs.py
+++ b/dedupfs.py
@@ -9,7 +9,7 @@
 is only stored once.
 
 In addition to deduplication the file system also supports transparent
-compression using any of the compression methods snappy, lzo, zlib and bz2.
+compression using any of the compression methods snappy, lzo, lzma, zlib and bz2.
 
 These two properties make the file system ideal for backups: I'm currently
 storing 250 GB worth of backups using only 8 GB of disk space.
@@ -165,7 +165,7 @@ def __init__(self, *args, **kw):  # {{{2
       def noop(s): return s
       self.compressors = { 'none': (noop, noop) }
       compression_methods = ['none']
-      for modname in 'snappy', 'lzo', 'zlib', 'bz2':
+      for modname in 'snappy', 'lzma', 'lzo', 'zlib', 'bz2':
         try:
           module = __import__(modname)
           if hasattr(module, 'compress') and hasattr(module, 'decompress'):

From 6bbf9855de50d169978353c79e3da88d5de24c52 Mon Sep 17 00:00:00 2001
From: "Eugene San (eugenesan)" <eugenesan@gmail.com>
Date: Wed, 15 Oct 2014 09:57:38 +0300
Subject: [PATCH 4/4] Allow usage with python 2.6 and later

---
 dedupfs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dedupfs.py b/dedupfs.py
index 2132489..4307bea 100755
--- a/dedupfs.py
+++ b/dedupfs.py
@@ -24,8 +24,8 @@
 
 # Check the Python version, warn the user if untested.
 import sys
-if sys.version_info[:2] != (2, 6):
-  msg = "Warning: DedupFS has only been tested on Python 2.6, while you're running Python %d.%d!\n"
+if sys.version_info[:2] <= (2, 6):
+  msg = "Warning: DedupFS has only been tested on Python 2.6 and later, while you're running Python %d.%d!\n"
   sys.stderr.write(msg % (sys.version_info[0], sys.version_info[1]))
 
 # Try to load the required modules from Python's standard library.