This repository has been archived on 2022-02-10. You can view files and clone it, but cannot push or open issues or pull requests.
git-fat/git-fat

629 lines
26 KiB
Python
Executable File

#!/usr/bin/env python
# -*- mode:python -*-
from __future__ import print_function, with_statement
import sys
import hashlib
import tempfile
import os
import subprocess
import shlex
import shutil
import itertools
import threading
import time
import collections
if not type(sys.version_info) is tuple and sys.version_info.major > 2:
sys.stderr.write('git-fat does not support Python-3 yet. Please use python2.\n')
sys.exit(1)
try:
from subprocess import check_output
del check_output
except ImportError:
def backport_check_output(*popenargs, **kwargs):
r"""Run command with arguments and return its output as a byte string.
Backported from Python 2.7 as it's implemented as pure python on stdlib.
>>> check_output(['/usr/bin/python', '--version'])
Python 2.6.2
"""
process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
output, unused_err = process.communicate()
retcode = process.poll()
if retcode:
cmd = kwargs.get("args")
if cmd is None:
cmd = popenargs[0]
error = subprocess.CalledProcessError(retcode, cmd)
error.output = output
raise error
return output
subprocess.check_output = backport_check_output
BLOCK_SIZE = 4096
def verbose_stderr(*args, **kwargs):
return print(*args, file=sys.stderr, **kwargs)
def verbose_ignore(*args, **kwargs):
pass
def mkdir_p(path):
import errno
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else: raise
def umask():
"""Get umask without changing it."""
old = os.umask(0)
os.umask(old)
return old
def readblocks(stream):
bytes = 0
while True:
data = stream.read(BLOCK_SIZE)
bytes += len(data)
if not data:
break
yield data
def cat_iter(initer, outstream):
for block in initer:
outstream.write(block)
def cat(instream, outstream):
return cat_iter(readblocks(instream), outstream)
def difftreez_reader(input):
"""Incremental reader for git diff-tree -z output
:oldmode newmode oldsha1 newsha1 modflag\0filename\0:oldmode newmode ...
"""
buffer = []
partial = ''
while True:
newread = input.read(BLOCK_SIZE)
if not newread:
break
partial += newread
while True:
head, sep, partial = partial.partition('\0')
if not sep:
partial = head
break
buffer.append(head)
if len(buffer) == 2:
oldmode, newmode, oldhash, newhash, modflag = buffer[0].split()
path = buffer[1]
yield (newhash, modflag, path)
buffer = []
def gitconfig_get(name, file=None):
args = ['git', 'config', '--get']
if file is not None:
args += ['--file', file]
args.append(name)
p = subprocess.Popen(args, stdout=subprocess.PIPE)
output = p.communicate()[0].strip()
if p.returncode and file is None:
return None
elif p.returncode:
return gitconfig_get(name)
else:
return output
def gitconfig_set(name, value, file=None):
args = ['git', 'config']
if file is not None:
args += ['--file', file]
args += [name, value]
p = subprocess.check_call(args)
class GitFat(object):
DecodeError = RuntimeError
def __init__(self):
self.verbose = verbose_stderr if os.environ.get('GIT_FAT_VERBOSE') else verbose_ignore
try:
self.gitroot = subprocess.check_output('git rev-parse --show-toplevel'.split()).strip()
except subprocess.CalledProcessError:
sys.exit(1)
self.gitdir = subprocess.check_output('git rev-parse --git-dir'.split()).strip()
self.objdir = os.path.join(self.gitdir, 'fat', 'objects')
if os.environ.get('GIT_FAT_VERSION') == '1':
self.encode = self.encode_v1
else:
self.encode = self.encode_v2
def magiclen(enc):
return len(enc(hashlib.sha1('dummy').hexdigest(), 5))
self.magiclen = magiclen(self.encode) # Current version
self.magiclens = [magiclen(enc) for enc in [self.encode_v1, self.encode_v2]] # All prior versions
def setup(self):
mkdir_p(self.objdir)
def is_init_done(self):
return gitconfig_get('filter.fat.clean') or gitconfig_get('filter.fat.smudge')
def assert_init_done(self):
if not self.is_init_done():
sys.stderr.write('fatal: git-fat is not yet configured in this repository.\n')
sys.stderr.write('Run "git fat init" to configure.\n')
sys.exit(1)
def get_rsync(self):
cfgpath = os.path.join(self.gitroot,'.gitfat')
remote = gitconfig_get('rsync.remote', file=cfgpath)
ssh_port = gitconfig_get('rsync.sshport', file=cfgpath)
ssh_user = gitconfig_get('rsync.sshuser', file=cfgpath)
options = gitconfig_get('rsync.options', file=cfgpath)
if remote is None:
raise RuntimeError('No rsync.remote in %s' % cfgpath)
return remote, ssh_port, ssh_user, options
def get_rsync_command(self,push):
(remote, ssh_port, ssh_user, options) = self.get_rsync()
if push:
self.verbose('Pushing to %s' % (remote))
else:
self.verbose('Pulling from %s' % (remote))
cmd = ['rsync', '--progress', '--ignore-existing', '--from0', '--files-from=-']
rshopts = ''
if ssh_user:
rshopts += ' -l ' + ssh_user
if ssh_port:
rshopts += ' -p ' + ssh_port
if rshopts:
cmd.append('--rsh=ssh' + rshopts)
if options:
cmd += options.split(' ')
if push:
cmd += [self.objdir + '/', remote + '/']
else:
cmd += [remote + '/', self.objdir + '/']
return cmd
def revparse(self, revname):
return subprocess.check_output(['git', 'rev-parse', revname]).strip()
def encode_v1(self, digest, bytes):
'Produce legacy representation of file to be stored in repository.'
return '#$# git-fat %s\n' % (digest,)
def encode_v2(self, digest, bytes):
'Produce representation of file to be stored in repository. 20 characters can hold 64-bit integers.'
return '#$# git-fat %s %20d\n' % (digest, bytes)
def decode(self, string, noraise=False):
cookie = '#$# git-fat '
if string.startswith(cookie):
parts = string[len(cookie):].split()
digest = parts[0]
bytes = int(parts[1]) if len(parts) > 1 else None
return digest, bytes
elif noraise:
return None, None
else:
raise GitFat.DecodeError('Could not decode %s' % (string))
def decode_stream(self, stream):
'Return digest if git-fat cache, otherwise return iterator over entire file contents'
preamble = stream.read(self.magiclen)
try:
return self.decode(preamble)
except GitFat.DecodeError:
# Not sure if this is the right behavior
return itertools.chain([preamble], readblocks(stream)), None
def decode_file(self, fname):
# Fast check
try:
stat = os.lstat(fname)
except OSError:
return False, None
if stat.st_size != self.magiclen:
return False, None
# read file
try:
digest, bytes = self.decode_stream(open(fname))
except IOError:
return False, None
if isinstance(digest, str):
return digest, bytes
else:
return None, bytes
def decode_clean(self, body):
'''
Attempt to decode version in working tree. The tree version could be changed to have a more
useful message than the machine-readable copy that goes into the repository. If the tree
version decodes successfully, it indicates that the fat data is not currently available in
this repository.
'''
digest, bytes = self.decode(body, noraise=True)
return digest
def filter_clean(self, instream, outstreamclean):
h = hashlib.new('sha1')
bytes = 0
fd, tmpname = tempfile.mkstemp(dir=self.objdir)
try:
ishanging = False
cached = False # changes to True when file is cached
with os.fdopen(fd, 'w') as cache:
outstream = cache
firstblock = True
for block in readblocks(instream):
if firstblock:
if len(block) == self.magiclen and self.decode_clean(block[0:self.magiclen]):
ishanging = True # Working tree version is verbatim from repository (not smudged)
outstream = outstreamclean
firstblock = False
h.update(block)
bytes += len(block)
outstream.write(block)
outstream.flush()
digest = h.hexdigest()
objfile = os.path.join(self.objdir, digest)
if not ishanging:
if os.path.exists(objfile):
self.verbose('git-fat filter-clean: cache already exists %s' % objfile)
os.remove(tmpname)
else:
# Set permissions for the new file using the current umask
os.chmod(tmpname, int('444', 8) & ~umask())
os.rename(tmpname, objfile)
self.verbose('git-fat filter-clean: caching to %s' % objfile)
cached = True
outstreamclean.write(self.encode(digest, bytes))
finally:
if not cached:
os.remove(tmpname)
def cmd_filter_clean(self):
'''
The clean filter runs when a file is added to the index. It gets the "smudged" (tree)
version of the file on stdin and produces the "clean" (repository) version on stdout.
'''
self.setup()
self.filter_clean(sys.stdin, sys.stdout)
def cmd_filter_smudge(self):
self.setup()
result, bytes = self.decode_stream(sys.stdin)
if isinstance(result, str): # We got a digest
objfile = os.path.join(self.objdir, result)
try:
cat(open(objfile), sys.stdout)
self.verbose('git-fat filter-smudge: restoring from %s' % objfile)
except IOError: # file not found
self.verbose('git-fat filter-smudge: fat object missing %s' % objfile)
sys.stdout.write(self.encode(result, bytes)) # could leave a better notice about how to recover this file
else: # We have an iterable over the original input.
self.verbose('git-fat filter-smudge: not a managed file')
cat_iter(result, sys.stdout)
def catalog_objects(self):
return set(os.listdir(self.objdir))
def referenced_objects(self, rev=None, all=False):
referenced = set()
if all:
rev = '--all'
elif rev is None:
rev = self.revparse('HEAD')
# Revision list gives us object names to inspect with cat-file...
p1 = subprocess.Popen(['git','rev-list','--objects',rev], stdout=subprocess.PIPE)
def cut_sha1hash(input, output):
for line in input:
output.write(line.split()[0] + '\n')
output.close()
# ...`cat-file --batch-check` filters for git-fat object candidates in bulk...
p2 = subprocess.Popen(['git','cat-file','--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
def filter_gitfat_candidates(input, output):
for line in input:
objhash, objtype, size = line.split()
if objtype == 'blob' and int(size) in self.magiclens:
output.write(objhash + '\n')
output.close()
# ...`cat-file --batch` provides full contents of git-fat candidates in bulk
p3 = subprocess.Popen(['git','cat-file','--batch'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
# Stream data: p1 | cut_thread | p2 | filter_thread | p3
cut_thread = threading.Thread(target=cut_sha1hash, args=(p1.stdout, p2.stdin))
filter_thread = threading.Thread(target=filter_gitfat_candidates, args=(p2.stdout, p3.stdin))
cut_thread.start()
filter_thread.start()
# Process metadata + content format provided by `cat-file --batch`
while True:
metadata_line = p3.stdout.readline()
if not metadata_line:
break # EOF
objhash, objtype, size_str = metadata_line.split()
size, bytes_read = int(size_str), 0
# We know from filter that item is a candidate git-fat object and
# is small enough to read into memory and process
content = ''
while bytes_read < size:
data = p3.stdout.read(size - bytes_read)
if not data:
break # EOF
content += data
bytes_read += len(data)
try:
fathash = self.decode(content)[0]
referenced.add(fathash)
except GitFat.DecodeError:
pass
# Consume LF record delimiter in `cat-file --batch` output
bytes_read = 0
while bytes_read < 1:
data = p3.stdout.read(1)
if not data:
break # EOF
bytes_read += len(data)
# Ensure everything is cleaned up
cut_thread.join()
filter_thread.join()
p1.wait()
p2.wait()
p3.wait()
return referenced
def orphan_files(self, patterns=[]):
'generator for all orphan placeholders in the working tree'
if not patterns or patterns == ['']:
patterns = ['.']
for fname in subprocess.check_output(['git', 'ls-files', '-z'] + patterns).split('\x00')[:-1]:
digest = self.decode_file(fname)[0]
if digest:
yield (digest, fname)
def cmd_status(self, args):
self.setup()
catalog = self.catalog_objects()
refargs = dict()
if '--all' in args:
refargs['all'] = True
referenced = self.referenced_objects(**refargs)
garbage = catalog - referenced
orphans = referenced - catalog
if '--all' in args:
for obj in referenced:
print(obj)
if orphans:
print('Orphan objects:')
for orph in orphans:
print(' ' + orph)
if garbage:
print('Garbage objects:')
for g in garbage:
print(' ' + g)
def is_dirty(self):
return subprocess.call(['git', 'diff-index', '--quiet', 'HEAD']) == 0
def cmd_push(self, args):
'Push anything that I have stored and referenced'
self.setup()
# Default to push only those objects referenced by current HEAD
# (includes history). Finer-grained pushing would be useful.
pushall = '--all' in args
files = self.referenced_objects(all=pushall) & self.catalog_objects()
cmd = self.get_rsync_command(push=True)
self.verbose('Executing: %s' % ' '.join(cmd))
p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
p.communicate(input='\x00'.join(files))
if p.returncode:
sys.exit(p.returncode)
def checkout(self, show_orphans=False):
'Update any stale files in the present working tree'
self.assert_init_done()
for digest, fname in self.orphan_files():
objpath = os.path.join(self.objdir, digest)
if os.access(objpath, os.R_OK):
print('Restoring %s -> %s' % (digest, fname))
# The output of our smudge filter depends on the existence of
# the file in .git/fat/objects, but git caches the file stat
# from the previous time the file was smudged, therefore it
# won't try to re-smudge. I don't know a git command that
# specifically invalidates that cache, but changing the mtime
# on the file will invalidate the cache.
# Here we set the mtime to mtime + 1. This is an improvement
# over touching the file as it catches the edgecase where a
# git-checkout happens within the same second as a git fat
# checkout.
stat = os.lstat(fname)
os.utime(fname, (stat.st_atime, stat.st_mtime + 1))
# This re-smudge is essentially a copy that restores
# permissions.
subprocess.check_call(
['git', 'checkout-index', '--index', '--force', fname])
elif show_orphans:
print('Data unavailable: %s %s' % (digest,fname))
def cmd_pull(self, args):
'Pull anything that I have referenced, but not stored'
self.setup()
refargs = dict()
if '--all' in args:
refargs['all'] = True
for arg in args:
if arg.startswith('-') or len(arg) != 40:
continue
rev = self.revparse(arg)
if rev:
refargs['rev'] = rev
files = self.filter_objects(refargs, self.parse_pull_patterns(args))
cmd = self.get_rsync_command(push=False)
self.verbose('Executing: %s' % ' '.join(cmd))
p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
p.communicate(input='\x00'.join(files))
if p.returncode:
sys.exit(p.returncode)
self.checkout()
def parse_pull_patterns(self, args):
if '--' not in args:
return ['']
else:
idx = args.index('--')
patterns = args[idx+1:] #we don't care about '--'
return patterns
def filter_objects(self, refargs, patterns):
files = self.referenced_objects(**refargs) - self.catalog_objects()
if refargs.get('all'): # Currently ignores patterns; can we efficiently do both?
return files
orphans_matched = list(self.orphan_files(patterns))
orphans_objects = set(map(lambda x: x[0], orphans_matched))
return files & orphans_objects
def cmd_checkout(self, args):
self.checkout(show_orphans=True)
def cmd_gc(self):
garbage = self.catalog_objects() - self.referenced_objects()
print('Unreferenced objects to remove: %d' % len(garbage))
for obj in garbage:
fname = os.path.join(self.objdir, obj)
print('%10d %s' % (os.stat(fname).st_size, obj))
os.remove(fname)
def cmd_verify(self):
"""Print details of git-fat objects with incorrect data hash"""
corrupted_objects = []
for obj in self.catalog_objects():
fname = os.path.join(self.objdir, obj)
h = hashlib.new('sha1')
for block in readblocks(open(fname)):
h.update(block)
data_hash = h.hexdigest()
if obj != data_hash:
corrupted_objects.append((obj, data_hash))
if corrupted_objects:
print('Corrupted objects: %d' % len(corrupted_objects))
for obj, data_hash in corrupted_objects:
print('%s data hash is %s' % (obj, data_hash))
sys.exit(1)
def cmd_init(self):
self.setup()
if self.is_init_done():
print('Git fat already configured, check configuration in .git/config')
else:
gitconfig_set('filter.fat.clean', 'git-fat filter-clean')
gitconfig_set('filter.fat.smudge', 'git-fat filter-smudge')
print('Initialized git fat')
def gen_large_blobs(self, revs, threshsize):
"""Build dict of all blobs"""
time0 = time.time()
def hash_only(input, output):
"""The output of git rev-list --objects shows extra info for blobs, subdirectory trees, and tags.
This truncates to one hash per line.
"""
for line in input:
output.write(line[:40] + '\n')
output.close()
revlist = subprocess.Popen(['git', 'rev-list', '--all', '--objects'], stdout=subprocess.PIPE, bufsize=-1)
objcheck = subprocess.Popen(['git', 'cat-file', '--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, bufsize=-1)
hashonly = threading.Thread(target=hash_only, args=(revlist.stdout, objcheck.stdin))
hashonly.start()
numblobs = 0; numlarge = 1
# Build dict with the sizes of all large blobs
for line in objcheck.stdout:
objhash, blob, size = line.split()
if blob != 'blob':
continue
size = int(size)
numblobs += 1
if size > threshsize:
numlarge += 1
yield objhash, size
revlist.wait()
objcheck.wait()
hashonly.join()
time1 = time.time()
self.verbose('%d of %d blobs are >= %d bytes [elapsed %.3fs]' % (numlarge, numblobs, threshsize, time1-time0))
def cmd_find(self, args):
maxsize = int(args[0])
blobsizes = dict(self.gen_large_blobs('--all', maxsize))
time0 = time.time()
# Find all names assumed by large blobs (those in blobsizes)
pathsizes = collections.defaultdict(lambda:set())
revlist = subprocess.Popen(['git', 'rev-list', '--all'], stdout=subprocess.PIPE, bufsize=-1)
difftree = subprocess.Popen(['git', 'diff-tree', '--root', '--no-renames', '--no-commit-id', '--diff-filter=AMCR', '-r', '--stdin', '-z'],
stdin=revlist.stdout, stdout=subprocess.PIPE)
for newblob, modflag, path in difftreez_reader(difftree.stdout):
bsize = blobsizes.get(newblob)
if bsize: # We care about this blob
pathsizes[path].add(bsize)
time1 = time.time()
self.verbose('Found %d paths in %.3f s' % (len(pathsizes), time1-time0))
maxlen = max(map(len,pathsizes)) if pathsizes else 0
for path, sizes in sorted(pathsizes.items(), key=lambda ps: max(ps[1]), reverse=True):
print('%-*s filter=fat -text # %10d %d' % (maxlen, path,max(sizes),len(sizes)))
revlist.wait()
difftree.wait()
def cmd_index_filter(self, args):
manage_gitattributes = '--manage-gitattributes' in args
filelist = set(f.strip() for f in open(args[0]).readlines())
lsfiles = subprocess.Popen(['git', 'ls-files', '-s'], stdout=subprocess.PIPE)
updateindex = subprocess.Popen(['git', 'update-index', '--index-info'], stdin=subprocess.PIPE)
for line in lsfiles.stdout:
mode, sep, tail = line.partition(' ')
blobhash, sep, tail = tail.partition(' ')
stageno, sep, tail = tail.partition('\t')
filename = tail.strip()
if filename not in filelist:
continue
if mode == "120000":
# skip symbolic links
continue
# This file will contain the hash of the cleaned object
hashfile = os.path.join(self.gitdir, 'fat', 'index-filter', blobhash)
try:
cleanedobj = open(hashfile).read().rstrip()
except IOError:
catfile = subprocess.Popen(['git', 'cat-file', 'blob', blobhash], stdout=subprocess.PIPE)
hashobject = subprocess.Popen(['git', 'hash-object', '-w', '--stdin'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
def dofilter():
self.filter_clean(catfile.stdout, hashobject.stdin)
hashobject.stdin.close()
filterclean = threading.Thread(target=dofilter)
filterclean.start()
cleanedobj = hashobject.stdout.read().rstrip()
catfile.wait()
hashobject.wait()
filterclean.join()
mkdir_p(os.path.dirname(hashfile))
open(hashfile, 'w').write(cleanedobj + '\n')
updateindex.stdin.write('%s %s %s\t%s\n' % (mode, cleanedobj, stageno, filename))
if manage_gitattributes:
try:
mode, blobsha1, stageno, filename = subprocess.check_output(['git', 'ls-files', '-s', '.gitattributes']).split()
gitattributes_lines = subprocess.check_output(['git', 'cat-file', 'blob', blobsha1]).splitlines()
except ValueError: # Nothing to unpack, thus no file
mode, stageno = '100644', '0'
gitattributes_lines = []
gitattributes_extra = ['%s filter=fat -text' % line.split()[0] for line in filelist]
hashobject = subprocess.Popen(['git', 'hash-object', '-w', '--stdin'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
stdout, stderr = hashobject.communicate('\n'.join(gitattributes_lines + gitattributes_extra) + '\n')
updateindex.stdin.write('%s %s %s\t%s\n' % (mode, stdout.strip(), stageno, '.gitattributes'))
updateindex.stdin.close()
lsfiles.wait()
updateindex.wait()
if __name__ == '__main__':
fat = GitFat()
cmd = sys.argv[1] if len(sys.argv) > 1 else ''
if cmd == 'filter-clean':
fat.cmd_filter_clean()
elif cmd == 'filter-smudge':
fat.cmd_filter_smudge()
elif cmd == 'init':
fat.cmd_init()
elif cmd == 'status':
fat.cmd_status(sys.argv[2:])
elif cmd == 'push':
fat.cmd_push(sys.argv[2:])
elif cmd == 'pull':
fat.cmd_pull(sys.argv[2:])
elif cmd == 'gc':
fat.cmd_gc()
elif cmd == 'verify':
fat.cmd_verify()
elif cmd == 'checkout':
fat.cmd_checkout(sys.argv[2:])
elif cmd == 'find':
fat.cmd_find(sys.argv[2:])
elif cmd == 'index-filter':
fat.cmd_index_filter(sys.argv[2:])
else:
print('Usage: git fat [init|status|push|pull|gc|verify|checkout|find|index-filter]', file=sys.stderr)