From eb2d41f47610d86e94193ef4cb22789fe9ece6d2 Mon Sep 17 00:00:00 2001 From: Marcel Kapfer Date: Mon, 18 Nov 2019 16:54:30 +0100 Subject: [PATCH] Import Upstream version 0.0~git20180821.e1733b1 --- LICENSE | 22 ++ README.md | 242 +++++++++++++++++ git-fat | 628 ++++++++++++++++++++++++++++++++++++++++++++ test-retroactive.sh | 53 ++++ test.sh | 64 +++++ 5 files changed, 1009 insertions(+) create mode 100644 LICENSE create mode 100644 README.md create mode 100755 git-fat create mode 100755 test-retroactive.sh create mode 100755 test.sh diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..1a59a75 --- /dev/null +++ b/LICENSE @@ -0,0 +1,22 @@ +Copyright (c) 2012, Jed Brown +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..1db7de8 --- /dev/null +++ b/README.md @@ -0,0 +1,242 @@ +# Introduction +Checking large binary files into a source repository (Git or otherwise) is a bad idea because repository size quickly becomes unreasonable. +Even if the instantaneous working tree stays manageable, preserving repository integrity requires all binary files in the entire project history, which given the typically poor compression of binary diffs, implies that the repository size will become impractically large. +Some people recommend checking binaries into different repositories or even not versioning them at all, but these are not satisfying solutions for most workflows. + +## Features of `git-fat` +* clones of the source repository are small and fast because no binaries are transferred, yet fully functional with complete metadata and incremental retrieval (`git clone --depth` has limited granularity and couples metadata to content) +* `git-fat` supports the same workflow for large binaries and traditionally versioned files, but internally manages the "fat" files separately +* `git-bisect` works properly even when versions of the binary files change over time +* selective control of which large files to pull into the local store +* local fat object stores can be shared between multiple clones, even by different users +* can easily support fat object stores distributed across multiple hosts +* depends only on stock Python and rsync + +## Related projects +* [git-annex](http://git-annex.branchable.com) is a far more comprehensive solution, but with less transparent workflow and with more dependencies. +* [git-media](https://github.com/schacon/git-media) adopts a similar approach to `git-fat`, but with a different synchronization philosophy and with many Ruby dependencies. + +# Installation and configuration +Place `git-fat` in your `PATH`. + +Edit (or create) `.gitattributes` to regard any desired extensions as fat files. + + $ cd path-to-your-repository + $ cat >> .gitattributes + *.png filter=fat -crlf + *.jpg filter=fat -crlf + *.gz filter=fat -crlf + ^D + +Run `git fat init` to activate the extension. Now add and commit as usual. +Matched files will be transparently stored externally, but will appear +complete in the working tree. + +Set a remote store for the fat objects by editing `.gitfat`. + + [rsync] + remote = your.remote-host.org:/share/fat-store + +This file should typically be committed to the repository so that others +will automatically have their remote set. This remote address can use +any protocol supported by rsync. + +Most users will configure it to use remote ssh in a directory with shared +access. To do this, set the `sshuser` and `sshport` variables in `.gitfat` +configuration file. For example, to use rsync with ssh, with the default +port (22) and authenticate with the user "_fat_", your configuration would +look like this: + + [rsync] + remote = your.remote-host.org:/share/fat-store + sshuser = fat + +# A worked example + +Before we start, let's turn on verbose reporting so we can see what's +happening. Without this environment variable, all the output lines +starting with `git-fat` will not be shown. + + $ export GIT_FAT_VERBOSE=1 + +First, we create a repository and configure it for use with `git-fat`. + + $ git init repo + Initialized empty Git repository in /tmp/repo/.git/ + $ cd repo + $ git fat init + $ cat > .gitfat + [rsync] + remote = localhost:/tmp/fat-store + $ mkdir -p /tmp/fat-store # make sure the remote directory exists + $ echo '*.gz filter=fat -crlf' > .gitattributes + $ git add .gitfat .gitattributes + $ git commit -m'Initial repository' + [master (root-commit) eb7facb] Initial repository + 2 files changed, 3 insertions(+) + create mode 100644 .gitattributes + create mode 100644 .gitfat + +Now we add a binary file whose name matches the pattern we set in `.gitattributes`. + + $ curl https://nodeload.github.com/jedbrown/git-fat/tar.gz/master -o master.tar.gz + % Total % Received % Xferd Average Speed Time Time Time Current + Dload Upload Total Spent Left Speed + 100 6449 100 6449 0 0 7741 0 --:--:-- --:--:-- --:--:-- 9786 + $ git add master.tar.gz + git-fat filter-clean: caching to /tmp/repo/.git/fat/objects/b3489819f81603b4c04e8ed134b80bace0810324 + $ git commit -m'Added master.tar.gz' + [master b85a96f] Added master.tar.gz + git-fat filter-clean: caching to /tmp/repo/.git/fat/objects/b3489819f81603b4c04e8ed134b80bace0810324 + 1 file changed, 1 insertion(+) + create mode 100644 master.tar.gz + +The patch itself is very simple and does not include the binary. + + $ git show --pretty=oneline HEAD + 918063043a6156172c2ad66478c6edd5c7df0217 Add master.tar.gz + diff --git a/master.tar.gz b/master.tar.gz + new file mode 100644 + index 0000000..12f7d52 + --- /dev/null + +++ b/master.tar.gz + @@ -0,0 +1 @@ + +#$# git-fat 1f218834a137f7b185b498924e7a030008aee2ae + +## Pushing fat files +Now let's push our fat files using the rsync configuration that we set up earlier. + + $ git fat push + Pushing to localhost:/tmp/fat-store + building file list ... + 1 file to consider + + sent 61 bytes received 12 bytes 48.67 bytes/sec + total size is 6449 speedup is 88.34 + +We might normally set a remote now and push the git repository. + +## Cloning and pulling +Now let's look at what happens when we clone. + + $ cd .. + $ git clone repo repo2 + Cloning into 'repo2'... + done. + $ cd repo2 + $ git fat init # don't forget + $ ls -l # file is just a placeholder + total 4 + -rw-r--r-- 1 jed users 53 Nov 25 22:42 master.tar.gz + $ cat master.tar.gz # holds the SHA1 of the file + #$# git-fat 1f218834a137f7b185b498924e7a030008aee2ae + +We can always get a summary of what fat objects are missing in our local cache. + + Orphan objects: + 1f218834a137f7b185b498924e7a030008aee2ae + +Now get any objects referenced by our current `HEAD`. This command also +accepts the `--all` option to pull full history, or a revision to pull +selected history. + + $ git fat pull + receiving file list ... + 1 file to consider + 1f218834a137f7b185b498924e7a030008aee2ae + 6449 100% 6.15MB/s 0:00:00 (xfer#1, to-check=0/1) + + sent 30 bytes received 6558 bytes 4392.00 bytes/sec + total size is 6449 speedup is 0.98 + Restoring 1f218834a137f7b185b498924e7a030008aee2ae -> master.tar.gz + git-fat filter-smudge: restoring from /tmp/repo2/.git/fat/objects/1f218834a137f7b185b498924e7a030008aee2ae + +Everything is in place + + $ git status + git-fat filter-clean: caching to /tmp/repo2/.git/fat/objects/1f218834a137f7b185b498924e7a030008aee2ae + # On branch master + nothing to commit, working directory clean + $ ls -l # recovered the full file + total 8 + -rw-r--r-- 1 jed users 6449 Nov 25 17:10 master.tar.gz + +## Summary +* Set the "fat" file types in `.gitattributes`. +* Use normal git commands to interact with the repository without + thinking about what files are fat and non-fat. The fat files will be + treated specially. +* Synchronize fat files with `git fat push` and `git fat pull`. + +## Retroactive import using `git filter-branch` [Experimental] + +Sometimes large objects were added to a repository by accident or for +lack of a better place to put them. _If_ you are willing to rewrite +history, forcing everyone to reclone, you can retroactively manage those +files with `git fat`. Be sure that you understand the consequences of +`git filter-branch` before attempting this. This feature is experimental +and irreversible, so be doubly careful with backups. + +### Step 1: Locate the fat files + +Run `git fat find THRESH_BYTES > fat-files` and inspect `fat-files` in +an editor. Lines will be sorted by the maximum object size that has been +at each path, and look like + + something.big filter=fat -text # 8154677 1 + +where the first number after the `#` is the number of bytes and the +second number is the number of modifications that path has seen. You +will normally filter out some of these paths using grep and/or an +editor. When satisfied, remove the ends of the lines (including the `#`) +and append to `.gitattributes`. It's best to `git add .gitattributes` and commit +at this time (likely enrolling some extant files into `git fat`). + +### Step 2: `filter-branch` + +Copy `.gitattributes` to `/tmp/fat-filter-files` and edit to remove +everything after the file name (e.g., `sed s/ \+filter=fat.*$//`). +Currently, this may only contain exact paths relative to the root of the +repository. Finally, run + + git filter-branch --index-filter \ + 'git fat index-filter /tmp/fat-filter-files --manage-gitattributes' \ + --tag-name-filter cat -- --all + +(You can remove the `--manage-gitattributes` option if you don't want to +append all the files being enrolled in `git fat` to `.gitattributes`, +however, future users would need to use `.git/info/attributes` to have +the `git fat` fileters run.) +When this finishes, inspect to see if everything is in order and follow +the +[Checklist for Shrinking a Repository](http://www.kernel.org/pub/software/scm/git/docs/git-filter-branch.html#_checklist_for_shrinking_a_repository) +in the `git filter-branch` man page, typically `git clone +file:///path/to/repo`. Be sure to `git fat push` from the original +repository. + +See the script `test-retroactive.sh` for an example of cleaning. + +## Implementation notes +The actual binary files are stored in `.git/fat/objects`, leaving `.git/objects` nice and small. + + $ du -bs .git/objects + 2212 .git/objects/ + $ ls -l .git/fat/objects # This is where the file actually goes, but that's not important + total 8 + -rw------- 1 jed users 6449 Nov 25 17:01 1f218834a137f7b185b498924e7a030008aee2ae + +If you have multiple clones that access the same filesystem, you can make +`.git/fat/objects` a symlink to a common location, in which case all content +will be available in all repositories without extra copies. You still need to +`git fat push` to make it available to others. + +# Some refinements +* Allow pulling and pushing only select files +* Relate orphan objects to file system +* Put some more useful message in smudged (working tree) version of missing files. +* More friendly configuration for multiple fat remotes +* Make commands safer in presence of a dirty tree. +* Private setting of a different remote. +* Gracefully handle unmanaged files when the filter is called (either + legacy files or files matching the pattern that should some reason not + be treated as fat). diff --git a/git-fat b/git-fat new file mode 100755 index 0000000..135f4e2 --- /dev/null +++ b/git-fat @@ -0,0 +1,628 @@ +#!/usr/bin/env python +# -*- mode:python -*- + +from __future__ import print_function, with_statement + +import sys +import hashlib +import tempfile +import os +import subprocess +import shlex +import shutil +import itertools +import threading +import time +import collections + +if not type(sys.version_info) is tuple and sys.version_info.major > 2: + sys.stderr.write('git-fat does not support Python-3 yet. Please use python2.\n') + sys.exit(1) + +try: + from subprocess import check_output + del check_output +except ImportError: + def backport_check_output(*popenargs, **kwargs): + r"""Run command with arguments and return its output as a byte string. + + Backported from Python 2.7 as it's implemented as pure python on stdlib. + + >>> check_output(['/usr/bin/python', '--version']) + Python 2.6.2 + """ + process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs) + output, unused_err = process.communicate() + retcode = process.poll() + if retcode: + cmd = kwargs.get("args") + if cmd is None: + cmd = popenargs[0] + error = subprocess.CalledProcessError(retcode, cmd) + error.output = output + raise error + return output + subprocess.check_output = backport_check_output + +BLOCK_SIZE = 4096 + +def verbose_stderr(*args, **kwargs): + return print(*args, file=sys.stderr, **kwargs) +def verbose_ignore(*args, **kwargs): + pass + +def mkdir_p(path): + import errno + try: + os.makedirs(path) + except OSError as exc: # Python >2.5 + if exc.errno == errno.EEXIST and os.path.isdir(path): + pass + else: raise + +def umask(): + """Get umask without changing it.""" + old = os.umask(0) + os.umask(old) + return old + +def readblocks(stream): + bytes = 0 + while True: + data = stream.read(BLOCK_SIZE) + bytes += len(data) + if not data: + break + yield data +def cat_iter(initer, outstream): + for block in initer: + outstream.write(block) +def cat(instream, outstream): + return cat_iter(readblocks(instream), outstream) +def difftreez_reader(input): + """Incremental reader for git diff-tree -z output + + :oldmode newmode oldsha1 newsha1 modflag\0filename\0:oldmode newmode ... + """ + buffer = [] + partial = '' + while True: + newread = input.read(BLOCK_SIZE) + if not newread: + break + partial += newread + while True: + head, sep, partial = partial.partition('\0') + if not sep: + partial = head + break + buffer.append(head) + if len(buffer) == 2: + oldmode, newmode, oldhash, newhash, modflag = buffer[0].split() + path = buffer[1] + yield (newhash, modflag, path) + buffer = [] +def gitconfig_get(name, file=None): + args = ['git', 'config', '--get'] + if file is not None: + args += ['--file', file] + args.append(name) + p = subprocess.Popen(args, stdout=subprocess.PIPE) + output = p.communicate()[0].strip() + if p.returncode and file is None: + return None + elif p.returncode: + return gitconfig_get(name) + else: + return output +def gitconfig_set(name, value, file=None): + args = ['git', 'config'] + if file is not None: + args += ['--file', file] + args += [name, value] + p = subprocess.check_call(args) + +class GitFat(object): + DecodeError = RuntimeError + def __init__(self): + self.verbose = verbose_stderr if os.environ.get('GIT_FAT_VERBOSE') else verbose_ignore + try: + self.gitroot = subprocess.check_output('git rev-parse --show-toplevel'.split()).strip() + except subprocess.CalledProcessError: + sys.exit(1) + self.gitdir = subprocess.check_output('git rev-parse --git-dir'.split()).strip() + self.objdir = os.path.join(self.gitdir, 'fat', 'objects') + if os.environ.get('GIT_FAT_VERSION') == '1': + self.encode = self.encode_v1 + else: + self.encode = self.encode_v2 + def magiclen(enc): + return len(enc(hashlib.sha1('dummy').hexdigest(), 5)) + self.magiclen = magiclen(self.encode) # Current version + self.magiclens = [magiclen(enc) for enc in [self.encode_v1, self.encode_v2]] # All prior versions + def setup(self): + mkdir_p(self.objdir) + def is_init_done(self): + return gitconfig_get('filter.fat.clean') or gitconfig_get('filter.fat.smudge') + def assert_init_done(self): + if not self.is_init_done(): + sys.stderr.write('fatal: git-fat is not yet configured in this repository.\n') + sys.stderr.write('Run "git fat init" to configure.\n') + sys.exit(1) + def get_rsync(self): + cfgpath = os.path.join(self.gitroot,'.gitfat') + remote = gitconfig_get('rsync.remote', file=cfgpath) + ssh_port = gitconfig_get('rsync.sshport', file=cfgpath) + ssh_user = gitconfig_get('rsync.sshuser', file=cfgpath) + options = gitconfig_get('rsync.options', file=cfgpath) + if remote is None: + raise RuntimeError('No rsync.remote in %s' % cfgpath) + return remote, ssh_port, ssh_user, options + def get_rsync_command(self,push): + (remote, ssh_port, ssh_user, options) = self.get_rsync() + if push: + self.verbose('Pushing to %s' % (remote)) + else: + self.verbose('Pulling from %s' % (remote)) + + cmd = ['rsync', '--progress', '--ignore-existing', '--from0', '--files-from=-'] + rshopts = '' + if ssh_user: + rshopts += ' -l ' + ssh_user + if ssh_port: + rshopts += ' -p ' + ssh_port + if rshopts: + cmd.append('--rsh=ssh' + rshopts) + if options: + cmd += options.split(' ') + if push: + cmd += [self.objdir + '/', remote + '/'] + else: + cmd += [remote + '/', self.objdir + '/'] + return cmd + def revparse(self, revname): + return subprocess.check_output(['git', 'rev-parse', revname]).strip() + def encode_v1(self, digest, bytes): + 'Produce legacy representation of file to be stored in repository.' + return '#$# git-fat %s\n' % (digest,) + def encode_v2(self, digest, bytes): + 'Produce representation of file to be stored in repository. 20 characters can hold 64-bit integers.' + return '#$# git-fat %s %20d\n' % (digest, bytes) + def decode(self, string, noraise=False): + cookie = '#$# git-fat ' + if string.startswith(cookie): + parts = string[len(cookie):].split() + digest = parts[0] + bytes = int(parts[1]) if len(parts) > 1 else None + return digest, bytes + elif noraise: + return None, None + else: + raise GitFat.DecodeError('Could not decode %s' % (string)) + def decode_stream(self, stream): + 'Return digest if git-fat cache, otherwise return iterator over entire file contents' + preamble = stream.read(self.magiclen) + try: + return self.decode(preamble) + except GitFat.DecodeError: + # Not sure if this is the right behavior + return itertools.chain([preamble], readblocks(stream)), None + def decode_file(self, fname): + # Fast check + try: + stat = os.lstat(fname) + except OSError: + return False, None + if stat.st_size != self.magiclen: + return False, None + # read file + try: + digest, bytes = self.decode_stream(open(fname)) + except IOError: + return False, None + if isinstance(digest, str): + return digest, bytes + else: + return None, bytes + def decode_clean(self, body): + ''' + Attempt to decode version in working tree. The tree version could be changed to have a more + useful message than the machine-readable copy that goes into the repository. If the tree + version decodes successfully, it indicates that the fat data is not currently available in + this repository. + ''' + digest, bytes = self.decode(body, noraise=True) + return digest + def filter_clean(self, instream, outstreamclean): + h = hashlib.new('sha1') + bytes = 0 + fd, tmpname = tempfile.mkstemp(dir=self.objdir) + try: + ishanging = False + cached = False # changes to True when file is cached + with os.fdopen(fd, 'w') as cache: + outstream = cache + firstblock = True + for block in readblocks(instream): + if firstblock: + if len(block) == self.magiclen and self.decode_clean(block[0:self.magiclen]): + ishanging = True # Working tree version is verbatim from repository (not smudged) + outstream = outstreamclean + firstblock = False + h.update(block) + bytes += len(block) + outstream.write(block) + outstream.flush() + digest = h.hexdigest() + objfile = os.path.join(self.objdir, digest) + if not ishanging: + if os.path.exists(objfile): + self.verbose('git-fat filter-clean: cache already exists %s' % objfile) + os.remove(tmpname) + else: + # Set permissions for the new file using the current umask + os.chmod(tmpname, int('444', 8) & ~umask()) + os.rename(tmpname, objfile) + self.verbose('git-fat filter-clean: caching to %s' % objfile) + cached = True + outstreamclean.write(self.encode(digest, bytes)) + finally: + if not cached: + os.remove(tmpname) + + def cmd_filter_clean(self): + ''' + The clean filter runs when a file is added to the index. It gets the "smudged" (tree) + version of the file on stdin and produces the "clean" (repository) version on stdout. + ''' + self.setup() + self.filter_clean(sys.stdin, sys.stdout) + + def cmd_filter_smudge(self): + self.setup() + result, bytes = self.decode_stream(sys.stdin) + if isinstance(result, str): # We got a digest + objfile = os.path.join(self.objdir, result) + try: + cat(open(objfile), sys.stdout) + self.verbose('git-fat filter-smudge: restoring from %s' % objfile) + except IOError: # file not found + self.verbose('git-fat filter-smudge: fat object missing %s' % objfile) + sys.stdout.write(self.encode(result, bytes)) # could leave a better notice about how to recover this file + else: # We have an iterable over the original input. + self.verbose('git-fat filter-smudge: not a managed file') + cat_iter(result, sys.stdout) + def catalog_objects(self): + return set(os.listdir(self.objdir)) + def referenced_objects(self, rev=None, all=False): + referenced = set() + if all: + rev = '--all' + elif rev is None: + rev = self.revparse('HEAD') + # Revision list gives us object names to inspect with cat-file... + p1 = subprocess.Popen(['git','rev-list','--objects',rev], stdout=subprocess.PIPE) + def cut_sha1hash(input, output): + for line in input: + output.write(line.split()[0] + '\n') + output.close() + # ...`cat-file --batch-check` filters for git-fat object candidates in bulk... + p2 = subprocess.Popen(['git','cat-file','--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) + def filter_gitfat_candidates(input, output): + for line in input: + objhash, objtype, size = line.split() + if objtype == 'blob' and int(size) in self.magiclens: + output.write(objhash + '\n') + output.close() + # ...`cat-file --batch` provides full contents of git-fat candidates in bulk + p3 = subprocess.Popen(['git','cat-file','--batch'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) + # Stream data: p1 | cut_thread | p2 | filter_thread | p3 + cut_thread = threading.Thread(target=cut_sha1hash, args=(p1.stdout, p2.stdin)) + filter_thread = threading.Thread(target=filter_gitfat_candidates, args=(p2.stdout, p3.stdin)) + cut_thread.start() + filter_thread.start() + # Process metadata + content format provided by `cat-file --batch` + while True: + metadata_line = p3.stdout.readline() + if not metadata_line: + break # EOF + objhash, objtype, size_str = metadata_line.split() + size, bytes_read = int(size_str), 0 + # We know from filter that item is a candidate git-fat object and + # is small enough to read into memory and process + content = '' + while bytes_read < size: + data = p3.stdout.read(size - bytes_read) + if not data: + break # EOF + content += data + bytes_read += len(data) + try: + fathash = self.decode(content)[0] + referenced.add(fathash) + except GitFat.DecodeError: + pass + # Consume LF record delimiter in `cat-file --batch` output + bytes_read = 0 + while bytes_read < 1: + data = p3.stdout.read(1) + if not data: + break # EOF + bytes_read += len(data) + # Ensure everything is cleaned up + cut_thread.join() + filter_thread.join() + p1.wait() + p2.wait() + p3.wait() + return referenced + + def orphan_files(self, patterns=[]): + 'generator for all orphan placeholders in the working tree' + if not patterns or patterns == ['']: + patterns = ['.'] + for fname in subprocess.check_output(['git', 'ls-files', '-z'] + patterns).split('\x00')[:-1]: + digest = self.decode_file(fname)[0] + if digest: + yield (digest, fname) + + def cmd_status(self, args): + self.setup() + catalog = self.catalog_objects() + refargs = dict() + if '--all' in args: + refargs['all'] = True + referenced = self.referenced_objects(**refargs) + garbage = catalog - referenced + orphans = referenced - catalog + if '--all' in args: + for obj in referenced: + print(obj) + if orphans: + print('Orphan objects:') + for orph in orphans: + print(' ' + orph) + if garbage: + print('Garbage objects:') + for g in garbage: + print(' ' + g) + def is_dirty(self): + return subprocess.call(['git', 'diff-index', '--quiet', 'HEAD']) == 0 + def cmd_push(self, args): + 'Push anything that I have stored and referenced' + self.setup() + # Default to push only those objects referenced by current HEAD + # (includes history). Finer-grained pushing would be useful. + pushall = '--all' in args + files = self.referenced_objects(all=pushall) & self.catalog_objects() + cmd = self.get_rsync_command(push=True) + self.verbose('Executing: %s' % ' '.join(cmd)) + p = subprocess.Popen(cmd, stdin=subprocess.PIPE) + p.communicate(input='\x00'.join(files)) + if p.returncode: + sys.exit(p.returncode) + def checkout(self, show_orphans=False): + 'Update any stale files in the present working tree' + self.assert_init_done() + for digest, fname in self.orphan_files(): + objpath = os.path.join(self.objdir, digest) + if os.access(objpath, os.R_OK): + print('Restoring %s -> %s' % (digest, fname)) + # The output of our smudge filter depends on the existence of + # the file in .git/fat/objects, but git caches the file stat + # from the previous time the file was smudged, therefore it + # won't try to re-smudge. I don't know a git command that + # specifically invalidates that cache, but changing the mtime + # on the file will invalidate the cache. + # Here we set the mtime to mtime + 1. This is an improvement + # over touching the file as it catches the edgecase where a + # git-checkout happens within the same second as a git fat + # checkout. + stat = os.lstat(fname) + os.utime(fname, (stat.st_atime, stat.st_mtime + 1)) + # This re-smudge is essentially a copy that restores + # permissions. + subprocess.check_call( + ['git', 'checkout-index', '--index', '--force', fname]) + elif show_orphans: + print('Data unavailable: %s %s' % (digest,fname)) + def cmd_pull(self, args): + 'Pull anything that I have referenced, but not stored' + self.setup() + refargs = dict() + if '--all' in args: + refargs['all'] = True + for arg in args: + if arg.startswith('-') or len(arg) != 40: + continue + rev = self.revparse(arg) + if rev: + refargs['rev'] = rev + files = self.filter_objects(refargs, self.parse_pull_patterns(args)) + cmd = self.get_rsync_command(push=False) + self.verbose('Executing: %s' % ' '.join(cmd)) + p = subprocess.Popen(cmd, stdin=subprocess.PIPE) + p.communicate(input='\x00'.join(files)) + if p.returncode: + sys.exit(p.returncode) + self.checkout() + + def parse_pull_patterns(self, args): + if '--' not in args: + return [''] + else: + idx = args.index('--') + patterns = args[idx+1:] #we don't care about '--' + return patterns + + def filter_objects(self, refargs, patterns): + files = self.referenced_objects(**refargs) - self.catalog_objects() + if refargs.get('all'): # Currently ignores patterns; can we efficiently do both? + return files + orphans_matched = list(self.orphan_files(patterns)) + orphans_objects = set(map(lambda x: x[0], orphans_matched)) + return files & orphans_objects + + def cmd_checkout(self, args): + self.checkout(show_orphans=True) + + def cmd_gc(self): + garbage = self.catalog_objects() - self.referenced_objects() + print('Unreferenced objects to remove: %d' % len(garbage)) + for obj in garbage: + fname = os.path.join(self.objdir, obj) + print('%10d %s' % (os.stat(fname).st_size, obj)) + os.remove(fname) + + def cmd_verify(self): + """Print details of git-fat objects with incorrect data hash""" + corrupted_objects = [] + for obj in self.catalog_objects(): + fname = os.path.join(self.objdir, obj) + h = hashlib.new('sha1') + for block in readblocks(open(fname)): + h.update(block) + data_hash = h.hexdigest() + if obj != data_hash: + corrupted_objects.append((obj, data_hash)) + if corrupted_objects: + print('Corrupted objects: %d' % len(corrupted_objects)) + for obj, data_hash in corrupted_objects: + print('%s data hash is %s' % (obj, data_hash)) + sys.exit(1) + + def cmd_init(self): + self.setup() + if self.is_init_done(): + print('Git fat already configured, check configuration in .git/config') + else: + gitconfig_set('filter.fat.clean', 'git-fat filter-clean') + gitconfig_set('filter.fat.smudge', 'git-fat filter-smudge') + print('Initialized git fat') + def gen_large_blobs(self, revs, threshsize): + """Build dict of all blobs""" + time0 = time.time() + def hash_only(input, output): + """The output of git rev-list --objects shows extra info for blobs, subdirectory trees, and tags. + This truncates to one hash per line. + """ + for line in input: + output.write(line[:40] + '\n') + output.close() + revlist = subprocess.Popen(['git', 'rev-list', '--all', '--objects'], stdout=subprocess.PIPE, bufsize=-1) + objcheck = subprocess.Popen(['git', 'cat-file', '--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, bufsize=-1) + hashonly = threading.Thread(target=hash_only, args=(revlist.stdout, objcheck.stdin)) + hashonly.start() + numblobs = 0; numlarge = 1 + # Build dict with the sizes of all large blobs + for line in objcheck.stdout: + objhash, blob, size = line.split() + if blob != 'blob': + continue + size = int(size) + numblobs += 1 + if size > threshsize: + numlarge += 1 + yield objhash, size + revlist.wait() + objcheck.wait() + hashonly.join() + time1 = time.time() + self.verbose('%d of %d blobs are >= %d bytes [elapsed %.3fs]' % (numlarge, numblobs, threshsize, time1-time0)) + def cmd_find(self, args): + maxsize = int(args[0]) + blobsizes = dict(self.gen_large_blobs('--all', maxsize)) + time0 = time.time() + # Find all names assumed by large blobs (those in blobsizes) + pathsizes = collections.defaultdict(lambda:set()) + revlist = subprocess.Popen(['git', 'rev-list', '--all'], stdout=subprocess.PIPE, bufsize=-1) + difftree = subprocess.Popen(['git', 'diff-tree', '--root', '--no-renames', '--no-commit-id', '--diff-filter=AMCR', '-r', '--stdin', '-z'], + stdin=revlist.stdout, stdout=subprocess.PIPE) + for newblob, modflag, path in difftreez_reader(difftree.stdout): + bsize = blobsizes.get(newblob) + if bsize: # We care about this blob + pathsizes[path].add(bsize) + time1 = time.time() + self.verbose('Found %d paths in %.3f s' % (len(pathsizes), time1-time0)) + maxlen = max(map(len,pathsizes)) if pathsizes else 0 + for path, sizes in sorted(pathsizes.items(), key=lambda ps: max(ps[1]), reverse=True): + print('%-*s filter=fat -text # %10d %d' % (maxlen, path,max(sizes),len(sizes))) + revlist.wait() + difftree.wait() + def cmd_index_filter(self, args): + manage_gitattributes = '--manage-gitattributes' in args + filelist = set(f.strip() for f in open(args[0]).readlines()) + lsfiles = subprocess.Popen(['git', 'ls-files', '-s'], stdout=subprocess.PIPE) + updateindex = subprocess.Popen(['git', 'update-index', '--index-info'], stdin=subprocess.PIPE) + for line in lsfiles.stdout: + mode, sep, tail = line.partition(' ') + blobhash, sep, tail = tail.partition(' ') + stageno, sep, tail = tail.partition('\t') + filename = tail.strip() + if filename not in filelist: + continue + if mode == "120000": + # skip symbolic links + continue + # This file will contain the hash of the cleaned object + hashfile = os.path.join(self.gitdir, 'fat', 'index-filter', blobhash) + try: + cleanedobj = open(hashfile).read().rstrip() + except IOError: + catfile = subprocess.Popen(['git', 'cat-file', 'blob', blobhash], stdout=subprocess.PIPE) + hashobject = subprocess.Popen(['git', 'hash-object', '-w', '--stdin'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) + def dofilter(): + self.filter_clean(catfile.stdout, hashobject.stdin) + hashobject.stdin.close() + filterclean = threading.Thread(target=dofilter) + filterclean.start() + cleanedobj = hashobject.stdout.read().rstrip() + catfile.wait() + hashobject.wait() + filterclean.join() + mkdir_p(os.path.dirname(hashfile)) + open(hashfile, 'w').write(cleanedobj + '\n') + updateindex.stdin.write('%s %s %s\t%s\n' % (mode, cleanedobj, stageno, filename)) + if manage_gitattributes: + try: + mode, blobsha1, stageno, filename = subprocess.check_output(['git', 'ls-files', '-s', '.gitattributes']).split() + gitattributes_lines = subprocess.check_output(['git', 'cat-file', 'blob', blobsha1]).splitlines() + except ValueError: # Nothing to unpack, thus no file + mode, stageno = '100644', '0' + gitattributes_lines = [] + gitattributes_extra = ['%s filter=fat -text' % line.split()[0] for line in filelist] + hashobject = subprocess.Popen(['git', 'hash-object', '-w', '--stdin'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) + stdout, stderr = hashobject.communicate('\n'.join(gitattributes_lines + gitattributes_extra) + '\n') + updateindex.stdin.write('%s %s %s\t%s\n' % (mode, stdout.strip(), stageno, '.gitattributes')) + updateindex.stdin.close() + lsfiles.wait() + updateindex.wait() + + +if __name__ == '__main__': + fat = GitFat() + cmd = sys.argv[1] if len(sys.argv) > 1 else '' + if cmd == 'filter-clean': + fat.cmd_filter_clean() + elif cmd == 'filter-smudge': + fat.cmd_filter_smudge() + elif cmd == 'init': + fat.cmd_init() + elif cmd == 'status': + fat.cmd_status(sys.argv[2:]) + elif cmd == 'push': + fat.cmd_push(sys.argv[2:]) + elif cmd == 'pull': + fat.cmd_pull(sys.argv[2:]) + elif cmd == 'gc': + fat.cmd_gc() + elif cmd == 'verify': + fat.cmd_verify() + elif cmd == 'checkout': + fat.cmd_checkout(sys.argv[2:]) + elif cmd == 'find': + fat.cmd_find(sys.argv[2:]) + elif cmd == 'index-filter': + fat.cmd_index_filter(sys.argv[2:]) + else: + print('Usage: git fat [init|status|push|pull|gc|verify|checkout|find|index-filter]', file=sys.stderr) diff --git a/test-retroactive.sh b/test-retroactive.sh new file mode 100755 index 0000000..51a38ec --- /dev/null +++ b/test-retroactive.sh @@ -0,0 +1,53 @@ +#!/bin/sh -ex + +fullpath() { echo "`pwd`/$1"; } + +git init retro +cd retro +cp /usr/share/dict/words words.big +chmod u+w words.big +git add words.big +git commit -m'Add big file without using git-fat' +sort words.big > sorted.big +git add sorted.big +git commit -m'Add sorted file without using git-fat' +cat > .gitattributes < words.big +git commit -am'Truncated words.big and add .gitattributes' +git fat init +cat > .gitattributes < fat-files +git filter-branch --index-filter "git fat index-filter $(fullpath fat-files) --manage-gitattributes" --tag-name-filter cat -- --all + +git log --stat +git checkout HEAD^ +rm * +git checkout . +ls -al + +# Set up place to push +git checkout master +cat > .gitfat <> .gitfat < .gitattributes +git add .gitattributes .gitfat +git commit -m'Initial fat repository' + +ln -s /oe/dss-oe/dss-add-ons-testing-build/deploy/licenses/common-licenses/GPL-3 c +git add c +git commit -m'add broken symlink' +echo 'fat content a' > a.fat +git add a.fat +git commit -m'add a.fat' +echo 'fat content b' > b.fat +git add b.fat +git commit -m'add b.fat' +echo 'revise fat content a' > a.fat +git commit -am'revise a.fat' +git fat push + +cd .. +git clone fat-test fat-test2 +cd fat-test2 +# checkout and pull should fail in repo not yet init'ed for git-fat +git fat checkout && true +if [ $? -eq 0 ] +then + echo 'ERROR: "git fat checkout" in uninitialised repo should fail' + exit 1 +fi +git fat pull -- 'a.fa*' && true +if [ $? -eq 0 ] +then + echo 'ERROR: "git fat pull" in uninitialised repo should fail' + exit 1 +fi +git fat init +git fat pull -- 'a.fa*' +cat a.fat +echo 'file which is committed and removed afterwards' > d +git add d +git commit -m'add d with normal content' +rm d +git fat pull + +# Check verify command finds corrupt object +mv .git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8 \ + .git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8.bak +echo "Not the right data" > .git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8 +git fat verify && true +if [ $? -eq 0 ]; then echo "Verify did not detect invalid object"; exit 1; fi +mv .git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8.bak \ + .git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8