Lots of documentation plus various cleanups & stylistic changes

James Foucar 15 years ago committed by Elijah Newren
parent 027f76e15c
commit 025fd20606

@ -1,24 +1,38 @@
#!/usr/bin/env python
import commands
import os
import sys
import tempfile
An executable for creating a filtered clone and grafting
commits between the filtered and unfiltered repositories. See USAGE.
import commands, os, sys, tempfile
from optparse import OptionParser
from subprocess import Popen, PIPE
from git_fast_filter import Blob, Reset, FileChanges, Commit
from git_fast_filter import FastExportFilter, FastExportOutput, FastImportInput
from git_fast_filter import get_commit_count, get_total_objects
from git_fast_filter import FastExportFilter, fast_export_output, \
fast_import_input, get_commit_count
def get_syntax_string():
return """Syntax:
collab --help
collab info
Report the path to the original repo, the excludes and includes they
had used, whether there were commits on collab/master that weren't
on master, etc.
collab pull-grafts
Take commits from the original repository and add them
to the collab/<branch> branches of the filtered repository
collab push-grafts
Take commits from this repository and place them in
collab/<branch> branches of the original repository
Create a [filtered] clone of a repository.
REPOSITORY is a path to a repository, OPTIONS is some mix of
@ -27,193 +41,355 @@ Notes:
and REVISION LIMITING is options acceptable to git log to reduce the total
list of revisions, examples of which include
--since="2 years ago"
--since='2 years ago'
master 4.10 4.8 ^4.6
If OPTIONS is not specified, everything is included. If REVISION LIMITING
is not specified, --branches is the default."""
is not specified, --branches is the default.
if len(sys.argv) <= 1 or sys.argv[1] == "--help":
raise SystemExit(get_syntax_string())
subcommand = sys.argv[1]
if subcommand == "-h":
raise SystemExit("help has four letters (and uses two dashes instead of one.")
elif subcommand not in ['info', 'pull-grafts', 'push-grafts', 'clone']:
sys.stderr.write("Unrecognized command: %s\n" % subcommand)
raise SystemExit(get_syntax_string())
Once the clone has completed, you'll need to run 'git merge collab/<branch>'
in order to populate your working tree.
def record_content(git_dir, filename, content):
Takes a string, calculates its hash, and stores the result in $filename.
This will also record the string as a blob in git.
p = Popen(["git", "--git-dir=.", "hash-object", "-w", "--stdin"],
stdin = PIPE, stdout = PIPE, cwd = git_dir)
hash = p.communicate(content)[0]
file = open(filename, 'w')
hash_value = p.communicate(content)[0]
hash_file = open(filename, 'w')
def read_content(git_dir, refname):
Takes a valid git ref (e.g. refs/collab/foo) and returns the content of
the corresponding object as a string.
p = Popen(["git", "--git-dir=.", "cat-file", "-p", refname],
stdout=PIPE, cwd = git_dir)
return p.communicate()[0]
class GraftFilter(object):
def __init__(self, source_repo, target_repo, fast_export_args = []):
self.source_repo = source_repo
self.target_repo = target_repo
self.fast_export_args = fast_export_args
if not self.fast_export_args:
self.fast_export_args = ['--branches']
self.sourcemarks = None
self.targetmarks = None
self.excludes = None
self.includes = None
self.collab_git_dir = None
self.show_progress = True
self.object_count = 0
self.commit_count = 0
self.total_commits = get_commit_count(source_repo, self.fast_export_args)
if self.total_commits == 0:
This class implements the functionality that the tool provides.
Some key implementation details:
All of the data that needs to persist from one execution of the tool
to another is recorded as a ref in git under refs/collab/DATA with
DATA being one of:
excludes : The original excludes given to collab in the cloning
includes : The original includes given to collab in the cloning
orig_repo : The handle to the repository collab cloned from
localmap : The commit-map from the POV of the local repository
remotemap : The commit-map from the POV of the remote repository
Important concept: commit-map. These maps map a mark* to a commit id
(hash). The raw commit-ids will not necessarily be meaningful to
both source and target repositories due to the presence of includes,
excludes, and history chopping (since any of these will change the
contents of certain commits). These keys allow us to refer to
correlated commits in both repositories.
*The notion of a mark is an important concept. Fast-export uses a simple
int that is incremented once per exported object to identify the object
without having to use it's sha1 hash. This is an ideal way for us to
refer to commits in a portable way (has proper meaning in both source
and target repo).
def __init__(self, source_repo, target_repo, fast_export_args = None):
# The location of the original repo
self._source_repo = source_repo
# The location of the filtered-clone repo
self._target_repo = target_repo
# Extra args that need to be passed along to git
self._fast_export_args = fast_export_args
if not self._fast_export_args:
self._fast_export_args = ['--branches']
# Temporary file used to store source commit-maps in ascii. We use this
# to grab the marks created by fast-exporting the source.
self._sourcemarks = None
# Temporary file used to store target commit-maps in ascii. We use this
# to grab the marks created by fast-importing the target.
self._targetmarks = None
# The path prefixs that the user wants to exclude
self._excludes = None
# The path prefixs that the user wants to include
self._includes = None
# The path to the .git directory of the repository from which collab was
# executed
self._collab_git_dir = None
# Flag that tells us to print text showing the progress of the operation
self._show_progress = True
# Number of objects processed; used only for showing progress
self._object_count = 0
# Number of commits processed; used only for showing progress
self._commit_count = 0
# Total number of commits in source repo; used only for showing progress
self._total_commits = get_commit_count(source_repo,
# If no commits to clone, we're done
if self._total_commits == 0:
sys.stderr.write("There are no commits to clone.\n")
def set_paths(self, excludes = [], includes = ['']):
self.excludes = excludes
self.includes = includes
def set_paths(self, excludes = None, includes = None):
Sets the exclude/include paths.
self._excludes = excludes
if (self._excludes is None):
self._excludes = []
self._includes = includes
if (self._includes is None):
self._includes = ['']
def _print_progress(self):
if self.show_progress:
Print a quick message describing the progress of the operation.
if self._show_progress:
print "\rRewriting commits... %d/%d (%d objects)" \
% (self.commit_count, self.total_commits, self.object_count),
% (self._commit_count, self._total_commits, self._object_count),
def _do_blob(self, blob):
self.object_count += 1
if self.object_count % 100 == 0:
The callback to be invoked when fast-export encounters a blob. We don't
do anything important here, just maintain and print progress.
self._object_count += 1
if self._object_count % 100 == 0:
def _do_commit(self, commit):
The callback to be invoked when fast-export encounters a commit object.
We have to analyze the commit to find changes in the files we included.
Note that, if all file changes are excluded, then FastExportFilter is
smart enough to skip it all together.
# list to hold all changes we care about
new_file_changes = []
# Iterate over file_changes associated with this commit
for change in commit.file_changes:
include_it = None
for include in self.includes:
# See if change involved an included file
for include in self._includes:
if change.filename.startswith(include):
include_it = True
for exclude in self.excludes:
# See if change involved an excluded file (overrides included status!).
for exclude in self._excludes:
if change.filename.startswith(exclude):
include_it = False
# If file was in neither included or excluded, we have an error
if include_it is None:
raise SystemExit("File '%s' is not in the include or exclude list." %
# Add change if it affected included file
if include_it:
# Overwrite commit's file changes so that it only has changes associated
# with included files.
commit.file_changes = new_file_changes
# Rename the affected branch
commit.branch = commit.branch.replace('refs/heads/','refs/remotes/collab/')
self.commit_count += 1
# Maintain and print progress info
self._commit_count += 1
def _get_map_name(self, filename, include_git_dir = True):
Gets a handle to the data containing the map. This method will return
either a raw filename or a handle that git will understand depending
upon the value of include_git_dir.
if include_git_dir:
collabdir = os.path.join(self.collab_git_dir, 'refs', 'collab')
collabdir = os.path.join(self._collab_git_dir, 'refs', 'collab')
collabdir = os.path.join('refs', 'collab')
if (filename == self.sourcemarks and self.source_repo == '.') or \
(filename == self.targetmarks and self.target_repo == '.'):
if ( (filename == self._sourcemarks and self._source_repo == '.') or
(filename == self._targetmarks and self._target_repo == '.') ):
subname = 'localmap'
subname = 'remotemap'
return os.path.join(collabdir, subname)
def _get_maps(self, filename):
Based on contents of file, create the key->commit-id map.
lines = open(filename,'r').read().strip().splitlines()
mark_and_sha = lambda t: (int(t[0][1:]), t[1])
return dict([mark_and_sha(line.split()) for line in lines])
def _setup_files_and_excludes(self):
if self.source_repo != '.' and self.target_repo != '.':
Setup _sourcemarks, _targetmarks, _collab_git_dir, _includes, _excludes,
and _source_repo. If collab has been run on this directory before,
much of this data will come from objects left behind from the previous
# Either the source or the target repo should be "."
if self._source_repo != '.' and self._target_repo != '.':
raise SystemExit("Must be run from collab-created repo location.")
(status, self.collab_git_dir) = \
# Get the location of the .git directory for this repo
(status, self._collab_git_dir) = \
commands.getstatusoutput("git rev-parse --git-dir")
if status != 0:
raise SystemExit(" must be run from a valid git repository")
self.first_time = True
if os.path.isdir(os.path.join(self.collab_git_dir, 'refs', 'collab')):
self.first_time = False
# If .git/refs/collab exists, this is not the first time we've used the
# collab tool on this repository
self._first_time = True
if os.path.isdir(os.path.join(self._collab_git_dir, 'refs', 'collab')):
self._first_time = False
if self.first_time:
if self.excludes is None or self.includes is None:
raise SystemExit("Assertion failed: called set_paths() == True")
if self._first_time:
# Check that excludes, includes have been set
assert self._excludes is not None and self._includes is not None, \
"set_paths() was not called"
# Make sure the current repository is sane
if self.target_repo != '.':
raise SystemExit("Assertion failed: Program written correctly == True")
# Make sure the current repository is sane. The target needs to be
# the cwd. Also, the target repo should not have any git objects.
assert self._target_repo == '.', "Target should be the current directory"
(status, output) = \
"find %s/objects -type f | head -n 1 | wc -l" % self.collab_git_dir)
"find %s/objects -type f | head -n 1 | wc -l"
% self._collab_git_dir)
if output != "0":
raise SystemExit("collab clone must be called from an empty git repo.")
# Create the sourcemarks and targetmarks empty files, get their names
(file, self.sourcemarks) = tempfile.mkstemp()
(file, self.targetmarks) = tempfile.mkstemp()
(file_obj, self._sourcemarks) = tempfile.mkstemp()
(file_obj, self._targetmarks) = tempfile.mkstemp()
# Get the souremarks and targetmarks
(file, self.sourcemarks) = tempfile.mkstemp()
mapname = self._get_map_name(self.sourcemarks, include_git_dir=False)
os.write(file, read_content(self.collab_git_dir, mapname))
(file, self.targetmarks) = tempfile.mkstemp()
mapname = self._get_map_name(self.targetmarks, include_git_dir=False)
os.write(file, read_content(self.collab_git_dir, mapname))
# Get the souremarks and targetmarks temp files. Write the map contents
# to them.
(file_obj, self._sourcemarks) = tempfile.mkstemp()
mapname = self._get_map_name(self._sourcemarks, include_git_dir=False)
os.write(file_obj, read_content(self._collab_git_dir, mapname))
(file_obj, self._targetmarks) = tempfile.mkstemp()
mapname = self._get_map_name(self._targetmarks, include_git_dir=False)
os.write(file_obj, read_content(self._collab_git_dir, mapname))
# Get the excludes and includes, unless overridden
if self.excludes is None:
self.excludes = \
read_content(self.collab_git_dir, "refs/collab/excludes").split()
if self.includes is None:
self.includes = \
read_content(self.collab_git_dir, "refs/collab/includes").split()
if not self.includes:
self.includes = ['']
if self._excludes is None:
self._excludes = \
read_content(self._collab_git_dir, "refs/collab/excludes").split()
if self._includes is None:
self._includes = \
read_content(self._collab_git_dir, "refs/collab/includes").split()
if not self._includes:
self._includes = ['']
# Get the remote repository if not specified
if self.source_repo is None and self.target_repo is None:
if self._source_repo is None and self._target_repo is None:
raise SystemExit("You are using code written by a moron.")
orig_repo = \
read_content(self.collab_git_dir, "refs/collab/orig_repo").strip()
if self.source_repo is None:
self.source_repo = orig_repo
if self.target_repo is None:
self.target_repo = orig_repo
read_content(self._collab_git_dir, "refs/collab/orig_repo").strip()
if self._source_repo is None:
self._source_repo = orig_repo
if self._target_repo is None:
self._target_repo = orig_repo
def run(self):
# Set members based on data from previous runs
# Setup the source and target processes. The source process will produce
# fast-export output for the source repo, this output will be passed
# through FastExportFilter which will manipulate the output using our
# callbacks, finally, the manipulated output will be given to the
# fast-import process and used to create the target repo.
# (This should update sourcemarks and targetmarks)
source = \
["--export-marks=%s" % self.sourcemarks,
"--import-marks=%s" % self.sourcemarks]
+ self.fast_export_args)
["--export-marks=%s" % self._sourcemarks,
"--import-marks=%s" % self._sourcemarks]
+ self._fast_export_args)
target = \
FastImportInput( self.target_repo,
["--export-marks=%s" % self.targetmarks,
"--import-marks=%s" % self.targetmarks])
fast_import_input( self._target_repo,
["--export-marks=%s" % self._targetmarks,
"--import-marks=%s" % self._targetmarks])
filter = FastExportFilter(blob_callback = lambda b: self._do_blob(b),
commit_callback = lambda c: self._do_commit(c)), target.stdin)
filt = FastExportFilter(blob_callback = lambda b: self._do_blob(b),
commit_callback = lambda c: self._do_commit(c)), target.stdin)
if self.show_progress:
# Show progress
if self._show_progress:
sys.stdout.write("\nWaiting for git fast-import to complete...")
if self.show_progress:
target.wait() # need to wait for fast-import process to finish
if self._show_progress:
# Record the sourcemarks and targetmarks -- 2 steps
@ -221,67 +397,96 @@ class GraftFilter(object):
# Step 1: Make sure the source and target marks have the same mark numbers.
# Not doing this would allow one end of the grafting to reuse a number
# that would then be misconnected on the other side.
sourcemaps = self._get_maps(self.sourcemarks)
targetmaps = self._get_maps(self.targetmarks)
sourcemaps = self._get_maps(self._sourcemarks)
targetmaps = self._get_maps(self._targetmarks)
for key in sourcemaps.keys():
if key not in targetmaps:
del sourcemaps[key]
for key in targetmaps.keys():
if key not in sourcemaps:
del targetmaps[key]
# Step 2: Record the data
for set in [(sourcemaps, self.sourcemarks), (targetmaps, self.targetmarks)]:
mapname = self._get_map_name(set[1])
for set_obj in [(sourcemaps, self._sourcemarks),
(targetmaps, self._targetmarks)]:
# get raw filename for source/target
mapname = self._get_map_name(set_obj[1])
# create refs/collab if it's not there
if not os.path.isdir(os.path.dirname(mapname)):
content = ''.join([":%d %s\n" % (k, v) for k,v in set[0].iteritems()])
record_content(self.collab_git_dir, mapname, content)
if self.target_repo == '.':
# compute string content of commit-map
content = ''.join([":%d %s\n" % (k,v) for k,v in set_obj[0].iteritems()])
# record content in the object database
record_content(self._collab_git_dir, mapname, content)
# Check if we are running from the target
if self._target_repo == '.':
# Record the excludes and includes so they can be reused next time
for set in [(self.excludes, 'excludes'), (self.includes, 'includes')]:
filename = os.path.join(self.collab_git_dir, 'refs', 'collab', set[1])
record_content(self.collab_git_dir, filename, '\n'.join(set[0])+'\n')
for set_obj in [(self._excludes, 'excludes'),
(self._includes, 'includes')]:
filename = os.path.join(self._collab_git_dir, 'refs',
'collab', set_obj[1])
record_content(self._collab_git_dir, filename,
'\n'.join(set_obj[0]) + '\n')
# Record source_repo as the original repository
filename = os.path.join(self.collab_git_dir, 'refs', 'collab', 'orig_repo')
record_content(self.collab_git_dir, filename, self.source_repo+'\n')
def do_info():
def do_pull_grafts():
filter = GraftFilter(None, '.')
def do_push_grafts():
filter = GraftFilter('.', None)
def do_clone():
# Get the arguments
if len(sys.argv) <= 2:
raise SystemExit(get_syntax_string())
repository = sys.argv[2]
if not os.path.isdir(repository):
raise SystemExit("%s does not appear to be a git repository" % repository)
parser = OptionParser(usage=get_syntax_string())
filename = os.path.join(self._collab_git_dir, 'refs',
'collab', 'orig_repo')
record_content(self._collab_git_dir, filename, self._source_repo+'\n')
def _main_func():
parser = OptionParser(usage=USAGE)
parser.add_option("--exclude", action="append", default=[], type="string",
parser.add_option("--include", action="append", default=[], type="string",
(options, args) = parser.parse_args(args=sys.argv[3:])
if not options.includes:
# Run the filtering
filter = GraftFilter(repository, '.', fast_export_args = args)
filter.set_paths(excludes = options.excludes, includes = options.includes)
if subcommand == 'info': do_info()
elif subcommand == 'pull-grafts': do_pull_grafts()
elif subcommand == 'push-grafts': do_push_grafts()
elif subcommand == 'clone': do_clone()
raise SystemExit("Assertion failed; unknown command: '%s'" % subcommand)
(options, args) = parser.parse_args()
if (not args):
raise SystemExit("Missing command\n\n" + USAGE)
if (not options.includes):
subcommand = args[0]
if (subcommand not in ['info', 'pull-grafts', 'push-grafts', 'clone']):
raise SystemExit("Unrecognized command: %s\n\n%s" % (subcommand, USAGE))
if (subcommand == "info"):
elif (subcommand == "pull-grafts"):
graft_filter = GraftFilter(None, '.')
elif (subcommand == "push-grafts"):
graft_filter = GraftFilter('.', None)
elif (subcommand == "clone"):
# Get the arguments
if len(args) < 2:
raise SystemExit("Missing repository\n\n" + USAGE)
repository = args[1]
if (not os.path.isdir(repository)):
raise SystemExit("%s does not appear to be a git repository"
% repository)
# Run the filtering
graft_filter = GraftFilter(repository, '.', fast_export_args = args[2:])
graft_filter.set_paths(excludes = options.excludes,
includes = options.includes)
assert False, "Unhandled command: " + subcommand
if (__name__ == "__main__"):
