git-filter-repo/git-filter-repo

#!/usr/bin/env python

import os
import re
import sha  # bleh...when can I assume python >= 2.5?
import sys
from pyparsing import ParserElement, Literal, Optional, Combine, Word, nums, \
                      Regex, ZeroOrMore, OneOrMore, CharsNotIn, \
                      dblQuotedString, \
                      ParseException, ParseSyntaxException

from pyparsing import Token, ParseResults

__all__ = ["Blob", "Reset", "FileChanges", "Commit",
           "FastExportFilter", "FilterGitRepo"]

class ExactData(Token):
  """Specialized pyparsing subclass for handling data dumps in git-fast-import
     exact data format"""
  def __init__( self ):
    super(ExactData,self).__init__()

    self.pattern = r"data (\d+)\n"
    self.re = re.compile(self.pattern)
    self.reString = self.pattern

    self.name = "ExactData"
    self.errmsg = "Expected " + self.name
    self.mayIndexError = False
    self.mayReturnEmpty = True

  def parseImpl( self, instring, loc, doActions=True ):
    result = self.re.match(instring,loc)
    if not result:
      exc = self.myException
      exc.loc = loc
      exc.pstr = instring
      raise exc

    num = result.group(1)
    loc = result.end()+int(num)
    data = instring[result.end():loc]
    d = result.groupdict()
    ret = ParseResults(['data', num, data])
    return loc,ret

  def __str__( self ):
    try:
      return super(ExactMath,self).__str__()
    except:
      pass

    if self.strRepr is None:
      self.strRepr = "Data:"

    return self.strRepr

newmark = 0
mark_dict = {}
def translate_mark(old_mark = None):
  if not old_mark or old_mark not in mark_dict:
    global newmark
    newmark += 1
    mark_dict[old_mark] = newmark

  return mark_dict[old_mark]

class GitElement(object):
  def __init__(self):
    self.type = None

  def dump(self, file):
    raise SystemExit("Unimplemented function: %s.dump()", type(self))

class Blob(GitElement):
  def __init__(self, data, mark = None):
    GitElement.__init__(self)
    self.type = 'blob'
    self.data = data
    self.mark = translate_mark(mark)

  def dump(self, file):
    file.write('blob\n')
    file.write('mark :%d\n' % self.mark)
    file.write('data %d\n%s' % (len(self.data), self.data))
    file.write('\n')

class Reset(GitElement):
  def __init__(self, ref, from_ref = None):
    GitElement.__init__(self)
    self.type = 'reset'
    self.ref = ref
    self.from_ref = from_ref

  def dump(self, file):
    file.write('reset %s\n' % self.ref)
    if self.from_ref:
      file.write('from %s\n' % self.from_ref)
      file.write('\n')

class FileChanges(object):
  def __init__(self, type, filename, mode = None, mark = None):
    self.type = type
    self.filename = filename
    self.mode = mode
    if type == 'M':
      self.mark = translate_mark(mark)

  def dump(self, file):
    if self.type == 'M':
      file.write('M %s :%d %s\n' % (self.mode, self.mark, self.filename))
    elif self.type == 'D':
      file.write('D %s\n' % self.filename)

class Commit(GitElement):
  def __init__(self, branch,
               author_name,    author_email,    author_date,
               committer_name, committer_email, committer_date,
               message,
               file_changes,
               mark = None,
               from_commit = None,
               merge_commits = []):
    GitElement.__init__(self)
    self.type = 'commit'
    self.branch = branch
    self.author_name  = author_name
    self.author_email = author_email
    self.author_date  = author_date
    self.committer_name  = committer_name
    self.committer_email = committer_email
    self.committer_date  = committer_date
    self.message = message
    self.file_changes = file_changes
    self.mark = translate_mark(mark)
    if from_commit:
      self.from_commit = translate_mark(from_commit)
    else:
      self.from_commit = None
    self.merge_commits = [translate_mark(mark) for mark in merge_commits]

  def dump(self, file):
    file.write('commit %s\n' % self.branch)
    file.write('mark :%d\n' % self.mark)
    file.write('author %s <%s> %s\n' % \
                     (self.author_name, self.author_email, self.author_date))
    file.write('committer %s <%s> %s\n' % \
                     (self.committer_name, self.committer_email,
                      self.committer_date))
    file.write('data %d\n%s' % (len(self.message), self.message))
    if self.from_commit:
      file.write('from :%s\n' % self.from_commit)
    for ref in self.merge_commits:
      file.write('merge :%s\n' % ref)
    for change in self.file_changes:
      change.dump(file)
    file.write('\n')

class FastExportFilter(object):
  def __init__(self,
               tag_callback = None,   commit_callback = None,
               blob_callback = None,  progress_callback = None,
               reset_callback = None, checkpoint_callback = None,
               everything_callback = None):
    self._setup_parser()
    self.tag_callback        = tag_callback
    self.blob_callback       = blob_callback
    self.reset_callback      = reset_callback
    self.commit_callback     = commit_callback
    self.progress_callback   = progress_callback
    self.checkpoint_callback = checkpoint_callback
    self.everything_callback = everything_callback

    self.output = sys.stdout

  def _make_blob(self, t):
    # Create the Blob object from the parser tokens
    mark = int(t[1][1:])
    datalen = int(t[3])
    data = t[4]
    if datalen != len(data):
      raise SystemExit('%d != len(%s)' % datalen, data)
    blob = Blob(data, mark)

    # Call any user callback to allow them to modify the blob
    if self.blob_callback:
      self.blob_callback(blob)
    if self.everything_callback:
      self.everything_callback('blob', blob)

    # Now print the resulting blob to stdout
    blob.dump(self.output)

    # We don't need the parser tokens anymore
    return []

  def _make_reset(self, t):
    # Create the Reset object from the parser tokens
    ref = t[1]
    from_ref = None
    if len(t) > 2:
      from_ref = t[3]
    reset = Reset(ref, from_ref)

    # Call any user callback to allow them to modify the reset
    if self.reset_callback:
      self.reset_callback(reset)
    if self.everything_callback:
      self.everything_callback('reset', reset)

    # Now print the resulting reset to stdout
    reset.dump(self.output)

    # We don't need the parser tokens anymore
    return []

  def _make_file_changes(self, t):
    if t[0] == 'M':
      mode = t[1]
      mark = int(t[2][1:])
      filename = t[3]
      return FileChanges(t[0], filename, mode, mark)
    elif t[0] == 'D':
      filename = t[1]
      return FileChanges(t[0], filename)

  def _make_commit(self, t):
    #
    # Create the Commit object from the parser tokens...
    #

    # Get the branch
    branch = t[1]
    loc = 2
    tlen = len(t)

    # Get the optional mark
    mark = None
    if t[loc].startswith(':'):
      mark = int(t[loc][1:])
      loc += 1

    # Get the committer; we'll get back to the author in a minute
    offset = (t[loc] == 'author') and loc+4 or loc
    committer_name  = t[offset+1]
    committer_email = t[offset+2]
    committer_date  = t[offset+3]

    # Get the optional author
    if t[loc] == 'author':
      author_name  = t[loc+1]
      author_email = t[loc+2]
      author_date  = t[loc+3]
      loc += 8
    else:
      author_name  = committer_name
      author_email = committer_email
      author_date  = committer_date
      loc += 4

    # Get the commit message
    messagelen = int(t[loc+1])
    message = t[loc+2] # Skip 'data' and len(message)
    if messagelen != len(message):
      raise SystemExit("Commit message's length mismatch; %d != len(%s)" % \
                       messagelen, message)
    loc += 3

    # Get the commit we're supposed to be based on, if other than HEAD
    from_commit = None
    if loc < tlen and t[loc] == 'from':
      from_commit = int(t[loc+1][1:])
      loc += 2

    # Find out if this is a merge commit, and if so what commits other than
    # HEAD are involved
    merge_commits = []
    while loc < tlen and t[loc] == 'merge':
      merge_commits.append( int(t[loc+1][1:]) )
      loc += 2

    # Get file changes
    file_changes = t[loc:]

    # Okay, now we can finally create the Commit object
    commit = Commit(branch,
                    author_name,    author_email,    author_date,
                    committer_name, committer_email, committer_date,
                    message,
                    file_changes,
                    mark,
                    from_commit,
                    merge_commits)

    # Call any user callback to allow them to modify the commit
    if self.commit_callback:
      self.commit_callback(commit)
    if self.everything_callback:
      self.everything_callback('commit', commit)

    # Now print the resulting commit to stdout
    commit.dump(self.output)

    # We don't need the parser tokens anymore
    return []

  def _setup_parser(self):
    # Basic setup
    ParserElement.setDefaultWhitespaceChars('')
    number = Word(nums)
    lf = Literal('\n').suppress()
    sp = Literal(' ').suppress()

    # Common constructs -- data, ref startpoints
    exact_data = ExactData() + Optional(lf)
    data = exact_data  # FIXME: Should allow delimited_data too
    from_ref  = Literal('from')  + sp + Regex('.*') + lf
    merge_ref = Literal('merge') + sp + Regex('.*') + lf
    person_info = sp + Regex('[^<\n]*(?=[ ])') + sp + \
                  Literal('<').suppress() + Regex('[^<>\n]*') + \
                  Literal('>').suppress() + sp + \
                  Regex('.*') + lf

    # Parsing marks
    idnum = Combine(Literal(':') + number)
    mark = Literal('mark').suppress() - sp + idnum + lf

    # Parsing blobs
    file_content = data
    blob = Literal('blob') + lf + mark + file_content
    blob.setParseAction(lambda t: self._make_blob(t))

    # Parsing branch resets
    reset = Literal('reset') + sp + Regex('.*') + lf + \
            Optional(from_ref) + Optional(lf)
    reset.setParseAction(lambda t: self._make_reset(t))

    # Parsing file changes
    mode = Literal('100644') | Literal('644') | Literal('100755') | \
           Literal('755') | Literal('120000')
    path_str = CharsNotIn(' \n') | dblQuotedString
    file_obm = Literal('M') - sp + mode + sp + idnum + sp + path_str + lf
    file_del = Literal('D') - sp + path_str + lf
    file_change = file_obm | file_del
    #file_change = file_clr|file_del|file_rnm|file_cpy|file_obm|file_inm
    file_change.setParseAction(lambda t: self._make_file_changes(t))

    # Parsing commits
    author_info = Literal('author') + person_info
    committer_info = Literal('committer') + person_info
    commit_msg = data
    commit = Literal('commit') + sp + Regex('.*') + lf + \
             Optional(mark) +                            \
             Optional(author_info) +                     \
             committer_info +                            \
             commit_msg +                                \
             Optional(from_ref) +                        \
             ZeroOrMore(merge_ref) +                     \
             ZeroOrMore(file_change) +                   \
             Optional(lf)
    commit.setParseAction(lambda t: self._make_commit(t))

    # Tying it all together
    cmd = blob | reset | commit
    self.stream = ZeroOrMore(cmd)
    self.stream.parseWithTabs()

  def run(self, input_file, output_file):
    if output_file:
      self.output = output_file
    try:
      results = self.stream.parseFile(input_file)
    except ParseException, err:
      print err.line
      print " "*(err.column-1) + "^"
      print err
      raise SystemExit
    except ParseSyntaxException, err:
      print err.line
      print " "*(err.column-1) + "^"
      print err
      raise SystemExit
    input_file.close()
    output_file.close()

class FilterGitRepo(object):
  def __init__(self, source_repo, filter, target_repo):
    from subprocess import Popen, PIPE

    input = Popen(["git", "fast-export", "--all"],
                  stdout = PIPE,
                  cwd = source_repo).stdout

    if not os.path.isdir(target_repo):
      os.makedirs(target_repo)
      os.waitpid(Popen(["git", "init"], cwd = target_repo).pid, 0)
    output = Popen(["git", "fast-import"],
                   stdin = PIPE,
                   cwd = target_repo).stdin

    filter.run(input, output)

filter = FastExportFilter()
FilterGitRepo("basic-import", filter, "testing")