mirror of
https://github.com/newren/git-filter-repo.git
synced 2024-11-17 03:26:08 +00:00
Rewrite to not use pyparsing in order to avoid memory madness
pyparsing sucks a whole file into memory at a time and then parses, which is really bad in this case since the output from git-fast-export is huge. I entered disk swapping madness pretty easily. So, now I just do my own manual parsing.
This commit is contained in:
parent
acd197044a
commit
eab6741272
@ -1,58 +1,12 @@
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from subprocess import Popen, PIPE
|
||||
from pyparsing import ParserElement, Literal, Optional, Combine, Word, nums, \
|
||||
Regex, ZeroOrMore, OneOrMore, CharsNotIn, \
|
||||
dblQuotedString, \
|
||||
ParseException, ParseSyntaxException
|
||||
|
||||
from pyparsing import Token, ParseResults
|
||||
from subprocess import Popen, PIPE, call
|
||||
from email.utils import unquote
|
||||
|
||||
__all__ = ["Blob", "Reset", "FileChanges", "Commit", "get_total_commits",
|
||||
"FastExportFilter", "FastExportOuput", "FastImportInput"]
|
||||
|
||||
class ExactData(Token):
|
||||
"""Specialized pyparsing subclass for handling data dumps in git-fast-import
|
||||
exact data format"""
|
||||
def __init__( self ):
|
||||
super(ExactData,self).__init__()
|
||||
|
||||
self.pattern = r"data (\d+)\n"
|
||||
self.re = re.compile(self.pattern)
|
||||
self.reString = self.pattern
|
||||
|
||||
self.name = "ExactData"
|
||||
self.errmsg = "Expected " + self.name
|
||||
self.mayIndexError = False
|
||||
self.mayReturnEmpty = True
|
||||
|
||||
def parseImpl( self, instring, loc, doActions=True ):
|
||||
result = self.re.match(instring,loc)
|
||||
if not result:
|
||||
exc = self.myException
|
||||
exc.loc = loc
|
||||
exc.pstr = instring
|
||||
raise exc
|
||||
|
||||
num = result.group(1)
|
||||
loc = result.end()+int(num)
|
||||
data = instring[result.end():loc]
|
||||
d = result.groupdict()
|
||||
ret = ParseResults(['data', num, data])
|
||||
return loc,ret
|
||||
|
||||
def __str__( self ):
|
||||
try:
|
||||
return super(ExactMath,self).__str__()
|
||||
except:
|
||||
pass
|
||||
|
||||
if self.strRepr is None:
|
||||
self.strRepr = "Data:"
|
||||
|
||||
return self.strRepr
|
||||
|
||||
class IDs(object):
|
||||
def __init__(self):
|
||||
self.count = 0
|
||||
@ -137,6 +91,8 @@ class FileChanges(GitElement):
|
||||
file.write('M %s :%d %s\n' % (self.mode, self.id, self.filename))
|
||||
elif self.type == 'D':
|
||||
file.write('D %s\n' % self.filename)
|
||||
else:
|
||||
raise SystemExit("Unhandled filechange type: %s" % self.type)
|
||||
|
||||
class Commit(GitElement):
|
||||
def __init__(self, branch,
|
||||
@ -187,7 +143,6 @@ class FastExportFilter(object):
|
||||
blob_callback = None, progress_callback = None,
|
||||
reset_callback = None, checkpoint_callback = None,
|
||||
everything_callback = None):
|
||||
self._setup_parser()
|
||||
self.tag_callback = tag_callback
|
||||
self.blob_callback = blob_callback
|
||||
self.reset_callback = reset_callback
|
||||
@ -196,17 +151,79 @@ class FastExportFilter(object):
|
||||
self.checkpoint_callback = checkpoint_callback
|
||||
self.everything_callback = everything_callback
|
||||
|
||||
self.input = None
|
||||
self.output = sys.stdout
|
||||
self.nextline = ''
|
||||
|
||||
def _make_blob(self, t):
|
||||
# Create the Blob object from the parser tokens
|
||||
id = int(t[1][1:])
|
||||
datalen = int(t[3])
|
||||
data = t[4]
|
||||
if datalen != len(data):
|
||||
raise SystemExit('%d != len(%s)' % datalen, data)
|
||||
def _advance_nextline(self):
|
||||
self.nextline = self.input.readline()
|
||||
|
||||
def _parse_optional_mark(self):
|
||||
mark = None
|
||||
matches = re.match('mark :(\d+)\n$', self.nextline)
|
||||
if matches:
|
||||
mark = int(matches.group(1))
|
||||
self._advance_nextline()
|
||||
return mark
|
||||
|
||||
def _parse_optional_baseref(self, refname):
|
||||
baseref = None
|
||||
matches = re.match('%s :(\d+)\n' % refname, self.nextline)
|
||||
if matches:
|
||||
baseref = ids.translate( int(matches.group(1)) )
|
||||
self._advance_nextline()
|
||||
return baseref
|
||||
|
||||
def _parse_optional_filechange(self):
|
||||
filechange = None
|
||||
if self.nextline.startswith('M '):
|
||||
(mode, idnum, path) = \
|
||||
re.match('M (\d+) :(\d+) (.*)\n$', self.nextline).groups()
|
||||
idnum = int(idnum)
|
||||
if path.startswith('"'):
|
||||
path = unquote(path)
|
||||
filechange = FileChanges('M', path, mode, idnum)
|
||||
self._advance_nextline()
|
||||
elif self.nextline.startswith('D '):
|
||||
path = self.nextline[2:-1]
|
||||
if path.startswith('"'):
|
||||
path = unquote(path)
|
||||
filechange = FileChanges('D', path)
|
||||
self._advance_nextline()
|
||||
return filechange
|
||||
|
||||
def _parse_ref_line(self, refname):
|
||||
matches = re.match('%s (.*)\n$' % refname, self.nextline)
|
||||
if not matches:
|
||||
raise SystemExit("Malformed %s line: '%s'" % (refname, self.nextline))
|
||||
ref = matches.group(1)
|
||||
self._advance_nextline()
|
||||
return ref
|
||||
|
||||
def _parse_user(self, usertype):
|
||||
(name, email, when) = \
|
||||
re.match('%s (.*?) <(.*?)> (.*)\n$' % usertype, self.nextline).groups()
|
||||
self._advance_nextline()
|
||||
return (name, email, when)
|
||||
|
||||
def _parse_data(self):
|
||||
size = int(re.match('data (\d+)\n$', self.nextline).group(1))
|
||||
data = self.input.read(size)
|
||||
self._advance_nextline()
|
||||
return data
|
||||
|
||||
def _parse_blob(self):
|
||||
# Parse the Blob
|
||||
self._advance_nextline()
|
||||
id = self._parse_optional_mark()
|
||||
data = self._parse_data()
|
||||
if self.nextline == '\n':
|
||||
self._advance_nextline()
|
||||
|
||||
# Create the blob
|
||||
blob = Blob(data)
|
||||
ids.record_rename(id, blob.id)
|
||||
if id:
|
||||
ids.record_rename(id, blob.id)
|
||||
|
||||
# Call any user callback to allow them to modify the blob
|
||||
if self.blob_callback:
|
||||
@ -214,19 +231,17 @@ class FastExportFilter(object):
|
||||
if self.everything_callback:
|
||||
self.everything_callback('blob', blob)
|
||||
|
||||
# Now print the resulting blob to stdout
|
||||
# Now print the resulting blob
|
||||
blob.dump(self.output)
|
||||
|
||||
# We don't need the parser tokens anymore
|
||||
return []
|
||||
def _parse_reset(self):
|
||||
# Parse the Reset
|
||||
ref = self._parse_ref_line('reset')
|
||||
from_ref = self._parse_optional_baseref('from')
|
||||
if self.nextline == '\n':
|
||||
self._advance_nextline()
|
||||
|
||||
def _make_reset(self, t):
|
||||
# Create the Reset object from the parser tokens
|
||||
ref = t[1]
|
||||
from_ref = None
|
||||
if len(t) > 2:
|
||||
old_id = int(t[3][1:])
|
||||
from_ref = ids.translate(old_id)
|
||||
# Create the reset
|
||||
reset = Reset(ref, from_ref)
|
||||
|
||||
# Call any user callback to allow them to modify the reset
|
||||
@ -235,88 +250,47 @@ class FastExportFilter(object):
|
||||
if self.everything_callback:
|
||||
self.everything_callback('reset', reset)
|
||||
|
||||
# Now print the resulting reset to stdout
|
||||
# Now print the resulting reset
|
||||
reset.dump(self.output)
|
||||
|
||||
# We don't need the parser tokens anymore
|
||||
return []
|
||||
def _parse_commit(self):
|
||||
# Parse the Commit
|
||||
branch = self._parse_ref_line('commit')
|
||||
id = self._parse_optional_mark()
|
||||
|
||||
def _make_file_changes(self, t):
|
||||
if t[0] == 'M':
|
||||
mode = t[1]
|
||||
old_id = int(t[2][1:])
|
||||
id = ids.translate(old_id)
|
||||
author_name = None
|
||||
if self.nextline.startswith('author'):
|
||||
(author_name, author_email, author_date) = self._parse_user('author')
|
||||
|
||||
filename = t[3]
|
||||
return FileChanges(t[0], filename, mode, id)
|
||||
elif t[0] == 'D':
|
||||
filename = t[1]
|
||||
return FileChanges(t[0], filename)
|
||||
(committer_name, committer_email, committer_date) = \
|
||||
self._parse_user('committer')
|
||||
|
||||
def _make_commit(self, t):
|
||||
#
|
||||
# Create the Commit object from the parser tokens...
|
||||
#
|
||||
if not author_name:
|
||||
(author_name, author_email, author_date) = \
|
||||
(committer_name, committer_email, committer_date)
|
||||
|
||||
# Get the branch
|
||||
branch = t[1]
|
||||
loc = 2
|
||||
tlen = len(t)
|
||||
commit_msg = self._parse_data()
|
||||
|
||||
# Get the optional mark
|
||||
id = None
|
||||
if t[loc].startswith(':'):
|
||||
id = int(t[loc][1:])
|
||||
loc += 1
|
||||
|
||||
# Get the committer; we'll get back to the author in a minute
|
||||
offset = (t[loc] == 'author') and loc+4 or loc
|
||||
committer_name = t[offset+1]
|
||||
committer_email = t[offset+2]
|
||||
committer_date = t[offset+3]
|
||||
|
||||
# Get the optional author
|
||||
if t[loc] == 'author':
|
||||
author_name = t[loc+1]
|
||||
author_email = t[loc+2]
|
||||
author_date = t[loc+3]
|
||||
loc += 8
|
||||
else:
|
||||
author_name = committer_name
|
||||
author_email = committer_email
|
||||
author_date = committer_date
|
||||
loc += 4
|
||||
|
||||
# Get the commit message
|
||||
messagelen = int(t[loc+1])
|
||||
message = t[loc+2] # Skip 'data' and len(message)
|
||||
if messagelen != len(message):
|
||||
raise SystemExit("Commit message's length mismatch; %d != len(%s)" % \
|
||||
messagelen, message)
|
||||
loc += 3
|
||||
|
||||
# Get the commit we're supposed to be based on, if other than HEAD
|
||||
from_commit = None
|
||||
if loc < tlen and t[loc] == 'from':
|
||||
old_id = int(t[loc+1][1:])
|
||||
from_commit = ids.translate(old_id)
|
||||
loc += 2
|
||||
|
||||
# Find out if this is a merge commit, and if so what commits other than
|
||||
# HEAD are involved
|
||||
from_commit = self._parse_optional_baseref('from')
|
||||
merge_commits = []
|
||||
while loc < tlen and t[loc] == 'merge':
|
||||
merge_commits.append(ids.translate( int(t[loc+1][1:]) ))
|
||||
loc += 2
|
||||
|
||||
# Get file changes
|
||||
file_changes = t[loc:]
|
||||
merge_ref = self._parse_optional_baseref('merge')
|
||||
while merge_ref:
|
||||
merge_commits.append(merge_ref)
|
||||
merge_ref = self._parse_optional_baseref('merge')
|
||||
|
||||
file_changes = []
|
||||
file_change = self._parse_optional_filechange()
|
||||
while file_change:
|
||||
file_changes.append(file_change)
|
||||
file_change = self._parse_optional_filechange()
|
||||
if self.nextline == '\n':
|
||||
self._advance_nextline()
|
||||
|
||||
# Okay, now we can finally create the Commit object
|
||||
commit = Commit(branch,
|
||||
author_name, author_email, author_date,
|
||||
committer_name, committer_email, committer_date,
|
||||
message,
|
||||
commit_msg,
|
||||
file_changes,
|
||||
from_commit,
|
||||
merge_commits)
|
||||
@ -332,87 +306,20 @@ class FastExportFilter(object):
|
||||
# Now print the resulting commit to stdout
|
||||
commit.dump(self.output)
|
||||
|
||||
# We don't need the parser tokens anymore
|
||||
return []
|
||||
|
||||
def _setup_parser(self):
|
||||
# Basic setup
|
||||
ParserElement.setDefaultWhitespaceChars('')
|
||||
number = Word(nums)
|
||||
lf = Literal('\n').suppress()
|
||||
sp = Literal(' ').suppress()
|
||||
|
||||
# Common constructs -- data, ref startpoints
|
||||
exact_data = ExactData() + Optional(lf)
|
||||
data = exact_data # FIXME: Should allow delimited_data too
|
||||
from_ref = Literal('from') + sp + Regex('.*') + lf
|
||||
merge_ref = Literal('merge') + sp + Regex('.*') + lf
|
||||
person_info = sp + Regex('[^<\n]*(?=[ ])') + sp + \
|
||||
Literal('<').suppress() + Regex('[^<>\n]*') + \
|
||||
Literal('>').suppress() + sp + \
|
||||
Regex('.*') + lf
|
||||
|
||||
# Parsing marks
|
||||
idnum = Combine(Literal(':') + number)
|
||||
mark = Literal('mark').suppress() - sp + idnum + lf
|
||||
|
||||
# Parsing blobs
|
||||
file_content = data
|
||||
blob = Literal('blob') + lf + mark + file_content
|
||||
blob.setParseAction(lambda t: self._make_blob(t))
|
||||
|
||||
# Parsing branch resets
|
||||
reset = Literal('reset') + sp + Regex('.*') + lf + \
|
||||
Optional(from_ref) + Optional(lf)
|
||||
reset.setParseAction(lambda t: self._make_reset(t))
|
||||
|
||||
# Parsing file changes
|
||||
mode = Literal('100644') | Literal('644') | Literal('100755') | \
|
||||
Literal('755') | Literal('120000')
|
||||
path_str = CharsNotIn(' \n') | dblQuotedString
|
||||
file_obm = Literal('M') - sp + mode + sp + idnum + sp + path_str + lf
|
||||
file_del = Literal('D') - sp + path_str + lf
|
||||
file_change = file_obm | file_del
|
||||
#file_change = file_clr|file_del|file_rnm|file_cpy|file_obm|file_inm
|
||||
file_change.setParseAction(lambda t: self._make_file_changes(t))
|
||||
|
||||
# Parsing commits
|
||||
author_info = Literal('author') + person_info
|
||||
committer_info = Literal('committer') + person_info
|
||||
commit_msg = data
|
||||
commit = Literal('commit') + sp + Regex('.*') + lf + \
|
||||
Optional(mark) + \
|
||||
Optional(author_info) + \
|
||||
committer_info + \
|
||||
commit_msg + \
|
||||
Optional(from_ref) + \
|
||||
ZeroOrMore(merge_ref) + \
|
||||
ZeroOrMore(file_change) + \
|
||||
Optional(lf)
|
||||
commit.setParseAction(lambda t: self._make_commit(t))
|
||||
|
||||
# Tying it all together
|
||||
cmd = blob | reset | commit
|
||||
self.stream = ZeroOrMore(cmd)
|
||||
self.stream.parseWithTabs()
|
||||
|
||||
def run(self, input_file, output_file):
|
||||
self.input = input_file
|
||||
if output_file:
|
||||
self.output = output_file
|
||||
try:
|
||||
results = self.stream.parseFile(input_file)
|
||||
except ParseException, err:
|
||||
print err.line
|
||||
print " "*(err.column-1) + "^"
|
||||
print err
|
||||
raise SystemExit
|
||||
except ParseSyntaxException, err:
|
||||
print err.line
|
||||
print " "*(err.column-1) + "^"
|
||||
print err
|
||||
raise SystemExit
|
||||
input_file.close()
|
||||
output_file.close()
|
||||
self.nextline = input_file.readline()
|
||||
while self.nextline:
|
||||
if self.nextline.startswith('blob'):
|
||||
self._parse_blob()
|
||||
elif self.nextline.startswith('reset'):
|
||||
self._parse_reset()
|
||||
elif self.nextline.startswith('commit'):
|
||||
self._parse_commit()
|
||||
else:
|
||||
raise SystemExit("Could not parse line: '%s'" % self.nextline)
|
||||
|
||||
def FastExportOutput(source_repo, extra_args = []):
|
||||
return Popen(["git", "fast-export", "--all"] + extra_args,
|
||||
@ -422,7 +329,8 @@ def FastExportOutput(source_repo, extra_args = []):
|
||||
def FastImportInput(target_repo, extra_args = []):
|
||||
if not os.path.isdir(target_repo):
|
||||
os.makedirs(target_repo)
|
||||
os.waitpid(Popen(["git", "init"], cwd = target_repo).pid, 0)
|
||||
if call(["git", "init"], cwd = target_repo) != 0:
|
||||
raise SystemExit("git init in %s failed!" % target_repo)
|
||||
return Popen(["git", "fast-import"] + extra_args,
|
||||
stdin = PIPE,
|
||||
stderr = PIPE, # We don't want no stinkin' statistics
|
||||
|
Loading…
Reference in New Issue
Block a user