filter-repo: avoid string->datetime->string round trips

Most filtering operations are not interested in the time that commits
were authored or committed, or when tags were tagged.  As such,
translating the string representation of the date into a datetime object
is wasted effort, and causes us to waste more time later as we have to
translate it back into a string.

Instead, provide string_to_date() and date_to_string() functions so that
callers can perform the translation if wanted, and let the normal case
be fast.

Provides a small but noticable speedup when just filtering based on
paths; about a 3.5% improvement in execution time for writing the new
history.

Signed-off-by: Elijah Newren <newren@gmail.com>
pull/13/head
Elijah Newren 5 years ago
parent b363a1574f
commit 6dba1f200c

@ -25,7 +25,8 @@ import textwrap
from datetime import tzinfo, timedelta, datetime
__all__ = ["Blob", "Reset", "FileChanges", "Commit", "Tag", "Progress",
"Checkpoint", "FastExportFilter", "FixedTimeZone", "ProgressWriter",
"Checkpoint", "FastExportFilter", "ProgressWriter",
"string_to_date", "date_to_string",
"record_id_rename", "GitUtils", "FilteringOptions", "RepoFilter"]
@ -36,15 +37,6 @@ def _timedelta_to_seconds(delta):
offset = delta.days*86400 + delta.seconds + (delta.microseconds+0.0)/1000000
return round(offset)
def _write_date(file_, date):
"""
Writes a date to a file. The file should already be open. The date is
written as seconds-since-epoch followed by the name of the timezone.
"""
epoch = datetime.fromtimestamp(0, date.tzinfo)
file_.write('%d %s' % (_timedelta_to_seconds(date - epoch),
date.tzinfo.tzname(0)))
class FixedTimeZone(tzinfo):
"""
Fixed offset in minutes east from UTC.
@ -52,21 +44,7 @@ class FixedTimeZone(tzinfo):
def __init__(self, offset_string):
tzinfo.__init__(self)
try:
sign, hh, mm = re.match(r'^([-+]?)(\d\d)(\d\d)$', offset_string).groups()
except AttributeError:
# TimeZone idiocy; IST is any of four timezones, so someone translated
# it to something that was totally invalid...and it got recorded that
# way. Others have suggested just using an invalid timezone that
# fast-import will not choke on. Let's do that. Note that +051800
# seems to be the only weird timezone found in the wild, by me or some
# other posts google returned on the subject...
if offset_string == '+051800':
sign, hh, mm = '+', '02', '61'
offset_string = offset_string.replace('+051800', '+0261')
else:
raise AttributeError("Could not parse {} as timezone"
.format(offset_string))
sign, hh, mm = re.match(r'^([-+]?)(\d\d)(\d\d)$', offset_string).groups()
factor = -1 if (sign and sign == '-') else 1
self._offset = timedelta(minutes = factor*(60*int(hh) + int(mm)))
self._offset_string = offset_string
@ -80,6 +58,16 @@ class FixedTimeZone(tzinfo):
def dst(self, dt):
return timedelta(0)
def string_to_date(datestring):
(unix_timestamp, tz_offset) = datestring.split()
return datetime.fromtimestamp(int(unix_timestamp),
FixedTimeZone(tz_offset))
def date_to_string(dateobj):
epoch = datetime.fromtimestamp(0, dateobj.tzinfo)
return('{} {}'.format(int(_timedelta_to_seconds(dateobj - epoch)),
dateobj.tzinfo.tzname(0)))
class PathQuoting:
_unescape = {'a': '\a',
'b': '\b',
@ -601,11 +589,11 @@ class Commit(_GitElementWithId):
file_.write('commit %s\n' % self.branch)
file_.write('mark :%d\n' % self.id)
file_.write('author %s <%s> ' % (self.author_name, self.author_email))
_write_date(file_, self.author_date)
file_.write(self.author_date)
file_.write('\n')
file_.write('committer %s <%s> ' % \
(self.committer_name, self.committer_email))
_write_date(file_, self.committer_date)
file_.write(self.committer_date)
file_.write('\n')
file_.write('data %d\n%s' % (len(self.message), self.message))
if not self.message.endswith("\n"):
@ -689,7 +677,7 @@ class Tag(_GitElement):
file_.write('from {}{}\n'.format(mark, self.from_ref))
if self.tagger_name:
file_.write('tagger %s <%s> ' % (self.tagger_name, self.tagger_email))
_write_date(file_, self.tagger_date)
file_.write(self.tagger_date)
file_.write('\n')
file_.write('data %d\n%s' % (len(self.tag_message), self.tag_message))
file_.write('\n')
@ -980,13 +968,17 @@ class FastExportFilter(object):
re.match('%s (.*?) <(.*?)> (.*)\n$' %
usertype, self._currentline).groups()
# Translate when into a datetime object, with corresponding timezone info
(unix_timestamp, tz_offset) = when.split()
datestamp = datetime.fromtimestamp(int(unix_timestamp),
FixedTimeZone(tz_offset))
# TimeZone idiocy; IST is any of four timezones, so someone translated
# it to something that was totally invalid...and it got recorded that
# way. Others have suggested just using an invalid timezone that
# fast-import will not choke on. Let's do that. Note that +051800
# seems to be the only weird timezone found in the wild, by me or some
# other posts google returned on the subject...
if when.endswith('+051800'):
when = when[0:-7]+'+0261'
self._advance_currentline()
return (name, email, datestamp)
return (name, email, when)
def _parse_data(self):
"""

@ -26,7 +26,9 @@ def change_up_them_commits(commit):
commit.author_email = re.sub("@my.crp", "@my.corp", commit.author_email)
# Fix the committer date (bad timezone conversion in initial import)
commit.committer_date += timedelta(hours=-5)
oldtime = repo_filter.string_to_date(commit.committer_date)
newtime = oldtime + timedelta(hours=-5)
commit.committer_date = repo_filter.date_to_string(newtime)
# Fix the commit message
commit.message = re.sub("Marketing is staffed with pansies", "",

@ -36,9 +36,10 @@ changes = [FileChanges('M', 'world', world.id, mode="100644"),
when = datetime(year=2005, month=4, day=7,
hour=15, minute=16, second=10,
tzinfo=FixedTimeZone("-0700"))
when_string = repo_filter.date_to_string(when)
commit1 = Commit("refs/heads/master",
"A U Thor", "au@thor.email", when,
"Com M. Iter", "comm@iter.email", when,
"A U Thor", "au@thor.email", when_string,
"Com M. Iter", "comm@iter.email", when_string,
"My first commit! Wooot!\n\nLonger description",
changes,
from_commit = None,
@ -53,9 +54,10 @@ world_link.dump(output)
changes = [FileChanges('M', 'world', world.id, mode="100644"),
FileChanges('M', 'planet', world_link.id, mode="120000")]
when += timedelta(days=3, hours=4, minutes=6)
when_string = repo_filter.date_to_string(when)
commit2 = Commit("refs/heads/master",
"A U Thor", "au@thor.email", when,
"Com M. Iter", "comm@iter.email", when,
"A U Thor", "au@thor.email", when_string,
"Com M. Iter", "comm@iter.email", when_string,
"Make a symlink to world called planet, modify world",
changes,
from_commit = commit1.id,
@ -66,10 +68,10 @@ script = Blob("#!/bin/sh\n\necho Hello")
script.dump(output)
changes = [FileChanges('M', 'runme', script.id, mode="100755"),
FileChanges('D', 'bar')]
when = datetime.fromtimestamp(timestamp=1234567890, tz=FixedTimeZone("-0700"))
when_string = "1234567890 -0700"
commit3 = Commit("refs/heads/master",
"A U Thor", "au@thor.email", when,
"Com M. Iter", "comm@iter.email", when,
"A U Thor", "au@thor.email", when_string,
"Com M. Iter", "comm@iter.email", when_string,
"Add runme script, remove bar",
changes,
from_commit = commit2.id,
@ -89,9 +91,10 @@ world.dump(output)
changes = [FileChanges('M', 'world', world.id, mode="100644")]
when = datetime(2006, 8, 17, tzinfo=FixedTimeZone("+0200"))
when_string = repo_filter.date_to_string(when)
commit4 = Commit("refs/heads/devel",
"A U Thor", "au@thor.email", when,
"Com M. Iter", "comm@iter.email", when,
"A U Thor", "au@thor.email", when_string,
"Com M. Iter", "comm@iter.email", when_string,
"Modify world",
changes,
from_commit = commit1.id,
@ -100,7 +103,8 @@ commit4.dump(output)
world = Blob("Hello\nHi\nGoodbye")
world.dump(output)
when = commit3.author_date + timedelta(days=47)
when = repo_filter.string_to_date(commit3.author_date) + timedelta(days=47)
when_string = repo_filter.date_to_string(when)
# git fast-import requires file changes to be listed in terms of differences
# to the first parent. Thus, despite the fact that runme and planet have
# not changed and bar was not modified in the devel side, we have to list them
@ -111,8 +115,8 @@ changes = [FileChanges('M', 'world', world.id, mode="100644"),
FileChanges('M', 'planet', world_link.id, mode="120000")]
commit5 = Commit("refs/heads/devel",
"A U Thor", "au@thor.email", when,
"Com M. Iter", "comm@iter.email", when,
"A U Thor", "au@thor.email", when_string,
"Com M. Iter", "comm@iter.email", when_string,
"Merge branch 'master'\n",
changes,
from_commit = commit4.id,
@ -121,7 +125,7 @@ commit5.dump(output)
mytag = Tag("refs/tags/v1.0", commit5.id,
"His R. Highness", "royalty@my.kingdom", when,
"His R. Highness", "royalty@my.kingdom", when_string,
"I bequeath to my peons this royal software")
mytag.dump(output)
out.finish()

Loading…
Cancel
Save