From 6dba1f200cda3968647de0fa180b9cf95a5feb23 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Mon, 4 Feb 2019 18:30:50 -0800 Subject: [PATCH] filter-repo: avoid string->datetime->string round trips Most filtering operations are not interested in the time that commits were authored or committed, or when tags were tagged. As such, translating the string representation of the date into a datetime object is wasted effort, and causes us to waste more time later as we have to translate it back into a string. Instead, provide string_to_date() and date_to_string() functions so that callers can perform the translation if wanted, and let the normal case be fast. Provides a small but noticable speedup when just filtering based on paths; about a 3.5% improvement in execution time for writing the new history. Signed-off-by: Elijah Newren --- git-filter-repo | 58 ++++++++++++---------------- t/t9391/commit_info.py | 4 +- t/t9391/create_fast_export_output.py | 30 +++++++------- 3 files changed, 45 insertions(+), 47 deletions(-) diff --git a/git-filter-repo b/git-filter-repo index 628e702..de1f914 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -25,7 +25,8 @@ import textwrap from datetime import tzinfo, timedelta, datetime __all__ = ["Blob", "Reset", "FileChanges", "Commit", "Tag", "Progress", - "Checkpoint", "FastExportFilter", "FixedTimeZone", "ProgressWriter", + "Checkpoint", "FastExportFilter", "ProgressWriter", + "string_to_date", "date_to_string", "record_id_rename", "GitUtils", "FilteringOptions", "RepoFilter"] @@ -36,15 +37,6 @@ def _timedelta_to_seconds(delta): offset = delta.days*86400 + delta.seconds + (delta.microseconds+0.0)/1000000 return round(offset) -def _write_date(file_, date): - """ - Writes a date to a file. The file should already be open. The date is - written as seconds-since-epoch followed by the name of the timezone. - """ - epoch = datetime.fromtimestamp(0, date.tzinfo) - file_.write('%d %s' % (_timedelta_to_seconds(date - epoch), - date.tzinfo.tzname(0))) - class FixedTimeZone(tzinfo): """ Fixed offset in minutes east from UTC. @@ -52,21 +44,7 @@ class FixedTimeZone(tzinfo): def __init__(self, offset_string): tzinfo.__init__(self) - try: - sign, hh, mm = re.match(r'^([-+]?)(\d\d)(\d\d)$', offset_string).groups() - except AttributeError: - # TimeZone idiocy; IST is any of four timezones, so someone translated - # it to something that was totally invalid...and it got recorded that - # way. Others have suggested just using an invalid timezone that - # fast-import will not choke on. Let's do that. Note that +051800 - # seems to be the only weird timezone found in the wild, by me or some - # other posts google returned on the subject... - if offset_string == '+051800': - sign, hh, mm = '+', '02', '61' - offset_string = offset_string.replace('+051800', '+0261') - else: - raise AttributeError("Could not parse {} as timezone" - .format(offset_string)) + sign, hh, mm = re.match(r'^([-+]?)(\d\d)(\d\d)$', offset_string).groups() factor = -1 if (sign and sign == '-') else 1 self._offset = timedelta(minutes = factor*(60*int(hh) + int(mm))) self._offset_string = offset_string @@ -80,6 +58,16 @@ class FixedTimeZone(tzinfo): def dst(self, dt): return timedelta(0) +def string_to_date(datestring): + (unix_timestamp, tz_offset) = datestring.split() + return datetime.fromtimestamp(int(unix_timestamp), + FixedTimeZone(tz_offset)) + +def date_to_string(dateobj): + epoch = datetime.fromtimestamp(0, dateobj.tzinfo) + return('{} {}'.format(int(_timedelta_to_seconds(dateobj - epoch)), + dateobj.tzinfo.tzname(0))) + class PathQuoting: _unescape = {'a': '\a', 'b': '\b', @@ -601,11 +589,11 @@ class Commit(_GitElementWithId): file_.write('commit %s\n' % self.branch) file_.write('mark :%d\n' % self.id) file_.write('author %s <%s> ' % (self.author_name, self.author_email)) - _write_date(file_, self.author_date) + file_.write(self.author_date) file_.write('\n') file_.write('committer %s <%s> ' % \ (self.committer_name, self.committer_email)) - _write_date(file_, self.committer_date) + file_.write(self.committer_date) file_.write('\n') file_.write('data %d\n%s' % (len(self.message), self.message)) if not self.message.endswith("\n"): @@ -689,7 +677,7 @@ class Tag(_GitElement): file_.write('from {}{}\n'.format(mark, self.from_ref)) if self.tagger_name: file_.write('tagger %s <%s> ' % (self.tagger_name, self.tagger_email)) - _write_date(file_, self.tagger_date) + file_.write(self.tagger_date) file_.write('\n') file_.write('data %d\n%s' % (len(self.tag_message), self.tag_message)) file_.write('\n') @@ -980,13 +968,17 @@ class FastExportFilter(object): re.match('%s (.*?) <(.*?)> (.*)\n$' % usertype, self._currentline).groups() - # Translate when into a datetime object, with corresponding timezone info - (unix_timestamp, tz_offset) = when.split() - datestamp = datetime.fromtimestamp(int(unix_timestamp), - FixedTimeZone(tz_offset)) + # TimeZone idiocy; IST is any of four timezones, so someone translated + # it to something that was totally invalid...and it got recorded that + # way. Others have suggested just using an invalid timezone that + # fast-import will not choke on. Let's do that. Note that +051800 + # seems to be the only weird timezone found in the wild, by me or some + # other posts google returned on the subject... + if when.endswith('+051800'): + when = when[0:-7]+'+0261' self._advance_currentline() - return (name, email, datestamp) + return (name, email, when) def _parse_data(self): """ diff --git a/t/t9391/commit_info.py b/t/t9391/commit_info.py index 673f18e..71a5dfe 100755 --- a/t/t9391/commit_info.py +++ b/t/t9391/commit_info.py @@ -26,7 +26,9 @@ def change_up_them_commits(commit): commit.author_email = re.sub("@my.crp", "@my.corp", commit.author_email) # Fix the committer date (bad timezone conversion in initial import) - commit.committer_date += timedelta(hours=-5) + oldtime = repo_filter.string_to_date(commit.committer_date) + newtime = oldtime + timedelta(hours=-5) + commit.committer_date = repo_filter.date_to_string(newtime) # Fix the commit message commit.message = re.sub("Marketing is staffed with pansies", "", diff --git a/t/t9391/create_fast_export_output.py b/t/t9391/create_fast_export_output.py index fe055bc..1b99db2 100755 --- a/t/t9391/create_fast_export_output.py +++ b/t/t9391/create_fast_export_output.py @@ -36,9 +36,10 @@ changes = [FileChanges('M', 'world', world.id, mode="100644"), when = datetime(year=2005, month=4, day=7, hour=15, minute=16, second=10, tzinfo=FixedTimeZone("-0700")) +when_string = repo_filter.date_to_string(when) commit1 = Commit("refs/heads/master", - "A U Thor", "au@thor.email", when, - "Com M. Iter", "comm@iter.email", when, + "A U Thor", "au@thor.email", when_string, + "Com M. Iter", "comm@iter.email", when_string, "My first commit! Wooot!\n\nLonger description", changes, from_commit = None, @@ -53,9 +54,10 @@ world_link.dump(output) changes = [FileChanges('M', 'world', world.id, mode="100644"), FileChanges('M', 'planet', world_link.id, mode="120000")] when += timedelta(days=3, hours=4, minutes=6) +when_string = repo_filter.date_to_string(when) commit2 = Commit("refs/heads/master", - "A U Thor", "au@thor.email", when, - "Com M. Iter", "comm@iter.email", when, + "A U Thor", "au@thor.email", when_string, + "Com M. Iter", "comm@iter.email", when_string, "Make a symlink to world called planet, modify world", changes, from_commit = commit1.id, @@ -66,10 +68,10 @@ script = Blob("#!/bin/sh\n\necho Hello") script.dump(output) changes = [FileChanges('M', 'runme', script.id, mode="100755"), FileChanges('D', 'bar')] -when = datetime.fromtimestamp(timestamp=1234567890, tz=FixedTimeZone("-0700")) +when_string = "1234567890 -0700" commit3 = Commit("refs/heads/master", - "A U Thor", "au@thor.email", when, - "Com M. Iter", "comm@iter.email", when, + "A U Thor", "au@thor.email", when_string, + "Com M. Iter", "comm@iter.email", when_string, "Add runme script, remove bar", changes, from_commit = commit2.id, @@ -89,9 +91,10 @@ world.dump(output) changes = [FileChanges('M', 'world', world.id, mode="100644")] when = datetime(2006, 8, 17, tzinfo=FixedTimeZone("+0200")) +when_string = repo_filter.date_to_string(when) commit4 = Commit("refs/heads/devel", - "A U Thor", "au@thor.email", when, - "Com M. Iter", "comm@iter.email", when, + "A U Thor", "au@thor.email", when_string, + "Com M. Iter", "comm@iter.email", when_string, "Modify world", changes, from_commit = commit1.id, @@ -100,7 +103,8 @@ commit4.dump(output) world = Blob("Hello\nHi\nGoodbye") world.dump(output) -when = commit3.author_date + timedelta(days=47) +when = repo_filter.string_to_date(commit3.author_date) + timedelta(days=47) +when_string = repo_filter.date_to_string(when) # git fast-import requires file changes to be listed in terms of differences # to the first parent. Thus, despite the fact that runme and planet have # not changed and bar was not modified in the devel side, we have to list them @@ -111,8 +115,8 @@ changes = [FileChanges('M', 'world', world.id, mode="100644"), FileChanges('M', 'planet', world_link.id, mode="120000")] commit5 = Commit("refs/heads/devel", - "A U Thor", "au@thor.email", when, - "Com M. Iter", "comm@iter.email", when, + "A U Thor", "au@thor.email", when_string, + "Com M. Iter", "comm@iter.email", when_string, "Merge branch 'master'\n", changes, from_commit = commit4.id, @@ -121,7 +125,7 @@ commit5.dump(output) mytag = Tag("refs/tags/v1.0", commit5.id, - "His R. Highness", "royalty@my.kingdom", when, + "His R. Highness", "royalty@my.kingdom", when_string, "I bequeath to my peons this royal software") mytag.dump(output) out.finish()