filter-repo: add text removal (or replacement) via file of expressions

Make it easy for users to search and replace text throughout the
repository history.  Instead of inventing some new syntax, reuse the
same syntax used by BFG repo filter's --replace-text option, namely,
a file with one expression per line of the form

    [regex:|glob:|literal:]$MATCH_EXPR[==>$REPLACEMENT_EXPR]

Where "$MATCH_EXPR" is by default considered to be literal text, but
could be a regex or a glob if the appropriate prefix is used.  Also,
$REPLACEMENT_EXPR defaults to '***REMOVED***' if not specified.  If
you want a literal '==>' to be part of your $MATCH_EXPR, then you
must also manually specify a replacement expression instead of taking
the default.  Some examples:

    sup3rs3kr3t
    (replaces 'sup3rs3kr3t' with '***REMOVED***')

    HeWhoShallNotBeNamed==>Voldemort
    (replaces 'HeWhoShallNotBeNamed' with 'Voldemort')

    very==>
    (replaces 'very' with the empty string)

    regex:(\d{2})/(\d{2})/(\d{4})==>\2/\1/\3
    (replaces '05/17/2012' with '17/05/2012', and vice-versa)

    The format for regex is as from
    re.sub(<pattern>, <repl>, <string>) from
    https://docs.python.org/2/library/re.html
    The <string> comes from file contents of the repo, and you specify
    the <pattern> and <repl>.

    glob:Copy*t==>Cartel
    (replaces 'Copyright' or 'Copyleft' or 'Copy my st' with 'Cartel')

Signed-off-by: Elijah Newren <newren@gmail.com>
pull/13/head
Elijah Newren 5 years ago
parent 4ee915e4dd
commit 73e91edecc

@ -1824,6 +1824,16 @@ class FilteringOptions(object):
action='store_const', const='.mailmap',
help='''Same as: '--mailmap .mailmap' ''')
contents = parser.add_argument_group(title='Content editing filters')
contents.add_argument('--replace-text', metavar='EXPRESSIONS_FILE',
help='''A file with expressions that, if found, will
be replaced. By default, each expression is
treated as literal text, but 'regex:' and 'glob:'
prefixes are supported. You can end the line
with "==>" and some replacement text to choose
a replacement choice other than the default of
"***REMOVED***". ''')
location = parser.add_argument_group(title='Location to filter from/to')
location.add_argument('--source',
help='''Git repository to read from''')
@ -1881,6 +1891,38 @@ class FilteringOptions(object):
if not has_filter:
args.inclusive = False
@staticmethod
def get_replace_text(filename):
replace_literals = []
replace_regexes = []
with open(filename) as f:
for line in f:
line = line.rstrip('\r\n')
# Determine the replacement
replacement = '***REMOVED***'
if '==>' in line:
line, replacement = line.rsplit('==>', 1)
# See if we need to match via regex
regex = None
if line.startswith('regex:'):
regex = line[6:]
elif line.startswith('glob:'):
regex = fnmatch.translate(line[5:])
if regex.endswith(r'\Z(?ms)'):
regex = regex[0:-7]
if regex:
replace_regexes.append((re.compile(regex), replacement))
else:
# Otherwise, find the literal we need to replace
if line.startswith('literal:'):
line = line[8:]
if not line:
continue
replace_literals.append((line, replacement))
return {'literals': replace_literals, 'regexes': replace_regexes}
@staticmethod
def default_options():
return FilteringOptions.parse_args([], error_on_empty = False)
@ -1898,6 +1940,8 @@ class FilteringOptions(object):
FilteringOptions.sanity_check_args(args)
if args.mailmap:
args.mailmap = MailmapInfo(args.mailmap)
if args.replace_text:
args.replace_text = FilteringOptions.get_replace_text(args.replace_text)
return args
class RepoAnalyze(object):
@ -2505,6 +2549,14 @@ class RepoFilter(object):
if rev != refs[origin_ref]:
abort('{} does not match {}'.format(refname, origin_ref))
@staticmethod
def tweak_blob(args, blob):
if args.replace_text:
for literal, replacement in args.replace_text['literals']:
blob.data = blob.data.replace(literal, replacement)
for regex, replacement in args.replace_text['regexes']:
blob.data = regex.sub(replacement, blob.data)
@staticmethod
def tweak_commit(args, commit):
def filename_matches(path_expression, pathname):
@ -2630,6 +2682,7 @@ class RepoFilter(object):
else:
skip_blobs = (self._blob_callback is None and
self._everything_callback is None and
self._args.replace_text is None and
self._args.source is None and
self._args.target is None)
extra_flags = ['--no-data'] if skip_blobs else []
@ -2686,6 +2739,9 @@ class RepoFilter(object):
if self._input:
# Set up the callbacks
def combined_blob_callback(b):
RepoFilter.tweak_blob(self._args, b)
self._blob_callback and self._blob_callback(b)
def actual_commit_callback(c):
RepoFilter.tweak_commit(self._args, c)
self._commit_callback and self._commit_callback(c)
@ -2695,10 +2751,13 @@ class RepoFilter(object):
def actual_reset_callback(r):
RepoFilter.handle_reset(self._args, r)
self._reset_callback and self._reset_callback(r)
actual_blob_callback = self._blob_callback
if self._args.replace_text:
actual_blob_callback = combined_blob_callback
# Create and run the filter
fef = FastExportFilter(self._args.source or '.',
blob_callback = self._blob_callback,
blob_callback = actual_blob_callback,
commit_callback = actual_commit_callback,
tag_callback = actual_tag_callback,
reset_callback = actual_reset_callback,

@ -34,5 +34,6 @@ filter_testcase basic basic-filename --path filename
filter_testcase basic basic-twenty --path twenty
filter_testcase basic basic-ten --path ten
filter_testcase basic basic-mailmap --mailmap ../t9390/sample-mailmap
filter_testcase basic basic-replace --replace-text ../t9390/sample-replace
test_done

@ -0,0 +1,78 @@
feature done
blob
mark :1
data 8
initial
reset refs/heads/B
commit refs/heads/B
mark :2
author Little O. Me <me@little.net> 1535228562 -0700
committer Little O. Me <me@little.net> 1535228562 -0700
data 8
Initial
M 100644 :1 filename
M 100644 :1 ten
M 100644 :1 twenty
blob
mark :3
data 28
twenty-modified-by-gremlins
commit refs/heads/B
mark :4
author Little 'ol Me <me@laptop.(none)> 1535229544 -0700
committer Little 'ol Me <me@laptop.(none)> 1535229544 -0700
data 11
add twenty
from :2
M 100644 :3 twenty
blob
mark :5
data 25
ten-modified-by-gremlins
commit refs/heads/A
mark :6
author Little O. Me <me@machine52.little.net> 1535229523 -0700
committer Little O. Me <me@machine52.little.net> 1535229523 -0700
data 8
add ten
from :2
M 100644 :5 ten
commit refs/heads/master
mark :7
author Lit.e Me <me@fire.com> 1535229559 -0700
committer Lit.e Me <me@fire.com> 1535229580 -0700
data 24
Merge branch 'A' into B
from :4
merge :6
M 100644 :5 ten
blob
mark :8
data 6
final
commit refs/heads/master
mark :9
author Little Me <me@bigcompany.com> 1535229601 -0700
committer Little Me <me@bigcompany.com> 1535229601 -0700
data 9
whatever
from :7
M 100644 :8 filename
M 100644 :8 ten
M 100644 :8 twenty
tag v1.0
from :9
tagger Little John <second@merry.men> 1535229618 -0700
data 5
v1.0
done

@ -0,0 +1 @@
mod==>modified-by-gremlins
Loading…
Cancel
Save