54863127a7
https://github.com/newren/git-filter-repo/releases/download/v2.38.0/git-filter-repo-2.38.0.tar.xz
4005 lines
162 KiB
Python
Executable file
4005 lines
162 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
|
|
"""
|
|
git-filter-repo filters git repositories, similar to git filter-branch, BFG
|
|
repo cleaner, and others. The basic idea is that it works by running
|
|
git fast-export <options> | filter | git fast-import <options>
|
|
where this program not only launches the whole pipeline but also serves as
|
|
the 'filter' in the middle. It does a few additional things on top as well
|
|
in order to make it into a well-rounded filtering tool.
|
|
|
|
git-filter-repo can also be used as a library for more involved filtering
|
|
operations; however:
|
|
***** API BACKWARD COMPATIBILITY CAVEAT *****
|
|
Programs using git-filter-repo as a library can reach pretty far into its
|
|
internals, but I am not prepared to guarantee backward compatibility of
|
|
all APIs. I suspect changes will be rare, but I reserve the right to
|
|
change any API. Since it is assumed that repository filtering is
|
|
something one would do very rarely, and in particular that it's a
|
|
one-shot operation, this should not be a problem in practice for anyone.
|
|
However, if you want to re-use a program you have written that uses
|
|
git-filter-repo as a library (or makes use of one of its --*-callback
|
|
arguments), you should either make sure you are using the same version of
|
|
git and git-filter-repo, or make sure to re-test it.
|
|
|
|
If there are particular pieces of the API you are concerned about, and
|
|
there is not already a testcase for it in t9391-lib-usage.sh or
|
|
t9392-python-callback.sh, please contribute a testcase. That will not
|
|
prevent me from changing the API, but it will allow you to look at the
|
|
history of a testcase to see whether and how the API changed.
|
|
***** END API BACKWARD COMPATIBILITY CAVEAT *****
|
|
"""
|
|
|
|
import argparse
|
|
import collections
|
|
import fnmatch
|
|
import gettext
|
|
import io
|
|
import os
|
|
import platform
|
|
import re
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
import textwrap
|
|
|
|
from datetime import tzinfo, timedelta, datetime
|
|
|
|
__all__ = ["Blob", "Reset", "FileChange", "Commit", "Tag", "Progress",
|
|
"Checkpoint", "FastExportParser", "ProgressWriter",
|
|
"string_to_date", "date_to_string",
|
|
"record_id_rename", "GitUtils", "FilteringOptions", "RepoFilter"]
|
|
|
|
deleted_hash = b'0'*40
|
|
write_marks = True
|
|
date_format_permissive = True
|
|
|
|
def gettext_poison(msg):
|
|
if "GIT_TEST_GETTEXT_POISON" in os.environ: # pragma: no cover
|
|
return "# GETTEXT POISON #"
|
|
return gettext.gettext(msg)
|
|
|
|
_ = gettext_poison
|
|
|
|
def setup_gettext():
|
|
TEXTDOMAIN="git-filter-repo"
|
|
podir = os.environ.get("GIT_TEXTDOMAINDIR") or "@@LOCALEDIR@@"
|
|
if not os.path.isdir(podir): # pragma: no cover
|
|
podir = None # Python has its own fallback; use that
|
|
|
|
## This looks like the most straightforward translation of the relevant
|
|
## code in git.git:gettext.c and git.git:perl/Git/I18n.pm:
|
|
#import locale
|
|
#locale.setlocale(locale.LC_MESSAGES, "");
|
|
#locale.setlocale(locale.LC_TIME, "");
|
|
#locale.textdomain(TEXTDOMAIN);
|
|
#locale.bindtextdomain(TEXTDOMAIN, podir);
|
|
## but the python docs suggest using the gettext module (which doesn't
|
|
## have setlocale()) instead, so:
|
|
gettext.textdomain(TEXTDOMAIN);
|
|
gettext.bindtextdomain(TEXTDOMAIN, podir);
|
|
|
|
def _timedelta_to_seconds(delta):
|
|
"""
|
|
Converts timedelta to seconds
|
|
"""
|
|
offset = delta.days*86400 + delta.seconds + (delta.microseconds+0.0)/1000000
|
|
return round(offset)
|
|
|
|
class FixedTimeZone(tzinfo):
|
|
"""
|
|
Fixed offset in minutes east from UTC.
|
|
"""
|
|
|
|
tz_re = re.compile(br'^([-+]?)(\d\d)(\d\d)$')
|
|
|
|
def __init__(self, offset_string):
|
|
tzinfo.__init__(self)
|
|
sign, hh, mm = FixedTimeZone.tz_re.match(offset_string).groups()
|
|
factor = -1 if (sign and sign == b'-') else 1
|
|
self._offset = timedelta(minutes = factor*(60*int(hh) + int(mm)))
|
|
self._offset_string = offset_string
|
|
|
|
def utcoffset(self, dt):
|
|
return self._offset
|
|
|
|
def tzname(self, dt):
|
|
return self._offset_string
|
|
|
|
def dst(self, dt):
|
|
return timedelta(0)
|
|
|
|
def string_to_date(datestring):
|
|
(unix_timestamp, tz_offset) = datestring.split()
|
|
return datetime.fromtimestamp(int(unix_timestamp),
|
|
FixedTimeZone(tz_offset))
|
|
|
|
def date_to_string(dateobj):
|
|
epoch = datetime.fromtimestamp(0, dateobj.tzinfo)
|
|
return(b'%d %s' % (int(_timedelta_to_seconds(dateobj - epoch)),
|
|
dateobj.tzinfo.tzname(0)))
|
|
|
|
def decode(bytestr):
|
|
'Try to convert bytestr to utf-8 for outputting as an error message.'
|
|
return bytestr.decode('utf-8', 'backslashreplace')
|
|
|
|
def glob_to_regex(glob_bytestr):
|
|
'Translate glob_bytestr into a regex on bytestrings'
|
|
|
|
# fnmatch.translate is idiotic and won't accept bytestrings
|
|
if (decode(glob_bytestr).encode() != glob_bytestr): # pragma: no cover
|
|
raise SystemExit(_("Error: Cannot handle glob %s").format(glob_bytestr))
|
|
|
|
# Create regex operating on string
|
|
regex = fnmatch.translate(decode(glob_bytestr))
|
|
|
|
# FIXME: This is an ugly hack...
|
|
# fnmatch.translate tries to do multi-line matching and wants the glob to
|
|
# match up to the end of the input, which isn't relevant for us, so we
|
|
# have to modify the regex. fnmatch.translate has used different regex
|
|
# constructs to achieve this with different python versions, so we have
|
|
# to check for each of them and then fix it up. It would be much better
|
|
# if fnmatch.translate could just take some flags to allow us to specify
|
|
# what we want rather than employing this hackery, but since it
|
|
# doesn't...
|
|
if regex.endswith(r'\Z(?ms)'): # pragma: no cover
|
|
regex = regex[0:-7]
|
|
elif regex.startswith(r'(?s:') and regex.endswith(r')\Z'): # pragma: no cover
|
|
regex = regex[4:-3]
|
|
|
|
# Finally, convert back to regex operating on bytestr
|
|
return regex.encode()
|
|
|
|
class PathQuoting:
|
|
_unescape = {b'a': b'\a',
|
|
b'b': b'\b',
|
|
b'f': b'\f',
|
|
b'n': b'\n',
|
|
b'r': b'\r',
|
|
b't': b'\t',
|
|
b'v': b'\v',
|
|
b'"': b'"',
|
|
b'\\':b'\\'}
|
|
_unescape_re = re.compile(br'\\([a-z"\\]|[0-9]{3})')
|
|
_escape = [bytes([x]) for x in range(127)]+[
|
|
b'\\'+bytes(ord(c) for c in oct(x)[2:]) for x in range(127,256)]
|
|
_reverse = dict(map(reversed, _unescape.items()))
|
|
for x in _reverse:
|
|
_escape[ord(x)] = b'\\'+_reverse[x]
|
|
_special_chars = [len(x) > 1 for x in _escape]
|
|
|
|
@staticmethod
|
|
def unescape_sequence(orig):
|
|
seq = orig.group(1)
|
|
return PathQuoting._unescape[seq] if len(seq) == 1 else bytes([int(seq, 8)])
|
|
|
|
@staticmethod
|
|
def dequote(quoted_string):
|
|
if quoted_string.startswith(b'"'):
|
|
assert quoted_string.endswith(b'"')
|
|
return PathQuoting._unescape_re.sub(PathQuoting.unescape_sequence,
|
|
quoted_string[1:-1])
|
|
return quoted_string
|
|
|
|
@staticmethod
|
|
def enquote(unquoted_string):
|
|
# Option 1: Quoting when fast-export would:
|
|
# pqsc = PathQuoting._special_chars
|
|
# if any(pqsc[x] for x in set(unquoted_string)):
|
|
# Option 2, perf hack: do minimal amount of quoting required by fast-import
|
|
if unquoted_string.startswith(b'"') or b'\n' in unquoted_string:
|
|
pqe = PathQuoting._escape
|
|
return b'"' + b''.join(pqe[x] for x in unquoted_string) + b'"'
|
|
return unquoted_string
|
|
|
|
class AncestryGraph(object):
|
|
"""
|
|
A class that maintains a direct acycle graph of commits for the purpose of
|
|
determining if one commit is the ancestor of another.
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.cur_value = 0
|
|
|
|
# A mapping from the external identifers given to us to the simple integers
|
|
# we use in self.graph
|
|
self.value = {}
|
|
|
|
# A tuple of (depth, list-of-ancestors). Values and keys in this graph are
|
|
# all integers from the self.value dict. The depth of a commit is one more
|
|
# than the max depth of any of its ancestors.
|
|
self.graph = {}
|
|
|
|
# Cached results from previous calls to is_ancestor().
|
|
self._cached_is_ancestor = {}
|
|
|
|
def record_external_commits(self, external_commits):
|
|
"""
|
|
Record in graph that each commit in external_commits exists, and is
|
|
treated as a root commit with no parents.
|
|
"""
|
|
for c in external_commits:
|
|
if c not in self.value:
|
|
self.cur_value += 1
|
|
self.value[c] = self.cur_value
|
|
self.graph[self.cur_value] = (1, [])
|
|
|
|
def add_commit_and_parents(self, commit, parents):
|
|
"""
|
|
Record in graph that commit has the given parents. parents _MUST_ have
|
|
been first recorded. commit _MUST_ not have been recorded yet.
|
|
"""
|
|
assert all(p in self.value for p in parents)
|
|
assert commit not in self.value
|
|
|
|
# Get values for commit and parents
|
|
self.cur_value += 1
|
|
self.value[commit] = self.cur_value
|
|
graph_parents = [self.value[x] for x in parents]
|
|
|
|
# Determine depth for commit, then insert the info into the graph
|
|
depth = 1
|
|
if parents:
|
|
depth += max(self.graph[p][0] for p in graph_parents)
|
|
self.graph[self.cur_value] = (depth, graph_parents)
|
|
|
|
def is_ancestor(self, possible_ancestor, check):
|
|
"""
|
|
Return whether possible_ancestor is an ancestor of check
|
|
"""
|
|
a, b = self.value[possible_ancestor], self.value[check]
|
|
original_pair = (a,b)
|
|
a_depth = self.graph[a][0]
|
|
ancestors = [b]
|
|
visited = set()
|
|
while ancestors:
|
|
ancestor = ancestors.pop()
|
|
prev_pair = (a, ancestor)
|
|
if prev_pair in self._cached_is_ancestor:
|
|
if not self._cached_is_ancestor[prev_pair]:
|
|
continue
|
|
self._cached_is_ancestor[original_pair] = True
|
|
return True
|
|
if ancestor in visited:
|
|
continue
|
|
visited.add(ancestor)
|
|
depth, more_ancestors = self.graph[ancestor]
|
|
if ancestor == a:
|
|
self._cached_is_ancestor[original_pair] = True
|
|
return True
|
|
elif depth <= a_depth:
|
|
continue
|
|
ancestors.extend(more_ancestors)
|
|
self._cached_is_ancestor[original_pair] = False
|
|
return False
|
|
|
|
class MailmapInfo(object):
|
|
def __init__(self, filename):
|
|
self.changes = {}
|
|
self._parse_file(filename)
|
|
|
|
def _parse_file(self, filename):
|
|
name_and_email_re = re.compile(br'(.*?)\s*<([^>]*)>\s*')
|
|
comment_re = re.compile(br'\s*#.*')
|
|
if not os.access(filename, os.R_OK):
|
|
raise SystemExit(_("Cannot read %s") % decode(filename))
|
|
with open(filename, 'br') as f:
|
|
count = 0
|
|
for line in f:
|
|
count += 1
|
|
err = "Unparseable mailmap file: line #{} is bad: {}".format(count, line)
|
|
# Remove comments
|
|
line = comment_re.sub(b'', line)
|
|
# Remove leading and trailing whitespace
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
m = name_and_email_re.match(line)
|
|
if not m:
|
|
raise SystemExit(err)
|
|
proper_name, proper_email = m.groups()
|
|
if len(line) == m.end():
|
|
self.changes[(None, proper_email)] = (proper_name, proper_email)
|
|
continue
|
|
rest = line[m.end():]
|
|
m = name_and_email_re.match(rest)
|
|
if m:
|
|
commit_name, commit_email = m.groups()
|
|
if len(rest) != m.end():
|
|
raise SystemExit(err)
|
|
else:
|
|
commit_name, commit_email = rest, None
|
|
self.changes[(commit_name, commit_email)] = (proper_name, proper_email)
|
|
|
|
def translate(self, name, email):
|
|
''' Given a name and email, return the expected new name and email from the
|
|
mailmap if there is a translation rule for it, otherwise just return
|
|
the given name and email.'''
|
|
for old, new in self.changes.items():
|
|
old_name, old_email = old
|
|
new_name, new_email = new
|
|
if (old_email is None or email.lower() == old_email.lower()) and (
|
|
name == old_name or not old_name):
|
|
return (new_name or name, new_email or email)
|
|
return (name, email)
|
|
|
|
class ProgressWriter(object):
|
|
def __init__(self):
|
|
self._last_progress_update = time.time()
|
|
self._last_message = None
|
|
|
|
def show(self, msg):
|
|
self._last_message = msg
|
|
now = time.time()
|
|
if now - self._last_progress_update > .1:
|
|
self._last_progress_update = now
|
|
sys.stdout.write("\r{}".format(msg))
|
|
sys.stdout.flush()
|
|
|
|
def finish(self):
|
|
self._last_progress_update = 0
|
|
if self._last_message:
|
|
self.show(self._last_message)
|
|
sys.stdout.write("\n")
|
|
|
|
class _IDs(object):
|
|
"""
|
|
A class that maintains the 'name domain' of all the 'marks' (short int
|
|
id for a blob/commit git object). The reason this mechanism is necessary
|
|
is because the text of fast-export may refer to an object using a different
|
|
mark than the mark that was assigned to that object using IDS.new(). This
|
|
class allows you to translate the fast-export marks (old) to the marks
|
|
assigned from IDS.new() (new).
|
|
|
|
Note that there are two reasons why the marks may differ: (1) The
|
|
user manually creates Blob or Commit objects (for insertion into the
|
|
stream) (2) We're reading the data from two different repositories
|
|
and trying to combine the data (git fast-export will number ids from
|
|
1...n, and having two 1's, two 2's, two 3's, causes issues).
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""
|
|
Init
|
|
"""
|
|
# The id for the next created blob/commit object
|
|
self._next_id = 1
|
|
|
|
# A map of old-ids to new-ids (1:1 map)
|
|
self._translation = {}
|
|
|
|
# A map of new-ids to every old-id that points to the new-id (1:N map)
|
|
self._reverse_translation = {}
|
|
|
|
def has_renames(self):
|
|
"""
|
|
Return whether there have been ids remapped to new values
|
|
"""
|
|
return bool(self._translation)
|
|
|
|
def new(self):
|
|
"""
|
|
Should be called whenever a new blob or commit object is created. The
|
|
returned value should be used as the id/mark for that object.
|
|
"""
|
|
rv = self._next_id
|
|
self._next_id += 1
|
|
return rv
|
|
|
|
def record_rename(self, old_id, new_id, handle_transitivity = False):
|
|
"""
|
|
Record that old_id is being renamed to new_id.
|
|
"""
|
|
if old_id != new_id:
|
|
# old_id -> new_id
|
|
self._translation[old_id] = new_id
|
|
|
|
# Transitivity will be needed if new commits are being inserted mid-way
|
|
# through a branch.
|
|
if handle_transitivity:
|
|
# Anything that points to old_id should point to new_id
|
|
if old_id in self._reverse_translation:
|
|
for id_ in self._reverse_translation[old_id]:
|
|
self._translation[id_] = new_id
|
|
|
|
# Record that new_id is pointed to by old_id
|
|
if new_id not in self._reverse_translation:
|
|
self._reverse_translation[new_id] = []
|
|
self._reverse_translation[new_id].append(old_id)
|
|
|
|
def translate(self, old_id):
|
|
"""
|
|
If old_id has been mapped to an alternate id, return the alternate id.
|
|
"""
|
|
if old_id in self._translation:
|
|
return self._translation[old_id]
|
|
else:
|
|
return old_id
|
|
|
|
def __str__(self):
|
|
"""
|
|
Convert IDs to string; used for debugging
|
|
"""
|
|
rv = "Current count: %d\nTranslation:\n" % self._next_id
|
|
for k in sorted(self._translation):
|
|
rv += " %d -> %s\n" % (k, self._translation[k])
|
|
|
|
rv += "Reverse translation:\n"
|
|
for k in sorted(self._reverse_translation):
|
|
rv += " " + str(k) + " -> " + str(self._reverse_translation[k]) + "\n"
|
|
|
|
return rv
|
|
|
|
class _GitElement(object):
|
|
"""
|
|
The base class for all git elements that we create.
|
|
"""
|
|
|
|
def __init__(self):
|
|
# A string that describes what type of Git element this is
|
|
self.type = None
|
|
|
|
# A flag telling us if this Git element has been dumped
|
|
# (i.e. printed) or skipped. Typically elements that have been
|
|
# dumped or skipped will not be dumped again.
|
|
self.dumped = 0
|
|
|
|
def dump(self, file_):
|
|
"""
|
|
This version should never be called. Derived classes need to
|
|
override! We should note that subclasses should implement this
|
|
method such that the output would match the format produced by
|
|
fast-export.
|
|
"""
|
|
raise SystemExit(_("Unimplemented function: %s") % type(self).__name__
|
|
+".dump()") # pragma: no cover
|
|
|
|
def __bytes__(self):
|
|
"""
|
|
Convert GitElement to bytestring; used for debugging
|
|
"""
|
|
old_dumped = self.dumped
|
|
writeme = io.BytesIO()
|
|
self.dump(writeme)
|
|
output_lines = writeme.getvalue().splitlines()
|
|
writeme.close()
|
|
self.dumped = old_dumped
|
|
return b"%s:\n %s" % (type(self).__name__.encode(),
|
|
b"\n ".join(output_lines))
|
|
|
|
def skip(self, new_id=None):
|
|
"""
|
|
Ensures this element will not be written to output
|
|
"""
|
|
self.dumped = 2
|
|
|
|
class _GitElementWithId(_GitElement):
|
|
"""
|
|
The base class for Git elements that have IDs (commits and blobs)
|
|
"""
|
|
|
|
def __init__(self):
|
|
_GitElement.__init__(self)
|
|
|
|
# The mark (short, portable id) for this element
|
|
self.id = _IDS.new()
|
|
|
|
# The previous mark for this element
|
|
self.old_id = None
|
|
|
|
def skip(self, new_id=None):
|
|
"""
|
|
This element will no longer be automatically written to output. When a
|
|
commit gets skipped, it's ID will need to be translated to that of its
|
|
parent.
|
|
"""
|
|
self.dumped = 2
|
|
|
|
_IDS.record_rename(self.old_id or self.id, new_id)
|
|
|
|
class Blob(_GitElementWithId):
|
|
"""
|
|
This class defines our representation of git blob elements (i.e. our
|
|
way of representing file contents).
|
|
"""
|
|
|
|
def __init__(self, data, original_id = None):
|
|
_GitElementWithId.__init__(self)
|
|
|
|
# Denote that this is a blob
|
|
self.type = 'blob'
|
|
|
|
# Record original id
|
|
self.original_id = original_id
|
|
|
|
# Stores the blob's data
|
|
assert(type(data) == bytes)
|
|
self.data = data
|
|
|
|
def dump(self, file_):
|
|
"""
|
|
Write this blob element to a file.
|
|
"""
|
|
self.dumped = 1
|
|
HASH_TO_ID[self.original_id] = self.id
|
|
ID_TO_HASH[self.id] = self.original_id
|
|
|
|
file_.write(b'blob\n')
|
|
file_.write(b'mark :%d\n' % self.id)
|
|
file_.write(b'data %d\n%s' % (len(self.data), self.data))
|
|
file_.write(b'\n')
|
|
|
|
|
|
class Reset(_GitElement):
|
|
"""
|
|
This class defines our representation of git reset elements. A reset
|
|
event is the creation (or recreation) of a named branch, optionally
|
|
starting from a specific revision).
|
|
"""
|
|
|
|
def __init__(self, ref, from_ref = None):
|
|
_GitElement.__init__(self)
|
|
|
|
# Denote that this is a reset
|
|
self.type = 'reset'
|
|
|
|
# The name of the branch being (re)created
|
|
self.ref = ref
|
|
|
|
# Some reference to the branch/commit we are resetting from
|
|
self.from_ref = from_ref
|
|
|
|
def dump(self, file_):
|
|
"""
|
|
Write this reset element to a file
|
|
"""
|
|
self.dumped = 1
|
|
|
|
file_.write(b'reset %s\n' % self.ref)
|
|
if self.from_ref:
|
|
if isinstance(self.from_ref, int):
|
|
file_.write(b'from :%d\n' % self.from_ref)
|
|
else:
|
|
file_.write(b'from %s\n' % self.from_ref)
|
|
file_.write(b'\n')
|
|
|
|
class FileChange(_GitElement):
|
|
"""
|
|
This class defines our representation of file change elements. File change
|
|
elements are components within a Commit element.
|
|
"""
|
|
|
|
def __init__(self, type_, filename = None, id_ = None, mode = None):
|
|
_GitElement.__init__(self)
|
|
|
|
# Denote the type of file-change (b'M' for modify, b'D' for delete, etc)
|
|
# We could
|
|
# assert(type(type_) == bytes)
|
|
# here but I don't just due to worries about performance overhead...
|
|
self.type = type_
|
|
|
|
# Record the name of the file being changed
|
|
self.filename = filename
|
|
|
|
# Record the mode (mode describes type of file entry (non-executable,
|
|
# executable, or symlink)).
|
|
self.mode = mode
|
|
|
|
# blob_id is the id (mark) of the affected blob
|
|
self.blob_id = id_
|
|
|
|
if type_ == b'DELETEALL':
|
|
assert filename is None and id_ is None and mode is None
|
|
self.filename = b'' # Just so PathQuoting.enquote doesn't die
|
|
else:
|
|
assert filename is not None
|
|
|
|
if type_ == b'M':
|
|
assert id_ is not None and mode is not None
|
|
elif type_ == b'D':
|
|
assert id_ is None and mode is None
|
|
elif type_ == b'R': # pragma: no cover (now avoid fast-export renames)
|
|
assert mode is None
|
|
if id_ is None:
|
|
raise SystemExit(_("new name needed for rename of %s") % filename)
|
|
self.filename = (self.filename, id_)
|
|
self.blob_id = None
|
|
|
|
def dump(self, file_):
|
|
"""
|
|
Write this file-change element to a file
|
|
"""
|
|
skipped_blob = (self.type == b'M' and self.blob_id is None)
|
|
if skipped_blob: return
|
|
self.dumped = 1
|
|
|
|
quoted_filename = PathQuoting.enquote(self.filename)
|
|
if self.type == b'M' and isinstance(self.blob_id, int):
|
|
file_.write(b'M %s :%d %s\n' % (self.mode, self.blob_id, quoted_filename))
|
|
elif self.type == b'M':
|
|
file_.write(b'M %s %s %s\n' % (self.mode, self.blob_id, quoted_filename))
|
|
elif self.type == b'D':
|
|
file_.write(b'D %s\n' % quoted_filename)
|
|
elif self.type == b'DELETEALL':
|
|
file_.write(b'deleteall\n')
|
|
else:
|
|
raise SystemExit(_("Unhandled filechange type: %s") % self.type) # pragma: no cover
|
|
|
|
class Commit(_GitElementWithId):
|
|
"""
|
|
This class defines our representation of commit elements. Commit elements
|
|
contain all the information associated with a commit.
|
|
"""
|
|
|
|
def __init__(self, branch,
|
|
author_name, author_email, author_date,
|
|
committer_name, committer_email, committer_date,
|
|
message,
|
|
file_changes,
|
|
parents,
|
|
original_id = None,
|
|
encoding = None, # encoding for message; None implies UTF-8
|
|
**kwargs):
|
|
_GitElementWithId.__init__(self)
|
|
self.old_id = self.id
|
|
|
|
# Denote that this is a commit element
|
|
self.type = 'commit'
|
|
|
|
# Record the affected branch
|
|
self.branch = branch
|
|
|
|
# Record original id
|
|
self.original_id = original_id
|
|
|
|
# Record author's name
|
|
self.author_name = author_name
|
|
|
|
# Record author's email
|
|
self.author_email = author_email
|
|
|
|
# Record date of authoring
|
|
self.author_date = author_date
|
|
|
|
# Record committer's name
|
|
self.committer_name = committer_name
|
|
|
|
# Record committer's email
|
|
self.committer_email = committer_email
|
|
|
|
# Record date the commit was made
|
|
self.committer_date = committer_date
|
|
|
|
# Record commit message and its encoding
|
|
self.encoding = encoding
|
|
self.message = message
|
|
|
|
# List of file-changes associated with this commit. Note that file-changes
|
|
# are also represented as git elements
|
|
self.file_changes = file_changes
|
|
|
|
self.parents = parents
|
|
|
|
def dump(self, file_):
|
|
"""
|
|
Write this commit element to a file.
|
|
"""
|
|
self.dumped = 1
|
|
HASH_TO_ID[self.original_id] = self.id
|
|
ID_TO_HASH[self.id] = self.original_id
|
|
|
|
# Make output to fast-import slightly easier for humans to read if the
|
|
# message has no trailing newline of its own; cosmetic, but a nice touch...
|
|
extra_newline = b'\n'
|
|
if self.message.endswith(b'\n') or not (self.parents or self.file_changes):
|
|
extra_newline = b''
|
|
|
|
if not self.parents:
|
|
file_.write(b'reset %s\n' % self.branch)
|
|
file_.write((b'commit %s\n'
|
|
b'mark :%d\n'
|
|
b'author %s <%s> %s\n'
|
|
b'committer %s <%s> %s\n'
|
|
) % (
|
|
self.branch, self.id,
|
|
self.author_name, self.author_email, self.author_date,
|
|
self.committer_name, self.committer_email, self.committer_date
|
|
))
|
|
if self.encoding:
|
|
file_.write(b'encoding %s\n' % self.encoding)
|
|
file_.write(b'data %d\n%s%s' %
|
|
(len(self.message), self.message, extra_newline))
|
|
for i, parent in enumerate(self.parents):
|
|
file_.write(b'from ' if i==0 else b'merge ')
|
|
if isinstance(parent, int):
|
|
file_.write(b':%d\n' % parent)
|
|
else:
|
|
file_.write(b'%s\n' % parent)
|
|
for change in self.file_changes:
|
|
change.dump(file_)
|
|
if not self.parents and not self.file_changes:
|
|
# Workaround a bug in pre-git-2.22 versions of fast-import with
|
|
# the get-mark directive.
|
|
file_.write(b'\n')
|
|
file_.write(b'\n')
|
|
|
|
def first_parent(self):
|
|
"""
|
|
Return first parent commit
|
|
"""
|
|
if self.parents:
|
|
return self.parents[0]
|
|
return None
|
|
|
|
def skip(self, new_id=None):
|
|
_SKIPPED_COMMITS.add(self.old_id or self.id)
|
|
_GitElementWithId.skip(self, new_id)
|
|
|
|
class Tag(_GitElementWithId):
|
|
"""
|
|
This class defines our representation of annotated tag elements.
|
|
"""
|
|
|
|
def __init__(self, ref, from_ref,
|
|
tagger_name, tagger_email, tagger_date, tag_msg,
|
|
original_id = None):
|
|
_GitElementWithId.__init__(self)
|
|
self.old_id = self.id
|
|
|
|
# Denote that this is a tag element
|
|
self.type = 'tag'
|
|
|
|
# Store the name of the tag
|
|
self.ref = ref
|
|
|
|
# Store the entity being tagged (this should be a commit)
|
|
self.from_ref = from_ref
|
|
|
|
# Record original id
|
|
self.original_id = original_id
|
|
|
|
# Store the name of the tagger
|
|
self.tagger_name = tagger_name
|
|
|
|
# Store the email of the tagger
|
|
self.tagger_email = tagger_email
|
|
|
|
# Store the date
|
|
self.tagger_date = tagger_date
|
|
|
|
# Store the tag message
|
|
self.message = tag_msg
|
|
|
|
def dump(self, file_):
|
|
"""
|
|
Write this tag element to a file
|
|
"""
|
|
|
|
self.dumped = 1
|
|
HASH_TO_ID[self.original_id] = self.id
|
|
ID_TO_HASH[self.id] = self.original_id
|
|
|
|
file_.write(b'tag %s\n' % self.ref)
|
|
if (write_marks and self.id):
|
|
file_.write(b'mark :%d\n' % self.id)
|
|
markfmt = b'from :%d\n' if isinstance(self.from_ref, int) else b'from %s\n'
|
|
file_.write(markfmt % self.from_ref)
|
|
if self.tagger_name:
|
|
file_.write(b'tagger %s <%s> ' % (self.tagger_name, self.tagger_email))
|
|
file_.write(self.tagger_date)
|
|
file_.write(b'\n')
|
|
file_.write(b'data %d\n%s' % (len(self.message), self.message))
|
|
file_.write(b'\n')
|
|
|
|
class Progress(_GitElement):
|
|
"""
|
|
This class defines our representation of progress elements. The progress
|
|
element only contains a progress message, which is printed by fast-import
|
|
when it processes the progress output.
|
|
"""
|
|
|
|
def __init__(self, message):
|
|
_GitElement.__init__(self)
|
|
|
|
# Denote that this is a progress element
|
|
self.type = 'progress'
|
|
|
|
# Store the progress message
|
|
self.message = message
|
|
|
|
def dump(self, file_):
|
|
"""
|
|
Write this progress element to a file
|
|
"""
|
|
self.dumped = 1
|
|
|
|
file_.write(b'progress %s\n' % self.message)
|
|
file_.write(b'\n')
|
|
|
|
class Checkpoint(_GitElement):
|
|
"""
|
|
This class defines our representation of checkpoint elements. These
|
|
elements represent events which force fast-import to close the current
|
|
packfile, start a new one, and to save out all current branch refs, tags
|
|
and marks.
|
|
"""
|
|
|
|
def __init__(self):
|
|
_GitElement.__init__(self)
|
|
|
|
# Denote that this is a checkpoint element
|
|
self.type = 'checkpoint'
|
|
|
|
def dump(self, file_):
|
|
"""
|
|
Write this checkpoint element to a file
|
|
"""
|
|
self.dumped = 1
|
|
|
|
file_.write(b'checkpoint\n')
|
|
file_.write(b'\n')
|
|
|
|
class LiteralCommand(_GitElement):
|
|
"""
|
|
This class defines our representation of commands. The literal command
|
|
includes only a single line, and is not processed in any special way.
|
|
"""
|
|
|
|
def __init__(self, line):
|
|
_GitElement.__init__(self)
|
|
|
|
# Denote that this is a literal element
|
|
self.type = 'literal'
|
|
|
|
# Store the command
|
|
self.line = line
|
|
|
|
def dump(self, file_):
|
|
"""
|
|
Write this progress element to a file
|
|
"""
|
|
self.dumped = 1
|
|
|
|
file_.write(self.line)
|
|
|
|
class Alias(_GitElement):
|
|
"""
|
|
This class defines our representation of fast-import alias elements. An
|
|
alias element is the setting of one mark to the same sha1sum as another,
|
|
usually because the newer mark corresponded to a pruned commit.
|
|
"""
|
|
|
|
def __init__(self, ref, to_ref):
|
|
_GitElement.__init__(self)
|
|
# Denote that this is a reset
|
|
self.type = 'alias'
|
|
|
|
self.ref = ref
|
|
self.to_ref = to_ref
|
|
|
|
def dump(self, file_):
|
|
"""
|
|
Write this reset element to a file
|
|
"""
|
|
self.dumped = 1
|
|
|
|
file_.write(b'alias\nmark :%d\nto :%d\n\n' % (self.ref, self.to_ref))
|
|
|
|
class FastExportParser(object):
|
|
"""
|
|
A class for parsing and handling the output from fast-export. This
|
|
class allows the user to register callbacks when various types of
|
|
data are encountered in the fast-export output. The basic idea is that,
|
|
FastExportParser takes fast-export output, creates the various objects
|
|
as it encounters them, the user gets to use/modify these objects via
|
|
callbacks, and finally FastExportParser outputs the modified objects
|
|
in fast-import format (presumably so they can be used to create a new
|
|
repo).
|
|
"""
|
|
|
|
def __init__(self,
|
|
tag_callback = None, commit_callback = None,
|
|
blob_callback = None, progress_callback = None,
|
|
reset_callback = None, checkpoint_callback = None,
|
|
done_callback = None):
|
|
# Members below simply store callback functions for the various git
|
|
# elements
|
|
self._tag_callback = tag_callback
|
|
self._blob_callback = blob_callback
|
|
self._reset_callback = reset_callback
|
|
self._commit_callback = commit_callback
|
|
self._progress_callback = progress_callback
|
|
self._checkpoint_callback = checkpoint_callback
|
|
self._done_callback = done_callback
|
|
|
|
# Keep track of which refs appear from the export, and which make it to
|
|
# the import (pruning of empty commits, renaming of refs, and creating
|
|
# new manual objects and inserting them can cause these to differ).
|
|
self._exported_refs = set()
|
|
self._imported_refs = set()
|
|
|
|
# A list of the branches we've seen, plus the last known commit they
|
|
# pointed to. An entry in latest_*commit will be deleted if we get a
|
|
# reset for that branch. These are used because of fast-import's weird
|
|
# decision to allow having an implicit parent via naming the branch
|
|
# instead of requiring branches to be specified via 'from' directives.
|
|
self._latest_commit = {}
|
|
self._latest_orig_commit = {}
|
|
|
|
# A handle to the input source for the fast-export data
|
|
self._input = None
|
|
|
|
# A handle to the output file for the output we generate (we call dump
|
|
# on many of the git elements we create).
|
|
self._output = None
|
|
|
|
# Stores the contents of the current line of input being parsed
|
|
self._currentline = ''
|
|
|
|
# Compile some regexes and cache those
|
|
self._mark_re = re.compile(br'mark :(\d+)\n$')
|
|
self._parent_regexes = {}
|
|
parent_regex_rules = (br' :(\d+)\n$', br' ([0-9a-f]{40})\n')
|
|
for parent_refname in (b'from', b'merge'):
|
|
ans = [re.compile(parent_refname+x) for x in parent_regex_rules]
|
|
self._parent_regexes[parent_refname] = ans
|
|
self._quoted_string_re = re.compile(br'"(?:[^"\\]|\\.)*"')
|
|
self._refline_regexes = {}
|
|
for refline_name in (b'reset', b'commit', b'tag', b'progress'):
|
|
self._refline_regexes[refline_name] = re.compile(refline_name+b' (.*)\n$')
|
|
self._user_regexes = {}
|
|
for user in (b'author', b'committer', b'tagger'):
|
|
self._user_regexes[user] = re.compile(user + b' (.*?) <(.*?)> (.*)\n$')
|
|
|
|
def _advance_currentline(self):
|
|
"""
|
|
Grab the next line of input
|
|
"""
|
|
self._currentline = self._input.readline()
|
|
|
|
def _parse_optional_mark(self):
|
|
"""
|
|
If the current line contains a mark, parse it and advance to the
|
|
next line; return None otherwise
|
|
"""
|
|
mark = None
|
|
matches = self._mark_re.match(self._currentline)
|
|
if matches:
|
|
mark = int(matches.group(1))
|
|
self._advance_currentline()
|
|
return mark
|
|
|
|
def _parse_optional_parent_ref(self, refname):
|
|
"""
|
|
If the current line contains a reference to a parent commit, then
|
|
parse it and advance the current line; otherwise return None. Note
|
|
that the name of the reference ('from', 'merge') must match the
|
|
refname arg.
|
|
"""
|
|
orig_baseref, baseref = None, None
|
|
rule, altrule = self._parent_regexes[refname]
|
|
matches = rule.match(self._currentline)
|
|
if matches:
|
|
orig_baseref = int(matches.group(1))
|
|
# We translate the parent commit mark to what it needs to be in
|
|
# our mark namespace
|
|
baseref = _IDS.translate(orig_baseref)
|
|
self._advance_currentline()
|
|
else:
|
|
matches = altrule.match(self._currentline)
|
|
if matches:
|
|
orig_baseref = matches.group(1)
|
|
baseref = orig_baseref
|
|
self._advance_currentline()
|
|
return orig_baseref, baseref
|
|
|
|
def _parse_optional_filechange(self):
|
|
"""
|
|
If the current line contains a file-change object, then parse it
|
|
and advance the current line; otherwise return None. We only care
|
|
about file changes of type b'M' and b'D' (these are the only types
|
|
of file-changes that fast-export will provide).
|
|
"""
|
|
filechange = None
|
|
changetype = self._currentline[0:1]
|
|
if changetype == b'M':
|
|
(changetype, mode, idnum, path) = self._currentline.split(None, 3)
|
|
if idnum[0:1] == b':':
|
|
idnum = idnum[1:]
|
|
path = path.rstrip(b'\n')
|
|
# We translate the idnum to our id system
|
|
if len(idnum) != 40:
|
|
idnum = _IDS.translate( int(idnum) )
|
|
if idnum is not None:
|
|
if path.startswith(b'"'):
|
|
path = PathQuoting.dequote(path)
|
|
filechange = FileChange(b'M', path, idnum, mode)
|
|
else:
|
|
filechange = b'skipped'
|
|
self._advance_currentline()
|
|
elif changetype == b'D':
|
|
(changetype, path) = self._currentline.split(None, 1)
|
|
path = path.rstrip(b'\n')
|
|
if path.startswith(b'"'):
|
|
path = PathQuoting.dequote(path)
|
|
filechange = FileChange(b'D', path)
|
|
self._advance_currentline()
|
|
elif changetype == b'R': # pragma: no cover (now avoid fast-export renames)
|
|
rest = self._currentline[2:-1]
|
|
if rest.startswith(b'"'):
|
|
m = self._quoted_string_re.match(rest)
|
|
if not m:
|
|
raise SystemExit(_("Couldn't parse rename source"))
|
|
orig = PathQuoting.dequote(m.group(0))
|
|
new = rest[m.end()+1:]
|
|
else:
|
|
orig, new = rest.split(b' ', 1)
|
|
if new.startswith(b'"'):
|
|
new = PathQuoting.dequote(new)
|
|
filechange = FileChange(b'R', orig, new)
|
|
self._advance_currentline()
|
|
return filechange
|
|
|
|
def _parse_original_id(self):
|
|
original_id = self._currentline[len(b'original-oid '):].rstrip()
|
|
self._advance_currentline()
|
|
return original_id
|
|
|
|
def _parse_encoding(self):
|
|
encoding = self._currentline[len(b'encoding '):].rstrip()
|
|
self._advance_currentline()
|
|
return encoding
|
|
|
|
def _parse_ref_line(self, refname):
|
|
"""
|
|
Parses string data (often a branch name) from current-line. The name of
|
|
the string data must match the refname arg. The program will crash if
|
|
current-line does not match, so current-line will always be advanced if
|
|
this method returns.
|
|
"""
|
|
matches = self._refline_regexes[refname].match(self._currentline)
|
|
if not matches:
|
|
raise SystemExit(_("Malformed %(refname)s line: '%(line)s'") %
|
|
({'refname': refname, 'line':self._currentline})
|
|
) # pragma: no cover
|
|
ref = matches.group(1)
|
|
self._advance_currentline()
|
|
return ref
|
|
|
|
def _parse_user(self, usertype):
|
|
"""
|
|
Get user name, email, datestamp from current-line. Current-line will
|
|
be advanced.
|
|
"""
|
|
user_regex = self._user_regexes[usertype]
|
|
(name, email, when) = user_regex.match(self._currentline).groups()
|
|
|
|
self._advance_currentline()
|
|
return (name, email, when)
|
|
|
|
def _parse_data(self):
|
|
"""
|
|
Reads data from _input. Current-line will be advanced until it is beyond
|
|
the data.
|
|
"""
|
|
fields = self._currentline.split()
|
|
assert fields[0] == b'data'
|
|
size = int(fields[1])
|
|
data = self._input.read(size)
|
|
self._advance_currentline()
|
|
if self._currentline == b'\n':
|
|
self._advance_currentline()
|
|
return data
|
|
|
|
def _parse_blob(self):
|
|
"""
|
|
Parse input data into a Blob object. Once the Blob has been created, it
|
|
will be handed off to the appropriate callbacks. Current-line will be
|
|
advanced until it is beyond this blob's data. The Blob will be dumped
|
|
to _output once everything else is done (unless it has been skipped by
|
|
the callback).
|
|
"""
|
|
# Parse the Blob
|
|
self._advance_currentline()
|
|
id_ = self._parse_optional_mark()
|
|
|
|
original_id = None
|
|
if self._currentline.startswith(b'original-oid'):
|
|
original_id = self._parse_original_id();
|
|
|
|
data = self._parse_data()
|
|
if self._currentline == b'\n':
|
|
self._advance_currentline()
|
|
|
|
# Create the blob
|
|
blob = Blob(data, original_id)
|
|
|
|
# If fast-export text had a mark for this blob, need to make sure this
|
|
# mark translates to the blob's true id.
|
|
if id_:
|
|
blob.old_id = id_
|
|
_IDS.record_rename(id_, blob.id)
|
|
|
|
# Call any user callback to allow them to use/modify the blob
|
|
if self._blob_callback:
|
|
self._blob_callback(blob)
|
|
|
|
# Now print the resulting blob
|
|
if not blob.dumped:
|
|
blob.dump(self._output)
|
|
|
|
def _parse_reset(self):
|
|
"""
|
|
Parse input data into a Reset object. Once the Reset has been created,
|
|
it will be handed off to the appropriate callbacks. Current-line will
|
|
be advanced until it is beyond the reset data. The Reset will be dumped
|
|
to _output once everything else is done (unless it has been skipped by
|
|
the callback).
|
|
"""
|
|
# Parse the Reset
|
|
ref = self._parse_ref_line(b'reset')
|
|
self._exported_refs.add(ref)
|
|
ignoreme, from_ref = self._parse_optional_parent_ref(b'from')
|
|
if self._currentline == b'\n':
|
|
self._advance_currentline()
|
|
|
|
# fast-export likes to print extraneous resets that serve no purpose.
|
|
# While we could continue processing such resets, that is a waste of
|
|
# resources. Also, we want to avoid recording that this ref was
|
|
# seen in such cases, since this ref could be rewritten to nothing.
|
|
if not from_ref:
|
|
self._latest_commit.pop(ref, None)
|
|
self._latest_orig_commit.pop(ref, None)
|
|
return
|
|
|
|
# Create the reset
|
|
reset = Reset(ref, from_ref)
|
|
|
|
# Call any user callback to allow them to modify the reset
|
|
if self._reset_callback:
|
|
self._reset_callback(reset)
|
|
|
|
# Update metadata
|
|
self._latest_commit[reset.ref] = reset.from_ref
|
|
self._latest_orig_commit[reset.ref] = reset.from_ref
|
|
|
|
# Now print the resulting reset
|
|
if not reset.dumped:
|
|
self._imported_refs.add(reset.ref)
|
|
reset.dump(self._output)
|
|
|
|
def _parse_commit(self):
|
|
"""
|
|
Parse input data into a Commit object. Once the Commit has been created,
|
|
it will be handed off to the appropriate callbacks. Current-line will
|
|
be advanced until it is beyond the commit data. The Commit will be dumped
|
|
to _output once everything else is done (unless it has been skipped by
|
|
the callback OR the callback has removed all file-changes from the commit).
|
|
"""
|
|
# Parse the Commit. This may look involved, but it's pretty simple; it only
|
|
# looks bad because a commit object contains many pieces of data.
|
|
branch = self._parse_ref_line(b'commit')
|
|
self._exported_refs.add(branch)
|
|
id_ = self._parse_optional_mark()
|
|
|
|
original_id = None
|
|
if self._currentline.startswith(b'original-oid'):
|
|
original_id = self._parse_original_id();
|
|
|
|
author_name = None
|
|
author_email = None
|
|
if self._currentline.startswith(b'author'):
|
|
(author_name, author_email, author_date) = self._parse_user(b'author')
|
|
|
|
(committer_name, committer_email, committer_date) = \
|
|
self._parse_user(b'committer')
|
|
|
|
if not author_name and not author_email:
|
|
(author_name, author_email, author_date) = \
|
|
(committer_name, committer_email, committer_date)
|
|
|
|
encoding = None
|
|
if self._currentline.startswith(b'encoding '):
|
|
encoding = self._parse_encoding()
|
|
|
|
commit_msg = self._parse_data()
|
|
|
|
pinfo = [self._parse_optional_parent_ref(b'from')]
|
|
# Due to empty pruning, we can have real 'from' and 'merge' lines that
|
|
# due to commit rewriting map to a parent of None. We need to record
|
|
# 'from' if its non-None, and we need to parse all 'merge' lines.
|
|
while self._currentline.startswith(b'merge '):
|
|
pinfo.append(self._parse_optional_parent_ref(b'merge'))
|
|
orig_parents, parents = [list(tmp) for tmp in zip(*pinfo)]
|
|
|
|
# No parents is oddly represented as [None] instead of [], due to the
|
|
# special 'from' handling. Convert it here to a more canonical form.
|
|
if parents == [None]:
|
|
parents = []
|
|
if orig_parents == [None]:
|
|
orig_parents = []
|
|
|
|
# fast-import format is kinda stupid in that it allows implicit parents
|
|
# based on the branch name instead of requiring them to be specified by
|
|
# 'from' directives. The only way to get no parent is by using a reset
|
|
# directive first, which clears the latest_commit_for_this_branch tracking.
|
|
if not orig_parents and self._latest_commit.get(branch):
|
|
parents = [self._latest_commit[branch]]
|
|
if not orig_parents and self._latest_orig_commit.get(branch):
|
|
orig_parents = [self._latest_orig_commit[branch]]
|
|
|
|
# Get the list of file changes
|
|
file_changes = []
|
|
file_change = self._parse_optional_filechange()
|
|
had_file_changes = file_change is not None
|
|
while file_change:
|
|
if not (type(file_change) == bytes and file_change == b'skipped'):
|
|
file_changes.append(file_change)
|
|
file_change = self._parse_optional_filechange()
|
|
if self._currentline == b'\n':
|
|
self._advance_currentline()
|
|
|
|
# Okay, now we can finally create the Commit object
|
|
commit = Commit(branch,
|
|
author_name, author_email, author_date,
|
|
committer_name, committer_email, committer_date,
|
|
commit_msg, file_changes, parents, original_id, encoding)
|
|
|
|
# If fast-export text had a mark for this commit, need to make sure this
|
|
# mark translates to the commit's true id.
|
|
if id_:
|
|
commit.old_id = id_
|
|
_IDS.record_rename(id_, commit.id)
|
|
|
|
# Call any user callback to allow them to modify the commit
|
|
aux_info = {'orig_parents': orig_parents,
|
|
'had_file_changes': had_file_changes}
|
|
if self._commit_callback:
|
|
self._commit_callback(commit, aux_info)
|
|
|
|
# Now print the resulting commit, or if prunable skip it
|
|
self._latest_orig_commit[branch] = commit.id
|
|
if not (commit.old_id or commit.id) in _SKIPPED_COMMITS:
|
|
self._latest_commit[branch] = commit.id
|
|
if not commit.dumped:
|
|
self._imported_refs.add(commit.branch)
|
|
commit.dump(self._output)
|
|
|
|
def _parse_tag(self):
|
|
"""
|
|
Parse input data into a Tag object. Once the Tag has been created,
|
|
it will be handed off to the appropriate callbacks. Current-line will
|
|
be advanced until it is beyond the tag data. The Tag will be dumped
|
|
to _output once everything else is done (unless it has been skipped by
|
|
the callback).
|
|
"""
|
|
# Parse the Tag
|
|
tag = self._parse_ref_line(b'tag')
|
|
self._exported_refs.add(b'refs/tags/'+tag)
|
|
id_ = self._parse_optional_mark()
|
|
ignoreme, from_ref = self._parse_optional_parent_ref(b'from')
|
|
|
|
original_id = None
|
|
if self._currentline.startswith(b'original-oid'):
|
|
original_id = self._parse_original_id();
|
|
|
|
tagger_name, tagger_email, tagger_date = None, None, None
|
|
if self._currentline.startswith(b'tagger'):
|
|
(tagger_name, tagger_email, tagger_date) = self._parse_user(b'tagger')
|
|
tag_msg = self._parse_data()
|
|
if self._currentline == b'\n':
|
|
self._advance_currentline()
|
|
|
|
# Create the tag
|
|
tag = Tag(tag, from_ref,
|
|
tagger_name, tagger_email, tagger_date, tag_msg,
|
|
original_id)
|
|
|
|
# If fast-export text had a mark for this tag, need to make sure this
|
|
# mark translates to the tag's true id.
|
|
if id_:
|
|
tag.old_id = id_
|
|
_IDS.record_rename(id_, tag.id)
|
|
|
|
# Call any user callback to allow them to modify the tag
|
|
if self._tag_callback:
|
|
self._tag_callback(tag)
|
|
|
|
# The tag might not point at anything that still exists (self.from_ref
|
|
# will be None if the commit it pointed to and all its ancestors were
|
|
# pruned due to being empty)
|
|
if tag.from_ref:
|
|
# Print out this tag's information
|
|
if not tag.dumped:
|
|
self._imported_refs.add(b'refs/tags/'+tag.ref)
|
|
tag.dump(self._output)
|
|
else:
|
|
tag.skip()
|
|
|
|
def _parse_progress(self):
|
|
"""
|
|
Parse input data into a Progress object. Once the Progress has
|
|
been created, it will be handed off to the appropriate
|
|
callbacks. Current-line will be advanced until it is beyond the
|
|
progress data. The Progress will be dumped to _output once
|
|
everything else is done (unless it has been skipped by the callback).
|
|
"""
|
|
# Parse the Progress
|
|
message = self._parse_ref_line(b'progress')
|
|
if self._currentline == b'\n':
|
|
self._advance_currentline()
|
|
|
|
# Create the progress message
|
|
progress = Progress(message)
|
|
|
|
# Call any user callback to allow them to modify the progress messsage
|
|
if self._progress_callback:
|
|
self._progress_callback(progress)
|
|
|
|
# NOTE: By default, we do NOT print the progress message; git
|
|
# fast-import would write it to fast_import_pipes which could mess with
|
|
# our parsing of output from the 'ls' and 'get-mark' directives we send
|
|
# to fast-import. If users want these messages, they need to process
|
|
# and handle them in the appropriate callback above.
|
|
|
|
def _parse_checkpoint(self):
|
|
"""
|
|
Parse input data into a Checkpoint object. Once the Checkpoint has
|
|
been created, it will be handed off to the appropriate
|
|
callbacks. Current-line will be advanced until it is beyond the
|
|
checkpoint data. The Checkpoint will be dumped to _output once
|
|
everything else is done (unless it has been skipped by the callback).
|
|
"""
|
|
# Parse the Checkpoint
|
|
self._advance_currentline()
|
|
if self._currentline == b'\n':
|
|
self._advance_currentline()
|
|
|
|
# Create the checkpoint
|
|
checkpoint = Checkpoint()
|
|
|
|
# Call any user callback to allow them to drop the checkpoint
|
|
if self._checkpoint_callback:
|
|
self._checkpoint_callback(checkpoint)
|
|
|
|
# NOTE: By default, we do NOT print the checkpoint message; although it
|
|
# we would only realistically get them with --stdin, the fact that we
|
|
# are filtering makes me think the checkpointing is less likely to be
|
|
# reasonable. In fact, I don't think it's necessary in general. If
|
|
# users do want it, they should process it in the checkpoint_callback.
|
|
|
|
def _parse_literal_command(self):
|
|
"""
|
|
Parse literal command. Then just dump the line as is.
|
|
"""
|
|
# Create the literal command object
|
|
command = LiteralCommand(self._currentline)
|
|
self._advance_currentline()
|
|
|
|
# Now print the resulting literal command
|
|
if not command.dumped:
|
|
command.dump(self._output)
|
|
|
|
def insert(self, obj):
|
|
assert not obj.dumped
|
|
obj.dump(self._output)
|
|
if type(obj) == Commit:
|
|
self._imported_refs.add(obj.branch)
|
|
elif type(obj) in (Reset, Tag):
|
|
self._imported_refs.add(obj.ref)
|
|
|
|
def run(self, input, output):
|
|
"""
|
|
This method filters fast export output.
|
|
"""
|
|
# Set input. If no args provided, use stdin.
|
|
self._input = input
|
|
self._output = output
|
|
|
|
# Run over the input and do the filtering
|
|
self._advance_currentline()
|
|
while self._currentline:
|
|
if self._currentline.startswith(b'blob'):
|
|
self._parse_blob()
|
|
elif self._currentline.startswith(b'reset'):
|
|
self._parse_reset()
|
|
elif self._currentline.startswith(b'commit'):
|
|
self._parse_commit()
|
|
elif self._currentline.startswith(b'tag'):
|
|
self._parse_tag()
|
|
elif self._currentline.startswith(b'progress'):
|
|
self._parse_progress()
|
|
elif self._currentline.startswith(b'checkpoint'):
|
|
self._parse_checkpoint()
|
|
elif self._currentline.startswith(b'feature'):
|
|
self._parse_literal_command()
|
|
elif self._currentline.startswith(b'option'):
|
|
self._parse_literal_command()
|
|
elif self._currentline.startswith(b'done'):
|
|
if self._done_callback:
|
|
self._done_callback()
|
|
self._parse_literal_command()
|
|
# Prevent confusion from others writing additional stuff that'll just
|
|
# be ignored
|
|
self._output.close()
|
|
elif self._currentline.startswith(b'#'):
|
|
self._parse_literal_command()
|
|
elif self._currentline.startswith(b'get-mark') or \
|
|
self._currentline.startswith(b'cat-blob') or \
|
|
self._currentline.startswith(b'ls'):
|
|
raise SystemExit(_("Unsupported command: '%s'") % self._currentline)
|
|
else:
|
|
raise SystemExit(_("Could not parse line: '%s'") % self._currentline)
|
|
|
|
def get_exported_and_imported_refs(self):
|
|
return self._exported_refs, self._imported_refs
|
|
|
|
def record_id_rename(old_id, new_id):
|
|
"""
|
|
Register a new translation
|
|
"""
|
|
handle_transitivity = True
|
|
_IDS.record_rename(old_id, new_id, handle_transitivity)
|
|
|
|
# Internal globals
|
|
_IDS = _IDs()
|
|
_SKIPPED_COMMITS = set()
|
|
HASH_TO_ID = {}
|
|
ID_TO_HASH = {}
|
|
|
|
class SubprocessWrapper(object):
|
|
@staticmethod
|
|
def decodify(args):
|
|
if type(args) == str:
|
|
return args
|
|
else:
|
|
assert type(args) == list
|
|
return [decode(x) if type(x)==bytes else x for x in args]
|
|
|
|
@staticmethod
|
|
def call(*args, **kwargs):
|
|
if 'cwd' in kwargs:
|
|
kwargs['cwd'] = decode(kwargs['cwd'])
|
|
return subprocess.call(SubprocessWrapper.decodify(*args), **kwargs)
|
|
|
|
@staticmethod
|
|
def check_output(*args, **kwargs):
|
|
if 'cwd' in kwargs:
|
|
kwargs['cwd'] = decode(kwargs['cwd'])
|
|
return subprocess.check_output(SubprocessWrapper.decodify(*args), **kwargs)
|
|
|
|
@staticmethod
|
|
def check_call(*args, **kwargs): # pragma: no cover # used by filter-lamely
|
|
if 'cwd' in kwargs:
|
|
kwargs['cwd'] = decode(kwargs['cwd'])
|
|
return subprocess.check_call(SubprocessWrapper.decodify(*args), **kwargs)
|
|
|
|
@staticmethod
|
|
def Popen(*args, **kwargs):
|
|
if 'cwd' in kwargs:
|
|
kwargs['cwd'] = decode(kwargs['cwd'])
|
|
return subprocess.Popen(SubprocessWrapper.decodify(*args), **kwargs)
|
|
|
|
subproc = subprocess
|
|
if platform.system() == 'Windows' or 'PRETEND_UNICODE_ARGS' in os.environ:
|
|
subproc = SubprocessWrapper
|
|
|
|
class GitUtils(object):
|
|
@staticmethod
|
|
def get_commit_count(repo, *args):
|
|
"""
|
|
Return the number of commits that have been made on repo.
|
|
"""
|
|
if not args:
|
|
args = ['--all']
|
|
if len(args) == 1 and isinstance(args[0], list):
|
|
args = args[0]
|
|
p = subproc.Popen(["git", "rev-list", "--count"] + args,
|
|
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
|
cwd=repo)
|
|
if p.wait() != 0:
|
|
raise SystemExit(_("%s does not appear to be a valid git repository")
|
|
% decode(repo))
|
|
return int(p.stdout.read())
|
|
|
|
@staticmethod
|
|
def get_total_objects(repo):
|
|
"""
|
|
Return the number of objects (both packed and unpacked)
|
|
"""
|
|
p1 = subproc.Popen(["git", "count-objects", "-v"],
|
|
stdout=subprocess.PIPE, cwd=repo)
|
|
lines = p1.stdout.read().splitlines()
|
|
# Return unpacked objects + packed-objects
|
|
return int(lines[0].split()[1]) + int(lines[2].split()[1])
|
|
|
|
@staticmethod
|
|
def is_repository_bare(repo_working_dir):
|
|
out = subproc.check_output('git rev-parse --is-bare-repository'.split(),
|
|
cwd=repo_working_dir)
|
|
return (out.strip() == b'true')
|
|
|
|
@staticmethod
|
|
def determine_git_dir(repo_working_dir):
|
|
d = subproc.check_output('git rev-parse --git-dir'.split(),
|
|
cwd=repo_working_dir).strip()
|
|
if repo_working_dir==b'.' or d.startswith(b'/'):
|
|
return d
|
|
return os.path.join(repo_working_dir, d)
|
|
|
|
@staticmethod
|
|
def get_refs(repo_working_dir):
|
|
try:
|
|
output = subproc.check_output('git show-ref'.split(),
|
|
cwd=repo_working_dir)
|
|
except subprocess.CalledProcessError as e:
|
|
# If error code is 1, there just aren't any refs; i.e. new repo.
|
|
# If error code is other than 1, some other error (e.g. not a git repo)
|
|
if e.returncode != 1:
|
|
raise SystemExit('fatal: {}'.format(e))
|
|
output = ''
|
|
return dict(reversed(x.split()) for x in output.splitlines())
|
|
|
|
@staticmethod
|
|
def get_blob_sizes(quiet = False):
|
|
blob_size_progress = ProgressWriter()
|
|
num_blobs = 0
|
|
processed_blobs_msg = _("Processed %d blob sizes")
|
|
|
|
# Get sizes of blobs by sha1
|
|
cmd = '--batch-check=%(objectname) %(objecttype) ' + \
|
|
'%(objectsize) %(objectsize:disk)'
|
|
cf = subproc.Popen(['git', 'cat-file', '--batch-all-objects', cmd],
|
|
bufsize = -1,
|
|
stdout = subprocess.PIPE)
|
|
unpacked_size = {}
|
|
packed_size = {}
|
|
for line in cf.stdout:
|
|
sha, objtype, objsize, objdisksize = line.split()
|
|
objsize, objdisksize = int(objsize), int(objdisksize)
|
|
if objtype == b'blob':
|
|
unpacked_size[sha] = objsize
|
|
packed_size[sha] = objdisksize
|
|
num_blobs += 1
|
|
if not quiet:
|
|
blob_size_progress.show(processed_blobs_msg % num_blobs)
|
|
cf.wait()
|
|
if not quiet:
|
|
blob_size_progress.finish()
|
|
return unpacked_size, packed_size
|
|
|
|
@staticmethod
|
|
def get_file_changes(repo, parent_hash, commit_hash):
|
|
"""
|
|
Return a FileChanges list with the differences between parent_hash
|
|
and commit_hash
|
|
"""
|
|
file_changes = []
|
|
|
|
cmd = ["git", "diff-tree", "-r", parent_hash, commit_hash]
|
|
output = subproc.check_output(cmd, cwd=repo)
|
|
for line in output.splitlines():
|
|
fileinfo, path = line.split(b'\t', 1)
|
|
if path.startswith(b'"'):
|
|
path = PathQuoting.dequote(path)
|
|
oldmode, mode, oldhash, newhash, changetype = fileinfo.split()
|
|
if changetype == b'D':
|
|
file_changes.append(FileChange(b'D', path))
|
|
elif changetype in (b'A', b'M', b'T'):
|
|
identifier = HASH_TO_ID.get(newhash, newhash)
|
|
file_changes.append(FileChange(b'M', path, identifier, mode))
|
|
else: # pragma: no cover
|
|
raise SystemExit("Unknown change type for line {}".format(line))
|
|
|
|
return file_changes
|
|
|
|
@staticmethod
|
|
def print_my_version():
|
|
with open(__file__, 'br') as f:
|
|
contents = f.read()
|
|
# If people replaced @@LOCALEDIR@@ string to point at their local
|
|
# directory, undo it so we can get original source version.
|
|
contents = re.sub(br'\A#\!.*',
|
|
br'#!/usr/bin/env python3', contents)
|
|
contents = re.sub(br'(\("GIT_TEXTDOMAINDIR"\) or ").*"',
|
|
br'\1@@LOCALEDIR@@"', contents)
|
|
|
|
cmd = 'git hash-object --stdin'.split()
|
|
version = subproc.check_output(cmd, input=contents).strip()
|
|
print(decode(version[0:12]))
|
|
|
|
class FilteringOptions(object):
|
|
default_replace_text = b'***REMOVED***'
|
|
class AppendFilter(argparse.Action):
|
|
def __call__(self, parser, namespace, values, option_string=None):
|
|
user_path = values
|
|
suffix = option_string[len('--path-'):] or 'match'
|
|
if suffix.startswith('rename'):
|
|
mod_type = 'rename'
|
|
match_type = option_string[len('--path-rename-'):] or 'match'
|
|
values = values.split(b':')
|
|
if len(values) != 2:
|
|
raise SystemExit(_("Error: --path-rename expects one colon in its"
|
|
" argument: <old_name:new_name>."))
|
|
if values[0] and values[1] and not (
|
|
values[0].endswith(b'/') == values[1].endswith(b'/')):
|
|
raise SystemExit(_("Error: With --path-rename, if OLD_NAME and "
|
|
"NEW_NAME are both non-empty and either ends "
|
|
"with a slash then both must."))
|
|
if any(v.startswith(b'/') for v in values):
|
|
raise SystemExit(_("Error: Pathnames cannot begin with a '/'"))
|
|
components = values[0].split(b'/') + values[1].split(b'/')
|
|
else:
|
|
mod_type = 'filter'
|
|
match_type = suffix
|
|
components = values.split(b'/')
|
|
if values.startswith(b'/'):
|
|
raise SystemExit(_("Error: Pathnames cannot begin with a '/'"))
|
|
for illegal_path in [b'.', b'..']:
|
|
if illegal_path in components:
|
|
raise SystemExit(_("Error: Invalid path component '%s' found in '%s'")
|
|
% (decode(illegal_path), decode(user_path)))
|
|
if match_type == 'regex':
|
|
values = re.compile(values)
|
|
items = getattr(namespace, self.dest, []) or []
|
|
items.append((mod_type, match_type, values))
|
|
if (match_type, mod_type) == ('glob', 'filter'):
|
|
if not values.endswith(b'*'):
|
|
extension = b'*' if values.endswith(b'/') else b'/*'
|
|
items.append((mod_type, match_type, values+extension))
|
|
setattr(namespace, self.dest, items)
|
|
|
|
class HelperFilter(argparse.Action):
|
|
def __call__(self, parser, namespace, values, option_string=None):
|
|
af = FilteringOptions.AppendFilter(dest='path_changes',
|
|
option_strings=None)
|
|
dirname = values if values[-1:] == b'/' else values+b'/'
|
|
if option_string == '--subdirectory-filter':
|
|
af(parser, namespace, dirname, '--path-match')
|
|
af(parser, namespace, dirname+b':', '--path-rename')
|
|
elif option_string == '--to-subdirectory-filter':
|
|
af(parser, namespace, b':'+dirname, '--path-rename')
|
|
else:
|
|
raise SystemExit(_("Error: HelperFilter given invalid option_string: %s")
|
|
% option_string) # pragma: no cover
|
|
|
|
class FileWithPathsFilter(argparse.Action):
|
|
def __call__(self, parser, namespace, values, option_string=None):
|
|
if not namespace.path_changes:
|
|
namespace.path_changes = []
|
|
namespace.path_changes += FilteringOptions.get_paths_from_file(values)
|
|
|
|
@staticmethod
|
|
def create_arg_parser():
|
|
# Include usage in the summary, so we can put the description first
|
|
summary = _('''Rewrite (or analyze) repository history
|
|
|
|
git-filter-repo destructively rewrites history (unless --analyze or
|
|
--dry-run are given) according to specified rules. It refuses to do any
|
|
rewriting unless either run from a clean fresh clone, or --force was
|
|
given.
|
|
|
|
Basic Usage:
|
|
git-filter-repo --analyze
|
|
git-filter-repo [FILTER/RENAME/CONTROL OPTIONS]
|
|
|
|
See EXAMPLES section for details.
|
|
''').rstrip()
|
|
|
|
# Provide a long helpful examples section
|
|
example_text = _('''CALLBACKS
|
|
|
|
All callback functions are of the same general format. For a command line
|
|
argument like
|
|
--foo-callback 'BODY'
|
|
|
|
the following code will be compiled and called:
|
|
def foo_callback(foo):
|
|
BODY
|
|
|
|
Thus, to replace 'Jon' with 'John' in author/committer/tagger names:
|
|
git filter-repo --name-callback 'return name.replace(b"Jon", b"John")'
|
|
|
|
To remove all 'Tested-by' tags in commit (or tag) messages:
|
|
git filter-repo --message-callback 'return re.sub(br"\\nTested-by:.*", "", message)'
|
|
|
|
To remove all .DS_Store files:
|
|
git filter-repo --filename-callback 'return None if os.path.basename(filename) == b".DS_Store" else filename'
|
|
|
|
Note that if BODY resolves to a filename, then the contents of that file
|
|
will be used as the BODY in the callback function.
|
|
|
|
For more detailed examples and explanations AND caveats, see
|
|
https://htmlpreview.github.io/?https://github.com/newren/git-filter-repo/blob/docs/html/git-filter-repo.html#CALLBACKS
|
|
|
|
EXAMPLES
|
|
|
|
To get a bunch of reports mentioning renames that have occurred in
|
|
your repo and listing sizes of objects aggregated by any of path,
|
|
directory, extension, or blob-id:
|
|
git filter-repo --analyze
|
|
|
|
(These reports can help you choose how to filter your repo; it can
|
|
be useful to re-run this command after filtering to regenerate the
|
|
report and verify the changes look correct.)
|
|
|
|
To extract the history that touched just 'guides' and 'tools/releases':
|
|
git filter-repo --path guides/ --path tools/releases
|
|
|
|
To remove foo.zip and bar/baz/zips from every revision in history:
|
|
git filter-repo --path foo.zip --path bar/baz/zips/ --invert-paths
|
|
|
|
To replace the text 'password' with 'p455w0rd':
|
|
git filter-repo --replace-text <(echo "password==>p455w0rd")
|
|
|
|
To use the current version of the .mailmap file to update authors,
|
|
committers, and taggers throughout history and make it permanent:
|
|
git filter-repo --use-mailmap
|
|
|
|
To extract the history of 'src/', rename all files to have a new leading
|
|
directory 'my-module' (e.g. src/foo.java -> my-module/src/foo.java), and
|
|
add a 'my-module-' prefix to all tags:
|
|
git filter-repo --path src/ --to-subdirectory-filter my-module --tag-rename '':'my-module-'
|
|
|
|
For more detailed examples and explanations, see
|
|
https://htmlpreview.github.io/?https://github.com/newren/git-filter-repo/blob/docs/html/git-filter-repo.html#EXAMPLES''')
|
|
|
|
# Create the basic parser
|
|
parser = argparse.ArgumentParser(description=summary,
|
|
usage = argparse.SUPPRESS,
|
|
add_help = False,
|
|
epilog = example_text,
|
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
|
|
|
analyze = parser.add_argument_group(title=_("Analysis"))
|
|
analyze.add_argument('--analyze', action='store_true',
|
|
help=_("Analyze repository history and create a report that may be "
|
|
"useful in determining what to filter in a subsequent run. "
|
|
"Will not modify your repo."))
|
|
analyze.add_argument('--report-dir',
|
|
metavar='DIR_OR_FILE',
|
|
type=os.fsencode,
|
|
dest='report_dir',
|
|
help=_("Directory to write report, defaults to GIT_DIR/filter_repo/analysis,"
|
|
"refuses to run if exists, --force delete existing dir first."))
|
|
|
|
path = parser.add_argument_group(title=_("Filtering based on paths "
|
|
"(see also --filename-callback)"),
|
|
description=textwrap.dedent(_("""
|
|
These options specify the paths to select. Note that much like git
|
|
itself, renames are NOT followed so you may need to specify multiple
|
|
paths, e.g. `--path olddir/ --path newdir/`
|
|
"""[1:])))
|
|
|
|
path.add_argument('--invert-paths', action='store_false', dest='inclusive',
|
|
help=_("Invert the selection of files from the specified "
|
|
"--path-{match,glob,regex} options below, i.e. only select "
|
|
"files matching none of those options."))
|
|
|
|
path.add_argument('--path-match', '--path', metavar='DIR_OR_FILE',
|
|
type=os.fsencode,
|
|
action=FilteringOptions.AppendFilter, dest='path_changes',
|
|
help=_("Exact paths (files or directories) to include in filtered "
|
|
"history. Multiple --path options can be specified to get "
|
|
"a union of paths."))
|
|
path.add_argument('--path-glob', metavar='GLOB', type=os.fsencode,
|
|
action=FilteringOptions.AppendFilter, dest='path_changes',
|
|
help=_("Glob of paths to include in filtered history. Multiple "
|
|
"--path-glob options can be specified to get a union of "
|
|
"paths."))
|
|
path.add_argument('--path-regex', metavar='REGEX', type=os.fsencode,
|
|
action=FilteringOptions.AppendFilter, dest='path_changes',
|
|
help=_("Regex of paths to include in filtered history. Multiple "
|
|
"--path-regex options can be specified to get a union of "
|
|
"paths"))
|
|
path.add_argument('--use-base-name', action='store_true',
|
|
help=_("Match on file base name instead of full path from the top "
|
|
"of the repo. Incompatible with --path-rename, and "
|
|
"incompatible with matching against directory names."))
|
|
|
|
rename = parser.add_argument_group(title=_("Renaming based on paths "
|
|
"(see also --filename-callback)"))
|
|
rename.add_argument('--path-rename', '--path-rename-match',
|
|
metavar='OLD_NAME:NEW_NAME', dest='path_changes', type=os.fsencode,
|
|
action=FilteringOptions.AppendFilter,
|
|
help=_("Path to rename; if filename or directory matches OLD_NAME "
|
|
"rename to NEW_NAME. Multiple --path-rename options can be "
|
|
"specified. NOTE: If you combine filtering options with "
|
|
"renaming ones, do not rely on a rename argument to select "
|
|
"paths; you also need a filter to select them."))
|
|
|
|
helpers = parser.add_argument_group(title=_("Path shortcuts"))
|
|
helpers.add_argument('--paths-from-file', metavar='FILENAME',
|
|
type=os.fsencode,
|
|
action=FilteringOptions.FileWithPathsFilter, dest='path_changes',
|
|
help=_("Specify several path filtering and renaming directives, one "
|
|
"per line. Lines with '==>' in them specify path renames, "
|
|
"and lines can begin with 'literal:' (the default), 'glob:', "
|
|
"or 'regex:' to specify different matching styles. Blank "
|
|
"lines and lines starting with a '#' are ignored."))
|
|
helpers.add_argument('--subdirectory-filter', metavar='DIRECTORY',
|
|
action=FilteringOptions.HelperFilter, type=os.fsencode,
|
|
help=_("Only look at history that touches the given subdirectory "
|
|
"and treat that directory as the project root. Equivalent "
|
|
"to using '--path DIRECTORY/ --path-rename DIRECTORY/:'"))
|
|
helpers.add_argument('--to-subdirectory-filter', metavar='DIRECTORY',
|
|
action=FilteringOptions.HelperFilter, type=os.fsencode,
|
|
help=_("Treat the project root as instead being under DIRECTORY. "
|
|
"Equivalent to using '--path-rename :DIRECTORY/'"))
|
|
|
|
contents = parser.add_argument_group(title=_("Content editing filters "
|
|
"(see also --blob-callback)"))
|
|
contents.add_argument('--replace-text', metavar='EXPRESSIONS_FILE',
|
|
help=_("A file with expressions that, if found, will be replaced. "
|
|
"By default, each expression is treated as literal text, "
|
|
"but 'regex:' and 'glob:' prefixes are supported. You can "
|
|
"end the line with '==>' and some replacement text to "
|
|
"choose a replacement choice other than the default of '{}'."
|
|
.format(decode(FilteringOptions.default_replace_text))))
|
|
contents.add_argument('--strip-blobs-bigger-than', metavar='SIZE',
|
|
dest='max_blob_size', default=0,
|
|
help=_("Strip blobs (files) bigger than specified size (e.g. '5M', "
|
|
"'2G', etc)"))
|
|
contents.add_argument('--strip-blobs-with-ids', metavar='BLOB-ID-FILENAME',
|
|
help=_("Read git object ids from each line of the given file, and "
|
|
"strip all of them from history"))
|
|
|
|
refrename = parser.add_argument_group(title=_("Renaming of refs "
|
|
"(see also --refname-callback)"))
|
|
refrename.add_argument('--tag-rename', metavar='OLD:NEW', type=os.fsencode,
|
|
help=_("Rename tags starting with OLD to start with NEW. For "
|
|
"example, --tag-rename foo:bar will rename tag foo-1.2.3 "
|
|
"to bar-1.2.3; either OLD or NEW can be empty."))
|
|
|
|
messages = parser.add_argument_group(title=_("Filtering of commit messages "
|
|
"(see also --message-callback)"))
|
|
messages.add_argument('--replace-message', metavar='EXPRESSIONS_FILE',
|
|
help=_("A file with expressions that, if found in commit messages, "
|
|
"will be replaced. This file uses the same syntax as "
|
|
"--replace-text."))
|
|
messages.add_argument('--preserve-commit-hashes', action='store_true',
|
|
help=_("By default, since commits are rewritten and thus gain new "
|
|
"hashes, references to old commit hashes in commit messages "
|
|
"are replaced with new commit hashes (abbreviated to the same "
|
|
"length as the old reference). Use this flag to turn off "
|
|
"updating commit hashes in commit messages."))
|
|
messages.add_argument('--preserve-commit-encoding', action='store_true',
|
|
help=_("Do not reencode commit messages into UTF-8. By default, if "
|
|
"the commit object specifies an encoding for the commit "
|
|
"message, the message is re-encoded into UTF-8."))
|
|
|
|
people = parser.add_argument_group(title=_("Filtering of names & emails "
|
|
"(see also --name-callback "
|
|
"and --email-callback)"))
|
|
people.add_argument('--mailmap', dest='mailmap', metavar='FILENAME',
|
|
type=os.fsencode,
|
|
help=_("Use specified mailmap file (see git-shortlog(1) for "
|
|
"details on the format) when rewriting author, committer, "
|
|
"and tagger names and emails. If the specified file is "
|
|
"part of git history, historical versions of the file will "
|
|
"be ignored; only the current contents are consulted."))
|
|
people.add_argument('--use-mailmap', dest='mailmap',
|
|
action='store_const', const=b'.mailmap',
|
|
help=_("Same as: '--mailmap .mailmap' "))
|
|
|
|
parents = parser.add_argument_group(title=_("Parent rewriting"))
|
|
parents.add_argument('--replace-refs', default=None,
|
|
choices=['delete-no-add', 'delete-and-add',
|
|
'update-no-add', 'update-or-add',
|
|
'update-and-add'],
|
|
help=_("Replace refs (see git-replace(1)) are used to rewrite "
|
|
"parents (unless turned off by the usual git mechanism); this "
|
|
"flag specifies what do do with those refs afterward. "
|
|
"Replace refs can either be deleted or updated to point at new "
|
|
"commit hashes. Also, new replace refs can be added for each "
|
|
"commit rewrite. With 'update-or-add', new replace refs are "
|
|
"only added for commit rewrites that aren't used to update an "
|
|
"existing replace ref. default is 'update-and-add' if "
|
|
"$GIT_DIR/filter-repo/already_ran does not exist; "
|
|
"'update-or-add' otherwise."))
|
|
parents.add_argument('--prune-empty', default='auto',
|
|
choices=['always', 'auto', 'never'],
|
|
help=_("Whether to prune empty commits. 'auto' (the default) means "
|
|
"only prune commits which become empty (not commits which were "
|
|
"empty in the original repo, unless their parent was pruned). "
|
|
"When the parent of a commit is pruned, the first non-pruned "
|
|
"ancestor becomes the new parent."))
|
|
parents.add_argument('--prune-degenerate', default='auto',
|
|
choices=['always', 'auto', 'never'],
|
|
help=_("Since merge commits are needed for history topology, they "
|
|
"are typically exempt from pruning. However, they can become "
|
|
"degenerate with the pruning of other commits (having fewer "
|
|
"than two parents, having one commit serve as both parents, or "
|
|
"having one parent as the ancestor of the other.) If such "
|
|
"merge commits have no file changes, they can be pruned. The "
|
|
"default ('auto') is to only prune empty merge commits which "
|
|
"become degenerate (not which started as such)."))
|
|
parents.add_argument('--no-ff', action='store_true',
|
|
help=_("Even if the first parent is or becomes an ancestor of another "
|
|
"parent, do not prune it. This modifies how "
|
|
"--prune-degenerate behaves, and may be useful in projects who "
|
|
"always use merge --no-ff."))
|
|
|
|
callback = parser.add_argument_group(title=_("Generic callback code snippets"))
|
|
callback.add_argument('--filename-callback', metavar="FUNCTION_BODY_OR_FILE",
|
|
help=_("Python code body for processing filenames; see CALLBACKS "
|
|
"sections below."))
|
|
callback.add_argument('--message-callback', metavar="FUNCTION_BODY_OR_FILE",
|
|
help=_("Python code body for processing messages (both commit "
|
|
"messages and tag messages); see CALLBACKS section below."))
|
|
callback.add_argument('--name-callback', metavar="FUNCTION_BODY_OR_FILE",
|
|
help=_("Python code body for processing names of people; see "
|
|
"CALLBACKS section below."))
|
|
callback.add_argument('--email-callback', metavar="FUNCTION_BODY_OR_FILE",
|
|
help=_("Python code body for processing emails addresses; see "
|
|
"CALLBACKS section below."))
|
|
callback.add_argument('--refname-callback', metavar="FUNCTION_BODY_OR_FILE",
|
|
help=_("Python code body for processing refnames; see CALLBACKS "
|
|
"section below."))
|
|
|
|
callback.add_argument('--blob-callback', metavar="FUNCTION_BODY_OR_FILE",
|
|
help=_("Python code body for processing blob objects; see "
|
|
"CALLBACKS section below."))
|
|
callback.add_argument('--commit-callback', metavar="FUNCTION_BODY_OR_FILE",
|
|
help=_("Python code body for processing commit objects; see "
|
|
"CALLBACKS section below."))
|
|
callback.add_argument('--tag-callback', metavar="FUNCTION_BODY_OR_FILE",
|
|
help=_("Python code body for processing tag objects; see CALLBACKS "
|
|
"section below."))
|
|
callback.add_argument('--reset-callback', metavar="FUNCTION_BODY_OR_FILE",
|
|
help=_("Python code body for processing reset objects; see "
|
|
"CALLBACKS section below."))
|
|
|
|
desc = _(
|
|
"Specifying alternate source or target locations implies --partial,\n"
|
|
"except that the normal default for --replace-refs is used. However,\n"
|
|
"unlike normal uses of --partial, this doesn't risk mixing old and new\n"
|
|
"history since the old and new histories are in different repositories.")
|
|
location = parser.add_argument_group(title=_("Location to filter from/to"),
|
|
description=desc)
|
|
location.add_argument('--source', type=os.fsencode,
|
|
help=_("Git repository to read from"))
|
|
location.add_argument('--target', type=os.fsencode,
|
|
help=_("Git repository to overwrite with filtered history"))
|
|
|
|
misc = parser.add_argument_group(title=_("Miscellaneous options"))
|
|
misc.add_argument('--help', '-h', action='store_true',
|
|
help=_("Show this help message and exit."))
|
|
misc.add_argument('--version', action='store_true',
|
|
help=_("Display filter-repo's version and exit."))
|
|
misc.add_argument('--force', '-f', action='store_true',
|
|
help=_("Rewrite repository history even if the current repo does not "
|
|
"look like a fresh clone. History rewriting is irreversible "
|
|
"(and includes immediate pruning of reflogs and old objects), "
|
|
"so be cautious about using this flag."))
|
|
misc.add_argument('--partial', action='store_true',
|
|
help=_("Do a partial history rewrite, resulting in the mixture of "
|
|
"old and new history. This implies a default of "
|
|
"update-no-add for --replace-refs, disables rewriting "
|
|
"refs/remotes/origin/* to refs/heads/*, disables removing "
|
|
"of the 'origin' remote, disables removing unexported refs, "
|
|
"disables expiring the reflog, and disables the automatic "
|
|
"post-filter gc. Also, this modifies --tag-rename and "
|
|
"--refname-callback options such that instead of replacing "
|
|
"old refs with new refnames, it will instead create new "
|
|
"refs and keep the old ones around. Use with caution."))
|
|
# WARNING: --refs presents a problem with become-degenerate pruning:
|
|
# * Excluding a commit also excludes its ancestors so when some other
|
|
# commit has an excluded ancestor as a parent we have no way of
|
|
# knowing what it is an ancestor of without doing a special
|
|
# full-graph walk.
|
|
misc.add_argument('--refs', nargs='+',
|
|
help=_("Limit history rewriting to the specified refs. Implies "
|
|
"--partial. In addition to the normal caveats of --partial "
|
|
"(mixing old and new history, no automatic remapping of "
|
|
"refs/remotes/origin/* to refs/heads/*, etc.), this also may "
|
|
"cause problems for pruning of degenerate empty merge "
|
|
"commits when negative revisions are specified."))
|
|
|
|
misc.add_argument('--dry-run', action='store_true',
|
|
help=_("Do not change the repository. Run `git fast-export` and "
|
|
"filter its output, and save both the original and the "
|
|
"filtered version for comparison. This also disables "
|
|
"rewriting commit messages due to not knowing new commit "
|
|
"IDs and disables filtering of some empty commits due to "
|
|
"inability to query the fast-import backend." ))
|
|
misc.add_argument('--debug', action='store_true',
|
|
help=_("Print additional information about operations being "
|
|
"performed and commands being run. When used together "
|
|
"with --dry-run, also show extra information about what "
|
|
"would be run."))
|
|
# WARNING: --state-branch has some problems:
|
|
# * It does not work well with manually inserted objects (user creating
|
|
# Blob() or Commit() or Tag() objects and calling
|
|
# RepoFilter.insert(obj) on them).
|
|
# * It does not work well with multiple source or multiple target repos
|
|
# * It doesn't work so well with pruning become-empty commits (though
|
|
# --refs doesn't work so well with it either)
|
|
# These are probably fixable, given some work (e.g. re-importing the
|
|
# graph at the beginning to get the AncestryGraph right, doing our own
|
|
# export of marks instead of using fast-export --export-marks, etc.), but
|
|
# for now just hide the option.
|
|
misc.add_argument('--state-branch',
|
|
#help=_("Enable incremental filtering by saving the mapping of old "
|
|
# "to new objects to the specified branch upon exit, and"
|
|
# "loading that mapping from that branch (if it exists) "
|
|
# "upon startup."))
|
|
help=argparse.SUPPRESS)
|
|
misc.add_argument('--stdin', action='store_true',
|
|
help=_("Instead of running `git fast-export` and filtering its "
|
|
"output, filter the fast-export stream from stdin. The "
|
|
"stdin must be in the expected input format (e.g. it needs "
|
|
"to include original-oid directives)."))
|
|
misc.add_argument('--quiet', action='store_true',
|
|
help=_("Pass --quiet to other git commands called"))
|
|
return parser
|
|
|
|
@staticmethod
|
|
def sanity_check_args(args):
|
|
if args.analyze and args.path_changes:
|
|
raise SystemExit(_("Error: --analyze is incompatible with --path* flags; "
|
|
"it's a read-only operation."))
|
|
if args.analyze and args.stdin:
|
|
raise SystemExit(_("Error: --analyze is incompatible with --stdin."))
|
|
# If no path_changes are found, initialize with empty list but mark as
|
|
# not inclusive so that all files match
|
|
if args.path_changes == None:
|
|
args.path_changes = []
|
|
args.inclusive = False
|
|
else:
|
|
# Similarly, if we have no filtering paths, then no path should be
|
|
# filtered out. Based on how newname() works, the easiest way to
|
|
# achieve that is setting args.inclusive to False.
|
|
if not any(x[0] == 'filter' for x in args.path_changes):
|
|
args.inclusive = False
|
|
# Also check for incompatible --use-base-name and --path-rename flags.
|
|
if args.use_base_name:
|
|
if any(x[0] == 'rename' for x in args.path_changes):
|
|
raise SystemExit(_("Error: --use-base-name and --path-rename are "
|
|
"incompatible."))
|
|
# Also throw some sanity checks on git version here;
|
|
# PERF: remove these checks once new enough git versions are common
|
|
p = subproc.Popen('git fast-export -h'.split(),
|
|
stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
|
output = p.stdout.read()
|
|
if b'--anonymize-map' not in output: # pragma: no cover
|
|
global date_format_permissive
|
|
date_format_permissive = False
|
|
if b'--mark-tags' not in output: # pragma: no cover
|
|
global write_marks
|
|
write_marks = False
|
|
if args.state_branch:
|
|
# We need a version of git-fast-export with --mark-tags
|
|
raise SystemExit(_("Error: need git >= 2.24.0"))
|
|
if b'--reencode' not in output: # pragma: no cover
|
|
if args.preserve_commit_encoding:
|
|
# We need a version of git-fast-export with --reencode
|
|
raise SystemExit(_("Error: need git >= 2.23.0"))
|
|
else:
|
|
# Set args.preserve_commit_encoding to None which we'll check for later
|
|
# to avoid passing --reencode=yes to fast-export (that option was the
|
|
# default prior to git-2.23)
|
|
args.preserve_commit_encoding = None
|
|
# If we don't have fast-exoprt --reencode, we may also be missing
|
|
# diff-tree --combined-all-paths, which is even more important...
|
|
p = subproc.Popen('git diff-tree -h'.split(),
|
|
stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
|
output = p.stdout.read()
|
|
if b'--combined-all-paths' not in output:
|
|
# We need a version of git-diff-tree with --combined-all-paths
|
|
raise SystemExit(_("Error: need git >= 2.22.0"))
|
|
# End of sanity checks on git version
|
|
if args.max_blob_size:
|
|
suffix = args.max_blob_size[-1]
|
|
if suffix not in '1234567890':
|
|
mult = {'K': 1024, 'M': 1024**2, 'G': 1024**3}
|
|
if suffix not in mult:
|
|
raise SystemExit(_("Error: could not parse --strip-blobs-bigger-than"
|
|
" argument %s")
|
|
% args.max_blob_size)
|
|
args.max_blob_size = int(args.max_blob_size[0:-1]) * mult[suffix]
|
|
else:
|
|
args.max_blob_size = int(args.max_blob_size)
|
|
|
|
@staticmethod
|
|
def get_replace_text(filename):
|
|
replace_literals = []
|
|
replace_regexes = []
|
|
with open(filename, 'br') as f:
|
|
for line in f:
|
|
line = line.rstrip(b'\r\n')
|
|
|
|
# Determine the replacement
|
|
replacement = FilteringOptions.default_replace_text
|
|
if b'==>' in line:
|
|
line, replacement = line.rsplit(b'==>', 1)
|
|
|
|
# See if we need to match via regex
|
|
regex = None
|
|
if line.startswith(b'regex:'):
|
|
regex = line[6:]
|
|
elif line.startswith(b'glob:'):
|
|
regex = glob_to_regex(line[5:])
|
|
if regex:
|
|
replace_regexes.append((re.compile(regex), replacement))
|
|
else:
|
|
# Otherwise, find the literal we need to replace
|
|
if line.startswith(b'literal:'):
|
|
line = line[8:]
|
|
if not line:
|
|
continue
|
|
replace_literals.append((line, replacement))
|
|
return {'literals': replace_literals, 'regexes': replace_regexes}
|
|
|
|
@staticmethod
|
|
def get_paths_from_file(filename):
|
|
new_path_changes = []
|
|
with open(filename, 'br') as f:
|
|
for line in f:
|
|
line = line.rstrip(b'\r\n')
|
|
|
|
# Skip blank lines
|
|
if not line:
|
|
continue
|
|
# Skip comment lines
|
|
if line.startswith(b'#'):
|
|
continue
|
|
|
|
# Determine the replacement
|
|
match_type, repl = 'literal', None
|
|
if b'==>' in line:
|
|
line, repl = line.rsplit(b'==>', 1)
|
|
|
|
# See if we need to match via regex
|
|
match_type = 'match' # a.k.a. 'literal'
|
|
if line.startswith(b'regex:'):
|
|
match_type = 'regex'
|
|
match = re.compile(line[6:])
|
|
elif line.startswith(b'glob:'):
|
|
match_type = 'glob'
|
|
match = line[5:]
|
|
if repl:
|
|
raise SystemExit(_("Error: In %s, 'glob:' and '==>' are incompatible (renaming globs makes no sense)" % decode(filename)))
|
|
else:
|
|
if line.startswith(b'literal:'):
|
|
match = line[8:]
|
|
else:
|
|
match = line
|
|
if repl is not None:
|
|
if match and repl and match.endswith(b'/') != repl.endswith(b'/'):
|
|
raise SystemExit(_("Error: When rename directories, if OLDNAME "
|
|
"and NEW_NAME are both non-empty and either "
|
|
"ends with a slash then both must."))
|
|
|
|
# Record the filter or rename
|
|
if repl is not None:
|
|
new_path_changes.append(['rename', match_type, (match, repl)])
|
|
else:
|
|
new_path_changes.append(['filter', match_type, match])
|
|
if match_type == 'glob' and not match.endswith(b'*'):
|
|
extension = b'*' if match.endswith(b'/') else b'/*'
|
|
new_path_changes.append(['filter', match_type, match+extension])
|
|
return new_path_changes
|
|
|
|
@staticmethod
|
|
def default_options():
|
|
return FilteringOptions.parse_args([], error_on_empty = False)
|
|
|
|
@staticmethod
|
|
def parse_args(input_args, error_on_empty = True):
|
|
parser = FilteringOptions.create_arg_parser()
|
|
if not input_args and error_on_empty:
|
|
parser.print_usage()
|
|
raise SystemExit(_("No arguments specified."))
|
|
args = parser.parse_args(input_args)
|
|
if args.help:
|
|
parser.print_help()
|
|
raise SystemExit()
|
|
if args.version:
|
|
GitUtils.print_my_version()
|
|
raise SystemExit()
|
|
FilteringOptions.sanity_check_args(args)
|
|
if args.mailmap:
|
|
args.mailmap = MailmapInfo(args.mailmap)
|
|
if args.replace_text:
|
|
args.replace_text = FilteringOptions.get_replace_text(args.replace_text)
|
|
if args.replace_message:
|
|
args.replace_message = FilteringOptions.get_replace_text(args.replace_message)
|
|
if args.strip_blobs_with_ids:
|
|
with open(args.strip_blobs_with_ids, 'br') as f:
|
|
args.strip_blobs_with_ids = set(f.read().split())
|
|
else:
|
|
args.strip_blobs_with_ids = set()
|
|
if (args.partial or args.refs) and not args.replace_refs:
|
|
args.replace_refs = 'update-no-add'
|
|
args.repack = not (args.partial or args.refs)
|
|
if args.refs or args.source or args.target:
|
|
args.partial = True
|
|
if not args.refs:
|
|
args.refs = ['--all']
|
|
return args
|
|
|
|
class RepoAnalyze(object):
|
|
|
|
# First, several helper functions for analyze_commit()
|
|
|
|
@staticmethod
|
|
def equiv_class(stats, filename):
|
|
return stats['equivalence'].get(filename, (filename,))
|
|
|
|
@staticmethod
|
|
def setup_equivalence_for_rename(stats, oldname, newname):
|
|
# if A is renamed to B and B is renamed to C, then the user thinks of
|
|
# A, B, and C as all being different names for the same 'file'. We record
|
|
# this as an equivalence class:
|
|
# stats['equivalence'][name] = (A,B,C)
|
|
# for name being each of A, B, and C.
|
|
old_tuple = stats['equivalence'].get(oldname, ())
|
|
if newname in old_tuple:
|
|
return
|
|
elif old_tuple:
|
|
new_tuple = tuple(list(old_tuple)+[newname])
|
|
else:
|
|
new_tuple = (oldname, newname)
|
|
for f in new_tuple:
|
|
stats['equivalence'][f] = new_tuple
|
|
|
|
@staticmethod
|
|
def setup_or_update_rename_history(stats, commit, oldname, newname):
|
|
rename_commits = stats['rename_history'].get(oldname, set())
|
|
rename_commits.add(commit)
|
|
stats['rename_history'][oldname] = rename_commits
|
|
|
|
@staticmethod
|
|
def handle_renames(stats, commit, change_types, filenames):
|
|
for index, change_type in enumerate(change_types):
|
|
if change_type == ord(b'R'):
|
|
oldname, newname = filenames[index], filenames[-1]
|
|
RepoAnalyze.setup_equivalence_for_rename(stats, oldname, newname)
|
|
RepoAnalyze.setup_or_update_rename_history(stats, commit,
|
|
oldname, newname)
|
|
|
|
@staticmethod
|
|
def handle_file(stats, graph, commit, modes, shas, filenames):
|
|
mode, sha, filename = modes[-1], shas[-1], filenames[-1]
|
|
|
|
# Figure out kind of deletions to undo for this file, and update lists
|
|
# of all-names-by-sha and all-filenames
|
|
delmode = 'tree_deletions'
|
|
if mode != b'040000':
|
|
delmode = 'file_deletions'
|
|
stats['names'][sha].add(filename)
|
|
stats['allnames'].add(filename)
|
|
|
|
# If the file (or equivalence class of files) was recorded as deleted,
|
|
# clearly it isn't anymore
|
|
equiv = RepoAnalyze.equiv_class(stats, filename)
|
|
for f in equiv:
|
|
stats[delmode].pop(f, None)
|
|
|
|
# If we get a modify/add for a path that was renamed, we may need to break
|
|
# the equivalence class. However, if the modify/add was on a branch that
|
|
# doesn't have the rename in its history, we are still okay.
|
|
need_to_break_equivalence = False
|
|
if equiv[-1] != filename:
|
|
for rename_commit in stats['rename_history'][filename]:
|
|
if graph.is_ancestor(rename_commit, commit):
|
|
need_to_break_equivalence = True
|
|
|
|
if need_to_break_equivalence:
|
|
for f in equiv:
|
|
if f in stats['equivalence']:
|
|
del stats['equivalence'][f]
|
|
|
|
@staticmethod
|
|
def analyze_commit(stats, graph, commit, parents, date, file_changes):
|
|
graph.add_commit_and_parents(commit, parents)
|
|
for change in file_changes:
|
|
modes, shas, change_types, filenames = change
|
|
if len(parents) == 1 and change_types.startswith(b'R'):
|
|
change_types = b'R' # remove the rename score; we don't care
|
|
if modes[-1] == b'160000':
|
|
continue
|
|
elif modes[-1] == b'000000':
|
|
# Track when files/directories are deleted
|
|
for f in RepoAnalyze.equiv_class(stats, filenames[-1]):
|
|
if any(x == b'040000' for x in modes[0:-1]):
|
|
stats['tree_deletions'][f] = date
|
|
else:
|
|
stats['file_deletions'][f] = date
|
|
elif change_types.strip(b'AMT') == b'':
|
|
RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
|
|
elif modes[-1] == b'040000' and change_types.strip(b'RAM') == b'':
|
|
RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
|
|
elif change_types.strip(b'RAMT') == b'':
|
|
RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
|
|
RepoAnalyze.handle_renames(stats, commit, change_types, filenames)
|
|
else:
|
|
raise SystemExit(_("Unhandled change type(s): %(change_type)s "
|
|
"(in commit %(commit)s)")
|
|
% ({'change_type': change_types, 'commit': commit})
|
|
) # pragma: no cover
|
|
|
|
@staticmethod
|
|
def gather_data(args):
|
|
unpacked_size, packed_size = GitUtils.get_blob_sizes()
|
|
stats = {'names': collections.defaultdict(set),
|
|
'allnames' : set(),
|
|
'file_deletions': {},
|
|
'tree_deletions': {},
|
|
'equivalence': {},
|
|
'rename_history': collections.defaultdict(set),
|
|
'unpacked_size': unpacked_size,
|
|
'packed_size': packed_size,
|
|
'num_commits': 0}
|
|
|
|
# Setup the rev-list/diff-tree process
|
|
processed_commits_msg = _("Processed %d commits")
|
|
commit_parse_progress = ProgressWriter()
|
|
num_commits = 0
|
|
cmd = ('git rev-list --topo-order --reverse {}'.format(' '.join(args.refs)) +
|
|
' | git diff-tree --stdin --always --root --format=%H%n%P%n%cd' +
|
|
' --date=short -M -t -c --raw --combined-all-paths')
|
|
dtp = subproc.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE)
|
|
f = dtp.stdout
|
|
line = f.readline()
|
|
if not line:
|
|
raise SystemExit(_("Nothing to analyze; repository is empty."))
|
|
cont = bool(line)
|
|
graph = AncestryGraph()
|
|
while cont:
|
|
commit = line.rstrip()
|
|
parents = f.readline().split()
|
|
date = f.readline().rstrip()
|
|
|
|
# We expect a blank line next; if we get a non-blank line then
|
|
# this commit modified no files and we need to move on to the next.
|
|
# If there is no line, we've reached end-of-input.
|
|
line = f.readline()
|
|
if not line:
|
|
cont = False
|
|
line = line.rstrip()
|
|
|
|
# If we haven't reached end of input, and we got a blank line meaning
|
|
# a commit that has modified files, then get the file changes associated
|
|
# with this commit.
|
|
file_changes = []
|
|
if cont and not line:
|
|
cont = False
|
|
for line in f:
|
|
if not line.startswith(b':'):
|
|
cont = True
|
|
break
|
|
n = 1+max(1, len(parents))
|
|
assert line.startswith(b':'*(n-1))
|
|
relevant = line[n-1:-1]
|
|
splits = relevant.split(None, n)
|
|
modes = splits[0:n]
|
|
splits = splits[n].split(None, n)
|
|
shas = splits[0:n]
|
|
splits = splits[n].split(b'\t')
|
|
change_types = splits[0]
|
|
filenames = [PathQuoting.dequote(x) for x in splits[1:]]
|
|
file_changes.append([modes, shas, change_types, filenames])
|
|
|
|
# If someone is trying to analyze a subset of the history, make sure
|
|
# to avoid dying on commits with parents that we haven't seen before
|
|
if args.refs:
|
|
graph.record_external_commits([p for p in parents
|
|
if not p in graph.value])
|
|
|
|
# Analyze this commit and update progress
|
|
RepoAnalyze.analyze_commit(stats, graph, commit, parents, date,
|
|
file_changes)
|
|
num_commits += 1
|
|
commit_parse_progress.show(processed_commits_msg % num_commits)
|
|
|
|
# Show the final commits processed message and record the number of commits
|
|
commit_parse_progress.finish()
|
|
stats['num_commits'] = num_commits
|
|
|
|
# Close the output, ensure rev-list|diff-tree pipeline completed successfully
|
|
dtp.stdout.close()
|
|
if dtp.wait():
|
|
raise SystemExit(_("Error: rev-list|diff-tree pipeline failed; see above.")) # pragma: no cover
|
|
|
|
return stats
|
|
|
|
@staticmethod
|
|
def write_report(reportdir, stats):
|
|
def datestr(datetimestr):
|
|
return datetimestr if datetimestr else _('<present>').encode()
|
|
|
|
def dirnames(path):
|
|
while True:
|
|
path = os.path.dirname(path)
|
|
yield path
|
|
if path == b'':
|
|
break
|
|
|
|
# Compute aggregate size information for paths, extensions, and dirs
|
|
total_size = {'packed': 0, 'unpacked': 0}
|
|
path_size = {'packed': collections.defaultdict(int),
|
|
'unpacked': collections.defaultdict(int)}
|
|
ext_size = {'packed': collections.defaultdict(int),
|
|
'unpacked': collections.defaultdict(int)}
|
|
dir_size = {'packed': collections.defaultdict(int),
|
|
'unpacked': collections.defaultdict(int)}
|
|
for sha in stats['names']:
|
|
size = {'packed': stats['packed_size'][sha],
|
|
'unpacked': stats['unpacked_size'][sha]}
|
|
for which in ('packed', 'unpacked'):
|
|
for name in stats['names'][sha]:
|
|
total_size[which] += size[which]
|
|
path_size[which][name] += size[which]
|
|
basename, ext = os.path.splitext(name)
|
|
ext_size[which][ext] += size[which]
|
|
for dirname in dirnames(name):
|
|
dir_size[which][dirname] += size[which]
|
|
|
|
# Determine if and when extensions and directories were deleted
|
|
ext_deleted_data = {}
|
|
for name in stats['allnames']:
|
|
when = stats['file_deletions'].get(name, None)
|
|
|
|
# Update the extension
|
|
basename, ext = os.path.splitext(name)
|
|
if when is None:
|
|
ext_deleted_data[ext] = None
|
|
elif ext in ext_deleted_data:
|
|
if ext_deleted_data[ext] is not None:
|
|
ext_deleted_data[ext] = max(ext_deleted_data[ext], when)
|
|
else:
|
|
ext_deleted_data[ext] = when
|
|
|
|
dir_deleted_data = {}
|
|
for name in dir_size['packed']:
|
|
dir_deleted_data[name] = stats['tree_deletions'].get(name, None)
|
|
|
|
with open(os.path.join(reportdir, b"README"), 'bw') as f:
|
|
# Give a basic overview of this file
|
|
f.write(b"== %s ==\n" % _("Overall Statistics").encode())
|
|
f.write((" %s: %d\n" % (_("Number of commits"),
|
|
stats['num_commits'])).encode())
|
|
f.write((" %s: %d\n" % (_("Number of filenames"),
|
|
len(path_size['packed']))).encode())
|
|
f.write((" %s: %d\n" % (_("Number of directories"),
|
|
len(dir_size['packed']))).encode())
|
|
f.write((" %s: %d\n" % (_("Number of file extensions"),
|
|
len(ext_size['packed']))).encode())
|
|
f.write(b"\n")
|
|
f.write((" %s: %d\n" % (_("Total unpacked size (bytes)"),
|
|
total_size['unpacked'])).encode())
|
|
f.write((" %s: %d\n" % (_("Total packed size (bytes)"),
|
|
total_size['packed'])).encode())
|
|
f.write(b"\n")
|
|
|
|
# Mention issues with the report
|
|
f.write(("== %s ==\n" % _("Caveats")).encode())
|
|
f.write(("=== %s ===\n" % _("Sizes")).encode())
|
|
f.write(textwrap.dedent(_("""
|
|
Packed size represents what size your repository would be if no
|
|
trees, commits, tags, or other metadata were included (though it may
|
|
fail to represent de-duplication; see below). It also represents the
|
|
current packing, which may be suboptimal if you haven't gc'ed for a
|
|
while.
|
|
|
|
Unpacked size represents what size your repository would be if no
|
|
trees, commits, tags, or other metadata were included AND if no
|
|
files were packed; i.e., without delta-ing or compression.
|
|
|
|
Both unpacked and packed sizes can be slightly misleading. Deleting
|
|
a blob from history not save as much space as the unpacked size,
|
|
because it is obviously normally stored in packed form. Also,
|
|
deleting a blob from history may not save as much space as its packed
|
|
size either, because another blob could be stored as a delta against
|
|
that blob, so when you remove one blob another blob's packed size may
|
|
grow.
|
|
|
|
Also, the sum of the packed sizes can add up to more than the
|
|
repository size; if the same contents appeared in the repository in
|
|
multiple places, git will automatically de-dupe and store only one
|
|
copy, while the way sizes are added in this analysis adds the size
|
|
for each file path that has those contents. Further, if a file is
|
|
ever reverted to a previous version's contents, the previous
|
|
version's size will be counted multiple times in this analysis, even
|
|
though git will only store it once.
|
|
""")[1:]).encode())
|
|
f.write(b"\n")
|
|
f.write(("=== %s ===\n" % _("Deletions")).encode())
|
|
f.write(textwrap.dedent(_("""
|
|
Whether a file is deleted is not a binary quality, since it can be
|
|
deleted on some branches but still exist in others. Also, it might
|
|
exist in an old tag, but have been deleted in versions newer than
|
|
that. More thorough tracking could be done, including looking at
|
|
merge commits where one side of history deleted and the other modified,
|
|
in order to give a more holistic picture of deletions. However, that
|
|
algorithm would not only be more complex to implement, it'd also be
|
|
quite difficult to present and interpret by users. Since --analyze
|
|
is just about getting a high-level rough picture of history, it instead
|
|
implements the simplistic rule that is good enough for 98% of cases:
|
|
A file is marked as deleted if the last commit in the fast-export
|
|
stream that mentions the file lists it as deleted.
|
|
This makes it dependent on topological ordering, but generally gives
|
|
the "right" answer.
|
|
""")[1:]).encode())
|
|
f.write(b"\n")
|
|
f.write(("=== %s ===\n" % _("Renames")).encode())
|
|
f.write(textwrap.dedent(_("""
|
|
Renames share the same non-binary nature that deletions do, plus
|
|
additional challenges:
|
|
* If the renamed file is renamed again, instead of just two names for
|
|
a path you can have three or more.
|
|
* Rename pairs of the form (oldname, newname) that we consider to be
|
|
different names of the "same file" might only be valid over certain
|
|
commit ranges. For example, if a new commit reintroduces a file
|
|
named oldname, then new versions of oldname aren't the "same file"
|
|
anymore. We could try to portray this to the user, but it's easier
|
|
for the user to just break the pairing and only report unbroken
|
|
rename pairings to the user.
|
|
* The ability for users to rename files differently in different
|
|
branches means that our chains of renames will not necessarily be
|
|
linear but may branch out.
|
|
""")[1:]).encode())
|
|
f.write(b"\n")
|
|
|
|
# Equivalence classes for names, so if folks only want to keep a
|
|
# certain set of paths, they know the old names they want to include
|
|
# too.
|
|
with open(os.path.join(reportdir, b"renames.txt"), 'bw') as f:
|
|
seen = set()
|
|
for pathname,equiv_group in sorted(stats['equivalence'].items(),
|
|
key=lambda x:(x[1], x[0])):
|
|
if equiv_group in seen:
|
|
continue
|
|
seen.add(equiv_group)
|
|
f.write(("{} ->\n ".format(decode(equiv_group[0])) +
|
|
"\n ".join(decode(x) for x in equiv_group[1:]) +
|
|
"\n").encode())
|
|
|
|
# List directories in reverse sorted order of unpacked size
|
|
with open(os.path.join(reportdir, b"directories-deleted-sizes.txt"), 'bw') as f:
|
|
msg = "=== %s ===\n" % _("Deleted directories by reverse size")
|
|
f.write(msg.encode())
|
|
msg = _("Format: unpacked size, packed size, date deleted, directory name\n")
|
|
f.write(msg.encode())
|
|
for dirname, size in sorted(dir_size['packed'].items(),
|
|
key=lambda x:(x[1],x[0]), reverse=True):
|
|
if (dir_deleted_data[dirname]):
|
|
f.write(b" %10d %10d %-10s %s\n" % (dir_size['unpacked'][dirname],
|
|
size,
|
|
datestr(dir_deleted_data[dirname]),
|
|
dirname or _('<toplevel>').encode()))
|
|
|
|
with open(os.path.join(reportdir, b"directories-all-sizes.txt"), 'bw') as f:
|
|
f.write(("=== %s ===\n" % _("All directories by reverse size")).encode())
|
|
msg = _("Format: unpacked size, packed size, date deleted, directory name\n")
|
|
f.write(msg.encode())
|
|
for dirname, size in sorted(dir_size['packed'].items(),
|
|
key=lambda x:(x[1],x[0]), reverse=True):
|
|
f.write(b" %10d %10d %-10s %s\n" % (dir_size['unpacked'][dirname],
|
|
size,
|
|
datestr(dir_deleted_data[dirname]),
|
|
dirname or _("<toplevel>").encode()))
|
|
|
|
# List extensions in reverse sorted order of unpacked size
|
|
with open(os.path.join(reportdir, b"extensions-deleted-sizes.txt"), 'bw') as f:
|
|
msg = "=== %s ===\n" % _("Deleted extensions by reverse size")
|
|
f.write(msg.encode())
|
|
msg = _("Format: unpacked size, packed size, date deleted, extension name\n")
|
|
f.write(msg.encode())
|
|
for extname, size in sorted(ext_size['packed'].items(),
|
|
key=lambda x:(x[1],x[0]), reverse=True):
|
|
if (ext_deleted_data[extname]):
|
|
f.write(b" %10d %10d %-10s %s\n" % (ext_size['unpacked'][extname],
|
|
size,
|
|
datestr(ext_deleted_data[extname]),
|
|
extname or _('<no extension>').encode()))
|
|
|
|
with open(os.path.join(reportdir, b"extensions-all-sizes.txt"), 'bw') as f:
|
|
f.write(("=== %s ===\n" % _("All extensions by reverse size")).encode())
|
|
msg = _("Format: unpacked size, packed size, date deleted, extension name\n")
|
|
f.write(msg.encode())
|
|
for extname, size in sorted(ext_size['packed'].items(),
|
|
key=lambda x:(x[1],x[0]), reverse=True):
|
|
f.write(b" %10d %10d %-10s %s\n" % (ext_size['unpacked'][extname],
|
|
size,
|
|
datestr(ext_deleted_data[extname]),
|
|
extname or _('<no extension>').encode()))
|
|
|
|
# List files in reverse sorted order of unpacked size
|
|
with open(os.path.join(reportdir, b"path-deleted-sizes.txt"), 'bw') as f:
|
|
msg = "=== %s ===\n" % _("Deleted paths by reverse accumulated size")
|
|
f.write(msg.encode())
|
|
msg = _("Format: unpacked size, packed size, date deleted, path name(s)\n")
|
|
f.write(msg.encode())
|
|
for pathname, size in sorted(path_size['packed'].items(),
|
|
key=lambda x:(x[1],x[0]), reverse=True):
|
|
when = stats['file_deletions'].get(pathname, None)
|
|
if when:
|
|
f.write(b" %10d %10d %-10s %s\n" % (path_size['unpacked'][pathname],
|
|
size,
|
|
datestr(when),
|
|
pathname))
|
|
|
|
with open(os.path.join(reportdir, b"path-all-sizes.txt"), 'bw') as f:
|
|
msg = "=== %s ===\n" % _("All paths by reverse accumulated size")
|
|
f.write(msg.encode())
|
|
msg = _("Format: unpacked size, packed size, date deleted, path name\n")
|
|
f.write(msg.encode())
|
|
for pathname, size in sorted(path_size['packed'].items(),
|
|
key=lambda x:(x[1],x[0]), reverse=True):
|
|
when = stats['file_deletions'].get(pathname, None)
|
|
f.write(b" %10d %10d %-10s %s\n" % (path_size['unpacked'][pathname],
|
|
size,
|
|
datestr(when),
|
|
pathname))
|
|
|
|
# List of filenames and sizes in descending order
|
|
with open(os.path.join(reportdir, b"blob-shas-and-paths.txt"), 'bw') as f:
|
|
f.write(("=== %s ===\n" % _("Files by sha and associated pathnames in reverse size")).encode())
|
|
f.write(_("Format: sha, unpacked size, packed size, filename(s) object stored as\n").encode())
|
|
for sha, size in sorted(stats['packed_size'].items(),
|
|
key=lambda x:(x[1],x[0]), reverse=True):
|
|
if sha not in stats['names']:
|
|
# Some objects in the repository might not be referenced, or not
|
|
# referenced by the branches/tags the user cares about; skip them.
|
|
continue
|
|
names_with_sha = stats['names'][sha]
|
|
if len(names_with_sha) == 1:
|
|
names_with_sha = names_with_sha.pop()
|
|
else:
|
|
names_with_sha = b'[' + b', '.join(sorted(names_with_sha)) + b']'
|
|
f.write(b" %s %10d %10d %s\n" % (sha,
|
|
stats['unpacked_size'][sha],
|
|
size,
|
|
names_with_sha))
|
|
|
|
@staticmethod
|
|
def run(args):
|
|
if args.report_dir:
|
|
reportdir = args.report_dir
|
|
else:
|
|
git_dir = GitUtils.determine_git_dir(b'.')
|
|
|
|
# Create the report directory as necessary
|
|
results_tmp_dir = os.path.join(git_dir, b'filter-repo')
|
|
if not os.path.isdir(results_tmp_dir):
|
|
os.mkdir(results_tmp_dir)
|
|
reportdir = os.path.join(results_tmp_dir, b"analysis")
|
|
|
|
if os.path.isdir(reportdir):
|
|
if args.force:
|
|
sys.stdout.write(_("Warning: Removing recursively: \"%s\"") % decode(reportdir))
|
|
shutil.rmtree(reportdir)
|
|
else:
|
|
sys.stdout.write(_("Error: dir already exists (use --force to delete): \"%s\"\n") % decode(reportdir))
|
|
sys.exit(1)
|
|
|
|
os.mkdir(reportdir)
|
|
|
|
# Gather the data we need
|
|
stats = RepoAnalyze.gather_data(args)
|
|
|
|
# Write the reports
|
|
sys.stdout.write(_("Writing reports to %s...") % decode(reportdir))
|
|
sys.stdout.flush()
|
|
RepoAnalyze.write_report(reportdir, stats)
|
|
sys.stdout.write(_("done.\n"))
|
|
|
|
class InputFileBackup:
|
|
def __init__(self, input_file, output_file):
|
|
self.input_file = input_file
|
|
self.output_file = output_file
|
|
|
|
def close(self):
|
|
self.input_file.close()
|
|
self.output_file.close()
|
|
|
|
def read(self, size):
|
|
output = self.input_file.read(size)
|
|
self.output_file.write(output)
|
|
return output
|
|
|
|
def readline(self):
|
|
line = self.input_file.readline()
|
|
self.output_file.write(line)
|
|
return line
|
|
|
|
class DualFileWriter:
|
|
def __init__(self, file1, file2):
|
|
self.file1 = file1
|
|
self.file2 = file2
|
|
|
|
def write(self, *args):
|
|
self.file1.write(*args)
|
|
self.file2.write(*args)
|
|
|
|
def flush(self):
|
|
self.file1.flush()
|
|
self.file2.flush()
|
|
|
|
def close(self):
|
|
self.file1.close()
|
|
self.file2.close()
|
|
|
|
class RepoFilter(object):
|
|
def __init__(self,
|
|
args,
|
|
filename_callback = None,
|
|
message_callback = None,
|
|
name_callback = None,
|
|
email_callback = None,
|
|
refname_callback = None,
|
|
blob_callback = None,
|
|
commit_callback = None,
|
|
tag_callback = None,
|
|
reset_callback = None,
|
|
done_callback = None):
|
|
|
|
self._args = args
|
|
|
|
# Repo we are exporting
|
|
self._repo_working_dir = None
|
|
|
|
# Store callbacks for acting on objects printed by FastExport
|
|
self._blob_callback = blob_callback
|
|
self._commit_callback = commit_callback
|
|
self._tag_callback = tag_callback
|
|
self._reset_callback = reset_callback
|
|
self._done_callback = done_callback
|
|
|
|
# Store callbacks for acting on slices of FastExport objects
|
|
self._filename_callback = filename_callback # filenames from commits
|
|
self._message_callback = message_callback # commit OR tag message
|
|
self._name_callback = name_callback # author, committer, tagger
|
|
self._email_callback = email_callback # author, committer, tagger
|
|
self._refname_callback = refname_callback # from commit/tag/reset
|
|
self._handle_arg_callbacks()
|
|
|
|
# Defaults for input
|
|
self._input = None
|
|
self._fep = None # Fast Export Process
|
|
self._fe_orig = None # Path to where original fast-export output stored
|
|
self._fe_filt = None # Path to where filtered fast-export output stored
|
|
self._parser = None # FastExportParser object we are working with
|
|
|
|
# Defaults for output
|
|
self._output = None
|
|
self._fip = None # Fast Import Process
|
|
self._import_pipes = None
|
|
self._managed_output = True
|
|
|
|
# A tuple of (depth, list-of-ancestors). Commits and ancestors are
|
|
# identified by their id (their 'mark' in fast-export or fast-import
|
|
# speak). The depth of a commit is one more than the max depth of any
|
|
# of its ancestors.
|
|
self._graph = AncestryGraph()
|
|
# Another one, for ancestry of commits in the original repo
|
|
self._orig_graph = AncestryGraph()
|
|
|
|
# Names of files that were tweaked in any commit; such paths could lead
|
|
# to subsequent commits being empty
|
|
self._files_tweaked = set()
|
|
|
|
# A set of commit hash pairs (oldhash, newhash) which used to be merge
|
|
# commits but due to filtering were turned into non-merge commits.
|
|
# The commits probably have suboptimal commit messages (e.g. "Merge branch
|
|
# next into master").
|
|
self._commits_no_longer_merges = []
|
|
|
|
# A dict of original_ids to new_ids; filtering commits means getting
|
|
# new commit hash (sha1sums), and we record the mapping both for
|
|
# diagnostic purposes and so we can rewrite commit messages. Note that
|
|
# the new_id can be None rather than a commit hash if the original
|
|
# commit became empty and was pruned or was otherwise dropped.
|
|
self._commit_renames = {}
|
|
|
|
# A set of original_ids for which we have not yet gotten the
|
|
# new_ids; we use OrderedDict because we need to know the order of
|
|
# insertion, but the values are always ignored (and set to None).
|
|
# If there was an OrderedSet class, I'd use it instead.
|
|
self._pending_renames = collections.OrderedDict()
|
|
|
|
# A dict of commit_hash[0:7] -> set(commit_hashes with that prefix).
|
|
#
|
|
# It's common for commit messages to refer to commits by abbreviated
|
|
# commit hashes, as short as 7 characters. To facilitate translating
|
|
# such short hashes, we have a mapping of prefixes to full old hashes.
|
|
self._commit_short_old_hashes = collections.defaultdict(set)
|
|
|
|
# A set of commit hash references appearing in commit messages which
|
|
# mapped to a valid commit that was removed entirely in the filtering
|
|
# process. The commit message will continue to reference the
|
|
# now-missing commit hash, since there was nothing to map it to.
|
|
self._commits_referenced_but_removed = set()
|
|
|
|
# Progress handling (number of commits parsed, etc.)
|
|
self._progress_writer = ProgressWriter()
|
|
self._num_commits = 0
|
|
|
|
# Size of blobs in the repo
|
|
self._unpacked_size = {}
|
|
|
|
# Other vars
|
|
self._sanity_checks_handled = False
|
|
self._finalize_handled = False
|
|
self._orig_refs = None
|
|
self._newnames = {}
|
|
|
|
# Cache a few message translations for performance reasons
|
|
self._parsed_message = _("Parsed %d commits")
|
|
|
|
# Compile some regexes and cache those
|
|
self._hash_re = re.compile(br'(\b[0-9a-f]{7,40}\b)')
|
|
|
|
def _handle_arg_callbacks(self):
|
|
def make_callback(argname, str):
|
|
exec('def callback({}, _do_not_use_this_var = None):\n'.format(argname)+
|
|
' '+'\n '.join(str.splitlines()), globals())
|
|
return callback #namespace['callback']
|
|
def handle(type):
|
|
callback_field = '_{}_callback'.format(type)
|
|
code_string = getattr(self._args, type+'_callback')
|
|
if code_string:
|
|
if os.path.exists(code_string):
|
|
with open(code_string, 'r', encoding='utf-8') as f:
|
|
code_string = f.read()
|
|
if getattr(self, callback_field):
|
|
raise SystemExit(_("Error: Cannot pass a %s_callback to RepoFilter "
|
|
"AND pass --%s-callback"
|
|
% (type, type)))
|
|
if 'return ' not in code_string and \
|
|
type not in ('blob', 'commit', 'tag', 'reset'):
|
|
raise SystemExit(_("Error: --%s-callback should have a return statement")
|
|
% type)
|
|
setattr(self, callback_field, make_callback(type, code_string))
|
|
handle('filename')
|
|
handle('message')
|
|
handle('name')
|
|
handle('email')
|
|
handle('refname')
|
|
handle('blob')
|
|
handle('commit')
|
|
handle('tag')
|
|
handle('reset')
|
|
|
|
def _run_sanity_checks(self):
|
|
self._sanity_checks_handled = True
|
|
if not self._managed_output:
|
|
if not self._args.replace_refs:
|
|
# If not _managed_output we don't want to make extra changes to the
|
|
# repo, so set default to no-op 'update-no-add'
|
|
self._args.replace_refs = 'update-no-add'
|
|
return
|
|
|
|
if self._args.debug:
|
|
print("[DEBUG] Passed arguments:\n{}".format(self._args))
|
|
|
|
# Determine basic repository information
|
|
target_working_dir = self._args.target or b'.'
|
|
self._orig_refs = GitUtils.get_refs(target_working_dir)
|
|
is_bare = GitUtils.is_repository_bare(target_working_dir)
|
|
|
|
# Determine if this is second or later run of filter-repo
|
|
tmp_dir = self.results_tmp_dir(create_if_missing=False)
|
|
already_ran = os.path.isfile(os.path.join(tmp_dir, b'already_ran'))
|
|
|
|
# Default for --replace-refs
|
|
if not self._args.replace_refs:
|
|
self._args.replace_refs = ('update-or-add' if already_ran
|
|
else 'update-and-add')
|
|
|
|
# Do sanity checks from the correct directory
|
|
if not self._args.force and not already_ran:
|
|
cwd = os.getcwd()
|
|
os.chdir(target_working_dir)
|
|
RepoFilter.sanity_check(self._orig_refs, is_bare)
|
|
os.chdir(cwd)
|
|
|
|
@staticmethod
|
|
def sanity_check(refs, is_bare):
|
|
def abort(reason):
|
|
try:
|
|
cmd = 'git config remote.origin.url'
|
|
output = subproc.check_output(cmd.split()).strip()
|
|
except subprocess.CalledProcessError as e:
|
|
output = None
|
|
msg = ""
|
|
if output and os.path.isdir(output):
|
|
msg = _("Note: when cloning local repositories, you need to pass\n"
|
|
" --no-local to git clone to avoid this issue.\n")
|
|
raise SystemExit(
|
|
_("Aborting: Refusing to destructively overwrite repo history since\n"
|
|
"this does not look like a fresh clone.\n"
|
|
" (%s)\n%s"
|
|
"Please operate on a fresh clone instead. If you want to proceed\n"
|
|
"anyway, use --force.") % (reason, msg))
|
|
|
|
# Make sure repo is fully packed, just like a fresh clone would be.
|
|
# Note that transfer.unpackLimit defaults to 100, meaning that a
|
|
# repository with no packs and less than 100 objects should be considered
|
|
# fully packed.
|
|
output = subproc.check_output('git count-objects -v'.split())
|
|
stats = dict(x.split(b': ') for x in output.splitlines())
|
|
num_packs = int(stats[b'packs'])
|
|
num_loose_objects = int(stats[b'count'])
|
|
if num_packs > 1 or \
|
|
(num_packs == 1 and num_loose_objects > 0) or \
|
|
num_loose_objects >= 100:
|
|
abort(_("expected freshly packed repo"))
|
|
|
|
# Make sure there is precisely one remote, named "origin"...or that this
|
|
# is a new bare repo with no packs and no remotes
|
|
output = subproc.check_output('git remote'.split()).strip()
|
|
if not (output == b"origin" or (num_packs == 0 and not output)):
|
|
abort(_("expected one remote, origin"))
|
|
|
|
# Avoid letting people running with weird setups and overwriting GIT_DIR
|
|
# elsewhere
|
|
git_dir = GitUtils.determine_git_dir(b'.')
|
|
if is_bare and git_dir != b'.':
|
|
abort(_("GIT_DIR must be ."))
|
|
elif not is_bare and git_dir != b'.git':
|
|
abort(_("GIT_DIR must be .git"))
|
|
|
|
# Make sure that all reflogs have precisely one entry
|
|
reflog_dir=os.path.join(git_dir, b'logs')
|
|
for root, dirs, files in os.walk(reflog_dir):
|
|
for filename in files:
|
|
pathname = os.path.join(root, filename)
|
|
with open(pathname, 'br') as f:
|
|
if len(f.read().splitlines()) > 1:
|
|
shortpath = pathname[len(reflog_dir)+1:]
|
|
abort(_("expected at most one entry in the reflog for %s") %
|
|
decode(shortpath))
|
|
|
|
# Make sure there are no stashed changes
|
|
if b'refs/stash' in refs:
|
|
abort(_("has stashed changes"))
|
|
|
|
# Do extra checks in non-bare repos
|
|
if not is_bare:
|
|
# Avoid uncommitted, unstaged, or untracked changes
|
|
if subproc.call('git diff --staged --quiet'.split()):
|
|
abort(_("you have uncommitted changes"))
|
|
if subproc.call('git diff --quiet'.split()):
|
|
abort(_("you have unstaged changes"))
|
|
if len(subproc.check_output('git ls-files -o'.split())) > 0:
|
|
abort(_("you have untracked changes"))
|
|
|
|
# Avoid unpushed changes
|
|
for refname, rev in refs.items():
|
|
if not refname.startswith(b'refs/heads/'):
|
|
continue
|
|
origin_ref = refname.replace(b'refs/heads/', b'refs/remotes/origin/')
|
|
if origin_ref not in refs:
|
|
abort(_('%s exists, but %s not found') % (decode(refname),
|
|
decode(origin_ref)))
|
|
if rev != refs[origin_ref]:
|
|
abort(_('%s does not match %s') % (decode(refname),
|
|
decode(origin_ref)))
|
|
|
|
# Make sure there is only one worktree
|
|
output = subproc.check_output('git worktree list'.split())
|
|
if len(output.splitlines()) > 1:
|
|
abort(_('you have multiple worktrees'))
|
|
|
|
@staticmethod
|
|
def cleanup(repo, repack, reset, run_quietly=False, show_debuginfo=False):
|
|
''' cleanup repo; if repack then expire reflogs and do a gc --prune=now.
|
|
if reset then do a reset --hard. Optionally also curb output if
|
|
run_quietly is True, or go the opposite direction and show extra
|
|
output if show_debuginfo is True. '''
|
|
assert not (run_quietly and show_debuginfo)
|
|
|
|
if (repack and not run_quietly and not show_debuginfo):
|
|
print(_("Repacking your repo and cleaning out old unneeded objects"))
|
|
quiet_flags = '--quiet' if run_quietly else ''
|
|
cleanup_cmds = []
|
|
if repack:
|
|
cleanup_cmds = ['git reflog expire --expire=now --all'.split(),
|
|
'git gc {} --prune=now'.format(quiet_flags).split()]
|
|
if reset:
|
|
cleanup_cmds.insert(0, 'git reset {} --hard'.format(quiet_flags).split())
|
|
location_info = ' (in {})'.format(decode(repo)) if repo != b'.' else ''
|
|
for cmd in cleanup_cmds:
|
|
if show_debuginfo:
|
|
print("[DEBUG] Running{}: {}".format(location_info, ' '.join(cmd)))
|
|
subproc.call(cmd, cwd=repo)
|
|
|
|
def _get_rename(self, old_hash):
|
|
# If we already know the rename, just return it
|
|
new_hash = self._commit_renames.get(old_hash, None)
|
|
if new_hash:
|
|
return new_hash
|
|
|
|
# If it's not in the remaining pending renames, we don't know it
|
|
if old_hash is not None and old_hash not in self._pending_renames:
|
|
return None
|
|
|
|
# Read through the pending renames until we find it or we've read them all,
|
|
# and return whatever we might find
|
|
self._flush_renames(old_hash)
|
|
return self._commit_renames.get(old_hash, None)
|
|
|
|
def _flush_renames(self, old_hash=None, limit=0):
|
|
# Parse through self._pending_renames until we have read enough. We have
|
|
# read enough if:
|
|
# self._pending_renames is empty
|
|
# old_hash != None and we found a rename for old_hash
|
|
# limit > 0 and len(self._pending_renames) started less than 2*limit
|
|
# limit > 0 and len(self._pending_renames) < limit
|
|
if limit and len(self._pending_renames) < 2 * limit:
|
|
return
|
|
fi_input, fi_output = self._import_pipes
|
|
while self._pending_renames:
|
|
orig_id, ignore = self._pending_renames.popitem(last=False)
|
|
new_id = fi_output.readline().rstrip()
|
|
self._commit_renames[orig_id] = new_id
|
|
if old_hash == orig_id:
|
|
return
|
|
if limit and len(self._pending_renames) < limit:
|
|
return
|
|
|
|
def _translate_commit_hash(self, matchobj_or_oldhash):
|
|
old_hash = matchobj_or_oldhash
|
|
if not isinstance(matchobj_or_oldhash, bytes):
|
|
old_hash = matchobj_or_oldhash.group(1)
|
|
orig_len = len(old_hash)
|
|
new_hash = self._get_rename(old_hash)
|
|
if new_hash is None:
|
|
if old_hash[0:7] not in self._commit_short_old_hashes:
|
|
self._commits_referenced_but_removed.add(old_hash)
|
|
return old_hash
|
|
possibilities = self._commit_short_old_hashes[old_hash[0:7]]
|
|
matches = [x for x in possibilities
|
|
if x[0:orig_len] == old_hash]
|
|
if len(matches) != 1:
|
|
self._commits_referenced_but_removed.add(old_hash)
|
|
return old_hash
|
|
old_hash = matches[0]
|
|
new_hash = self._get_rename(old_hash)
|
|
|
|
assert new_hash is not None
|
|
return new_hash[0:orig_len]
|
|
|
|
def _trim_extra_parents(self, orig_parents, parents):
|
|
'''Due to pruning of empty commits, some parents could be non-existent
|
|
(None) or otherwise redundant. Remove the non-existent parents, and
|
|
remove redundant parents so long as that doesn't transform a merge
|
|
commit into a non-merge commit.
|
|
|
|
Returns a tuple:
|
|
(parents, new_first_parent_if_would_become_non_merge)'''
|
|
|
|
always_prune = (self._args.prune_degenerate == 'always')
|
|
|
|
# Pruning of empty commits means multiple things:
|
|
# * An original parent of this commit may have been pruned causing the
|
|
# need to rewrite the reported parent to the nearest ancestor. We
|
|
# want to know when we're dealing with such a parent.
|
|
# * Further, there may be no "nearest ancestor" if the entire history
|
|
# of that parent was also pruned. (Detectable by the parent being
|
|
# 'None')
|
|
# Remove all parents rewritten to None, and keep track of which parents
|
|
# were rewritten to an ancestor.
|
|
tmp = zip(parents,
|
|
orig_parents,
|
|
[(x in _SKIPPED_COMMITS or always_prune) for x in orig_parents])
|
|
tmp2 = [x for x in tmp if x[0] is not None]
|
|
if not tmp2:
|
|
# All ancestors have been pruned; we have no parents.
|
|
return [], None
|
|
parents, orig_parents, is_rewritten = [list(x) for x in zip(*tmp2)]
|
|
|
|
# We can't have redundant parents if we don't have at least 2 parents
|
|
if len(parents) < 2:
|
|
return parents, None
|
|
|
|
# Don't remove redundant parents if user doesn't want us to
|
|
if self._args.prune_degenerate == 'never':
|
|
return parents, None
|
|
|
|
# Remove duplicate parents (if both sides of history have lots of commits
|
|
# which become empty due to pruning, the most recent ancestor on both
|
|
# sides may be the same commit), except only remove parents that have
|
|
# been rewritten due to previous empty pruning.
|
|
seen = set()
|
|
seen_add = seen.add
|
|
# Deleting duplicate rewritten parents means keeping parents if either
|
|
# they have not been seen or they are ones that have not been rewritten.
|
|
parents_copy = parents
|
|
uniq = [[p, orig_parents[i], is_rewritten[i]] for i, p in enumerate(parents)
|
|
if not (p in seen or seen_add(p)) or not is_rewritten[i]]
|
|
parents, orig_parents, is_rewritten = [list(x) for x in zip(*uniq)]
|
|
if len(parents) < 2:
|
|
return parents_copy, parents[0]
|
|
|
|
# Flatten unnecessary merges. (If one side of history is entirely
|
|
# empty commits that were pruned, we may end up attempting to
|
|
# merge a commit with its ancestor. Remove parents that are an
|
|
# ancestor of another parent.)
|
|
num_parents = len(parents)
|
|
to_remove = []
|
|
for cur in range(num_parents):
|
|
if not is_rewritten[cur]:
|
|
continue
|
|
for other in range(num_parents):
|
|
if cur == other:
|
|
continue
|
|
if not self._graph.is_ancestor(parents[cur], parents[other]):
|
|
continue
|
|
# parents[cur] is an ancestor of parents[other], so parents[cur]
|
|
# seems redundant. However, if it was intentionally redundant
|
|
# (e.g. a no-ff merge) in the original, then we want to keep it.
|
|
if not always_prune and \
|
|
self._orig_graph.is_ancestor(orig_parents[cur],
|
|
orig_parents[other]):
|
|
continue
|
|
# Some folks want their history to have all first parents be merge
|
|
# commits (except for any root commits), and always do a merge --no-ff.
|
|
# For such folks, don't remove the first parent even if it's an
|
|
# ancestor of other commits.
|
|
if self._args.no_ff and cur == 0:
|
|
continue
|
|
# Okay so the cur-th parent is an ancestor of the other-th parent,
|
|
# and it wasn't that way in the original repository; mark the
|
|
# cur-th parent as removable.
|
|
to_remove.append(cur)
|
|
break # cur removed, so skip rest of others -- i.e. check cur+=1
|
|
for x in reversed(to_remove):
|
|
parents.pop(x)
|
|
if len(parents) < 2:
|
|
return parents_copy, parents[0]
|
|
|
|
return parents, None
|
|
|
|
def _prunable(self, commit, new_1st_parent, had_file_changes, orig_parents):
|
|
parents = commit.parents
|
|
|
|
if self._args.prune_empty == 'never':
|
|
return False
|
|
always_prune = (self._args.prune_empty == 'always')
|
|
|
|
# For merge commits, unless there are prunable (redundant) parents, we
|
|
# do not want to prune
|
|
if len(parents) >= 2 and not new_1st_parent:
|
|
return False
|
|
|
|
if len(parents) < 2:
|
|
# Special logic for commits that started empty...
|
|
if not had_file_changes and not always_prune:
|
|
had_parents_pruned = (len(parents) < len(orig_parents) or
|
|
(len(orig_parents) == 1 and
|
|
orig_parents[0] in _SKIPPED_COMMITS))
|
|
# If the commit remains empty and had parents which were pruned,
|
|
# then prune this commit; otherwise, retain it
|
|
return (not commit.file_changes and had_parents_pruned)
|
|
|
|
# We can only get here if the commit didn't start empty, so if it's
|
|
# empty now, it obviously became empty
|
|
if not commit.file_changes:
|
|
return True
|
|
|
|
# If there are no parents of this commit and we didn't match the case
|
|
# above, then this commit cannot be pruned. Since we have no parent(s)
|
|
# to compare to, abort now to prevent future checks from failing.
|
|
if not parents:
|
|
return False
|
|
|
|
# Similarly, we cannot handle the hard cases if we don't have a pipe
|
|
# to communicate with fast-import
|
|
if not self._import_pipes:
|
|
return False
|
|
|
|
# If there have not been renames/remappings of IDs (due to insertion of
|
|
# new blobs), then we can sometimes know things aren't prunable with a
|
|
# simple check
|
|
if not _IDS.has_renames():
|
|
# non-merge commits can only be empty if blob/file-change editing caused
|
|
# all file changes in the commit to have the same file contents as
|
|
# the parent.
|
|
changed_files = set(change.filename for change in commit.file_changes)
|
|
if len(orig_parents) < 2 and changed_files - self._files_tweaked:
|
|
return False
|
|
|
|
# Finally, the hard case: due to either blob rewriting, or due to pruning
|
|
# of empty commits wiping out the first parent history back to the merge
|
|
# base, the list of file_changes we have may not actually differ from our
|
|
# (new) first parent's version of the files, i.e. this would actually be
|
|
# an empty commit. Check by comparing the contents of this commit to its
|
|
# (remaining) parent.
|
|
#
|
|
# NOTE on why this works, for the case of original first parent history
|
|
# having been pruned away due to being empty:
|
|
# The first parent history having been pruned away due to being
|
|
# empty implies the original first parent would have a tree (after
|
|
# filtering) that matched the merge base's tree. Since
|
|
# file_changes has the changes needed to go from what would have
|
|
# been the first parent to our new commit, and what would have been
|
|
# our first parent has a tree that matches the merge base, then if
|
|
# the new first parent has a tree matching the versions of files in
|
|
# file_changes, then this new commit is empty and thus prunable.
|
|
fi_input, fi_output = self._import_pipes
|
|
self._flush_renames() # Avoid fi_output having other stuff present
|
|
# Optimization note: we could have two loops over file_changes, the
|
|
# first doing all the self._output.write() calls, and the second doing
|
|
# the rest. But I'm worried about fast-import blocking on fi_output
|
|
# buffers filling up so I instead read from it as I go.
|
|
for change in commit.file_changes:
|
|
parent = new_1st_parent or commit.parents[0] # exists due to above checks
|
|
quoted_filename = PathQuoting.enquote(change.filename)
|
|
if isinstance(parent, int):
|
|
self._output.write(b"ls :%d %s\n" % (parent, quoted_filename))
|
|
else:
|
|
self._output.write(b"ls %s %s\n" % (parent, quoted_filename))
|
|
self._output.flush()
|
|
parent_version = fi_output.readline().split()
|
|
if change.type == b'D':
|
|
if parent_version != [b'missing', quoted_filename]:
|
|
return False
|
|
else:
|
|
blob_sha = change.blob_id
|
|
if isinstance(change.blob_id, int):
|
|
self._output.write(b"get-mark :%d\n" % change.blob_id)
|
|
self._output.flush()
|
|
blob_sha = fi_output.readline().rstrip()
|
|
if parent_version != [change.mode, b'blob', blob_sha, quoted_filename]:
|
|
return False
|
|
|
|
return True
|
|
|
|
def _record_remapping(self, commit, orig_parents):
|
|
new_id = None
|
|
# Record the mapping of old commit hash to new one
|
|
if commit.original_id and self._import_pipes:
|
|
fi_input, fi_output = self._import_pipes
|
|
self._output.write(b"get-mark :%d\n" % commit.id)
|
|
self._output.flush()
|
|
orig_id = commit.original_id
|
|
self._commit_short_old_hashes[orig_id[0:7]].add(orig_id)
|
|
# Note that we have queued up an id for later reading; flush a
|
|
# few of the older ones if we have too many queued up
|
|
self._pending_renames[orig_id] = None
|
|
self._flush_renames(None, limit=40)
|
|
# Also, record if this was a merge commit that turned into a non-merge
|
|
# commit.
|
|
if len(orig_parents) >= 2 and len(commit.parents) < 2:
|
|
self._commits_no_longer_merges.append((commit.original_id, new_id))
|
|
|
|
def callback_metadata(self, extra_items = dict()):
|
|
return {'commit_rename_func': self._translate_commit_hash,
|
|
'ancestry_graph': self._graph,
|
|
'original_ancestry_graph': self._orig_graph,
|
|
**extra_items}
|
|
|
|
def _tweak_blob(self, blob):
|
|
if self._args.max_blob_size and len(blob.data) > self._args.max_blob_size:
|
|
blob.skip()
|
|
|
|
if blob.original_id in self._args.strip_blobs_with_ids:
|
|
blob.skip()
|
|
|
|
if ( self._args.replace_text
|
|
# not (if blob contains zero byte in the first 8Kb, that is, if blob is binary data)
|
|
and not b"\0" in blob.data[0:8192]
|
|
):
|
|
for literal, replacement in self._args.replace_text['literals']:
|
|
blob.data = blob.data.replace(literal, replacement)
|
|
for regex, replacement in self._args.replace_text['regexes']:
|
|
blob.data = regex.sub(replacement, blob.data)
|
|
|
|
if self._blob_callback:
|
|
self._blob_callback(blob, self.callback_metadata())
|
|
|
|
def _filter_files(self, commit):
|
|
def filename_matches(path_expression, pathname):
|
|
''' Returns whether path_expression matches pathname or a leading
|
|
directory thereof, allowing path_expression to not have a trailing
|
|
slash even if it is meant to match a leading directory. '''
|
|
if path_expression == b'':
|
|
return True
|
|
n = len(path_expression)
|
|
if (pathname.startswith(path_expression) and
|
|
(path_expression[n-1:n] == b'/' or
|
|
len(pathname) == n or
|
|
pathname[n:n+1] == b'/')):
|
|
return True
|
|
return False
|
|
|
|
def newname(path_changes, pathname, use_base_name, filtering_is_inclusive):
|
|
''' Applies filtering and rename changes from path_changes to pathname,
|
|
returning any of None (file isn't wanted), original filename (file
|
|
is wanted with original name), or new filename. '''
|
|
wanted = False
|
|
full_pathname = pathname
|
|
if use_base_name:
|
|
pathname = os.path.basename(pathname)
|
|
for (mod_type, match_type, path_exp) in path_changes:
|
|
if mod_type == 'filter' and not wanted:
|
|
assert match_type in ('match', 'glob', 'regex')
|
|
if match_type == 'match' and filename_matches(path_exp, pathname):
|
|
wanted = True
|
|
if match_type == 'glob' and fnmatch.fnmatch(pathname, path_exp):
|
|
wanted = True
|
|
if match_type == 'regex' and path_exp.search(pathname):
|
|
wanted = True
|
|
elif mod_type == 'rename':
|
|
match, repl = path_exp
|
|
assert match_type in ('match','regex') # glob was translated to regex
|
|
if match_type == 'match' and filename_matches(match, full_pathname):
|
|
full_pathname = full_pathname.replace(match, repl, 1)
|
|
if match_type == 'regex':
|
|
full_pathname = match.sub(repl, full_pathname)
|
|
return full_pathname if (wanted == filtering_is_inclusive) else None
|
|
|
|
args = self._args
|
|
new_file_changes = {} # Assumes no renames or copies, otherwise collisions
|
|
for change in commit.file_changes:
|
|
# NEEDSWORK: _If_ we ever want to pass `--full-tree` to fast-export and
|
|
# parse that output, we'll need to modify this block; `--full-tree`
|
|
# issues a deleteall directive which has no filename, and thus this
|
|
# block would normally strip it. Of course, FileChange() and
|
|
# _parse_optional_filechange() would need updates too.
|
|
if change.type == b'DELETEALL':
|
|
new_file_changes[b''] = change
|
|
continue
|
|
if change.filename in self._newnames:
|
|
change.filename = self._newnames[change.filename]
|
|
else:
|
|
original_filename = change.filename
|
|
change.filename = newname(args.path_changes, change.filename,
|
|
args.use_base_name, args.inclusive)
|
|
if self._filename_callback:
|
|
change.filename = self._filename_callback(change.filename)
|
|
self._newnames[original_filename] = change.filename
|
|
if not change.filename:
|
|
continue # Filtering criteria excluded this file; move on to next one
|
|
if change.filename in new_file_changes:
|
|
# Getting here means that path renaming is in effect, and caused one
|
|
# path to collide with another. That's usually bad, but can be okay
|
|
# under two circumstances:
|
|
# 1) Sometimes people have a file named OLDFILE in old revisions of
|
|
# history, and they rename to NEWFILE, and would like to rewrite
|
|
# history so that all revisions refer to it as NEWFILE. As such,
|
|
# we can allow a collision when (at least) one of the two paths
|
|
# is a deletion. Note that if OLDFILE and NEWFILE are unrelated
|
|
# this also allows the rewrite to continue, which makes sense
|
|
# since OLDFILE is no longer in the way.
|
|
# 2) If OLDFILE and NEWFILE are exactly equal, then writing them
|
|
# both to the same location poses no problem; we only need one
|
|
# file. (This could come up if someone copied a file in some
|
|
# commit, then later either deleted the file or kept it exactly
|
|
# in sync with the original with any changes, and then decides
|
|
# they want to rewrite history to only have one of the two files)
|
|
colliding_change = new_file_changes[change.filename]
|
|
if change.type == b'D':
|
|
# We can just throw this one away and keep the other
|
|
continue
|
|
elif change.type == b'M' and (
|
|
change.mode == colliding_change.mode and
|
|
change.blob_id == colliding_change.blob_id):
|
|
# The two are identical, so we can throw this one away and keep other
|
|
continue
|
|
elif new_file_changes[change.filename].type != b'D':
|
|
raise SystemExit(_("File renaming caused colliding pathnames!\n") +
|
|
_(" Commit: {}\n").format(commit.original_id) +
|
|
_(" Filename: {}").format(change.filename))
|
|
# Strip files that are too large
|
|
if self._args.max_blob_size and \
|
|
self._unpacked_size.get(change.blob_id, 0) > self._args.max_blob_size:
|
|
continue
|
|
if self._args.strip_blobs_with_ids and \
|
|
change.blob_id in self._args.strip_blobs_with_ids:
|
|
continue
|
|
# Otherwise, record the change
|
|
new_file_changes[change.filename] = change
|
|
commit.file_changes = [v for k,v in sorted(new_file_changes.items())]
|
|
|
|
def _tweak_commit(self, commit, aux_info):
|
|
# Change the commit message according to callback
|
|
if not self._args.preserve_commit_hashes:
|
|
commit.message = self._hash_re.sub(self._translate_commit_hash,
|
|
commit.message)
|
|
if self._args.replace_message:
|
|
for literal, replacement in self._args.replace_message['literals']:
|
|
commit.message = commit.message.replace(literal, replacement)
|
|
for regex, replacement in self._args.replace_message['regexes']:
|
|
commit.message = regex.sub(replacement, commit.message)
|
|
if self._message_callback:
|
|
commit.message = self._message_callback(commit.message)
|
|
|
|
# Change the author & committer according to mailmap rules
|
|
args = self._args
|
|
if args.mailmap:
|
|
commit.author_name, commit.author_email = \
|
|
args.mailmap.translate(commit.author_name, commit.author_email)
|
|
commit.committer_name, commit.committer_email = \
|
|
args.mailmap.translate(commit.committer_name, commit.committer_email)
|
|
# Change author & committer according to callbacks
|
|
if self._name_callback:
|
|
commit.author_name = self._name_callback(commit.author_name)
|
|
commit.committer_name = self._name_callback(commit.committer_name)
|
|
if self._email_callback:
|
|
commit.author_email = self._email_callback(commit.author_email)
|
|
commit.committer_email = self._email_callback(commit.committer_email)
|
|
|
|
# Sometimes the 'branch' given is a tag; if so, rename it as requested so
|
|
# we don't get any old tagnames
|
|
if self._args.tag_rename:
|
|
commit.branch = RepoFilter._do_tag_rename(args.tag_rename, commit.branch)
|
|
if self._refname_callback:
|
|
commit.branch = self._refname_callback(commit.branch)
|
|
|
|
# Filter or rename the list of file changes
|
|
orig_file_changes = set(commit.file_changes)
|
|
self._filter_files(commit)
|
|
|
|
# Record ancestry graph
|
|
parents, orig_parents = commit.parents, aux_info['orig_parents']
|
|
if self._args.state_branch:
|
|
external_parents = parents
|
|
else:
|
|
external_parents = [p for p in parents if not isinstance(p, int)]
|
|
self._graph.record_external_commits(external_parents)
|
|
self._orig_graph.record_external_commits(external_parents)
|
|
self._graph.add_commit_and_parents(commit.id, parents)
|
|
self._orig_graph.add_commit_and_parents(commit.old_id, orig_parents)
|
|
|
|
# Prune parents (due to pruning of empty commits) if relevant
|
|
old_1st_parent = parents[0] if parents else None
|
|
parents, new_1st_parent = self._trim_extra_parents(orig_parents, parents)
|
|
commit.parents = parents
|
|
|
|
# If parents were pruned, then we need our file changes to be relative
|
|
# to the new first parent
|
|
if parents and old_1st_parent != parents[0]:
|
|
commit.file_changes = GitUtils.get_file_changes(self._repo_working_dir,
|
|
ID_TO_HASH[parents[0]],
|
|
commit.original_id)
|
|
orig_file_changes = set(commit.file_changes)
|
|
self._filter_files(commit)
|
|
|
|
# Find out which files were modified by the callbacks. Such paths could
|
|
# lead to subsequent commits being empty (e.g. if removing a line containing
|
|
# a password from every version of a file that had the password, and some
|
|
# later commit did nothing more than remove that line)
|
|
final_file_changes = set(commit.file_changes)
|
|
if self._args.replace_text or self._blob_callback:
|
|
differences = orig_file_changes.union(final_file_changes)
|
|
else:
|
|
differences = orig_file_changes.symmetric_difference(final_file_changes)
|
|
self._files_tweaked.update(x.filename for x in differences)
|
|
|
|
# Call the user-defined callback, if any
|
|
if self._commit_callback:
|
|
self._commit_callback(commit, self.callback_metadata(aux_info))
|
|
|
|
# Now print the resulting commit, or if prunable skip it
|
|
if not commit.dumped:
|
|
if not self._prunable(commit, new_1st_parent,
|
|
aux_info['had_file_changes'], orig_parents):
|
|
self._insert_into_stream(commit)
|
|
self._record_remapping(commit, orig_parents)
|
|
else:
|
|
rewrite_to = new_1st_parent or commit.first_parent()
|
|
commit.skip(new_id = rewrite_to)
|
|
if self._args.state_branch:
|
|
alias = Alias(commit.old_id or commit.id, rewrite_to or deleted_hash)
|
|
self._insert_into_stream(alias)
|
|
reset = Reset(commit.branch, rewrite_to or deleted_hash)
|
|
self._insert_into_stream(reset)
|
|
self._commit_renames[commit.original_id] = None
|
|
|
|
# Show progress
|
|
self._num_commits += 1
|
|
if not self._args.quiet:
|
|
self._progress_writer.show(self._parsed_message % self._num_commits)
|
|
|
|
@staticmethod
|
|
def _do_tag_rename(rename_pair, tagname):
|
|
old, new = rename_pair.split(b':', 1)
|
|
old, new = b'refs/tags/'+old, b'refs/tags/'+new
|
|
if tagname.startswith(old):
|
|
return tagname.replace(old, new, 1)
|
|
return tagname
|
|
|
|
def _tweak_tag(self, tag):
|
|
# Tweak the tag message according to callbacks
|
|
if self._args.replace_message:
|
|
for literal, replacement in self._args.replace_message['literals']:
|
|
tag.message = tag.message.replace(literal, replacement)
|
|
for regex, replacement in self._args.replace_message['regexes']:
|
|
tag.message = regex.sub(replacement, tag.message)
|
|
if self._message_callback:
|
|
tag.message = self._message_callback(tag.message)
|
|
|
|
# Tweak the tag name according to tag-name-related callbacks
|
|
tag_prefix = b'refs/tags/'
|
|
fullref = tag_prefix+tag.ref
|
|
if self._args.tag_rename:
|
|
fullref = RepoFilter._do_tag_rename(self._args.tag_rename, fullref)
|
|
if self._refname_callback:
|
|
fullref = self._refname_callback(fullref)
|
|
if not fullref.startswith(tag_prefix):
|
|
msg = "Error: fast-import requires tags to be in refs/tags/ namespace."
|
|
msg += "\n {} renamed to {}".format(tag_prefix+tag.ref, fullref)
|
|
raise SystemExit(msg)
|
|
tag.ref = fullref[len(tag_prefix):]
|
|
|
|
# Tweak the tagger according to callbacks
|
|
if self._args.mailmap:
|
|
tag.tagger_name, tag.tagger_email = \
|
|
self._args.mailmap.translate(tag.tagger_name, tag.tagger_email)
|
|
if self._name_callback:
|
|
tag.tagger_name = self._name_callback(tag.tagger_name)
|
|
if self._email_callback:
|
|
tag.tagger_email = self._email_callback(tag.tagger_email)
|
|
|
|
# Call general purpose tag callback
|
|
if self._tag_callback:
|
|
self._tag_callback(tag, self.callback_metadata())
|
|
|
|
def _tweak_reset(self, reset):
|
|
if self._args.tag_rename:
|
|
reset.ref = RepoFilter._do_tag_rename(self._args.tag_rename, reset.ref)
|
|
if self._refname_callback:
|
|
reset.ref = self._refname_callback(reset.ref)
|
|
if self._reset_callback:
|
|
self._reset_callback(reset, self.callback_metadata())
|
|
|
|
def results_tmp_dir(self, create_if_missing=True):
|
|
target_working_dir = self._args.target or b'.'
|
|
git_dir = GitUtils.determine_git_dir(target_working_dir)
|
|
d = os.path.join(git_dir, b'filter-repo')
|
|
if create_if_missing and not os.path.isdir(d):
|
|
os.mkdir(d)
|
|
return d
|
|
|
|
def _load_marks_file(self, marks_basename):
|
|
full_branch = 'refs/heads/{}'.format(self._args.state_branch)
|
|
marks_file = os.path.join(self.results_tmp_dir(), marks_basename)
|
|
working_dir = self._args.target or b'.'
|
|
cmd = ['git', '-C', working_dir, 'show-ref', full_branch]
|
|
contents = b''
|
|
if subproc.call(cmd, stdout=subprocess.DEVNULL) == 0:
|
|
cmd = ['git', '-C', working_dir, 'show',
|
|
'%s:%s' % (full_branch, decode(marks_basename))]
|
|
try:
|
|
contents = subproc.check_output(cmd)
|
|
except subprocess.CalledProcessError as e: # pragma: no cover
|
|
raise SystemExit(_("Failed loading %s from %s") %
|
|
(decode(marks_basename), full_branch))
|
|
if contents:
|
|
biggest_id = max(int(x.split()[0][1:]) for x in contents.splitlines())
|
|
_IDS._next_id = max(_IDS._next_id, biggest_id+1)
|
|
with open(marks_file, 'bw') as f:
|
|
f.write(contents)
|
|
return marks_file
|
|
|
|
def _save_marks_files(self):
|
|
basenames = [b'source-marks', b'target-marks']
|
|
working_dir = self._args.target or b'.'
|
|
|
|
# Check whether the branch exists
|
|
parent = []
|
|
full_branch = 'refs/heads/{}'.format(self._args.state_branch)
|
|
cmd = ['git', '-C', working_dir, 'show-ref', full_branch]
|
|
if subproc.call(cmd, stdout=subprocess.DEVNULL) == 0:
|
|
parent = ['-p', full_branch]
|
|
|
|
# Run 'git hash-object $MARKS_FILE' for each marks file, save result
|
|
blob_hashes = {}
|
|
for marks_basename in basenames:
|
|
marks_file = os.path.join(self.results_tmp_dir(), marks_basename)
|
|
if not os.path.isfile(marks_file): # pragma: no cover
|
|
raise SystemExit(_("Failed to find %s to save to %s")
|
|
% (marks_file, self._args.state_branch))
|
|
cmd = ['git', '-C', working_dir, 'hash-object', '-w', marks_file]
|
|
blob_hashes[marks_basename] = subproc.check_output(cmd).strip()
|
|
|
|
# Run 'git mktree' to create a tree out of it
|
|
p = subproc.Popen(['git', '-C', working_dir, 'mktree'],
|
|
stdin=subprocess.PIPE, stdout=subprocess.PIPE)
|
|
for b in basenames:
|
|
p.stdin.write(b'100644 blob %s\t%s\n' % (blob_hashes[b], b))
|
|
p.stdin.close()
|
|
p.wait()
|
|
tree = p.stdout.read().strip()
|
|
|
|
# Create the new commit
|
|
cmd = (['git', '-C', working_dir, 'commit-tree', '-m', 'New mark files',
|
|
tree] + parent)
|
|
commit = subproc.check_output(cmd).strip()
|
|
subproc.call(['git', '-C', working_dir, 'update-ref', full_branch, commit])
|
|
|
|
def importer_only(self):
|
|
self._run_sanity_checks()
|
|
self._setup_output()
|
|
|
|
def set_output(self, outputRepoFilter):
|
|
assert outputRepoFilter._output
|
|
|
|
# set_output implies this RepoFilter is doing exporting, though may not
|
|
# be the only one.
|
|
self._setup_input(use_done_feature = False)
|
|
|
|
# Set our output management up to pipe to outputRepoFilter's locations
|
|
self._managed_output = False
|
|
self._output = outputRepoFilter._output
|
|
self._import_pipes = outputRepoFilter._import_pipes
|
|
|
|
# Handle sanity checks, though currently none needed for export-only cases
|
|
self._run_sanity_checks()
|
|
|
|
def _setup_input(self, use_done_feature):
|
|
if self._args.stdin:
|
|
self._input = sys.stdin.detach()
|
|
sys.stdin = None # Make sure no one tries to accidentally use it
|
|
self._fe_orig = None
|
|
else:
|
|
skip_blobs = (self._blob_callback is None and
|
|
self._args.replace_text is None and
|
|
self._args.source == self._args.target)
|
|
extra_flags = []
|
|
if skip_blobs:
|
|
extra_flags.append('--no-data')
|
|
if self._args.max_blob_size:
|
|
self._unpacked_size, packed_size = GitUtils.get_blob_sizes()
|
|
if use_done_feature:
|
|
extra_flags.append('--use-done-feature')
|
|
if write_marks:
|
|
extra_flags.append(b'--mark-tags')
|
|
if self._args.state_branch:
|
|
assert(write_marks)
|
|
source_marks_file = self._load_marks_file(b'source-marks')
|
|
extra_flags.extend([b'--export-marks='+source_marks_file,
|
|
b'--import-marks='+source_marks_file])
|
|
if self._args.preserve_commit_encoding is not None: # pragma: no cover
|
|
reencode = 'no' if self._args.preserve_commit_encoding else 'yes'
|
|
extra_flags.append('--reencode='+reencode)
|
|
location = ['-C', self._args.source] if self._args.source else []
|
|
fep_cmd = ['git'] + location + ['fast-export', '--show-original-ids',
|
|
'--signed-tags=strip', '--tag-of-filtered-object=rewrite',
|
|
'--fake-missing-tagger', '--reference-excluded-parents'
|
|
] + extra_flags + self._args.refs
|
|
self._fep = subproc.Popen(fep_cmd, bufsize=-1, stdout=subprocess.PIPE)
|
|
self._input = self._fep.stdout
|
|
if self._args.dry_run or self._args.debug:
|
|
self._fe_orig = os.path.join(self.results_tmp_dir(),
|
|
b'fast-export.original')
|
|
output = open(self._fe_orig, 'bw')
|
|
self._input = InputFileBackup(self._input, output)
|
|
if self._args.debug:
|
|
tmp = [decode(x) if isinstance(x, bytes) else x for x in fep_cmd]
|
|
print("[DEBUG] Running: {}".format(' '.join(tmp)))
|
|
print(" (saving a copy of the output at {})"
|
|
.format(decode(self._fe_orig)))
|
|
|
|
def _setup_output(self):
|
|
if not self._args.dry_run:
|
|
location = ['-C', self._args.target] if self._args.target else []
|
|
fip_cmd = ['git'] + location + ['-c', 'core.ignorecase=false',
|
|
'fast-import', '--force', '--quiet']
|
|
if date_format_permissive:
|
|
fip_cmd.append('--date-format=raw-permissive')
|
|
if self._args.state_branch:
|
|
target_marks_file = self._load_marks_file(b'target-marks')
|
|
fip_cmd.extend([b'--export-marks='+target_marks_file,
|
|
b'--import-marks='+target_marks_file])
|
|
self._fip = subproc.Popen(fip_cmd, bufsize=-1,
|
|
stdin=subprocess.PIPE, stdout=subprocess.PIPE)
|
|
self._import_pipes = (self._fip.stdin, self._fip.stdout)
|
|
if self._args.dry_run or self._args.debug:
|
|
self._fe_filt = os.path.join(self.results_tmp_dir(),
|
|
b'fast-export.filtered')
|
|
self._output = open(self._fe_filt, 'bw')
|
|
else:
|
|
self._output = self._fip.stdin
|
|
if self._args.debug and not self._args.dry_run:
|
|
self._output = DualFileWriter(self._fip.stdin, self._output)
|
|
tmp = [decode(x) if isinstance(x, bytes) else x for x in fip_cmd]
|
|
print("[DEBUG] Running: {}".format(' '.join(tmp)))
|
|
print(" (using the following file as input: {})"
|
|
.format(decode(self._fe_filt)))
|
|
|
|
def _migrate_origin_to_heads(self):
|
|
refs_to_migrate = set(x for x in self._orig_refs
|
|
if x.startswith(b'refs/remotes/origin/'))
|
|
if not refs_to_migrate:
|
|
return
|
|
if self._args.debug:
|
|
print("[DEBUG] Migrating refs/remotes/origin/* -> refs/heads/*")
|
|
target_working_dir = self._args.target or b'.'
|
|
p = subproc.Popen('git update-ref --no-deref --stdin'.split(),
|
|
stdin=subprocess.PIPE, cwd=target_working_dir)
|
|
for ref in refs_to_migrate:
|
|
if ref == b'refs/remotes/origin/HEAD':
|
|
p.stdin.write(b'delete %s %s\n' % (ref, self._orig_refs[ref]))
|
|
del self._orig_refs[ref]
|
|
continue
|
|
newref = ref.replace(b'refs/remotes/origin/', b'refs/heads/')
|
|
if newref not in self._orig_refs:
|
|
p.stdin.write(b'create %s %s\n' % (newref, self._orig_refs[ref]))
|
|
p.stdin.write(b'delete %s %s\n' % (ref, self._orig_refs[ref]))
|
|
self._orig_refs[newref] = self._orig_refs[ref]
|
|
del self._orig_refs[ref]
|
|
p.stdin.close()
|
|
if p.wait():
|
|
raise SystemExit(_("git update-ref failed; see above")) # pragma: no cover
|
|
|
|
# Now remove
|
|
if self._args.debug:
|
|
print("[DEBUG] Removing 'origin' remote (rewritten history will no ")
|
|
print(" longer be related; consider re-pushing it elsewhere.")
|
|
subproc.call('git remote rm origin'.split(), cwd=target_working_dir)
|
|
|
|
def _final_commands(self):
|
|
self._finalize_handled = True
|
|
self._done_callback and self._done_callback()
|
|
|
|
if not self._args.quiet:
|
|
self._progress_writer.finish()
|
|
|
|
def _ref_update(self, target_working_dir):
|
|
# Start the update-ref process
|
|
p = subproc.Popen('git update-ref --no-deref --stdin'.split(),
|
|
stdin=subprocess.PIPE,
|
|
cwd=target_working_dir)
|
|
|
|
# Remove replace_refs from _orig_refs
|
|
replace_refs = {k:v for k, v in self._orig_refs.items()
|
|
if k.startswith(b'refs/replace/')}
|
|
reverse_replace_refs = collections.defaultdict(list)
|
|
for k,v in replace_refs.items():
|
|
reverse_replace_refs[v].append(k)
|
|
all(map(self._orig_refs.pop, replace_refs))
|
|
|
|
# Remove unused refs
|
|
exported_refs, imported_refs = self.get_exported_and_imported_refs()
|
|
refs_to_nuke = exported_refs - imported_refs
|
|
if self._args.partial:
|
|
refs_to_nuke = set()
|
|
if refs_to_nuke and self._args.debug:
|
|
print("[DEBUG] Deleting the following refs:\n "+
|
|
decode(b"\n ".join(refs_to_nuke)))
|
|
p.stdin.write(b''.join([b"delete %s\n" % x
|
|
for x in refs_to_nuke]))
|
|
|
|
# Delete or update and add replace_refs; note that fast-export automatically
|
|
# handles 'update-no-add', we only need to take action for the other four
|
|
# choices for replace_refs.
|
|
self._flush_renames()
|
|
actual_renames = {k:v for k,v in self._commit_renames.items() if k != v}
|
|
if self._args.replace_refs in ['delete-no-add', 'delete-and-add']:
|
|
# Delete old replace refs, if unwanted
|
|
replace_refs_to_nuke = set(replace_refs)
|
|
if self._args.replace_refs == 'delete-and-add':
|
|
# git-update-ref won't allow us to update a ref twice, so be careful
|
|
# to avoid deleting refs we'll later update
|
|
replace_refs_to_nuke = replace_refs_to_nuke.difference(
|
|
[b'refs/replace/'+x for x in actual_renames])
|
|
p.stdin.write(b''.join([b"delete %s\n" % x
|
|
for x in replace_refs_to_nuke]))
|
|
if self._args.replace_refs in ['delete-and-add', 'update-or-add',
|
|
'update-and-add']:
|
|
# Add new replace refs
|
|
update_only = (self._args.replace_refs == 'update-or-add')
|
|
p.stdin.write(b''.join([b"update refs/replace/%s %s\n" % (old, new)
|
|
for old,new in actual_renames.items()
|
|
if new and not (update_only and
|
|
old in reverse_replace_refs)]))
|
|
|
|
# Complete the update-ref process
|
|
p.stdin.close()
|
|
if p.wait():
|
|
raise SystemExit(_("git update-ref failed; see above")) # pragma: no cover
|
|
|
|
def _record_metadata(self, metadata_dir, orig_refs):
|
|
self._flush_renames()
|
|
with open(os.path.join(metadata_dir, b'commit-map'), 'bw') as f:
|
|
f.write(("%-40s %s\n" % (_("old"), _("new"))).encode())
|
|
for (old,new) in self._commit_renames.items():
|
|
msg = b'%s %s\n' % (old, new if new != None else deleted_hash)
|
|
f.write(msg)
|
|
|
|
exported_refs, imported_refs = self.get_exported_and_imported_refs()
|
|
|
|
batch_check_process = None
|
|
batch_check_output_re = re.compile(b'^([0-9a-f]{40}) ([a-z]+) ([0-9]+)$')
|
|
with open(os.path.join(metadata_dir, b'ref-map'), 'bw') as f:
|
|
f.write(("%-40s %-40s %s\n" % (_("old"), _("new"), _("ref"))).encode())
|
|
for refname, old_hash in orig_refs.items():
|
|
if refname not in exported_refs:
|
|
continue
|
|
if refname not in imported_refs:
|
|
new_hash = deleted_hash
|
|
elif old_hash in self._commit_renames:
|
|
new_hash = self._commit_renames[old_hash]
|
|
new_hash = new_hash if new_hash != None else deleted_hash
|
|
else: # Must be either an annotated tag, or a ref whose tip was pruned
|
|
if not batch_check_process:
|
|
cmd = 'git cat-file --batch-check'.split()
|
|
target_working_dir = self._args.target or b'.'
|
|
batch_check_process = subproc.Popen(cmd,
|
|
stdin=subprocess.PIPE,
|
|
stdout=subprocess.PIPE,
|
|
cwd=target_working_dir)
|
|
batch_check_process.stdin.write(refname+b"\n")
|
|
batch_check_process.stdin.flush()
|
|
line = batch_check_process.stdout.readline()
|
|
m = batch_check_output_re.match(line)
|
|
if m and m.group(2) in (b'tag', b'commit'):
|
|
new_hash = m.group(1)
|
|
elif line.endswith(b' missing\n'):
|
|
new_hash = deleted_hash
|
|
else:
|
|
raise SystemExit(_("Failed to find new id for %(refname)s "
|
|
"(old id was %(old_hash)s)")
|
|
% ({'refname': refname, 'old_hash': old_hash})
|
|
) # pragma: no cover
|
|
f.write(b'%s %s %s\n' % (old_hash, new_hash, refname))
|
|
if self._args.source or self._args.target:
|
|
new_refs = GitUtils.get_refs(self._args.target or b'.')
|
|
for ref, new_hash in new_refs.items():
|
|
if ref not in orig_refs and not ref.startswith(b'refs/replace/'):
|
|
old_hash = b'0'*len(new_hash)
|
|
f.write(b'%s %s %s\n' % (old_hash, new_hash, ref))
|
|
if batch_check_process:
|
|
batch_check_process.stdin.close()
|
|
batch_check_process.wait()
|
|
|
|
with open(os.path.join(metadata_dir, b'suboptimal-issues'), 'bw') as f:
|
|
issues_found = False
|
|
if self._commits_no_longer_merges:
|
|
issues_found = True
|
|
|
|
f.write(textwrap.dedent(_('''
|
|
The following commits used to be merge commits but due to filtering
|
|
are now regular commits; they likely have suboptimal commit messages
|
|
(e.g. "Merge branch next into master"). Original commit hash on the
|
|
left, commit hash after filtering/rewriting on the right:
|
|
''')[1:]).encode())
|
|
for oldhash, newhash in self._commits_no_longer_merges:
|
|
f.write(' {} {}\n'.format(oldhash, newhash).encode())
|
|
f.write(b'\n')
|
|
|
|
if self._commits_referenced_but_removed:
|
|
issues_found = True
|
|
f.write(textwrap.dedent(_('''
|
|
The following commits were filtered out, but referenced in another
|
|
commit message. The reference to the now-nonexistent commit hash
|
|
(or a substring thereof) was left as-is in any commit messages:
|
|
''')[1:]).encode())
|
|
for bad_commit_reference in self._commits_referenced_but_removed:
|
|
f.write(' {}\n'.format(bad_commit_reference).encode())
|
|
f.write(b'\n')
|
|
|
|
if not issues_found:
|
|
f.write(_("No filtering problems encountered.\n").encode())
|
|
|
|
with open(os.path.join(metadata_dir, b'already_ran'), 'bw') as f:
|
|
f.write(_("This file exists to allow you to filter again without --force.\n").encode())
|
|
|
|
def finish(self):
|
|
''' Alternative to run() when there is no input of our own to parse,
|
|
meaning that run only really needs to close the handle to fast-import
|
|
and let it finish, thus making a call to "run" feel like a misnomer. '''
|
|
assert not self._input
|
|
assert self._managed_output
|
|
self.run()
|
|
|
|
def insert(self, obj, direct_insertion = False):
|
|
if not direct_insertion:
|
|
if type(obj) == Blob:
|
|
self._tweak_blob(obj)
|
|
elif type(obj) == Commit:
|
|
aux_info = {'orig_parents': obj.parents,
|
|
'had_file_changes': bool(obj.file_changes)}
|
|
self._tweak_commit(obj, aux_info)
|
|
elif type(obj) == Reset:
|
|
self._tweak_reset(obj)
|
|
elif type(obj) == Tag:
|
|
self._tweak_tag(obj)
|
|
self._insert_into_stream(obj)
|
|
|
|
def _insert_into_stream(self, obj):
|
|
if not obj.dumped:
|
|
if self._parser:
|
|
self._parser.insert(obj)
|
|
else:
|
|
obj.dump(self._output)
|
|
|
|
def get_exported_and_imported_refs(self):
|
|
return self._parser.get_exported_and_imported_refs()
|
|
|
|
def run(self):
|
|
start = time.time()
|
|
if not self._input and not self._output:
|
|
self._run_sanity_checks()
|
|
if not self._args.dry_run and not self._args.partial:
|
|
self._migrate_origin_to_heads()
|
|
self._setup_input(use_done_feature = True)
|
|
self._setup_output()
|
|
assert self._sanity_checks_handled
|
|
|
|
if self._input:
|
|
# Create and run the filter
|
|
self._repo_working_dir = self._args.source or b'.'
|
|
self._parser = FastExportParser(blob_callback = self._tweak_blob,
|
|
commit_callback = self._tweak_commit,
|
|
tag_callback = self._tweak_tag,
|
|
reset_callback = self._tweak_reset,
|
|
done_callback = self._final_commands)
|
|
self._parser.run(self._input, self._output)
|
|
if not self._finalize_handled:
|
|
self._final_commands()
|
|
|
|
# Make sure fast-export completed successfully
|
|
if not self._args.stdin and self._fep.wait():
|
|
raise SystemExit(_("Error: fast-export failed; see above.")) # pragma: no cover
|
|
self._input.close()
|
|
|
|
# If we're not the manager of self._output, we should avoid post-run cleanup
|
|
if not self._managed_output:
|
|
return
|
|
|
|
# Close the output and ensure fast-import successfully completes
|
|
self._output.close()
|
|
if not self._args.dry_run and self._fip.wait():
|
|
raise SystemExit(_("Error: fast-import failed; see above.")) # pragma: no cover
|
|
|
|
# With fast-export and fast-import complete, update state if requested
|
|
if self._args.state_branch:
|
|
self._save_marks_files()
|
|
|
|
# Notify user how long it took, before doing a gc and such
|
|
msg = "New history written in {:.2f} seconds..."
|
|
if self._args.repack:
|
|
msg = "New history written in {:.2f} seconds; now repacking/cleaning..."
|
|
print(msg.format(time.time()-start))
|
|
|
|
# Exit early, if requested
|
|
if self._args.dry_run:
|
|
print(_("NOTE: Not running fast-import or cleaning up; --dry-run passed."))
|
|
if self._fe_orig:
|
|
print(_(" Requested filtering can be seen by comparing:"))
|
|
print(" " + decode(self._fe_orig))
|
|
else:
|
|
print(_(" Requested filtering can be seen at:"))
|
|
print(" " + decode(self._fe_filt))
|
|
return
|
|
|
|
target_working_dir = self._args.target or b'.'
|
|
if self._input:
|
|
self._ref_update(target_working_dir)
|
|
|
|
# Write out data about run
|
|
self._record_metadata(self.results_tmp_dir(), self._orig_refs)
|
|
|
|
# Final cleanup:
|
|
# If we need a repack, then nuke the reflogs and repack.
|
|
# If we need a reset, do a reset --hard
|
|
reset = not GitUtils.is_repository_bare(target_working_dir)
|
|
RepoFilter.cleanup(target_working_dir, self._args.repack, reset,
|
|
run_quietly=self._args.quiet,
|
|
show_debuginfo=self._args.debug)
|
|
|
|
# Let user know how long it took
|
|
print(_("Completely finished after {:.2f} seconds.")
|
|
.format(time.time()-start))
|
|
|
|
def main():
|
|
setup_gettext()
|
|
args = FilteringOptions.parse_args(sys.argv[1:])
|
|
if args.analyze:
|
|
RepoAnalyze.run(args)
|
|
else:
|
|
filter = RepoFilter(args)
|
|
filter.run()
|
|
|
|
if __name__ == '__main__':
|
|
main()
|