# HG changeset patch # User Gregory Szorc # Date 1363758241 25200 # Node ID d6b9c30a3e0f432977c92cf3c949c01f49ea6dee # Parent 16b4a689fee1c1826b21df0d2ad4350893abd2fc Export Git objects from incremental Mercurial changes This replaces the brute force Mercurial to Git export with one that is incremental. It results in a decent performance win and paves the road for parallel export via using multiple incremental exporters. diff --git a/hggit/git_handler.py b/hggit/git_handler.py --- a/hggit/git_handler.py +++ b/hggit/git_handler.py @@ -2,7 +2,6 @@ import stat, posixpath, StringIO from dulwich.errors import HangupException, GitProtocolError, UpdateRefsError -from dulwich.index import commit_tree from dulwich.objects import Blob, Commit, Tag, Tree, parse_timezone, S_IFGITLINK from dulwich.pack import create_delta, apply_delta from dulwich.repo import Repo @@ -26,9 +25,11 @@ from mercurial import error import _ssh +import hg2git import util from overlay import overlayrepo + RE_GIT_AUTHOR = re.compile('^(.*?) ?\<(.*?)(?:\>(.*))?$') RE_GIT_SANITIZE_AUTHOR = re.compile('[<>\n]') @@ -350,6 +351,12 @@ total = len(export) if total: self.ui.note(_("exporting hg objects to git\n")) + + # By only exporting deltas, the assertion is that all previous objects + # for all other changesets are already present in the Git repository. + # This assertion is necessary to prevent redundant work. + exporter = hg2git.IncrementalChangesetExporter(self.repo) + for i, rev in enumerate(export): util.progress(self.ui, 'exporting', i, total=total) ctx = self.repo.changectx(rev) @@ -358,14 +365,14 @@ self.ui.debug("revision %d is a part " "of octopus explosion\n" % ctx.rev()) continue - self.export_hg_commit(rev) + self.export_hg_commit(rev, exporter) util.progress(self.ui, 'importing', None, total=total) # convert this commit into git objects # go through the manifest, convert all blobs/trees we don't have # write the commit object (with metadata info) - def export_hg_commit(self, rev): + def export_hg_commit(self, rev, exporter): self.ui.note(_("converting revision %s\n") % hex(rev)) oldenc = self.swap_out_encoding() @@ -423,7 +430,11 @@ if 'encoding' in extra: commit.encoding = extra['encoding'] - tree_sha = commit_tree(self.git.object_store, self.iterblobs(ctx)) + for obj, nodeid in exporter.update_changeset(ctx): + self.git.object_store.add_object(obj) + + tree_sha = exporter.root_tree_sha + if tree_sha not in self.git.object_store: raise hgutil.Abort(_('Tree SHA-1 not present in Git repo: %s' % tree_sha)) @@ -566,43 +577,6 @@ return message - def iterblobs(self, ctx): - if '.hgsubstate' in ctx: - hgsub = util.OrderedDict() - if '.hgsub' in ctx: - hgsub = util.parse_hgsub(ctx['.hgsub'].data().splitlines()) - hgsubstate = util.parse_hgsubstate(ctx['.hgsubstate'].data().splitlines()) - for path, sha in hgsubstate.iteritems(): - try: - if path in hgsub and not hgsub[path].startswith('[git]'): - # some other kind of a repository (e.g. [hg]) - # that keeps its state in .hgsubstate, shall ignore - continue - yield path, sha, S_IFGITLINK - except ValueError: - pass - - for f in ctx: - if f == '.hgsubstate' or f == '.hgsub': - continue - fctx = ctx[f] - blobid = self.map_git_get(hex(fctx.filenode())) - - if not blobid: - blob = Blob.from_string(fctx.data()) - self.git.object_store.add_object(blob) - self.map_set(blob.id, hex(fctx.filenode())) - blobid = blob.id - - if 'l' in ctx.flags(f): - mode = 0120000 - elif 'x' in ctx.flags(f): - mode = 0100755 - else: - mode = 0100644 - - yield f, blobid, mode - def getnewgitcommits(self, refs=None): self.init_if_missing() diff --git a/hggit/hg2git.py b/hggit/hg2git.py new file mode 100644 --- /dev/null +++ b/hggit/hg2git.py @@ -0,0 +1,274 @@ +# This file contains code dealing specifically with converting Mercurial +# repositories to Git repositories. Code in this file is meant to be a generic +# library and should be usable outside the context of hg-git or an hg command. + +import os +import stat + +import dulwich.objects as dulobjs +import mercurial.node + +import util + + +class IncrementalChangesetExporter(object): + """Incrementally export Mercurial changesets to Git trees. + + The purpose of this class is to facilitate Git tree export that is more + optimal than brute force. + + A "dumb" implementations of Mercurial to Git export would iterate over + every file present in a Mercurial changeset and would convert each to + a Git blob and then conditionally add it to a Git repository if it didn't + yet exist. This is suboptimal because the overhead associated with + obtaining every file's raw content and converting it to a Git blob is + not trivial! + + This class works around the suboptimality of brute force export by + leveraging the information stored in Mercurial - the knowledge of what + changed between changesets - to only export Git objects corresponding to + changes in Mercurial. In the context of converting Mercurial repositories + to Git repositories, we only export objects Git (possibly) hasn't seen yet. + This prevents a lot of redundant work and is thus faster. + + Callers instantiate an instance of this class against a mercurial.localrepo + instance. They then associate it with a specific changesets by calling + update_changeset(). On each call to update_changeset(), the instance + computes the difference between the current and new changesets and emits + Git objects that haven't yet been encountered during the lifetime of the + class instance. In other words, it expresses Mercurial changeset deltas in + terms of Git objects. Callers then (usually) take this set of Git objects + and add them to the Git repository. + + This class only emits Git blobs and trees, not commits. + + The tree calculation part of this class is essentially a reimplementation + of dulwich.index.commit_tree. However, since our implementation reuses + Tree instances and only recalculates SHA-1 when things change, we are + more efficient. + """ + + def __init__(self, hg_repo): + """Create an instance against a mercurial.localrepo.""" + self._hg = hg_repo + + # Our current revision. + self._rev = mercurial.node.nullrev + + # Path to dulwich.objects.Tree. + self._dirs = {} + + # Mercurial file nodeid to Git blob SHA-1. Used to prevent redundant + # blob calculation. + self._blob_cache = {} + + @property + def root_tree_sha(self): + """The SHA-1 of the root Git tree. + + This is needed to construct a Git commit object. + """ + return self._dirs[''].id + + def update_changeset(self, ctx): + """Set the tree to track a new Mercurial changeset. + + This is a generator of 2-tuples. The first item in each tuple is a + dulwich object, either a Blob or a Tree. The second item is the + corresponding Mercurial nodeid for the item, if any. Only blobs will + have nodeids. Trees do not correspond to a specific nodeid, so it does + not make sense to emit a nodeid for them. + + When exporting trees from Mercurial, callers typically write the + returned dulwich object to the Git repo via the store's add_object(). + + Some emitted objects may already exist in the Git repository. This + class does not know about the Git repository, so it's up to the caller + to conditionally add the object, etc. + + Emitted objects are those that have changed since the last call to + update_changeset. If this is the first call to update_chanageset, all + objects in the tree are emitted. + """ + # Our general strategy is to accumulate dulwich.objects.Blob and + # dulwich.objects.Tree instances for the current Mercurial changeset. + # We do this incremental by iterating over the Mercurial-reported + # changeset delta. We rely on the behavior of Mercurial to lazy + # calculate a Tree's SHA-1 when we modify it. This is critical to + # performance. + + # In theory we should be able to look at changectx.files(). This is + # *much* faster. However, it may not be accurate, especially with older + # repositories, which may not record things like deleted files + # explicitly in the manifest (which is where files() gets its data). + # The only reliable way to get the full set of changes is by looking at + # the full manifest. And, the easy way to compare two manifests is + # localrepo.status(). + modified, added, removed = self._hg.status(self._rev, ctx.rev())[0:3] + + # We first process file removals so we can prune dead trees. + for path in sorted(removed, key=len, reverse=True): + d = os.path.dirname(path) + tree = self._dirs.get(d, dulobjs.Tree()) + + del tree[os.path.basename(path)] + + # If removing this file made the tree empty, we should delete this + # tree. This could result in parent trees losing their only child + # and so on. + if not len(tree): + self._remove_tree(d) + continue + + self._dirs[d] = tree + + # For every file that changed or was added, we need to calculate the + # corresponding Git blob and its tree entry. We emit the blob + # immediately and update trees to be aware of its presence. + for path in sorted(set(modified) | set(added), key=len, reverse=True): + # Handle special Mercurial paths. + if path == '.hgsubstate': + self._handle_subrepos(ctx) + continue + + if path == '.hgsub': + continue + + d = os.path.dirname(path) + tree = self._dirs.setdefault(d, dulobjs.Tree()) + + fctx = ctx[path] + + entry, blob = IncrementalChangesetExporter.tree_entry(fctx, + self._blob_cache) + if blob is not None: + yield (blob, fctx.filenode()) + + tree.add(*entry) + + # Now that all the trees represent the current changeset, recalculate + # the tree IDs and emit them. Note that we wait until now to calculate + # tree SHA-1s. This is an important difference between us and + # dulwich.index.commit_tree(), which builds new Tree instances for each + # series of blobs. + for obj in self._populate_tree_entries(): + yield (obj, None) + + self._rev = ctx.rev() + + def _remove_tree(self, path): + """Remove a (presumably empty) tree from the current changeset. + + A now-empty tree may be the only child of its parent. So, we traverse + up the chain to the root tree, deleting any empty trees along the way. + """ + try: + del self._dirs[path] + except KeyError: + return + + # Now we traverse up to the parent and delete any references. + if path == '': + return + + basename = os.path.basename(path) + parent = os.path.dirname(path) + while True: + tree = self._dirs.get(parent, None) + + # No parent entry. Nothing to remove or update. + if tree is None: + return + + try: + del tree[basename] + except KeyError: + return + + if len(tree): + return + + # The parent tree is empty. Se, we can delete it. + del self._dirs[parent] + + if parent == '': + return + + basename = os.path.basename(parent) + parent = os.path.dirname(parent) + + def _populate_tree_entries(self): + self._dirs.setdefault('', dulobjs.Tree()) + + # Fill in missing directories. + for path in self._dirs.keys(): + parent = os.path.dirname(path) + + while parent != '': + parent_tree = self._dirs.get(parent, None) + + if parent_tree is not None: + break + + self._dirs[parent] = dulobjs.Tree() + parent = os.path.dirname(parent) + + # TODO only emit trees that have been modified. + for d in sorted(self._dirs.keys(), key=len, reverse=True): + tree = self._dirs[d] + yield tree + + if d == '': + continue + + parent_tree = self._dirs[os.path.dirname(d)] + + # Accessing the tree's ID is what triggers SHA-1 calculation and is + # the expensive part (at least if the tree has been modified since + # the last time we retrieved its ID). + parent_tree[os.path.basename(d)] = (stat.S_IFDIR, tree.id) + + def _handle_subrepos(self, ctx): + substate = util.parse_hgsubstate(ctx['.hgsubstate'].data().splitlines()) + sub = util.OrderedDict() + + if '.hgsub' in ctx: + sub = util.parse_hgsub(ctx['.hgsub'].data().splitlines()) + + for path, sha in substate.iteritems(): + # Ignore non-Git repositories keeping state in .hgsubstate. + if path in sub and not sub[path].startswith('[git]'): + continue + + d = os.path.dirname(path) + tree = self._dirs.setdefault(d, dulobjs.Tree()) + tree.add(os.path.basename(path), dulobjs.S_IFGITLINK, sha) + + @staticmethod + def tree_entry(fctx, blob_cache): + """Compute a dulwich TreeEntry from a filectx. + + A side effect is the TreeEntry is stored in the passed cache. + + Returns a 2-tuple of (dulwich.objects.TreeEntry, dulwich.objects.Blob). + """ + blob_id = blob_cache.get(fctx.filenode(), None) + blob = None + + if blob_id is None: + blob = dulobjs.Blob.from_string(fctx.data()) + blob_id = blob.id + blob_cache[fctx.filenode()] = blob_id + + flags = fctx.flags() + + if 'l' in flags: + mode = 0120000 + elif 'x' in flags: + mode = 0100755 + else: + mode = 0100644 + + return (dulobjs.TreeEntry(os.path.basename(fctx.path()), mode, blob_id), + blob) +