# HG changeset patch
# User Gregory Szorc <gregory.szorc@gmail.com>
# Date 1363758241 25200
# Node ID d6b9c30a3e0f432977c92cf3c949c01f49ea6dee
# Parent  16b4a689fee1c1826b21df0d2ad4350893abd2fc
Export Git objects from incremental Mercurial changes

This replaces the brute force Mercurial to Git export with one that is
incremental. It results in a decent performance win and paves the road
for parallel export via using multiple incremental exporters.

diff --git a/hggit/git_handler.py b/hggit/git_handler.py
--- a/hggit/git_handler.py
+++ b/hggit/git_handler.py
@@ -2,7 +2,6 @@
 import stat, posixpath, StringIO
 
 from dulwich.errors import HangupException, GitProtocolError, UpdateRefsError
-from dulwich.index import commit_tree
 from dulwich.objects import Blob, Commit, Tag, Tree, parse_timezone, S_IFGITLINK
 from dulwich.pack import create_delta, apply_delta
 from dulwich.repo import Repo
@@ -26,9 +25,11 @@
 from mercurial import error
 
 import _ssh
+import hg2git
 import util
 from overlay import overlayrepo
 
+
 RE_GIT_AUTHOR = re.compile('^(.*?) ?\<(.*?)(?:\>(.*))?$')
 
 RE_GIT_SANITIZE_AUTHOR = re.compile('[<>\n]')
@@ -350,6 +351,12 @@
         total = len(export)
         if total:
             self.ui.note(_("exporting hg objects to git\n"))
+
+        # By only exporting deltas, the assertion is that all previous objects
+        # for all other changesets are already present in the Git repository.
+        # This assertion is necessary to prevent redundant work.
+        exporter = hg2git.IncrementalChangesetExporter(self.repo)
+
         for i, rev in enumerate(export):
             util.progress(self.ui, 'exporting', i, total=total)
             ctx = self.repo.changectx(rev)
@@ -358,14 +365,14 @@
                 self.ui.debug("revision %d is a part "
                               "of octopus explosion\n" % ctx.rev())
                 continue
-            self.export_hg_commit(rev)
+            self.export_hg_commit(rev, exporter)
         util.progress(self.ui, 'importing', None, total=total)
 
 
     # convert this commit into git objects
     # go through the manifest, convert all blobs/trees we don't have
     # write the commit object (with metadata info)
-    def export_hg_commit(self, rev):
+    def export_hg_commit(self, rev, exporter):
         self.ui.note(_("converting revision %s\n") % hex(rev))
 
         oldenc = self.swap_out_encoding()
@@ -423,7 +430,11 @@
         if 'encoding' in extra:
             commit.encoding = extra['encoding']
 
-        tree_sha = commit_tree(self.git.object_store, self.iterblobs(ctx))
+        for obj, nodeid in exporter.update_changeset(ctx):
+            self.git.object_store.add_object(obj)
+
+        tree_sha = exporter.root_tree_sha
+
         if tree_sha not in self.git.object_store:
             raise hgutil.Abort(_('Tree SHA-1 not present in Git repo: %s' %
                 tree_sha))
@@ -566,43 +577,6 @@
 
         return message
 
-    def iterblobs(self, ctx):
-        if '.hgsubstate' in ctx:
-            hgsub = util.OrderedDict()
-            if '.hgsub' in ctx:
-                hgsub = util.parse_hgsub(ctx['.hgsub'].data().splitlines())
-            hgsubstate = util.parse_hgsubstate(ctx['.hgsubstate'].data().splitlines())
-            for path, sha in hgsubstate.iteritems():
-                try:
-                    if path in hgsub and not hgsub[path].startswith('[git]'):
-                        # some other kind of a repository (e.g. [hg])
-                        # that keeps its state in .hgsubstate, shall ignore
-                        continue
-                    yield path, sha, S_IFGITLINK
-                except ValueError:
-                    pass
-
-        for f in ctx:
-            if f == '.hgsubstate' or f == '.hgsub':
-                continue
-            fctx = ctx[f]
-            blobid = self.map_git_get(hex(fctx.filenode()))
-
-            if not blobid:
-                blob = Blob.from_string(fctx.data())
-                self.git.object_store.add_object(blob)
-                self.map_set(blob.id, hex(fctx.filenode()))
-                blobid = blob.id
-
-            if 'l' in ctx.flags(f):
-                mode = 0120000
-            elif 'x' in ctx.flags(f):
-                mode = 0100755
-            else:
-                mode = 0100644
-
-            yield f, blobid, mode
-
     def getnewgitcommits(self, refs=None):
         self.init_if_missing()
 
diff --git a/hggit/hg2git.py b/hggit/hg2git.py
new file mode 100644
--- /dev/null
+++ b/hggit/hg2git.py
@@ -0,0 +1,274 @@
+# This file contains code dealing specifically with converting Mercurial
+# repositories to Git repositories. Code in this file is meant to be a generic
+# library and should be usable outside the context of hg-git or an hg command.
+
+import os
+import stat
+
+import dulwich.objects as dulobjs
+import mercurial.node
+
+import util
+
+
+class IncrementalChangesetExporter(object):
+    """Incrementally export Mercurial changesets to Git trees.
+
+    The purpose of this class is to facilitate Git tree export that is more
+    optimal than brute force.
+
+    A "dumb" implementations of Mercurial to Git export would iterate over
+    every file present in a Mercurial changeset and would convert each to
+    a Git blob and then conditionally add it to a Git repository if it didn't
+    yet exist. This is suboptimal because the overhead associated with
+    obtaining every file's raw content and converting it to a Git blob is
+    not trivial!
+
+    This class works around the suboptimality of brute force export by
+    leveraging the information stored in Mercurial - the knowledge of what
+    changed between changesets - to only export Git objects corresponding to
+    changes in Mercurial. In the context of converting Mercurial repositories
+    to Git repositories, we only export objects Git (possibly) hasn't seen yet.
+    This prevents a lot of redundant work and is thus faster.
+
+    Callers instantiate an instance of this class against a mercurial.localrepo
+    instance. They then associate it with a specific changesets by calling
+    update_changeset(). On each call to update_changeset(), the instance
+    computes the difference between the current and new changesets and emits
+    Git objects that haven't yet been encountered during the lifetime of the
+    class instance. In other words, it expresses Mercurial changeset deltas in
+    terms of Git objects. Callers then (usually) take this set of Git objects
+    and add them to the Git repository.
+
+    This class only emits Git blobs and trees, not commits.
+
+    The tree calculation part of this class is essentially a reimplementation
+    of dulwich.index.commit_tree. However, since our implementation reuses
+    Tree instances and only recalculates SHA-1 when things change, we are
+    more efficient.
+    """
+
+    def __init__(self, hg_repo):
+        """Create an instance against a mercurial.localrepo."""
+        self._hg = hg_repo
+
+        # Our current revision.
+        self._rev = mercurial.node.nullrev
+
+        # Path to dulwich.objects.Tree.
+        self._dirs = {}
+
+        # Mercurial file nodeid to Git blob SHA-1. Used to prevent redundant
+        # blob calculation.
+        self._blob_cache = {}
+
+    @property
+    def root_tree_sha(self):
+        """The SHA-1 of the root Git tree.
+
+        This is needed to construct a Git commit object.
+        """
+        return self._dirs[''].id
+
+    def update_changeset(self, ctx):
+        """Set the tree to track a new Mercurial changeset.
+
+        This is a generator of 2-tuples. The first item in each tuple is a
+        dulwich object, either a Blob or a Tree. The second item is the
+        corresponding Mercurial nodeid for the item, if any. Only blobs will
+        have nodeids. Trees do not correspond to a specific nodeid, so it does
+        not make sense to emit a nodeid for them.
+
+        When exporting trees from Mercurial, callers typically write the
+        returned dulwich object to the Git repo via the store's add_object().
+
+        Some emitted objects may already exist in the Git repository. This
+        class does not know about the Git repository, so it's up to the caller
+        to conditionally add the object, etc.
+
+        Emitted objects are those that have changed since the last call to
+        update_changeset. If this is the first call to update_chanageset, all
+        objects in the tree are emitted.
+        """
+        # Our general strategy is to accumulate dulwich.objects.Blob and
+        # dulwich.objects.Tree instances for the current Mercurial changeset.
+        # We do this incremental by iterating over the Mercurial-reported
+        # changeset delta. We rely on the behavior of Mercurial to lazy
+        # calculate a Tree's SHA-1 when we modify it. This is critical to
+        # performance.
+
+        # In theory we should be able to look at changectx.files(). This is
+        # *much* faster. However, it may not be accurate, especially with older
+        # repositories, which may not record things like deleted files
+        # explicitly in the manifest (which is where files() gets its data).
+        # The only reliable way to get the full set of changes is by looking at
+        # the full manifest. And, the easy way to compare two manifests is
+        # localrepo.status().
+        modified, added, removed = self._hg.status(self._rev, ctx.rev())[0:3]
+
+        # We first process file removals so we can prune dead trees.
+        for path in sorted(removed, key=len, reverse=True):
+            d = os.path.dirname(path)
+            tree = self._dirs.get(d, dulobjs.Tree())
+
+            del tree[os.path.basename(path)]
+
+            # If removing this file made the tree empty, we should delete this
+            # tree. This could result in parent trees losing their only child
+            # and so on.
+            if not len(tree):
+                self._remove_tree(d)
+                continue
+
+            self._dirs[d] = tree
+
+        # For every file that changed or was added, we need to calculate the
+        # corresponding Git blob and its tree entry. We emit the blob
+        # immediately and update trees to be aware of its presence.
+        for path in sorted(set(modified) | set(added), key=len, reverse=True):
+            # Handle special Mercurial paths.
+            if path == '.hgsubstate':
+                self._handle_subrepos(ctx)
+                continue
+
+            if path == '.hgsub':
+                continue
+
+            d = os.path.dirname(path)
+            tree = self._dirs.setdefault(d, dulobjs.Tree())
+
+            fctx = ctx[path]
+
+            entry, blob = IncrementalChangesetExporter.tree_entry(fctx,
+                self._blob_cache)
+            if blob is not None:
+                yield (blob, fctx.filenode())
+
+            tree.add(*entry)
+
+        # Now that all the trees represent the current changeset, recalculate
+        # the tree IDs and emit them. Note that we wait until now to calculate
+        # tree SHA-1s. This is an important difference between us and
+        # dulwich.index.commit_tree(), which builds new Tree instances for each
+        # series of blobs.
+        for obj in self._populate_tree_entries():
+            yield (obj, None)
+
+        self._rev = ctx.rev()
+
+    def _remove_tree(self, path):
+        """Remove a (presumably empty) tree from the current changeset.
+
+        A now-empty tree may be the only child of its parent. So, we traverse
+        up the chain to the root tree, deleting any empty trees along the way.
+        """
+        try:
+            del self._dirs[path]
+        except KeyError:
+            return
+
+        # Now we traverse up to the parent and delete any references.
+        if path == '':
+            return
+
+        basename = os.path.basename(path)
+        parent = os.path.dirname(path)
+        while True:
+            tree = self._dirs.get(parent, None)
+
+            # No parent entry. Nothing to remove or update.
+            if tree is None:
+                return
+
+            try:
+                del tree[basename]
+            except KeyError:
+                return
+
+            if len(tree):
+                return
+
+            # The parent tree is empty. Se, we can delete it.
+            del self._dirs[parent]
+
+            if parent == '':
+                return
+
+            basename = os.path.basename(parent)
+            parent = os.path.dirname(parent)
+
+    def _populate_tree_entries(self):
+        self._dirs.setdefault('', dulobjs.Tree())
+
+        # Fill in missing directories.
+        for path in self._dirs.keys():
+            parent = os.path.dirname(path)
+
+            while parent != '':
+                parent_tree = self._dirs.get(parent, None)
+
+                if parent_tree is not None:
+                    break
+
+                self._dirs[parent] = dulobjs.Tree()
+                parent = os.path.dirname(parent)
+
+        # TODO only emit trees that have been modified.
+        for d in sorted(self._dirs.keys(), key=len, reverse=True):
+            tree = self._dirs[d]
+            yield tree
+
+            if d == '':
+                continue
+
+            parent_tree = self._dirs[os.path.dirname(d)]
+
+            # Accessing the tree's ID is what triggers SHA-1 calculation and is
+            # the expensive part (at least if the tree has been modified since
+            # the last time we retrieved its ID).
+            parent_tree[os.path.basename(d)] = (stat.S_IFDIR, tree.id)
+
+    def _handle_subrepos(self, ctx):
+        substate = util.parse_hgsubstate(ctx['.hgsubstate'].data().splitlines())
+        sub = util.OrderedDict()
+
+        if '.hgsub' in ctx:
+            sub = util.parse_hgsub(ctx['.hgsub'].data().splitlines())
+
+        for path, sha in substate.iteritems():
+            # Ignore non-Git repositories keeping state in .hgsubstate.
+            if path in sub and not sub[path].startswith('[git]'):
+                continue
+
+            d = os.path.dirname(path)
+            tree = self._dirs.setdefault(d, dulobjs.Tree())
+            tree.add(os.path.basename(path), dulobjs.S_IFGITLINK, sha)
+
+    @staticmethod
+    def tree_entry(fctx, blob_cache):
+        """Compute a dulwich TreeEntry from a filectx.
+
+        A side effect is the TreeEntry is stored in the passed cache.
+
+        Returns a 2-tuple of (dulwich.objects.TreeEntry, dulwich.objects.Blob).
+        """
+        blob_id = blob_cache.get(fctx.filenode(), None)
+        blob = None
+
+        if blob_id is None:
+            blob = dulobjs.Blob.from_string(fctx.data())
+            blob_id = blob.id
+            blob_cache[fctx.filenode()] = blob_id
+
+        flags = fctx.flags()
+
+        if 'l' in flags:
+            mode = 0120000
+        elif 'x' in flags:
+            mode = 0100755
+        else:
+            mode = 0100644
+
+        return (dulobjs.TreeEntry(os.path.basename(fctx.path()), mode, blob_id),
+                blob)
+