changeset 710:5c7943ca051f

hg2git: start incremental conversion from a known commit Previously, we'd spin up the Mercurial incremental exporter from the null commit and build up state from there. This meant that for the first exported commit, we'd have to read all the files in that commit and compute Git blobs and trees based on that. The current Mercurial to Git conversion scheme makes most sense with Mercurial's current default storage format, where manifests are diffed against the numerically previous revision. At some point in the future, the default will switch to generaldelta, where manifests would be diffed against one of their parents. In that world it might make more sense to have a stateless exporter that diffed each commit against its generaldelta parent and calculated dirty trees based on that instead. However, more experiments need to be done to see what export scheme is best. For a repo with around 50,000 files, this brings down an incremental 'hg gexport' of one commit from 18 seconds with a hot file cache (and tens of minutes with a cold one) to around 2 seconds with a hot file cache.
author Siddharth Agarwal <sid0@fb.com>
date Fri, 14 Mar 2014 20:45:09 -0700
parents 4f0a154ae374
children 623cb724c3d0
files hggit/git_handler.py hggit/hg2git.py
diffstat 2 files changed, 29 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- a/hggit/git_handler.py
+++ b/hggit/git_handler.py
@@ -363,8 +363,24 @@
 
         # By only exporting deltas, the assertion is that all previous objects
         # for all other changesets are already present in the Git repository.
-        # This assertion is necessary to prevent redundant work.
-        exporter = hg2git.IncrementalChangesetExporter(self.repo)
+        # This assertion is necessary to prevent redundant work. Here, nodes,
+        # and therefore export, is in topological order. By definition,
+        # export[0]'s parents must be present in Git, so we start the
+        # incremental exporter from there.
+        pctx = self.repo[export[0]].p1()
+        pnode = pctx.node()
+        if pnode == nullid:
+            gitcommit = None
+        else:
+            gitsha = self._map_hg[hex(pnode)]
+            try:
+                gitcommit = self.git[gitsha]
+            except KeyError:
+                raise hgutil.Abort(_('Parent SHA-1 not present in Git'
+                                     'repo: %s' % gitsha))
+
+        exporter = hg2git.IncrementalChangesetExporter(
+            self.repo, pctx, self.git.object_store, gitcommit)
 
         for i, rev in enumerate(export):
             self.ui.progress('exporting', i, total=total)
--- a/hggit/hg2git.py
+++ b/hggit/hg2git.py
@@ -7,8 +7,6 @@
 
 import dulwich.objects as dulobjs
 from dulwich import diff_tree
-import mercurial.node
-import mercurial.context
 
 import util
 
@@ -59,15 +57,22 @@
     more efficient.
     """
 
-    def __init__(self, hg_repo):
-        """Create an instance against a mercurial.localrepo."""
+    def __init__(self, hg_repo, start_ctx, git_store, git_commit):
+        """Create an instance against a mercurial.localrepo.
+
+        start_ctx is the context for a Mercurial commit that has a Git
+        equivalent, passed in as git_commit. The incremental computation will be
+        started from this commit. git_store is the Git object store the commit
+        comes from. start_ctx can be repo[nullid], in which case git_commit
+        should be None.
+        """
         self._hg = hg_repo
 
         # Our current revision's context.
-        self._ctx = mercurial.context.changectx(hg_repo, 'null')
+        self._ctx = start_ctx
 
         # Path to dulwich.objects.Tree.
-        self._dirs = {}
+        self._init_dirs(git_store, git_commit)
 
         # Mercurial file nodeid to Git blob SHA-1. Used to prevent redundant
         # blob calculation.