changeset 186:f4caf22b87cd

Handle git repositories with legacy encodings.
author Abderrahim Kitouni <a.kitouni@gmail.com>
date Thu, 18 Jun 2009 16:49:13 +0100
parents 1224d118ac92
children 5f196f80ffb3
files git_handler.py tests/latin-1-encoding tests/test-encoding
diffstat 3 files changed, 154 insertions(+), 31 deletions(-) [+]
line wrap: on
line diff
--- a/git_handler.py
+++ b/git_handler.py
@@ -197,24 +197,32 @@
         commit['tree'] = tree_sha
         (time, timezone) = ctx.date()
 
-        # hg authors might not have emails
-        author = ctx.user()
+        if 'git-author' in extra:
+            author = extra['git-author']
+        else:
+            # hg authors might not have emails
+            author = ctx.user()
 
-        # check for git author pattern compliance
-        regex = re.compile('^(.*?) \<(.*?)\>(.*)$')
-        a = regex.match(author)
+            # check for git author pattern compliance
+            regex = re.compile('^(.*?) \<(.*?)\>(.*)$')
+            a = regex.match(author)
 
-        if a:
-            name = a.group(1)
-            email = a.group(2)
-            if len(a.group(3)) > 0:
-                name += ' ext:(' + urllib.quote(a.group(3)) + ')'
-            author = name + ' <' + email + '>'
+            if a:
+                name = a.group(1)
+                email = a.group(2)
+                if len(a.group(3)) > 0:
+                    name += ' ext:(' + urllib.quote(a.group(3)) + ')'
+                author = name + ' <' + email + '>'
+            else:
+                author = author + ' <none@none>'
+
+        commit['author'] = author + ' ' + str(int(time)) + ' ' + format_timezone(-timezone)
+
+        if 'git-commit-message' in extra:
+            commit['message'] = extra['git-commit-message']
         else:
-            author = author + ' <none@none>'
-        commit['author'] = author + ' ' + str(int(time)) + ' ' + format_timezone(-timezone)
-        message = ctx.description()
-        commit['message'] = ctx.description() + "\n"
+            message = ctx.description()
+            commit['message'] = ctx.description() + "\n"
 
         if 'committer' in extra:
             # fixup timezone
@@ -443,6 +451,35 @@
         date = (commit.author_time, -commit.author_timezone)
         text = strip_message
 
+        try:
+            text.decode('utf-8')
+        except UnicodeDecodeError:
+            extra['git-commit-message'] = text
+            text = self.decode_guess(text, commit._encoding)
+
+        author = commit.author
+
+        # convert extra data back to the end
+        if ' ext:' in commit.author:
+            regex = re.compile('^(.*?)\ ext:\((.*)\) <(.*)\>$')
+            m = regex.match(commit.author)
+            if m:
+                name = m.group(1)
+                ex = urllib.unquote(m.group(2))
+                email = m.group(3)
+                author = name + ' <' + email + '>' + ex
+
+        if ' <none@none>' in commit.author:
+            author = commit.author[:-12]
+
+        try:
+            author.decode('utf-8')
+        except UnicodeDecodeError:
+            extra['git-author'] = author
+            author = self.decode_guess(author, commit._encoding)
+
+        oldenc = self.swap_out_encoding()
+
         def getfilectx(repo, memctx, f):
             try:
                 (mode, sha, data) = self.git.get_file(commit, f)
@@ -463,7 +500,7 @@
             # merge, possibly octopus
             def commit_octopus(p1, p2):
                 ctx = context.memctx(self.repo, (p1, p2), text, files, getfilectx,
-                                     commit.author, date, {'hg-git': 'octopus'})
+                                     author, date, {'hg-git': 'octopus'})
                 return hex(self.repo.commitctx(ctx))
 
             octopus = len(gparents) > 2
@@ -484,21 +521,6 @@
             node2 = self.repo.changectx(p2)
             pa = node1.ancestor(node2)
 
-        author = commit.author
-
-        # convert extra data back to the end
-        if ' ext:' in commit.author:
-            regex = re.compile('^(.*?)\ ext:\((.*)\) <(.*)\>$')
-            m = regex.match(commit.author)
-            if m:
-                name = m.group(1)
-                ex = urllib.unquote(m.group(2))
-                email = m.group(3)
-                author = name + ' <' + email + '>' + ex
-
-        if ' <none@none>' in commit.author:
-            author = commit.author[:-12]
-
         # if named branch, add to extra
         if hg_branch:
             extra['branch'] = hg_branch
@@ -521,6 +543,8 @@
 
         node = self.repo.commit_import_ctx(ctx, pa, force_files)
 
+        self.swap_out_encoding(oldenc)
+
         # save changeset to mapping file
         cs = hex(node)
         self.map_set(commit.id, cs)
@@ -790,6 +814,30 @@
         if names:
             return names[0]
 
+    # Stolen from hgsubversion
+    def swap_out_encoding(self, new_encoding='UTF-8'):
+        try:
+            from mercurial import encoding
+            old = encoding.encoding
+            encoding.encoding = new_encoding
+        except ImportError:
+            old = hgutil._encoding
+            hgutil._encoding = new_encoding
+        return old
+
+    def decode_guess(self, string, encoding):
+        # text is not valid utf-8, try to make sense of it
+        if encoding:
+            try:
+                return string.decode(encoding).encode('utf-8')
+            except UnicodeDecodeError:
+                pass
+
+        try:
+            return string.decode('latin-1').encode('utf-8')
+        except UnicodeDecodeError:
+            return string.decode('ascii', 'replace').encode('utf-8')
+
     def check_bookmarks(self):
         if self.ui.config('extensions', 'hgext.bookmarks') is not None:
             self.ui.warn("YOU NEED TO SETUP BOOKMARKS\n")
new file mode 100644
--- /dev/null
+++ b/tests/latin-1-encoding
@@ -0,0 +1,18 @@
+# -*- coding: latin-1 -*-
+
+# this file contains some latin-1 messages for test-encoding
+
+GIT_AUTHOR_NAME='tést èncödîng'; export GIT_AUTHOR_NAME
+echo beta > beta
+git add beta
+commit -m 'add beta'
+
+echo gamma > gamma
+git add gamma
+commit -m 'add gämmâ'
+
+# test the commit encoding field
+git config i18n.commitencoding latin-1
+echo delta > delta
+git add delta
+commit -m 'add déltà'
new file mode 100755
--- /dev/null
+++ b/tests/test-encoding
@@ -0,0 +1,57 @@
+#!/bin/sh
+
+# -*- coding: utf-8 -*-
+
+# Fails for some reason, need to investigate
+# "$TESTDIR/hghave" git || exit 80
+
+# bail early if the user is already running git-daemon
+echo hi | nc localhost 9418 2>/dev/null && exit 80
+
+echo "[extensions]" >> $HGRCPATH
+echo "hggit=$(echo $(dirname $(dirname $0)))" >> $HGRCPATH
+echo 'hgext.graphlog =' >> $HGRCPATH
+echo 'hgext.bookmarks =' >> $HGRCPATH
+
+GIT_AUTHOR_NAME='test'; export GIT_AUTHOR_NAME
+GIT_AUTHOR_EMAIL='test@example.org'; export GIT_AUTHOR_EMAIL
+GIT_AUTHOR_DATE="2007-01-01 00:00:00 +0000"; export GIT_AUTHOR_DATE
+GIT_COMMITTER_NAME="$GIT_AUTHOR_NAME"; export GIT_COMMITTER_NAME
+GIT_COMMITTER_EMAIL="$GIT_AUTHOR_EMAIL"; export GIT_COMMITTER_EMAIL
+GIT_COMMITTER_DATE="$GIT_AUTHOR_DATE"; export GIT_COMMITTER_DATE
+
+count=10
+commit()
+{
+    GIT_AUTHOR_DATE="2007-01-01 00:00:$count +0000"
+    GIT_COMMITTER_DATE="$GIT_AUTHOR_DATE"
+    git commit "$@" >/dev/null || echo "git commit error"
+    count=`expr $count + 1`
+}
+
+mkdir gitrepo
+cd gitrepo
+git init | python -c "import sys; print sys.stdin.read().replace('$(dirname $(pwd))/', '')"
+
+# utf-8 encoded commit message
+echo alpha > alpha
+git add alpha
+commit -m 'add älphà'
+
+. $TESTDIR/latin-1-encoding
+
+# dulwich does not presently support local git repos, workaround
+cd ..
+git daemon --base-path="$(pwd)"\
+ --listen=localhost\
+ --export-all\
+  --pid-file=gitdaemon.pid \
+ --detach --reuseaddr
+
+hg clone git://localhost/gitrepo hgrepo
+cd hgrepo
+
+HGENCODING=utf-8 hg log --graph --debug
+
+cd ..
+kill `cat gitdaemon.pid`