[Gpa-commits] r775 - trunk/src

Wed May 2 21:05:02 CEST 2007

Author: werner
Date: 2007-05-02 21:05:02 +0200 (Wed, 02 May 2007)
New Revision: 775

Modified:
   trunk/src/ChangeLog
   trunk/src/gpgmetools.c
Log:
Fixed an UTF-8 issue.


Modified: trunk/src/ChangeLog
===================================================================

--- trunk/src/ChangeLog	2007-04-25 13:13:49 UTC (rev 774)
+++ trunk/src/ChangeLog	2007-05-02 19:05:02 UTC (rev 775)
@@ -1,3 +1,7 @@
+2007-05-02  Werner Koch  <wk at g10code.com>
+
+	* gpgmetools.c (string_to_utf8): Rewritten.
+
 2007-04-25  Werner Koch  <wk at g10code.com>
 
 	* keyring.c (keyring_details_page_fill_num_keys): Use ngettext.

Modified: trunk/src/gpgmetools.c
===================================================================
--- trunk/src/gpgmetools.c	2007-04-25 13:13:49 UTC (rev 774)
+++ trunk/src/gpgmetools.c	2007-05-02 19:05:02 UTC (rev 775)
@@ -729,24 +729,41 @@
 static gchar *
 string_to_utf8 (const gchar *string)
 {
-  const gchar *s;
-
+  const char *s;
+  
   if (!string)
-    {
-      return NULL;
-    }
-  /* Make sure the encoding is UTF-8.  Test structure suggested by
-     Werner Koch.  */
+    return NULL;
+  
+  /* Due to a bug in old and not so old PGP versions user IDs have
+     been copied verbatim into the key.  Thus many users with Umlauts
+     et al. in their name will see their names garbled.  Although this
+     is not an issue for me (;-)), I have a couple of friends with
+     Umlauts in their name, so let's try to make their life easier by
+     detecting invalid encodings and convert that to Latin-1. */
   for (s = string; *s && !(*s & 0x80); s++)
     ;
-  if (*s && !strchr (string, 0xc3))
+  if (*s && ((s[1] & 0xc0) == 0x80) && ( ((*s & 0xe0) == 0xc0)
+                                         || ((*s & 0xf0) == 0xe0)
+                                         || ((*s & 0xf8) == 0xf0)
+                                         || ((*s & 0xfc) == 0xf8)
+                                         || ((*s & 0xfe) == 0xfc)) )
+    {  
+      /* Possible utf-8 character followed by continuation byte.
+         Although this might still be Latin-1 we better assume that it
+         is valid utf-8. */
+      return g_strdup (string);
+     }
+  else if (*s && !strchr (string, 0xc3))
     {
-      /* The string is Latin-1.  */
-      return  g_convert (string, -1, "UTF-8", "ISO-8859-1", NULL, NULL, NULL);
+      /* No 0xC3 character in the string; assume that it is Latin-1.  */
+      return g_convert (string, -1, "UTF-8", "ISO-8859-1", NULL, NULL, NULL);
     }
   else
     {
-      /* The string is already in UTF-8.  */
+      /* Everything else is assumed to be UTF-8.  We do this even that
+         we know the encoding is not valid.  However as we only test
+         the first non-ascii character, valid encodings might
+         follow.  */
       return g_strdup (string);
     }
 }