Fix problem with Unicode in Bibtex generation.

The Unicode would not generate and failed with an exception. A new test function test a few cases, but several cases are not tested.
fnielsen · Nov 5, 2019 · 44dc7b8d974f61c0b3b8799ac5ee643932ce88f2 · 44dc7b8
1 parent 0a19113
commit 44dc7b8d974f61c0b3b8799ac5ee643932ce88f2
Unified Split

Showing with 1,004 additions and 24 deletions.

+31 −24 scholia/tex.py

+973 −0 tests/test_tex.py
diff --git a/scholia/tex.py b/scholia/tex.py
@@ -29,12 +29,14 @@
 
 from __future__ import print_function
 
+import os
+from os import write
 from os.path import splitext
 
 import re
 import unicodedata
 
-from six import u
+from six import ensure_text, u
 
 from .api import (
     entity_to_authors, entity_to_classes, entity_to_doi,
@@ -63,14 +65,14 @@
 }
 
 STRING_TO_TEX_URL = {
-    '{': r'\{',
-    '}': r'\}',
-    '#': r'\#',
-    '&': r'\&',
-    '^': r'\^{}',
-    '%': r'\%',
-    '$': r'\$',
-    '_': r'\_',
+    '{': u(r'\{'),
+    '}': u(r'\}'),
+    '#': u(r'\#'),
+    '&': u(r'\&'),
+    '^': u(r'\^{}'),
+    '%': u(r'\%'),
+    '$': u(r'\$'),
+    '_': u(r'\_'),
 }
 
 COMBINING_DIACRITIC_TO_TEX = {
@@ -92,15 +94,15 @@
 }
 
 STRING_TO_TEX_PATTERN = re.compile(
-    u'|'.join(re.escape(key) for key in STRING_TO_TEX),
+    u('|').join(re.escape(key) for key in STRING_TO_TEX),
     flags=re.UNICODE)
 
 STRING_TO_TEX_URL_PATTERN = re.compile(
-    u'|'.join(re.escape(key) for key in STRING_TO_TEX_URL),
+    u('|').join(re.escape(key) for key in STRING_TO_TEX_URL),
     flags=re.UNICODE)
 
 COMBINING_DIACRITIC_TO_TEX_PATTERN = re.compile(
-    u'(.)({})'.format(
+    u('(.)({})').format(
         u'|'.join(re.escape(key)for key in COMBINING_DIACRITIC_TO_TEX)),
     flags=re.UNICODE)
 
@@ -137,7 +139,10 @@ def escape_to_tex(string, escape_type='normal'):
 
     """
     if string is None:
-        return ''
+        return u('')
+
+    string = ensure_text(string)
+
     if escape_type == 'normal':
         escaped_string = STRING_TO_TEX_PATTERN.sub(
             lambda match: STRING_TO_TEX[match.group()], string)
@@ -149,10 +154,10 @@ def escape_to_tex(string, escape_type='normal'):
             escape_type))
 
     escaped_string = COMBINING_DIACRITIC_TO_TEX_PATTERN.sub(
-        lambda match: '{{{} {}}}'.format(
+        lambda match: u('{{{} {}}}').format(
             COMBINING_DIACRITIC_TO_TEX[match.group(2)],
             match.group(1)),
-        unicodedata.normalize('NFD', u(escaped_string)))
+        unicodedata.normalize('NFD', escaped_string))
     return escaped_string
 
 
@@ -279,15 +284,15 @@ def authors_to_bibtex_authors(authors):
     Returns
     -------
     entry : str
-        Bibtex entry.
+        Bibtex entry in Unicode string.
 
     """
     bibtex_authors = []
     for n, (author, humanness) in enumerate(authors):
         if humanness:
             bibtex_authors.append(escape_to_tex(author))
         else:
-            bibtex_authors.append('{' + escape_to_tex(author) + '}')
+            bibtex_authors.append(u('{') + escape_to_tex(author) + '}')
     return bibtex_authors
 
 
@@ -299,18 +304,18 @@ def entity_to_bibtex_entry(entity, key=None):
     entity : dict
         Wikidata entity as hierarchical structure.
     key : str
-        Bibtex key
+        Bibtex key.
 
     Returns
     -------
     entry : str
-        Bibtex entry.
+        Bibtex entry in Unicode string.
 
     """
     if key is None:
-        entry = "@Article{%s,\n" % entity['id']
+        entry = u("@Article{%s,\n") % entity['id']
     else:
-        entry = "@Article{%s,\n" % escape_to_tex(key)
+        entry = u("@Article{%s,\n") % escape_to_tex(key)
     authors = authors_to_bibtex_authors(
         entity_to_authors(entity, return_humanness=True))
     entry += "  author =   {%s},\n" % u" and ".join(authors)
@@ -393,14 +398,16 @@ def main():
 
         entities = wb_get_entities(qs)
 
-        bib = ""
+        bib = u("")
         for q, key in zip(qs, keys):
             entity = entities[q]
             bib += entity_to_bibtex_entry(entity, key=key)
             bib += '\n'
 
-        with open(bib_filename, 'w') as f:
-            f.write(bib.encode('utf-8'))
+        # Write BibTeX-formatted string to file
+        output_file = os.open(bib_filename, os.O_RDWR | os.O_CREAT)
+        output_encoding = "utf-8"
+        write(output_file, bib.encode(output_encoding))
 
 
 if __name__ == '__main__':