Skip to content
Please note that GitHub no longer supports your web browser.

We recommend upgrading to the latest Google Chrome or Firefox.

Learn more
Permalink
Browse files

Fix problem with Unicode in Bibtex generation.

The Unicode would not generate and failed with an exception.
A new test function test a few cases, but several cases are not
tested.
  • Loading branch information...
fnielsen committed Nov 5, 2019
1 parent 0a19113 commit 44dc7b8d974f61c0b3b8799ac5ee643932ce88f2
Showing with 1,004 additions and 24 deletions.
  1. +31 −24 scholia/tex.py
  2. +973 −0 tests/test_tex.py
@@ -29,12 +29,14 @@

from __future__ import print_function

import os
from os import write
from os.path import splitext

import re
import unicodedata

from six import u
from six import ensure_text, u

from .api import (
entity_to_authors, entity_to_classes, entity_to_doi,
@@ -63,14 +65,14 @@
}

STRING_TO_TEX_URL = {
'{': r'\{',
'}': r'\}',
'#': r'\#',
'&': r'\&',
'^': r'\^{}',
'%': r'\%',
'$': r'\$',
'_': r'\_',
'{': u(r'\{'),
'}': u(r'\}'),
'#': u(r'\#'),
'&': u(r'\&'),
'^': u(r'\^{}'),
'%': u(r'\%'),
'$': u(r'\$'),
'_': u(r'\_'),
}

COMBINING_DIACRITIC_TO_TEX = {
@@ -92,15 +94,15 @@
}

STRING_TO_TEX_PATTERN = re.compile(
u'|'.join(re.escape(key) for key in STRING_TO_TEX),
u('|').join(re.escape(key) for key in STRING_TO_TEX),
flags=re.UNICODE)

STRING_TO_TEX_URL_PATTERN = re.compile(
u'|'.join(re.escape(key) for key in STRING_TO_TEX_URL),
u('|').join(re.escape(key) for key in STRING_TO_TEX_URL),
flags=re.UNICODE)

COMBINING_DIACRITIC_TO_TEX_PATTERN = re.compile(
u'(.)({})'.format(
u('(.)({})').format(
u'|'.join(re.escape(key)for key in COMBINING_DIACRITIC_TO_TEX)),
flags=re.UNICODE)

@@ -137,7 +139,10 @@ def escape_to_tex(string, escape_type='normal'):
"""
if string is None:
return ''
return u('')

string = ensure_text(string)

if escape_type == 'normal':
escaped_string = STRING_TO_TEX_PATTERN.sub(
lambda match: STRING_TO_TEX[match.group()], string)
@@ -149,10 +154,10 @@ def escape_to_tex(string, escape_type='normal'):
escape_type))

escaped_string = COMBINING_DIACRITIC_TO_TEX_PATTERN.sub(
lambda match: '{{{} {}}}'.format(
lambda match: u('{{{} {}}}').format(
COMBINING_DIACRITIC_TO_TEX[match.group(2)],
match.group(1)),
unicodedata.normalize('NFD', u(escaped_string)))
unicodedata.normalize('NFD', escaped_string))
return escaped_string


@@ -279,15 +284,15 @@ def authors_to_bibtex_authors(authors):
Returns
-------
entry : str
Bibtex entry.
Bibtex entry in Unicode string.
"""
bibtex_authors = []
for n, (author, humanness) in enumerate(authors):
if humanness:
bibtex_authors.append(escape_to_tex(author))
else:
bibtex_authors.append('{' + escape_to_tex(author) + '}')
bibtex_authors.append(u('{') + escape_to_tex(author) + '}')
return bibtex_authors


@@ -299,18 +304,18 @@ def entity_to_bibtex_entry(entity, key=None):
entity : dict
Wikidata entity as hierarchical structure.
key : str
Bibtex key
Bibtex key.
Returns
-------
entry : str
Bibtex entry.
Bibtex entry in Unicode string.
"""
if key is None:
entry = "@Article{%s,\n" % entity['id']
entry = u("@Article{%s,\n") % entity['id']
else:
entry = "@Article{%s,\n" % escape_to_tex(key)
entry = u("@Article{%s,\n") % escape_to_tex(key)
authors = authors_to_bibtex_authors(
entity_to_authors(entity, return_humanness=True))
entry += " author = {%s},\n" % u" and ".join(authors)
@@ -393,14 +398,16 @@ def main():

entities = wb_get_entities(qs)

bib = ""
bib = u("")
for q, key in zip(qs, keys):
entity = entities[q]
bib += entity_to_bibtex_entry(entity, key=key)
bib += '\n'

with open(bib_filename, 'w') as f:
f.write(bib.encode('utf-8'))
# Write BibTeX-formatted string to file
output_file = os.open(bib_filename, os.O_RDWR | os.O_CREAT)
output_encoding = "utf-8"
write(output_file, bib.encode(output_encoding))


if __name__ == '__main__':

0 comments on commit 44dc7b8

Please sign in to comment.
You can’t perform that action at this time.