Amend to previous commit

fnielsen · Feb 12, 2019 · 9b974626b01f6c2ed40bf93d22042e0bbe4581fb · 9b97462
1 parent 3ac8878
commit 9b974626b01f6c2ed40bf93d22042e0bbe4581fb
Unified Split

Showing with 328 additions and 0 deletions.

+74 −0 scholia/qs.py

+243 −0 scholia/scrape/ojs.py

+11 −0 tests/scrape/test_ojs.py
diff --git a/scholia/qs.py b/scholia/qs.py
@@ -0,0 +1,74 @@
+"""Quickstatements."""
+
+
+from six import u
+
+from .utils import escape_string
+
+
+def paper_to_quickstatements(paper):
+    """Convert paper to Quickstatements.
+
+    Convert a paper represented as a dict in to Magnus Manske's
+    Quickstatement format for entry into Wikidata.
+
+    Parameters
+    ----------
+    paper : dict
+        Scraped paper represented as a dict.
+
+    Returns
+    -------
+    qs : str
+        Quickstatements as a string
+
+    References
+    ----------
+    https://tools.wmflabs.org/wikidata-todo/quick_statements.php
+
+    Notes
+    -----
+    title, authors (list), date, year, language_q, url, full_text_url,
+    published_in_q are recognized.
+
+    `date` takes precedence over `year`.
+
+    """
+    qs = u("CREATE\n")
+
+    title = escape_string(paper['title'])
+    qs += u('LAST\tLen\t"{}"\n').format(title)
+
+    # Instance of scientific article
+    qs += 'LAST\tP31\tQ13442814\n'
+
+    # Title
+    qs += u('LAST\tP1476\ten:"{}"\n').format(title)
+
+    # Authors
+    for n, author in enumerate(paper['authors'], start=1):
+        qs += u('LAST\tP2093\t"{}"\tP1545\t"{}"\n').format(author, n)
+
+    # Published in
+    if 'date' in paper:
+        # Day precision
+        qs += 'LAST\tP577\t+{}T00:00:00Z/11\n'.format(paper['date'])
+    elif 'year' in paper:
+        # Year precision
+        qs += 'LAST\tP577\t+{}-01-01T00:00:00Z/9\n'.format(paper['year'])
+
+    # Language
+    if 'language_q' in paper:
+        qs += 'LAST\tP407\t{}\n'.format(paper['language_q'])
+
+    # Homepage
+    qs += 'LAST\tP856\t"{}"\n'.format(paper['url'])
+
+    # Fulltext URL
+    qs += 'LAST\tP953\t"{}"\n'.format(paper['full_text_url'])
+
+    # Published in
+    if 'published_in_q' in paper and paper['published_in_q']:
+        qs += 'LAST\tP1433\t{}\n'.format(paper['published_in_q'])
+
+    return qs
diff --git a/scholia/scrape/ojs.py b/scholia/scrape/ojs.py
@@ -0,0 +1,243 @@
+r"""Scraping Open Journal Systems.
+
+Usage:
+  scholia.scrape.ojs scrape-paper-from-url <url>
+  scholia.scrape.ojs paper-url-to-q <url>
+  scholia.scrape.ojs paper-url-to-quickstatements [options] <url>
+
+Options:
+  -o --output=file  Output filename, default output to stdout
+  --oe=encoding     Output encoding [default: utf-8]
+
+Examples
+--------
+$ python -m scholia.scrape.ojs paper-url-to-quickstatements \
+    https://journals.uio.no/index.php/osla/article/view/5855
+
+"""
+
+
+import json
+
+import os
+
+import signal
+
+from six import b, print_, u
+
+from lxml import etree
+
+import requests
+
+from ..qs import paper_to_quickstatements
+from ..query import iso639_to_q, issn_to_qs
+from ..utils import escape_string
+
+
+USER_AGENT = 'Scholia'
+
+HEADERS = {'User-Agent': USER_AGENT}
+
+PAPER_TO_Q_QUERY = u("""
+SELECT ?paper WHERE {{
+  OPTIONAL {{ ?label rdfs:label "{label}"@en . }}
+  OPTIONAL {{ ?title wdt:P1476 "{title}"@en . }}
+  OPTIONAL {{ ?url wdt:P953 <{url}> . }}
+  BIND(COALESCE(?full_text_url, ?url, ?label, ?title) AS ?paper)
+}}
+""")
+
+# SPARQL Endpoint for Wikidata Query Service
+WDQS_URL = 'https://query.wikidata.org/sparql'
+
+
+def paper_to_q(paper):
+    """Find Q identifier for paper.
+
+    Parameters
+    ----------
+    paper : dict
+        Paper represented as dictionary.
+
+    Returns
+    -------
+    q : str or None
+        Q identifier in Wikidata. None is returned if the paper is not found.
+
+    Notes
+    -----
+    This function might be used to test if a scraped OJS paper is already
+    present in Wikidata.
+
+    The match on title is using an exact query, meaning that any variation in
+    lowercase/uppercase will not find the Wikidata item.
+
+    Examples
+    --------
+    >>> paper = {
+    ...     'title': ('Linguistic Deviations in the Written Academic Register '
+    ...               'of Danish University Students'),
+    ...     'url': 'https://journals.uio.no/index.php/osla/article/view/5855'}
+    >>> paper_to_q(paper)
+    'Q61708017'
+
+    """
+    title = escape_string(paper['title'])
+    query = PAPER_TO_Q_QUERY.format(
+        label=title, title=title,
+        url=paper['url'])
+
+    response = requests.get(WDQS_URL,
+                            params={'query': query, 'format': 'json'},
+                            headers=HEADERS)
+    data = response.json()['results']['bindings']
+
+    if len(data) == 0 or not data[0]:
+        # Not found
+        return None
+
+    return str(data[0]['paper']['value'][31:])
+
+
+def paper_url_to_q(url):
+    """Return Q identifier based on URL.
+
+    Scrape OJS HTML page with paper and use the extracted information on a
+    query on Wikidata Query Service to find the Wikidata Q identifier.
+
+    Parameters
+    ----------
+    url : str
+        URL to NIPS HTML page.
+
+    Returns
+    -------
+    q : str or None
+        Q identifier for Wikidata or None if not found.
+
+    Examples
+    --------
+    >>> url = 'https://journals.uio.no/index.php/osla/article/view/5855'
+    >>> paper_url_to_q(url)
+    'Q61708017'
+
+    """
+    paper = scrape_paper_from_url(url)
+    q = paper_to_q(paper)
+    return q
+
+
+def paper_url_to_quickstatements(url):
+    """Scrape OJS paper and return quickstatements.
+
+    Given a URL to a HTML web page representing a paper formatted by the Open
+    Journal Systems, return quickstatements for data entry in Wikidata with the
+    Magnus Manske Quicksatement tool.
+
+    Parameters
+    ----------
+    url : str
+        URL to OJS paper as a string.
+
+    Returns
+    -------
+    qs : str
+        Quickstatements for paper as a string.
+
+    """
+    paper = scrape_paper_from_url(url)
+    qs = paper_to_quickstatements(paper)
+    return qs
+
+
+def scrape_paper_from_url(url):
+    """Scrape OJS paper from URL.
+
+    Arguments
+    ---------
+    url : str
+        URL to paper as a string
+
+    Returns
+    -------
+    paper : dict
+        Paper represented as a dictionary.
+
+    """
+    def _field_to_content(field):
+        elements = tree.xpath("//meta[@name='{}']".format(field))
+        content = elements[0].attrib['content']
+        return content
+
+    entry = {'url': url}
+
+    response = requests.get(url)
+    tree = etree.HTML(response.content)
+
+    entry['authors'] = [
+        author_element.attrib['content']
+        for author_element in tree.xpath("//meta[@name='citation_author']")
+    ]
+
+    entry['title'] = _field_to_content('citation_title')
+    entry['date'] = _field_to_content('citation_date').replace('/', '-')
+    entry['volume'] = _field_to_content('citation_volume')
+    entry['issue'] = _field_to_content('citation_issue')
+    entry['full_text_url'] = _field_to_content('citation_pdf_url')
+
+    language_as_iso639 = _field_to_content('citation_language')
+    language_q = iso639_to_q(language_as_iso639)
+    if language_q:
+        entry['language_q'] = language_q
+
+    entry['published_in_title'] = _field_to_content('citation_journal_title')
+
+    issn = _field_to_content('citation_issn')
+    if len(issn) == 8:
+        # Oslo Studies in Language OJS does not have a dash between the numbers
+        issn = issn[:4] + '-' + issn[4:]
+    qs = issn_to_qs(issn)
+    if len(qs) == 1:
+        entry['published_in_q'] = qs[0]
+
+    return entry
+
+
+def main():
+    """Handle command-line interface."""
+    from docopt import docopt
+
+    arguments = docopt(__doc__)
+
+    if arguments['--output']:
+        output_filename = arguments['--output']
+        output_file = os.open(output_filename, os.O_RDWR | os.O_CREAT)
+    else:
+        # stdout
+        output_file = 1
+    output_encoding = arguments['--oe']
+
+    # Ignore broken pipe errors
+    signal.signal(signal.SIGPIPE, signal.SIG_DFL)
+
+    if arguments['paper-url-to-q']:
+        url = arguments['<url>']
+        entry = paper_url_to_q(url)
+        print_(entry)
+
+    elif arguments['paper-url-to-quickstatements']:
+        url = arguments['<url>']
+        qs = paper_url_to_quickstatements(url)
+        os.write(output_file, qs.encode(output_encoding) + b('\n'))
+
+    elif arguments['scrape-paper-from-url']:
+        url = arguments['<url>']
+        entry = scrape_paper_from_url(url)
+        print_(json.dumps(entry))
+
+    else:
+        assert False
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/scrape/test_ojs.py b/tests/scrape/test_ojs.py
@@ -0,0 +1,11 @@
+"""Test OJS."""
+
+
+from scholia.scrape.ojs import paper_url_to_q
+
+
+def test_paper_url_to_q():
+    """Test paper_url_to_q."""
+    # https://www.wikidata.org/wiki/Q61708017
+    url = "https://journals.uio.no/index.php/osla/article/view/5855"
+    assert paper_url_to_q(url) == "Q61708017"