Skip to content
Please note that GitHub no longer supports your web browser.

We recommend upgrading to the latest Google Chrome or Firefox.

Learn more
Permalink
Browse files

Amend to previous commit

  • Loading branch information...
fnielsen committed Feb 12, 2019
1 parent 3ac8878 commit 9b974626b01f6c2ed40bf93d22042e0bbe4581fb
Showing with 328 additions and 0 deletions.
  1. +74 −0 scholia/qs.py
  2. +243 −0 scholia/scrape/ojs.py
  3. +11 −0 tests/scrape/test_ojs.py
@@ -0,0 +1,74 @@
"""Quickstatements."""


from six import u

from .utils import escape_string


def paper_to_quickstatements(paper):
"""Convert paper to Quickstatements.
Convert a paper represented as a dict in to Magnus Manske's
Quickstatement format for entry into Wikidata.
Parameters
----------
paper : dict
Scraped paper represented as a dict.
Returns
-------
qs : str
Quickstatements as a string
References
----------
https://tools.wmflabs.org/wikidata-todo/quick_statements.php
Notes
-----
title, authors (list), date, year, language_q, url, full_text_url,
published_in_q are recognized.
`date` takes precedence over `year`.
"""
qs = u("CREATE\n")

title = escape_string(paper['title'])
qs += u('LAST\tLen\t"{}"\n').format(title)

# Instance of scientific article
qs += 'LAST\tP31\tQ13442814\n'

# Title
qs += u('LAST\tP1476\ten:"{}"\n').format(title)

# Authors
for n, author in enumerate(paper['authors'], start=1):
qs += u('LAST\tP2093\t"{}"\tP1545\t"{}"\n').format(author, n)

# Published in
if 'date' in paper:
# Day precision
qs += 'LAST\tP577\t+{}T00:00:00Z/11\n'.format(paper['date'])
elif 'year' in paper:
# Year precision
qs += 'LAST\tP577\t+{}-01-01T00:00:00Z/9\n'.format(paper['year'])

# Language
if 'language_q' in paper:
qs += 'LAST\tP407\t{}\n'.format(paper['language_q'])

# Homepage
qs += 'LAST\tP856\t"{}"\n'.format(paper['url'])

# Fulltext URL
qs += 'LAST\tP953\t"{}"\n'.format(paper['full_text_url'])

# Published in
if 'published_in_q' in paper and paper['published_in_q']:
qs += 'LAST\tP1433\t{}\n'.format(paper['published_in_q'])

return qs
@@ -0,0 +1,243 @@
r"""Scraping Open Journal Systems.
Usage:
scholia.scrape.ojs scrape-paper-from-url <url>
scholia.scrape.ojs paper-url-to-q <url>
scholia.scrape.ojs paper-url-to-quickstatements [options] <url>
Options:
-o --output=file Output filename, default output to stdout
--oe=encoding Output encoding [default: utf-8]
Examples
--------
$ python -m scholia.scrape.ojs paper-url-to-quickstatements \
https://journals.uio.no/index.php/osla/article/view/5855
"""


import json

import os

import signal

from six import b, print_, u

from lxml import etree

import requests

from ..qs import paper_to_quickstatements
from ..query import iso639_to_q, issn_to_qs
from ..utils import escape_string


USER_AGENT = 'Scholia'

HEADERS = {'User-Agent': USER_AGENT}

PAPER_TO_Q_QUERY = u("""
SELECT ?paper WHERE {{
OPTIONAL {{ ?label rdfs:label "{label}"@en . }}
OPTIONAL {{ ?title wdt:P1476 "{title}"@en . }}
OPTIONAL {{ ?url wdt:P953 <{url}> . }}
BIND(COALESCE(?full_text_url, ?url, ?label, ?title) AS ?paper)
}}
""")

# SPARQL Endpoint for Wikidata Query Service
WDQS_URL = 'https://query.wikidata.org/sparql'


def paper_to_q(paper):
"""Find Q identifier for paper.
Parameters
----------
paper : dict
Paper represented as dictionary.
Returns
-------
q : str or None
Q identifier in Wikidata. None is returned if the paper is not found.
Notes
-----
This function might be used to test if a scraped OJS paper is already
present in Wikidata.
The match on title is using an exact query, meaning that any variation in
lowercase/uppercase will not find the Wikidata item.
Examples
--------
>>> paper = {
... 'title': ('Linguistic Deviations in the Written Academic Register '
... 'of Danish University Students'),
... 'url': 'https://journals.uio.no/index.php/osla/article/view/5855'}
>>> paper_to_q(paper)
'Q61708017'
"""
title = escape_string(paper['title'])
query = PAPER_TO_Q_QUERY.format(
label=title, title=title,
url=paper['url'])

response = requests.get(WDQS_URL,
params={'query': query, 'format': 'json'},
headers=HEADERS)
data = response.json()['results']['bindings']

if len(data) == 0 or not data[0]:
# Not found
return None

return str(data[0]['paper']['value'][31:])


def paper_url_to_q(url):
"""Return Q identifier based on URL.
Scrape OJS HTML page with paper and use the extracted information on a
query on Wikidata Query Service to find the Wikidata Q identifier.
Parameters
----------
url : str
URL to NIPS HTML page.
Returns
-------
q : str or None
Q identifier for Wikidata or None if not found.
Examples
--------
>>> url = 'https://journals.uio.no/index.php/osla/article/view/5855'
>>> paper_url_to_q(url)
'Q61708017'
"""
paper = scrape_paper_from_url(url)
q = paper_to_q(paper)
return q


def paper_url_to_quickstatements(url):
"""Scrape OJS paper and return quickstatements.
Given a URL to a HTML web page representing a paper formatted by the Open
Journal Systems, return quickstatements for data entry in Wikidata with the
Magnus Manske Quicksatement tool.
Parameters
----------
url : str
URL to OJS paper as a string.
Returns
-------
qs : str
Quickstatements for paper as a string.
"""
paper = scrape_paper_from_url(url)
qs = paper_to_quickstatements(paper)
return qs


def scrape_paper_from_url(url):
"""Scrape OJS paper from URL.
Arguments
---------
url : str
URL to paper as a string
Returns
-------
paper : dict
Paper represented as a dictionary.
"""
def _field_to_content(field):
elements = tree.xpath("//meta[@name='{}']".format(field))
content = elements[0].attrib['content']
return content

entry = {'url': url}

response = requests.get(url)
tree = etree.HTML(response.content)

entry['authors'] = [
author_element.attrib['content']
for author_element in tree.xpath("//meta[@name='citation_author']")
]

entry['title'] = _field_to_content('citation_title')
entry['date'] = _field_to_content('citation_date').replace('/', '-')
entry['volume'] = _field_to_content('citation_volume')
entry['issue'] = _field_to_content('citation_issue')
entry['full_text_url'] = _field_to_content('citation_pdf_url')

language_as_iso639 = _field_to_content('citation_language')
language_q = iso639_to_q(language_as_iso639)
if language_q:
entry['language_q'] = language_q

entry['published_in_title'] = _field_to_content('citation_journal_title')

issn = _field_to_content('citation_issn')
if len(issn) == 8:
# Oslo Studies in Language OJS does not have a dash between the numbers
issn = issn[:4] + '-' + issn[4:]
qs = issn_to_qs(issn)
if len(qs) == 1:
entry['published_in_q'] = qs[0]

return entry


def main():
"""Handle command-line interface."""
from docopt import docopt

arguments = docopt(__doc__)

if arguments['--output']:
output_filename = arguments['--output']
output_file = os.open(output_filename, os.O_RDWR | os.O_CREAT)
else:
# stdout
output_file = 1
output_encoding = arguments['--oe']

# Ignore broken pipe errors
signal.signal(signal.SIGPIPE, signal.SIG_DFL)

if arguments['paper-url-to-q']:
url = arguments['<url>']
entry = paper_url_to_q(url)
print_(entry)

elif arguments['paper-url-to-quickstatements']:
url = arguments['<url>']
qs = paper_url_to_quickstatements(url)
os.write(output_file, qs.encode(output_encoding) + b('\n'))

elif arguments['scrape-paper-from-url']:
url = arguments['<url>']
entry = scrape_paper_from_url(url)
print_(json.dumps(entry))

else:
assert False


if __name__ == "__main__":
main()
@@ -0,0 +1,11 @@
"""Test OJS."""


from scholia.scrape.ojs import paper_url_to_q


def test_paper_url_to_q():
"""Test paper_url_to_q."""
# https://www.wikidata.org/wiki/Q61708017
url = "https://journals.uio.no/index.php/osla/article/view/5855"
assert paper_url_to_q(url) == "Q61708017"

0 comments on commit 9b97462

Please sign in to comment.
You can’t perform that action at this time.