Skip to content
Permalink
Browse files

Amend to previous commit

  • Loading branch information...
fnielsen committed Feb 12, 2019
1 parent 3ac8878 commit 9b974626b01f6c2ed40bf93d22042e0bbe4581fb
Showing with 328 additions and 0 deletions.
  1. +74 −0 scholia/qs.py
  2. +243 −0 scholia/scrape/ojs.py
  3. +11 −0 tests/scrape/test_ojs.py
@@ -0,0 +1,74 @@
"""Quickstatements."""


from six import u

from .utils import escape_string


def paper_to_quickstatements(paper):
"""Convert paper to Quickstatements.
Convert a paper represented as a dict in to Magnus Manske's
Quickstatement format for entry into Wikidata.
Parameters
----------
paper : dict
Scraped paper represented as a dict.
Returns
-------
qs : str
Quickstatements as a string
References
----------
https://tools.wmflabs.org/wikidata-todo/quick_statements.php
Notes
-----
title, authors (list), date, year, language_q, url, full_text_url,
published_in_q are recognized.
`date` takes precedence over `year`.
"""
qs = u("CREATE\n")

title = escape_string(paper['title'])
qs += u('LAST\tLen\t"{}"\n').format(title)

# Instance of scientific article
qs += 'LAST\tP31\tQ13442814\n'

# Title
qs += u('LAST\tP1476\ten:"{}"\n').format(title)

# Authors
for n, author in enumerate(paper['authors'], start=1):
qs += u('LAST\tP2093\t"{}"\tP1545\t"{}"\n').format(author, n)

# Published in
if 'date' in paper:
# Day precision
qs += 'LAST\tP577\t+{}T00:00:00Z/11\n'.format(paper['date'])
elif 'year' in paper:
# Year precision
qs += 'LAST\tP577\t+{}-01-01T00:00:00Z/9\n'.format(paper['year'])

# Language
if 'language_q' in paper:
qs += 'LAST\tP407\t{}\n'.format(paper['language_q'])

# Homepage
qs += 'LAST\tP856\t"{}"\n'.format(paper['url'])

# Fulltext URL
qs += 'LAST\tP953\t"{}"\n'.format(paper['full_text_url'])

# Published in
if 'published_in_q' in paper and paper['published_in_q']:
qs += 'LAST\tP1433\t{}\n'.format(paper['published_in_q'])

return qs
@@ -0,0 +1,243 @@
r"""Scraping Open Journal Systems.
Usage:
scholia.scrape.ojs scrape-paper-from-url <url>
scholia.scrape.ojs paper-url-to-q <url>
scholia.scrape.ojs paper-url-to-quickstatements [options] <url>
Options:
-o --output=file Output filename, default output to stdout
--oe=encoding Output encoding [default: utf-8]
Examples
--------
$ python -m scholia.scrape.ojs paper-url-to-quickstatements \
https://journals.uio.no/index.php/osla/article/view/5855
"""


import json

import os

import signal

from six import b, print_, u

from lxml import etree

import requests

from ..qs import paper_to_quickstatements
from ..query import iso639_to_q, issn_to_qs
from ..utils import escape_string


USER_AGENT = 'Scholia'

HEADERS = {'User-Agent': USER_AGENT}

PAPER_TO_Q_QUERY = u("""
SELECT ?paper WHERE {{
OPTIONAL {{ ?label rdfs:label "{label}"@en . }}
OPTIONAL {{ ?title wdt:P1476 "{title}"@en . }}
OPTIONAL {{ ?url wdt:P953 <{url}> . }}
BIND(COALESCE(?full_text_url, ?url, ?label, ?title) AS ?paper)
}}
""")

# SPARQL Endpoint for Wikidata Query Service
WDQS_URL = 'https://query.wikidata.org/sparql'


def paper_to_q(paper):
"""Find Q identifier for paper.
Parameters
----------
paper : dict
Paper represented as dictionary.
Returns
-------
q : str or None
Q identifier in Wikidata. None is returned if the paper is not found.
Notes
-----
This function might be used to test if a scraped OJS paper is already
present in Wikidata.
The match on title is using an exact query, meaning that any variation in
lowercase/uppercase will not find the Wikidata item.
Examples
--------
>>> paper = {
... 'title': ('Linguistic Deviations in the Written Academic Register '
... 'of Danish University Students'),
... 'url': 'https://journals.uio.no/index.php/osla/article/view/5855'}
>>> paper_to_q(paper)
'Q61708017'
"""
title = escape_string(paper['title'])
query = PAPER_TO_Q_QUERY.format(
label=title, title=title,
url=paper['url'])

response = requests.get(WDQS_URL,
params={'query': query, 'format': 'json'},
headers=HEADERS)
data = response.json()['results']['bindings']

if len(data) == 0 or not data[0]:
# Not found
return None

return str(data[0]['paper']['value'][31:])


def paper_url_to_q(url):
"""Return Q identifier based on URL.
Scrape OJS HTML page with paper and use the extracted information on a
query on Wikidata Query Service to find the Wikidata Q identifier.
Parameters
----------
url : str
URL to NIPS HTML page.
Returns
-------
q : str or None
Q identifier for Wikidata or None if not found.
Examples
--------
>>> url = 'https://journals.uio.no/index.php/osla/article/view/5855'
>>> paper_url_to_q(url)
'Q61708017'
"""
paper = scrape_paper_from_url(url)
q = paper_to_q(paper)
return q


def paper_url_to_quickstatements(url):
"""Scrape OJS paper and return quickstatements.
Given a URL to a HTML web page representing a paper formatted by the Open
Journal Systems, return quickstatements for data entry in Wikidata with the
Magnus Manske Quicksatement tool.
Parameters
----------
url : str
URL to OJS paper as a string.
Returns
-------
qs : str
Quickstatements for paper as a string.
"""
paper = scrape_paper_from_url(url)
qs = paper_to_quickstatements(paper)
return qs


def scrape_paper_from_url(url):
"""Scrape OJS paper from URL.
Arguments
---------
url : str
URL to paper as a string
Returns
-------
paper : dict
Paper represented as a dictionary.
"""
def _field_to_content(field):
elements = tree.xpath("//meta[@name='{}']".format(field))
content = elements[0].attrib['content']
return content

entry = {'url': url}

response = requests.get(url)
tree = etree.HTML(response.content)

entry['authors'] = [
author_element.attrib['content']
for author_element in tree.xpath("//meta[@name='citation_author']")
]

entry['title'] = _field_to_content('citation_title')
entry['date'] = _field_to_content('citation_date').replace('/', '-')
entry['volume'] = _field_to_content('citation_volume')
entry['issue'] = _field_to_content('citation_issue')
entry['full_text_url'] = _field_to_content('citation_pdf_url')

language_as_iso639 = _field_to_content('citation_language')
language_q = iso639_to_q(language_as_iso639)
if language_q:
entry['language_q'] = language_q

entry['published_in_title'] = _field_to_content('citation_journal_title')

issn = _field_to_content('citation_issn')
if len(issn) == 8:
# Oslo Studies in Language OJS does not have a dash between the numbers
issn = issn[:4] + '-' + issn[4:]
qs = issn_to_qs(issn)
if len(qs) == 1:
entry['published_in_q'] = qs[0]

return entry


def main():
"""Handle command-line interface."""
from docopt import docopt

arguments = docopt(__doc__)

if arguments['--output']:
output_filename = arguments['--output']
output_file = os.open(output_filename, os.O_RDWR | os.O_CREAT)
else:
# stdout
output_file = 1
output_encoding = arguments['--oe']

# Ignore broken pipe errors
signal.signal(signal.SIGPIPE, signal.SIG_DFL)

if arguments['paper-url-to-q']:
url = arguments['<url>']
entry = paper_url_to_q(url)
print_(entry)

elif arguments['paper-url-to-quickstatements']:
url = arguments['<url>']
qs = paper_url_to_quickstatements(url)
os.write(output_file, qs.encode(output_encoding) + b('\n'))

elif arguments['scrape-paper-from-url']:
url = arguments['<url>']
entry = scrape_paper_from_url(url)
print_(json.dumps(entry))

else:
assert False


if __name__ == "__main__":
main()
@@ -0,0 +1,11 @@
"""Test OJS."""


from scholia.scrape.ojs import paper_url_to_q


def test_paper_url_to_q():
"""Test paper_url_to_q."""
# https://www.wikidata.org/wiki/Q61708017
url = "https://journals.uio.no/index.php/osla/article/view/5855"
assert paper_url_to_q(url) == "Q61708017"

0 comments on commit 9b97462

Please sign in to comment.
You can’t perform that action at this time.