Permalink
Please
sign in to comment.
Showing
with
328 additions
and 0 deletions.
- +74 −0 scholia/qs.py
- +243 −0 scholia/scrape/ojs.py
- +11 −0 tests/scrape/test_ojs.py
@@ -0,0 +1,74 @@ | |||
"""Quickstatements.""" | |||
|
|||
|
|||
from six import u | |||
|
|||
from .utils import escape_string | |||
|
|||
|
|||
def paper_to_quickstatements(paper): | |||
"""Convert paper to Quickstatements. | |||
Convert a paper represented as a dict in to Magnus Manske's | |||
Quickstatement format for entry into Wikidata. | |||
Parameters | |||
---------- | |||
paper : dict | |||
Scraped paper represented as a dict. | |||
Returns | |||
------- | |||
qs : str | |||
Quickstatements as a string | |||
References | |||
---------- | |||
https://tools.wmflabs.org/wikidata-todo/quick_statements.php | |||
Notes | |||
----- | |||
title, authors (list), date, year, language_q, url, full_text_url, | |||
published_in_q are recognized. | |||
`date` takes precedence over `year`. | |||
""" | |||
qs = u("CREATE\n") | |||
|
|||
title = escape_string(paper['title']) | |||
qs += u('LAST\tLen\t"{}"\n').format(title) | |||
|
|||
# Instance of scientific article | |||
qs += 'LAST\tP31\tQ13442814\n' | |||
|
|||
# Title | |||
qs += u('LAST\tP1476\ten:"{}"\n').format(title) | |||
|
|||
# Authors | |||
for n, author in enumerate(paper['authors'], start=1): | |||
qs += u('LAST\tP2093\t"{}"\tP1545\t"{}"\n').format(author, n) | |||
|
|||
# Published in | |||
if 'date' in paper: | |||
# Day precision | |||
qs += 'LAST\tP577\t+{}T00:00:00Z/11\n'.format(paper['date']) | |||
elif 'year' in paper: | |||
# Year precision | |||
qs += 'LAST\tP577\t+{}-01-01T00:00:00Z/9\n'.format(paper['year']) | |||
|
|||
# Language | |||
if 'language_q' in paper: | |||
qs += 'LAST\tP407\t{}\n'.format(paper['language_q']) | |||
|
|||
# Homepage | |||
qs += 'LAST\tP856\t"{}"\n'.format(paper['url']) | |||
|
|||
# Fulltext URL | |||
qs += 'LAST\tP953\t"{}"\n'.format(paper['full_text_url']) | |||
|
|||
# Published in | |||
if 'published_in_q' in paper and paper['published_in_q']: | |||
qs += 'LAST\tP1433\t{}\n'.format(paper['published_in_q']) | |||
|
|||
return qs |
@@ -0,0 +1,243 @@ | |||
r"""Scraping Open Journal Systems. | |||
Usage: | |||
scholia.scrape.ojs scrape-paper-from-url <url> | |||
scholia.scrape.ojs paper-url-to-q <url> | |||
scholia.scrape.ojs paper-url-to-quickstatements [options] <url> | |||
Options: | |||
-o --output=file Output filename, default output to stdout | |||
--oe=encoding Output encoding [default: utf-8] | |||
Examples | |||
-------- | |||
$ python -m scholia.scrape.ojs paper-url-to-quickstatements \ | |||
https://journals.uio.no/index.php/osla/article/view/5855 | |||
""" | |||
|
|||
|
|||
import json | |||
|
|||
import os | |||
|
|||
import signal | |||
|
|||
from six import b, print_, u | |||
|
|||
from lxml import etree | |||
|
|||
import requests | |||
|
|||
from ..qs import paper_to_quickstatements | |||
from ..query import iso639_to_q, issn_to_qs | |||
from ..utils import escape_string | |||
|
|||
|
|||
USER_AGENT = 'Scholia' | |||
|
|||
HEADERS = {'User-Agent': USER_AGENT} | |||
|
|||
PAPER_TO_Q_QUERY = u(""" | |||
SELECT ?paper WHERE {{ | |||
OPTIONAL {{ ?label rdfs:label "{label}"@en . }} | |||
OPTIONAL {{ ?title wdt:P1476 "{title}"@en . }} | |||
OPTIONAL {{ ?url wdt:P953 <{url}> . }} | |||
BIND(COALESCE(?full_text_url, ?url, ?label, ?title) AS ?paper) | |||
}} | |||
""") | |||
|
|||
# SPARQL Endpoint for Wikidata Query Service | |||
WDQS_URL = 'https://query.wikidata.org/sparql' | |||
|
|||
|
|||
def paper_to_q(paper): | |||
"""Find Q identifier for paper. | |||
Parameters | |||
---------- | |||
paper : dict | |||
Paper represented as dictionary. | |||
Returns | |||
------- | |||
q : str or None | |||
Q identifier in Wikidata. None is returned if the paper is not found. | |||
Notes | |||
----- | |||
This function might be used to test if a scraped OJS paper is already | |||
present in Wikidata. | |||
The match on title is using an exact query, meaning that any variation in | |||
lowercase/uppercase will not find the Wikidata item. | |||
Examples | |||
-------- | |||
>>> paper = { | |||
... 'title': ('Linguistic Deviations in the Written Academic Register ' | |||
... 'of Danish University Students'), | |||
... 'url': 'https://journals.uio.no/index.php/osla/article/view/5855'} | |||
>>> paper_to_q(paper) | |||
'Q61708017' | |||
""" | |||
title = escape_string(paper['title']) | |||
query = PAPER_TO_Q_QUERY.format( | |||
label=title, title=title, | |||
url=paper['url']) | |||
|
|||
response = requests.get(WDQS_URL, | |||
params={'query': query, 'format': 'json'}, | |||
headers=HEADERS) | |||
data = response.json()['results']['bindings'] | |||
|
|||
if len(data) == 0 or not data[0]: | |||
# Not found | |||
return None | |||
|
|||
return str(data[0]['paper']['value'][31:]) | |||
|
|||
|
|||
def paper_url_to_q(url): | |||
"""Return Q identifier based on URL. | |||
Scrape OJS HTML page with paper and use the extracted information on a | |||
query on Wikidata Query Service to find the Wikidata Q identifier. | |||
Parameters | |||
---------- | |||
url : str | |||
URL to NIPS HTML page. | |||
Returns | |||
------- | |||
q : str or None | |||
Q identifier for Wikidata or None if not found. | |||
Examples | |||
-------- | |||
>>> url = 'https://journals.uio.no/index.php/osla/article/view/5855' | |||
>>> paper_url_to_q(url) | |||
'Q61708017' | |||
""" | |||
paper = scrape_paper_from_url(url) | |||
q = paper_to_q(paper) | |||
return q | |||
|
|||
|
|||
def paper_url_to_quickstatements(url): | |||
"""Scrape OJS paper and return quickstatements. | |||
Given a URL to a HTML web page representing a paper formatted by the Open | |||
Journal Systems, return quickstatements for data entry in Wikidata with the | |||
Magnus Manske Quicksatement tool. | |||
Parameters | |||
---------- | |||
url : str | |||
URL to OJS paper as a string. | |||
Returns | |||
------- | |||
qs : str | |||
Quickstatements for paper as a string. | |||
""" | |||
paper = scrape_paper_from_url(url) | |||
qs = paper_to_quickstatements(paper) | |||
return qs | |||
|
|||
|
|||
def scrape_paper_from_url(url): | |||
"""Scrape OJS paper from URL. | |||
Arguments | |||
--------- | |||
url : str | |||
URL to paper as a string | |||
Returns | |||
------- | |||
paper : dict | |||
Paper represented as a dictionary. | |||
""" | |||
def _field_to_content(field): | |||
elements = tree.xpath("//meta[@name='{}']".format(field)) | |||
content = elements[0].attrib['content'] | |||
return content | |||
|
|||
entry = {'url': url} | |||
|
|||
response = requests.get(url) | |||
tree = etree.HTML(response.content) | |||
|
|||
entry['authors'] = [ | |||
author_element.attrib['content'] | |||
for author_element in tree.xpath("//meta[@name='citation_author']") | |||
] | |||
|
|||
entry['title'] = _field_to_content('citation_title') | |||
entry['date'] = _field_to_content('citation_date').replace('/', '-') | |||
entry['volume'] = _field_to_content('citation_volume') | |||
entry['issue'] = _field_to_content('citation_issue') | |||
entry['full_text_url'] = _field_to_content('citation_pdf_url') | |||
|
|||
language_as_iso639 = _field_to_content('citation_language') | |||
language_q = iso639_to_q(language_as_iso639) | |||
if language_q: | |||
entry['language_q'] = language_q | |||
|
|||
entry['published_in_title'] = _field_to_content('citation_journal_title') | |||
|
|||
issn = _field_to_content('citation_issn') | |||
if len(issn) == 8: | |||
# Oslo Studies in Language OJS does not have a dash between the numbers | |||
issn = issn[:4] + '-' + issn[4:] | |||
qs = issn_to_qs(issn) | |||
if len(qs) == 1: | |||
entry['published_in_q'] = qs[0] | |||
|
|||
return entry | |||
|
|||
|
|||
def main(): | |||
"""Handle command-line interface.""" | |||
from docopt import docopt | |||
|
|||
arguments = docopt(__doc__) | |||
|
|||
if arguments['--output']: | |||
output_filename = arguments['--output'] | |||
output_file = os.open(output_filename, os.O_RDWR | os.O_CREAT) | |||
else: | |||
# stdout | |||
output_file = 1 | |||
output_encoding = arguments['--oe'] | |||
|
|||
# Ignore broken pipe errors | |||
signal.signal(signal.SIGPIPE, signal.SIG_DFL) | |||
|
|||
if arguments['paper-url-to-q']: | |||
url = arguments['<url>'] | |||
entry = paper_url_to_q(url) | |||
print_(entry) | |||
|
|||
elif arguments['paper-url-to-quickstatements']: | |||
url = arguments['<url>'] | |||
qs = paper_url_to_quickstatements(url) | |||
os.write(output_file, qs.encode(output_encoding) + b('\n')) | |||
|
|||
elif arguments['scrape-paper-from-url']: | |||
url = arguments['<url>'] | |||
entry = scrape_paper_from_url(url) | |||
print_(json.dumps(entry)) | |||
|
|||
else: | |||
assert False | |||
|
|||
|
|||
if __name__ == "__main__": | |||
main() |
@@ -0,0 +1,11 @@ | |||
"""Test OJS.""" | |||
|
|||
|
|||
from scholia.scrape.ojs import paper_url_to_q | |||
|
|||
|
|||
def test_paper_url_to_q(): | |||
"""Test paper_url_to_q.""" | |||
# https://www.wikidata.org/wiki/Q61708017 | |||
url = "https://journals.uio.no/index.php/osla/article/view/5855" | |||
assert paper_url_to_q(url) == "Q61708017" |
0 comments on commit
9b97462