Permalink
Browse files

Rewrite ACL translator (#1577)

  • Loading branch information...
GuyAglionby authored and zuphilip committed Mar 24, 2018
1 parent cd5557a commit 9fba327a037aeb85559b94d5f422155dad634edd
Showing with 171 additions and 147 deletions.
  1. +171 −147 ACLWeb.js
View
318 ACLWeb.js
@@ -1,184 +1,208 @@
{
"translatorID": "f4a5876a-3e53-40e2-9032-d99a30d7a6fc",
"label": "ACL",
"creator": "Nathan Schneider",
"target": "^https?://(www[.])?aclweb\\.org/anthology/[^#]+",
"minVersion": "1.0.7",
"label": "ACLWeb",
"creator": "Nathan Schneider, Guy Aglionby",
"target": "^https?://(www\\.)?aclweb\\.org/anthology/[^#]+",
"minVersion": "3.0",
"maxVersion": "",
"priority": 100,
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcsbv",
"lastUpdated": "2013-09-16 00:20:13"
}
// based on ACM translator
function detectWeb(doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return prefix; else return null;
} : namespace;
var bibXpath = "//a[./text() = 'bib']"
if(doc.evaluate(bibXpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
return "multiple"
}
//commenting out single stuff
// if (url.indexOf("/anthology-new/J/")>-1)
// return "journalArticle";
// else
// return "conferencePaper";
"browserSupport": "gcsibv",
"lastUpdated": "2018-03-24 09:47:15"
}
/*
***** BEGIN LICENSE BLOCK *****
function scrapeIndex(doc, items) {
var results;
var doImport;
Copyright © 2018 Guy Aglionby
This file is part of Zotero.
if (items != null) { // Import user-selected item(s)
results = items;
doImport = true;
}
else {
bibFileNodes = doc.evaluate('//a[substring(@href, string-length(@href)-3, 4) = ".bib"]', doc, null, XPathResult.ANY_TYPE, null);
results = [];
doImport = false;
Zotero is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
var bibFileNode = bibFileNodes.iterateNext();
Zotero is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
while (bibFileNode) {
var bibFileName = bibFileNode.getAttribute("href");
var bibFile = bibFileName.substring(0, bibFileName.length-4);
You should have received a copy of the GNU Affero General Public License
along with Zotero. If not, see <http://www.gnu.org/licenses/>.
var bNodes = doc.evaluate('//a[@href="' + bibFileName + '"]/following-sibling::b[position()=1]', doc, null, XPathResult.ANY_TYPE, null); // These nodes contain author information
***** END LICENSE BLOCK *****
*/
// Extract authors' last names
var authorLasts = new Array();
function detectWeb(doc, url) {
if (doc.contentType === 'application/pdf' || url.endsWith('.bib')) {
let id = url.split('/').pop();
return id[0] == 'J' || id[0] == 'Q' ? 'journalArticle' : 'conferencePaper';
} else {
return 'multiple';
}
}
var bNode = bNodes.iterateNext();
var authorsS = bNode.innerHTML; // may include markup: potentially <author>, <first>, <von>, and/or <last> tags
authorsS = authorsS.replace(/[<][/]?author[>]/g, "");
var authors = authorsS.split("; ");
for (var a in authors) {
var authorS = authors[a];
var m = authorS.match(/[<]von[>]([^<]+)[<][/]von[>]/);
var last = "";
if (m!=null) // we expect there is a <last> tag if there is a <von> tag
last = m[1] + " ";
m = authorS.match(/[<]last[>]([^<]+)[<][/]last[>]/);
if (m!=null)
last += m[1];
else {
var name = authorS.replace(/[<][^>]+[>]/g, ""); // remove all markup
if (name=="Entire volume")
last = name;
else {
var parts = name.split(" ");
last = parts[parts.length-1];
if (parts.length>1) {
var penultInitial = parts[parts.length-2].substr(0,1);
if (penultInitial.toUpperCase()!=penultInitial) // e.g. van Dyke
last = name[parts.length-2] + " " + last;
}
}
function doWeb(doc, url) {
if (detectWeb(doc, url) === 'multiple') {
Zotero.selectItems(extractFullProceedings(doc), function (selected) {
if (!selected) {
return true;
}
Object.keys(selected).forEach(function (id) {
let bibtexElement = ZU.xpath(doc, '//a[contains(@href, "' + id + '.bib")]');
// Sometimes there won't be a BibTeX link, so we need to check
// and scrape directly from the proceedings page if there isn't.
if (bibtexElement.length) {
let bibtexURL = bibtexElement[0].href;
ZU.doGet(bibtexURL, function(responseString, responseObj, url) {
scrapeBibtex(responseString, url);
});
} else {
scrapeProceedings(doc, id);
}
authorLasts.push(last);
});
});
} else if(url.endsWith('.bib')) {
// e.g. http://www.aclweb.org/anthology/P10-4014.bib
let bibtex = ZU.xpath(doc, '//pre')[0].textContent;
scrapeBibtex(bibtex, url);
} else if (doc.contentType === 'application/pdf') {
let bibtexURL = url.replace('.pdf', '') + '.bib';
ZU.doGet(bibtexURL, function(responseString, responseObj) {
// Some items don't have .bib entries. In those cases we need to go
// to the proceedings page and scrape the information from there,
// given that we have the ID of the paper from the URL.
let is404 = responseString.includes('<title>404 Not Found</title>');
if (is404) {
// e.g. http://www.aclweb.org/anthology/Q14-1019
let id = url.split('/').pop().replace('.pdf', '');
ZU.processDocuments(constructProceedingsURL(id), function(doc) {
scrapeProceedings(doc, id);
});
} else {
// e.g. http://www.aclweb.org/anthology/P10-4014
scrapeBibtex(responseString, bibtexURL);
}
// Prepare result for this item, which consists of the relative path to the .bib file (minus the extension)
// followed by a space and the authors' last names (abbreviated format)
var result = bibFile + " ";
if (authorLasts.length<3)
result += authorLasts.join(" & ");
else
result += authorLasts[0] + "+";
results.push(result);
bibFileNode = bibFileNodes.iterateNext();
}
});
}
}
if (!doImport)
return results;
for (var i in results) {
var ii = results[i].indexOf(" ");
var fileRelPath = results[i].substring(0, ii);
var authorsShort = results[i].substring(ii+1);
var fileName = fileRelPath.substring(fileRelPath.lastIndexOf("/")+1);
var bibFile = fileRelPath + ".bib";
var pageurl = doc.location.href;
var lastSlash = pageurl.lastIndexOf("/");
var dirInUrl = pageurl.substring(0, lastSlash+1);
var fileInUrl = pageurl.substring(lastSlash+1, pageurl.indexOf("#", lastSlash));
var bib = dirInUrl + fileRelPath + ".bib";
var pdf = dirInUrl + fileRelPath + ".pdf";
var j = fileRelPath.lastIndexOf("-");
var yearShort = fileRelPath.substring(j-2, j);
var year = "";
if (new Number(yearShort) < 50)
year = "20" + yearShort;
else
year = "19" + yearShort;
var attachments = new Array();
attachments.push({title:authorsShort + " " + year + ".pdf", mimeType:"application/pdf", url:pdf});
var type = "";
if (pageurl.indexOf("/anthology-new/J/")>-1)
type = "journalArticle";
else
type = "conferencePaper";
if (doImport)
callTranslator(bib, type, attachments);
function extractFullProceedings(doc) {
let unwantedTitles = ['Front Matter', 'Author Index', 'Keyword Index'].map(function(title) {
return 'not(contains(., "' + title + '"))';
}).join(' and ');
let baseXpath = '//div[@id="content"]/p[i[' + unwantedTitles + ']]/';
let ids = ZU.xpath(doc, baseXpath + 'a[@href = concat(text(), ".pdf")]');
ids = ids.map(function(id) { return id.textContent; });
let authors = ZU.xpath(doc, baseXpath + 'b');
authors = authors.map(function(author) { return author.textContent; });
let titles = ZU.xpath(doc, baseXpath + 'i');
titles = titles.map(function(title) { return title.textContent; });
let items = {};
for (let i = 0; i < ids.length; i++) {
let articleAuthors = authors[i].split('; ');
let authorSurname = articleAuthors[0].split(' ').pop();
let etAl = articleAuthors.length > 1 ? ' et al.' : '';
let author = authorSurname + etAl;
items[ids[i]] = ids[i] + ' (' + author + '): ' + titles[i];
}
return items;
}
function callTranslator(bibFileURL, type, attachments) {
Zotero.Utilities.HTTP.doGet(bibFileURL, function(text) {
// load BibTex translator
var translator = Zotero.loadTranslator("import");
translator.setTranslator("9cb70025-a888-4a29-a210-93ec52da40d4");
translator.setString(text);
translator.setHandler("itemDone", function(obj, item) {
item.itemType = type;
item.attachments = attachments;
item.repository = "Association for Computational Linguistics"
item.complete();
function scrapeBibtex(responseString, bibtexURL) {
let pdfURL = bibtexURL.replace('.bib', '.pdf');
let translator = Zotero.loadTranslator("import");
translator.setTranslator("9cb70025-a888-4a29-a210-93ec52da40d4");
translator.setString(responseString);
translator.setHandler("itemDone", function (obj, item) {
item.attachments.push({
url: pdfURL,
title: 'Full Text PDF',
mimeType: 'application/pdf'
});
translator.translate();
delete item.itemID;
item.complete();
});
translator.translate();
}
function doWeb(doc, url) {
var searchResult = true;
if(searchResult) {
var possibleItems = scrapeIndex(doc, null); // items to present to user
Zotero.selectItems(possibleItems, function (items) {
if (!items) {
return true;
}
scrapeIndex(doc, items)
});
function scrapeProceedings(doc, id) {
let itemType = id[0] == 'J' || id[0] == 'Q' ? 'journalArticle' : 'conferencePaper';
let newItem = new Zotero.Item(itemType);
let paragraphXpath = '//p[a[text()="' + id + '"]]/';
let pdfURL = ZU.xpathText(doc, paragraphXpath + 'a[contains(@href, "pdf")]/@href');
newItem.attachments.push({
title: "Full Text PDF",
mimeType: "application/pdf",
url: pdfURL
});
// The same proceedings list page can have multiple titles on it, so get the
// one relevant to this paper ID.
// e.g. http://www.aclweb.org/anthology/Y/Y16/
let titles = ZU.xpath(doc, paragraphXpath + 'preceding-sibling::h1');
if (itemType == 'conferencePaper') {
newItem.proceedingsTitle = titles[titles.length - 1].textContent;
newItem.publisher = 'Association for Computational Linguistics';
} else {
//not implemented yet
scrape(doc);
let publicationName = id[0] == 'J'
? 'Computational Linguistics'
: 'Transactions of the Association of Computational Linguistics';
newItem.publicationTitle = publicationName;
let journalInfo = titles[titles.length - 1].textContent;
let matchVolume = journalInfo.match(/Volume (\d)/);
if (matchVolume) newItem.volume = matchVolume[1];
let matchIssue = journalInfo.match(/(Issue|Number) (\d)/);
if (matchIssue) newItem.issue = matchIssue[2];
}
newItem.url = constructProceedingsURL(id) + '/' + id;
let titleElement = ZU.xpath(doc, paragraphXpath + 'i')[0];
newItem.title = titleElement.textContent;
let authorElement = ZU.xpath(doc, paragraphXpath + 'b')[0];
let authors = authorElement.textContent.split('; ');
newItem.creators = authors.map(function(author) {
return ZU.cleanAuthor(author, 'author');
});
let year = id.split('-')[0].substring(1);
year = year < 50 ? '20' + year : '19' + year;
newItem.date = year;
newItem.complete();
}
function constructProceedingsURL(id) {
const STUB_URL = 'http://aclweb.org/anthology/';
let idComponents = id.split('-');
return STUB_URL + idComponents[0][0] + '/' + idComponents[0];
}
/** BEGIN TEST CASES **/
var testCases = [
{
"type": "web",
"url": "http://aclweb.org/anthology/P/P93/",
"items": "multiple"
},
{
"type": "web",
"url": "http://aclweb.org/anthology/Y/Y16/",
"items": "multiple"
}
]
/** END TEST CASES **/
/** END TEST CASES **/

0 comments on commit 9fba327

Please sign in to comment.