Permalink
Join GitHub today
GitHub is home to over 31 million developers working together to host and review code, manage projects, and build software together.
Sign up
translators/ACLWeb.js
Find file
Copy path
Fetching contributors…
Cannot retrieve contributors at this time
{ | |
"translatorID": "f4a5876a-3e53-40e2-9032-d99a30d7a6fc", | |
"label": "ACLWeb", | |
"creator": "Nathan Schneider, Guy Aglionby", | |
"target": "^https?://(www\\.)?aclweb\\.org/anthology/[^#]+", | |
"minVersion": "3.0", | |
"maxVersion": "", | |
"priority": 100, | |
"inRepository": true, | |
"translatorType": 4, | |
"browserSupport": "gcsibv", | |
"lastUpdated": "2018-03-24 09:47:15" | |
} | |
/* | |
***** BEGIN LICENSE BLOCK ***** | |
Copyright © 2018 Guy Aglionby | |
This file is part of Zotero. | |
Zotero is free software: you can redistribute it and/or modify | |
it under the terms of the GNU Affero General Public License as published by | |
the Free Software Foundation, either version 3 of the License, or | |
(at your option) any later version. | |
Zotero is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
GNU Affero General Public License for more details. | |
You should have received a copy of the GNU Affero General Public License | |
along with Zotero. If not, see <http://www.gnu.org/licenses/>. | |
***** END LICENSE BLOCK ***** | |
*/ | |
function detectWeb(doc, url) { | |
if (doc.contentType === 'application/pdf' || url.endsWith('.bib')) { | |
let id = url.split('/').pop(); | |
return id[0] == 'J' || id[0] == 'Q' ? 'journalArticle' : 'conferencePaper'; | |
} else { | |
return 'multiple'; | |
} | |
} | |
function doWeb(doc, url) { | |
if (detectWeb(doc, url) === 'multiple') { | |
Zotero.selectItems(extractFullProceedings(doc), function (selected) { | |
if (!selected) { | |
return true; | |
} | |
Object.keys(selected).forEach(function (id) { | |
let bibtexElement = ZU.xpath(doc, '//a[contains(@href, "' + id + '.bib")]'); | |
// Sometimes there won't be a BibTeX link, so we need to check | |
// and scrape directly from the proceedings page if there isn't. | |
if (bibtexElement.length) { | |
let bibtexURL = bibtexElement[0].href; | |
ZU.doGet(bibtexURL, function(responseString, responseObj, url) { | |
scrapeBibtex(responseString, url); | |
}); | |
} else { | |
scrapeProceedings(doc, id); | |
} | |
}); | |
}); | |
} else if (url.endsWith('.bib')) { | |
// e.g. http://www.aclweb.org/anthology/P10-4014.bib | |
let bibtex = ZU.xpath(doc, '//pre')[0].textContent; | |
scrapeBibtex(bibtex, url); | |
} else if (doc.contentType === 'application/pdf') { | |
let bibtexURL = url.replace('.pdf', '') + '.bib'; | |
ZU.doGet(bibtexURL, function(responseString, responseObj) { | |
// Some items don't have .bib entries. In those cases we need to go | |
// to the proceedings page and scrape the information from there, | |
// given that we have the ID of the paper from the URL. | |
let is404 = responseString.includes('<title>404 Not Found</title>'); | |
if (is404) { | |
// e.g. http://www.aclweb.org/anthology/Q14-1019 | |
let id = url.split('/').pop().replace('.pdf', ''); | |
ZU.processDocuments(constructProceedingsURL(id), function(doc) { | |
scrapeProceedings(doc, id); | |
}); | |
} else { | |
// e.g. http://www.aclweb.org/anthology/P10-4014 | |
scrapeBibtex(responseString, bibtexURL); | |
} | |
}); | |
} | |
} | |
function extractFullProceedings(doc) { | |
let unwantedTitles = ['Front Matter', 'Author Index', 'Keyword Index'].map(function(title) { | |
return 'not(contains(., "' + title + '"))'; | |
}).join(' and '); | |
let baseXpath = '//div[@id="content"]/p[i[' + unwantedTitles + ']]/'; | |
let ids = ZU.xpath(doc, baseXpath + 'a[@href = concat(text(), ".pdf")]'); | |
ids = ids.map(function(id) { return id.textContent; }); | |
let authors = ZU.xpath(doc, baseXpath + 'b'); | |
authors = authors.map(function(author) { return author.textContent; }); | |
let titles = ZU.xpath(doc, baseXpath + 'i'); | |
titles = titles.map(function(title) { return title.textContent; }); | |
let items = {}; | |
for (let i = 0; i < ids.length; i++) { | |
let articleAuthors = authors[i].split('; '); | |
let authorSurname = articleAuthors[0].split(' ').pop(); | |
let etAl = articleAuthors.length > 1 ? ' et al.' : ''; | |
let author = authorSurname + etAl; | |
items[ids[i]] = ids[i] + ' (' + author + '): ' + titles[i]; | |
} | |
return items; | |
} | |
function scrapeBibtex(responseString, bibtexURL) { | |
let pdfURL = bibtexURL.replace('.bib', '.pdf'); | |
let translator = Zotero.loadTranslator("import"); | |
translator.setTranslator("9cb70025-a888-4a29-a210-93ec52da40d4"); | |
translator.setString(responseString); | |
translator.setHandler("itemDone", function (obj, item) { | |
item.attachments.push({ | |
url: pdfURL, | |
title: 'Full Text PDF', | |
mimeType: 'application/pdf' | |
}); | |
delete item.itemID; | |
item.complete(); | |
}); | |
translator.translate(); | |
} | |
function scrapeProceedings(doc, id) { | |
let itemType = id[0] == 'J' || id[0] == 'Q' ? 'journalArticle' : 'conferencePaper'; | |
let newItem = new Zotero.Item(itemType); | |
let paragraphXpath = '//p[a[text()="' + id + '"]]/'; | |
let pdfURL = ZU.xpathText(doc, paragraphXpath + 'a[contains(@href, "pdf")]/@href'); | |
newItem.attachments.push({ | |
title: "Full Text PDF", | |
mimeType: "application/pdf", | |
url: pdfURL | |
}); | |
// The same proceedings list page can have multiple titles on it, so get the | |
// one relevant to this paper ID. | |
// e.g. http://www.aclweb.org/anthology/Y/Y16/ | |
let titles = ZU.xpath(doc, paragraphXpath + 'preceding-sibling::h1'); | |
if (itemType == 'conferencePaper') { | |
newItem.proceedingsTitle = titles[titles.length - 1].textContent; | |
newItem.publisher = 'Association for Computational Linguistics'; | |
} else { | |
let publicationName = id[0] == 'J' | |
? 'Computational Linguistics' | |
: 'Transactions of the Association of Computational Linguistics'; | |
newItem.publicationTitle = publicationName; | |
let journalInfo = titles[titles.length - 1].textContent; | |
let matchVolume = journalInfo.match(/Volume (\d)/); | |
if (matchVolume) newItem.volume = matchVolume[1]; | |
let matchIssue = journalInfo.match(/(Issue|Number) (\d)/); | |
if (matchIssue) newItem.issue = matchIssue[2]; | |
} | |
newItem.url = constructProceedingsURL(id) + '/' + id; | |
let titleElement = ZU.xpath(doc, paragraphXpath + 'i')[0]; | |
newItem.title = titleElement.textContent; | |
let authorElement = ZU.xpath(doc, paragraphXpath + 'b')[0]; | |
let authors = authorElement.textContent.split('; '); | |
newItem.creators = authors.map(function(author) { | |
return ZU.cleanAuthor(author, 'author'); | |
}); | |
let year = id.split('-')[0].substring(1); | |
year = year < 50 ? '20' + year : '19' + year; | |
newItem.date = year; | |
newItem.complete(); | |
} | |
function constructProceedingsURL(id) { | |
const STUB_URL = 'http://aclweb.org/anthology/'; | |
let idComponents = id.split('-'); | |
return STUB_URL + idComponents[0][0] + '/' + idComponents[0]; | |
} | |
/** BEGIN TEST CASES **/ | |
var testCases = [ | |
{ | |
"type": "web", | |
"url": "http://aclweb.org/anthology/P/P93/", | |
"items": "multiple" | |
}, | |
{ | |
"type": "web", | |
"url": "http://aclweb.org/anthology/Y/Y16/", | |
"items": "multiple" | |
} | |
] | |
/** END TEST CASES **/ |