Permalink
Please sign in to comment.
@@ -1,184 +1,208 @@ | ||
{ | ||
"translatorID": "f4a5876a-3e53-40e2-9032-d99a30d7a6fc", | ||
"label": "ACL", | ||
"creator": "Nathan Schneider", | ||
"target": "^https?://(www[.])?aclweb\\.org/anthology/[^#]+", | ||
"minVersion": "1.0.7", | ||
"label": "ACLWeb", | ||
"creator": "Nathan Schneider, Guy Aglionby", | ||
"target": "^https?://(www\\.)?aclweb\\.org/anthology/[^#]+", | ||
"minVersion": "3.0", | ||
"maxVersion": "", | ||
"priority": 100, | ||
"inRepository": true, | ||
"translatorType": 4, | ||
"browserSupport": "gcsbv", | ||
"lastUpdated": "2013-09-16 00:20:13" | ||
} | ||
// based on ACM translator | ||
function detectWeb(doc, url) { | ||
var namespace = doc.documentElement.namespaceURI; | ||
var nsResolver = namespace ? function(prefix) { | ||
if (prefix == 'x') return prefix; else return null; | ||
} : namespace; | ||
var bibXpath = "//a[./text() = 'bib']" | ||
if(doc.evaluate(bibXpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { | ||
return "multiple" | ||
} | ||
//commenting out single stuff | ||
// if (url.indexOf("/anthology-new/J/")>-1) | ||
// return "journalArticle"; | ||
// else | ||
// return "conferencePaper"; | ||
"browserSupport": "gcsibv", | ||
"lastUpdated": "2018-03-24 09:47:15" | ||
} | ||
/* | ||
***** BEGIN LICENSE BLOCK ***** | ||
function scrapeIndex(doc, items) { | ||
var results; | ||
var doImport; | ||
Copyright © 2018 Guy Aglionby | ||
This file is part of Zotero. | ||
if (items != null) { // Import user-selected item(s) | ||
results = items; | ||
doImport = true; | ||
} | ||
else { | ||
bibFileNodes = doc.evaluate('//a[substring(@href, string-length(@href)-3, 4) = ".bib"]', doc, null, XPathResult.ANY_TYPE, null); | ||
results = []; | ||
doImport = false; | ||
Zotero is free software: you can redistribute it and/or modify | ||
it under the terms of the GNU Affero General Public License as published by | ||
the Free Software Foundation, either version 3 of the License, or | ||
(at your option) any later version. | ||
var bibFileNode = bibFileNodes.iterateNext(); | ||
Zotero is distributed in the hope that it will be useful, | ||
but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
GNU Affero General Public License for more details. | ||
while (bibFileNode) { | ||
var bibFileName = bibFileNode.getAttribute("href"); | ||
var bibFile = bibFileName.substring(0, bibFileName.length-4); | ||
You should have received a copy of the GNU Affero General Public License | ||
along with Zotero. If not, see <http://www.gnu.org/licenses/>. | ||
var bNodes = doc.evaluate('//a[@href="' + bibFileName + '"]/following-sibling::b[position()=1]', doc, null, XPathResult.ANY_TYPE, null); // These nodes contain author information | ||
***** END LICENSE BLOCK ***** | ||
*/ | ||
// Extract authors' last names | ||
var authorLasts = new Array(); | ||
function detectWeb(doc, url) { | ||
if (doc.contentType === 'application/pdf' || url.endsWith('.bib')) { | ||
let id = url.split('/').pop(); | ||
return id[0] == 'J' || id[0] == 'Q' ? 'journalArticle' : 'conferencePaper'; | ||
} else { | ||
return 'multiple'; | ||
} | ||
} | ||
var bNode = bNodes.iterateNext(); | ||
var authorsS = bNode.innerHTML; // may include markup: potentially <author>, <first>, <von>, and/or <last> tags | ||
authorsS = authorsS.replace(/[<][/]?author[>]/g, ""); | ||
var authors = authorsS.split("; "); | ||
for (var a in authors) { | ||
var authorS = authors[a]; | ||
var m = authorS.match(/[<]von[>]([^<]+)[<][/]von[>]/); | ||
var last = ""; | ||
if (m!=null) // we expect there is a <last> tag if there is a <von> tag | ||
last = m[1] + " "; | ||
m = authorS.match(/[<]last[>]([^<]+)[<][/]last[>]/); | ||
if (m!=null) | ||
last += m[1]; | ||
else { | ||
var name = authorS.replace(/[<][^>]+[>]/g, ""); // remove all markup | ||
if (name=="Entire volume") | ||
last = name; | ||
else { | ||
var parts = name.split(" "); | ||
last = parts[parts.length-1]; | ||
if (parts.length>1) { | ||
var penultInitial = parts[parts.length-2].substr(0,1); | ||
if (penultInitial.toUpperCase()!=penultInitial) // e.g. van Dyke | ||
last = name[parts.length-2] + " " + last; | ||
} | ||
} | ||
function doWeb(doc, url) { | ||
if (detectWeb(doc, url) === 'multiple') { | ||
Zotero.selectItems(extractFullProceedings(doc), function (selected) { | ||
if (!selected) { | ||
return true; | ||
} | ||
Object.keys(selected).forEach(function (id) { | ||
let bibtexElement = ZU.xpath(doc, '//a[contains(@href, "' + id + '.bib")]'); | ||
// Sometimes there won't be a BibTeX link, so we need to check | ||
// and scrape directly from the proceedings page if there isn't. | ||
if (bibtexElement.length) { | ||
let bibtexURL = bibtexElement[0].href; | ||
ZU.doGet(bibtexURL, function(responseString, responseObj, url) { | ||
scrapeBibtex(responseString, url); | ||
}); | ||
} else { | ||
scrapeProceedings(doc, id); | ||
} | ||
authorLasts.push(last); | ||
}); | ||
}); | ||
} else if(url.endsWith('.bib')) { | ||
// e.g. http://www.aclweb.org/anthology/P10-4014.bib | ||
let bibtex = ZU.xpath(doc, '//pre')[0].textContent; | ||
scrapeBibtex(bibtex, url); | ||
} else if (doc.contentType === 'application/pdf') { | ||
let bibtexURL = url.replace('.pdf', '') + '.bib'; | ||
ZU.doGet(bibtexURL, function(responseString, responseObj) { | ||
// Some items don't have .bib entries. In those cases we need to go | ||
// to the proceedings page and scrape the information from there, | ||
// given that we have the ID of the paper from the URL. | ||
let is404 = responseString.includes('<title>404 Not Found</title>'); | ||
if (is404) { | ||
// e.g. http://www.aclweb.org/anthology/Q14-1019 | ||
let id = url.split('/').pop().replace('.pdf', ''); | ||
ZU.processDocuments(constructProceedingsURL(id), function(doc) { | ||
scrapeProceedings(doc, id); | ||
}); | ||
} else { | ||
// e.g. http://www.aclweb.org/anthology/P10-4014 | ||
scrapeBibtex(responseString, bibtexURL); | ||
} | ||
// Prepare result for this item, which consists of the relative path to the .bib file (minus the extension) | ||
// followed by a space and the authors' last names (abbreviated format) | ||
var result = bibFile + " "; | ||
if (authorLasts.length<3) | ||
result += authorLasts.join(" & "); | ||
else | ||
result += authorLasts[0] + "+"; | ||
results.push(result); | ||
bibFileNode = bibFileNodes.iterateNext(); | ||
} | ||
}); | ||
} | ||
} | ||
if (!doImport) | ||
return results; | ||
for (var i in results) { | ||
var ii = results[i].indexOf(" "); | ||
var fileRelPath = results[i].substring(0, ii); | ||
var authorsShort = results[i].substring(ii+1); | ||
var fileName = fileRelPath.substring(fileRelPath.lastIndexOf("/")+1); | ||
var bibFile = fileRelPath + ".bib"; | ||
var pageurl = doc.location.href; | ||
var lastSlash = pageurl.lastIndexOf("/"); | ||
var dirInUrl = pageurl.substring(0, lastSlash+1); | ||
var fileInUrl = pageurl.substring(lastSlash+1, pageurl.indexOf("#", lastSlash)); | ||
var bib = dirInUrl + fileRelPath + ".bib"; | ||
var pdf = dirInUrl + fileRelPath + ".pdf"; | ||
var j = fileRelPath.lastIndexOf("-"); | ||
var yearShort = fileRelPath.substring(j-2, j); | ||
var year = ""; | ||
if (new Number(yearShort) < 50) | ||
year = "20" + yearShort; | ||
else | ||
year = "19" + yearShort; | ||
var attachments = new Array(); | ||
attachments.push({title:authorsShort + " " + year + ".pdf", mimeType:"application/pdf", url:pdf}); | ||
var type = ""; | ||
if (pageurl.indexOf("/anthology-new/J/")>-1) | ||
type = "journalArticle"; | ||
else | ||
type = "conferencePaper"; | ||
if (doImport) | ||
callTranslator(bib, type, attachments); | ||
function extractFullProceedings(doc) { | ||
let unwantedTitles = ['Front Matter', 'Author Index', 'Keyword Index'].map(function(title) { | ||
return 'not(contains(., "' + title + '"))'; | ||
}).join(' and '); | ||
let baseXpath = '//div[@id="content"]/p[i[' + unwantedTitles + ']]/'; | ||
let ids = ZU.xpath(doc, baseXpath + 'a[@href = concat(text(), ".pdf")]'); | ||
ids = ids.map(function(id) { return id.textContent; }); | ||
let authors = ZU.xpath(doc, baseXpath + 'b'); | ||
authors = authors.map(function(author) { return author.textContent; }); | ||
let titles = ZU.xpath(doc, baseXpath + 'i'); | ||
titles = titles.map(function(title) { return title.textContent; }); | ||
let items = {}; | ||
for (let i = 0; i < ids.length; i++) { | ||
let articleAuthors = authors[i].split('; '); | ||
let authorSurname = articleAuthors[0].split(' ').pop(); | ||
let etAl = articleAuthors.length > 1 ? ' et al.' : ''; | ||
let author = authorSurname + etAl; | ||
items[ids[i]] = ids[i] + ' (' + author + '): ' + titles[i]; | ||
} | ||
return items; | ||
} | ||
function callTranslator(bibFileURL, type, attachments) { | ||
Zotero.Utilities.HTTP.doGet(bibFileURL, function(text) { | ||
// load BibTex translator | ||
var translator = Zotero.loadTranslator("import"); | ||
translator.setTranslator("9cb70025-a888-4a29-a210-93ec52da40d4"); | ||
translator.setString(text); | ||
translator.setHandler("itemDone", function(obj, item) { | ||
item.itemType = type; | ||
item.attachments = attachments; | ||
item.repository = "Association for Computational Linguistics" | ||
item.complete(); | ||
function scrapeBibtex(responseString, bibtexURL) { | ||
let pdfURL = bibtexURL.replace('.bib', '.pdf'); | ||
let translator = Zotero.loadTranslator("import"); | ||
translator.setTranslator("9cb70025-a888-4a29-a210-93ec52da40d4"); | ||
translator.setString(responseString); | ||
translator.setHandler("itemDone", function (obj, item) { | ||
item.attachments.push({ | ||
url: pdfURL, | ||
title: 'Full Text PDF', | ||
mimeType: 'application/pdf' | ||
}); | ||
translator.translate(); | ||
delete item.itemID; | ||
item.complete(); | ||
}); | ||
translator.translate(); | ||
} | ||
function doWeb(doc, url) { | ||
var searchResult = true; | ||
if(searchResult) { | ||
var possibleItems = scrapeIndex(doc, null); // items to present to user | ||
Zotero.selectItems(possibleItems, function (items) { | ||
if (!items) { | ||
return true; | ||
} | ||
scrapeIndex(doc, items) | ||
}); | ||
function scrapeProceedings(doc, id) { | ||
let itemType = id[0] == 'J' || id[0] == 'Q' ? 'journalArticle' : 'conferencePaper'; | ||
let newItem = new Zotero.Item(itemType); | ||
let paragraphXpath = '//p[a[text()="' + id + '"]]/'; | ||
let pdfURL = ZU.xpathText(doc, paragraphXpath + 'a[contains(@href, "pdf")]/@href'); | ||
newItem.attachments.push({ | ||
title: "Full Text PDF", | ||
mimeType: "application/pdf", | ||
url: pdfURL | ||
}); | ||
// The same proceedings list page can have multiple titles on it, so get the | ||
// one relevant to this paper ID. | ||
// e.g. http://www.aclweb.org/anthology/Y/Y16/ | ||
let titles = ZU.xpath(doc, paragraphXpath + 'preceding-sibling::h1'); | ||
if (itemType == 'conferencePaper') { | ||
newItem.proceedingsTitle = titles[titles.length - 1].textContent; | ||
newItem.publisher = 'Association for Computational Linguistics'; | ||
} else { | ||
//not implemented yet | ||
scrape(doc); | ||
let publicationName = id[0] == 'J' | ||
? 'Computational Linguistics' | ||
: 'Transactions of the Association of Computational Linguistics'; | ||
newItem.publicationTitle = publicationName; | ||
let journalInfo = titles[titles.length - 1].textContent; | ||
let matchVolume = journalInfo.match(/Volume (\d)/); | ||
if (matchVolume) newItem.volume = matchVolume[1]; | ||
let matchIssue = journalInfo.match(/(Issue|Number) (\d)/); | ||
if (matchIssue) newItem.issue = matchIssue[2]; | ||
} | ||
newItem.url = constructProceedingsURL(id) + '/' + id; | ||
let titleElement = ZU.xpath(doc, paragraphXpath + 'i')[0]; | ||
newItem.title = titleElement.textContent; | ||
let authorElement = ZU.xpath(doc, paragraphXpath + 'b')[0]; | ||
let authors = authorElement.textContent.split('; '); | ||
newItem.creators = authors.map(function(author) { | ||
return ZU.cleanAuthor(author, 'author'); | ||
}); | ||
let year = id.split('-')[0].substring(1); | ||
year = year < 50 ? '20' + year : '19' + year; | ||
newItem.date = year; | ||
newItem.complete(); | ||
} | ||
function constructProceedingsURL(id) { | ||
const STUB_URL = 'http://aclweb.org/anthology/'; | ||
let idComponents = id.split('-'); | ||
return STUB_URL + idComponents[0][0] + '/' + idComponents[0]; | ||
} | ||
/** BEGIN TEST CASES **/ | ||
var testCases = [ | ||
{ | ||
"type": "web", | ||
"url": "http://aclweb.org/anthology/P/P93/", | ||
"items": "multiple" | ||
}, | ||
{ | ||
"type": "web", | ||
"url": "http://aclweb.org/anthology/Y/Y16/", | ||
"items": "multiple" | ||
} | ||
] | ||
/** END TEST CASES **/ | ||
/** END TEST CASES **/ |
0 comments on commit
9fba327