Rewrite ACL translator (#1577)

adam3smith · Mar 24, 2018 · 9fba327a037aeb85559b94d5f422155dad634edd · 9fba327
1 parent cd5557a
commit 9fba327a037aeb85559b94d5f422155dad634edd
Unified Split

Showing with 171 additions and 147 deletions.

+171 −147 ACLWeb.js
diff --git a/ACLWeb.js b/ACLWeb.js
@@ -1,184 +1,208 @@
 {
 	"translatorID": "f4a5876a-3e53-40e2-9032-d99a30d7a6fc",
-	"label": "ACL",
-	"creator": "Nathan Schneider",
-	"target": "^https?://(www[.])?aclweb\\.org/anthology/[^#]+",
-	"minVersion": "1.0.7",
+	"label": "ACLWeb",
+	"creator": "Nathan Schneider, Guy Aglionby",
+	"target": "^https?://(www\\.)?aclweb\\.org/anthology/[^#]+",
+	"minVersion": "3.0",
 	"maxVersion": "",
 	"priority": 100,
 	"inRepository": true,
 	"translatorType": 4,
-	"browserSupport": "gcsbv",
-	"lastUpdated": "2013-09-16 00:20:13"
-}
-
-// based on ACM translator
-function detectWeb(doc, url) {
-  var namespace = doc.documentElement.namespaceURI;
-	var nsResolver = namespace ? function(prefix) {
-		if (prefix == 'x') return prefix; else return null;
-	} : namespace;
-
-	var bibXpath = "//a[./text() = 'bib']"
-	if(doc.evaluate(bibXpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
-	  return "multiple"
-	}
-  //commenting out single stuff
-  // if (url.indexOf("/anthology-new/J/")>-1)
-  //  return "journalArticle";
-  // else
-  //  return "conferencePaper";
+	"browserSupport": "gcsibv",
+	"lastUpdated": "2018-03-24 09:47:15"
 }

+/*
+	***** BEGIN LICENSE BLOCK *****

-function scrapeIndex(doc, items) {
-	var results;
-	var doImport;
+	Copyright © 2018 Guy Aglionby
+	This file is part of Zotero.

-	if (items != null) {	// Import user-selected item(s)
-		results = items;
-		doImport = true;
-	}
-	else {
-		bibFileNodes = doc.evaluate('//a[substring(@href, string-length(@href)-3, 4) = ".bib"]', doc, null, XPathResult.ANY_TYPE, null);
-
-		results = [];
-		doImport = false;
+	Zotero is free software: you can redistribute it and/or modify
+	it under the terms of the GNU Affero General Public License as published by
+	the Free Software Foundation, either version 3 of the License, or
+	(at your option) any later version.

-		var bibFileNode = bibFileNodes.iterateNext();
+	Zotero is distributed in the hope that it will be useful,
+	but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+	GNU Affero General Public License for more details.

-		while (bibFileNode) {
-			var bibFileName = bibFileNode.getAttribute("href");
-			var bibFile = bibFileName.substring(0, bibFileName.length-4);
+	You should have received a copy of the GNU Affero General Public License
+	along with Zotero.  If not, see <http://www.gnu.org/licenses/>.

-			var bNodes = doc.evaluate('//a[@href="' + bibFileName + '"]/following-sibling::b[position()=1]', doc, null, XPathResult.ANY_TYPE, null);	// These nodes contain author information
+	***** END LICENSE BLOCK *****
+*/

-			// Extract authors' last names
-			var authorLasts = new Array();
+function detectWeb(doc, url) {
+	if (doc.contentType === 'application/pdf' || url.endsWith('.bib')) {
+		let id = url.split('/').pop();
+		return id[0] == 'J' || id[0] == 'Q' ? 'journalArticle' : 'conferencePaper';
+	} else {
+		return 'multiple';
+	}
+}

-			var bNode = bNodes.iterateNext();
-			var authorsS = bNode.innerHTML;	// may include markup: potentially <author>, <first>, <von>, and/or <last> tags
-			authorsS = authorsS.replace(/[<][/]?author[>]/g, "");
-			var authors = authorsS.split("; ");
-			for (var a in authors) {
-				var authorS = authors[a];
-				var m = authorS.match(/[<]von[>]([^<]+)[<][/]von[>]/);
-				var last = "";
-				if (m!=null)	// we expect there is a <last> tag if there is a <von> tag
-					last = m[1] + " ";
-				m = authorS.match(/[<]last[>]([^<]+)[<][/]last[>]/);
-				if (m!=null)
-					last += m[1];
-				else {
-					var name = authorS.replace(/[<][^>]+[>]/g, "");	// remove all markup
-					if (name=="Entire volume")
-						last = name;
-					else {
-						var parts = name.split(" ");
-						last = parts[parts.length-1];
-						if (parts.length>1) {
-							var penultInitial = parts[parts.length-2].substr(0,1);
-							if (penultInitial.toUpperCase()!=penultInitial)	// e.g. van Dyke
-								last = name[parts.length-2] + " " + last;
-						}
-					}
+function doWeb(doc, url) {
+	if (detectWeb(doc, url) === 'multiple') {
+		Zotero.selectItems(extractFullProceedings(doc), function (selected) {
+			if (!selected) {
+				return true;
+			}
+			
+			Object.keys(selected).forEach(function (id) {
+				let bibtexElement = ZU.xpath(doc, '//a[contains(@href, "' + id + '.bib")]');
+				
+				// Sometimes there won't be a BibTeX link, so we need to check
+				// and scrape directly from the proceedings page if there isn't.
+				if (bibtexElement.length) {
+					let bibtexURL = bibtexElement[0].href;
+					ZU.doGet(bibtexURL, function(responseString, responseObj, url) {
+						scrapeBibtex(responseString, url);
+					});
+				} else {
+					scrapeProceedings(doc, id);
 				}
-				authorLasts.push(last);
+			});
+		});
+	} else if(url.endsWith('.bib')) {
+		// e.g. http://www.aclweb.org/anthology/P10-4014.bib
+		let bibtex = ZU.xpath(doc, '//pre')[0].textContent;
+		scrapeBibtex(bibtex, url);
+	} else if (doc.contentType === 'application/pdf') {
+		let bibtexURL = url.replace('.pdf', '') + '.bib';
+		ZU.doGet(bibtexURL, function(responseString, responseObj) {
+			// Some items don't have .bib entries. In those cases we need to go
+			// to the proceedings page and scrape the information from there,
+			// given that we have the ID of the paper from the URL.
+			let is404 = responseString.includes('<title>404 Not Found</title>');
+			if (is404) {
+				// e.g. http://www.aclweb.org/anthology/Q14-1019
+				let id = url.split('/').pop().replace('.pdf', '');
+				ZU.processDocuments(constructProceedingsURL(id), function(doc) {
+					scrapeProceedings(doc, id);
+				});
+			} else {
+				// e.g. http://www.aclweb.org/anthology/P10-4014
+				scrapeBibtex(responseString, bibtexURL);
 			}
-
-			// Prepare result for this item, which consists of the relative path to the .bib file (minus the extension)
-			// followed by a space and the authors' last names (abbreviated format)
-			var result = bibFile + " ";
-
-			if (authorLasts.length<3)
-				result += authorLasts.join(" & ");
-			else
-				result += authorLasts[0] + "+";
-
-			results.push(result);
-			bibFileNode = bibFileNodes.iterateNext();
-		}
+		});
 	}
+}

-
-	if (!doImport)
-		return results;
-
-	for (var i in results) {
-		var ii = results[i].indexOf(" ");
-		var fileRelPath = results[i].substring(0, ii);
-		var authorsShort = results[i].substring(ii+1);
-		var fileName = fileRelPath.substring(fileRelPath.lastIndexOf("/")+1);
-		var bibFile = fileRelPath + ".bib";
-
-		var pageurl = doc.location.href;
-		var lastSlash = pageurl.lastIndexOf("/");
-		var dirInUrl = pageurl.substring(0, lastSlash+1);
-		var fileInUrl = pageurl.substring(lastSlash+1, pageurl.indexOf("#", lastSlash));
-		var bib = dirInUrl + fileRelPath + ".bib";
-		var pdf = dirInUrl + fileRelPath + ".pdf";
-		var j = fileRelPath.lastIndexOf("-");
-		var yearShort = fileRelPath.substring(j-2, j);
-		var year = "";
-		if (new Number(yearShort) < 50)
-			year = "20" + yearShort;
-		else
-			year = "19" + yearShort;
-
-		var attachments = new Array();
-		attachments.push({title:authorsShort + " " + year + ".pdf", mimeType:"application/pdf", url:pdf});
-
-		var type = "";
-		if (pageurl.indexOf("/anthology-new/J/")>-1)
-			type = "journalArticle";
-		else
-			type = "conferencePaper";
-
-		if (doImport)
-			callTranslator(bib, type, attachments);
-
+function extractFullProceedings(doc) {
+	let unwantedTitles = ['Front Matter', 'Author Index', 'Keyword Index'].map(function(title) { 
+		return 'not(contains(., "' + title + '"))';
+	}).join(' and ');
+	
+	let baseXpath = '//div[@id="content"]/p[i[' + unwantedTitles + ']]/';
+	
+	let ids = ZU.xpath(doc, baseXpath + 'a[@href = concat(text(), ".pdf")]');
+	ids = ids.map(function(id) { return id.textContent; });
+	
+	let authors = ZU.xpath(doc, baseXpath + 'b');
+	authors = authors.map(function(author) { return author.textContent; });
+	
+	let titles = ZU.xpath(doc, baseXpath + 'i');
+	titles = titles.map(function(title) { return title.textContent; });
+	
+	let items = {};
+
+	for (let i = 0; i < ids.length; i++) {
+		let articleAuthors = authors[i].split('; ');
+		let authorSurname = articleAuthors[0].split(' ').pop();
+		let etAl = articleAuthors.length > 1 ? ' et al.' : '';
+		let author = authorSurname + etAl;
+		items[ids[i]] = ids[i] + ' (' + author + '): ' + titles[i];
 	}
+	
+	return items;
 }

-function callTranslator(bibFileURL, type, attachments) {
-	Zotero.Utilities.HTTP.doGet(bibFileURL, function(text) {
-
-		// load BibTex translator
-		var translator = Zotero.loadTranslator("import");
-		translator.setTranslator("9cb70025-a888-4a29-a210-93ec52da40d4");
-		translator.setString(text);
-		translator.setHandler("itemDone", function(obj, item) {
-			item.itemType = type;
-			item.attachments = attachments;
-			item.repository = "Association for Computational Linguistics"
-			item.complete();
+function scrapeBibtex(responseString, bibtexURL) {
+	let pdfURL = bibtexURL.replace('.bib', '.pdf');
+			
+	let translator = Zotero.loadTranslator("import");
+	translator.setTranslator("9cb70025-a888-4a29-a210-93ec52da40d4");
+	translator.setString(responseString);
+	translator.setHandler("itemDone", function (obj, item) {
+		item.attachments.push({
+			url: pdfURL,
+			title: 'Full Text PDF',
+			mimeType: 'application/pdf'
 		});
-		translator.translate();
-
+		delete item.itemID;
+		item.complete();
 	});
+	translator.translate();
 }

-function doWeb(doc, url) {
-	var searchResult = true;
-	if(searchResult) {
-		var possibleItems = scrapeIndex(doc, null);	// items to present to user
-		Zotero.selectItems(possibleItems, function (items) {
-			if (!items) {
-				return true;
-			}
-			scrapeIndex(doc, items)	
-		});
+function scrapeProceedings(doc, id) {
+	let itemType = id[0] == 'J' || id[0] == 'Q' ? 'journalArticle' : 'conferencePaper';
+	let newItem = new Zotero.Item(itemType);
+	
+	let paragraphXpath = '//p[a[text()="' + id + '"]]/';
+	
+	let pdfURL = ZU.xpathText(doc, paragraphXpath + 'a[contains(@href, "pdf")]/@href');
+	newItem.attachments.push({
+		title: "Full Text PDF",
+		mimeType: "application/pdf",
+		url: pdfURL
+	});
+	
+	// The same proceedings list page can have multiple titles on it, so get the
+	// one relevant to this paper ID.
+	// e.g. http://www.aclweb.org/anthology/Y/Y16/
+	let titles = ZU.xpath(doc, paragraphXpath + 'preceding-sibling::h1');
+	
+	if (itemType == 'conferencePaper') {
+		newItem.proceedingsTitle = titles[titles.length - 1].textContent;
+		newItem.publisher = 'Association for Computational Linguistics';
 	} else {
-	  //not implemented yet
-		scrape(doc);
+		let publicationName = id[0] == 'J' 
+			? 'Computational Linguistics'
+			: 'Transactions of the Association of Computational Linguistics';
+		newItem.publicationTitle = publicationName;
+		let journalInfo = titles[titles.length - 1].textContent;
+		let matchVolume = journalInfo.match(/Volume (\d)/);
+		if (matchVolume) newItem.volume = matchVolume[1];
+		let matchIssue = journalInfo.match(/(Issue|Number) (\d)/);
+		if (matchIssue) newItem.issue = matchIssue[2];
 	}
+	
+	newItem.url = constructProceedingsURL(id) + '/' + id;
+	
+	let titleElement = ZU.xpath(doc, paragraphXpath + 'i')[0];
+	newItem.title = titleElement.textContent;
+	
+	let authorElement = ZU.xpath(doc, paragraphXpath + 'b')[0];
+	let authors = authorElement.textContent.split('; ');
+	newItem.creators = authors.map(function(author) {
+		return ZU.cleanAuthor(author, 'author');
+	});
+	
+	let year = id.split('-')[0].substring(1);
+	year = year < 50 ? '20' + year : '19' + year;
+	newItem.date = year;
+	newItem.complete();
+}
+
+function constructProceedingsURL(id) {
+	const STUB_URL = 'http://aclweb.org/anthology/';
+	let idComponents = id.split('-');
+	return STUB_URL + idComponents[0][0] + '/' + idComponents[0];
 }
 /** BEGIN TEST CASES **/
 var testCases = [
 	{
 		"type": "web",
 		"url": "http://aclweb.org/anthology/P/P93/",
 		"items": "multiple"
+	},
+	{
+		"type": "web",
+		"url": "http://aclweb.org/anthology/Y/Y16/",
+		"items": "multiple"
 	}
 ]
-/** END TEST CASES **/
+/** END TEST CASES **/