Improvements for Dataset & DOI handling (#1338)

Note for SAE: this appears currently broken due to invalid HTML
zotero · Jun 22, 2017 · be399e30e0d06015d58bcf60898a6519e5805194 · be399e3
1 parent a463a3a
commit be399e30e0d06015d58bcf60898a6519e5805194
Unified Split

Showing with 300 additions and 213 deletions.

+52 −41 Embedded Metadata.js

+10 −22 Figshare.js

+14 −1 RDF.js

+9 −13 RSC Publishing.js

+2 −4 SAE Papers.js

+2 −5 Scopus.js

+211 −127 Zenodo.js
diff --git a/Embedded Metadata.js b/Embedded Metadata.js
@@ -9,7 +9,7 @@
 	"inRepository": true,
 	"translatorType": 4,
 	"browserSupport": "gcsibv",
-  
-	"lastUpdated": "2017-06-06 18:09:35"
+  
+	"lastUpdated": "2017-06-16 18:52:30"
 }
 /*
@@ -144,7 +144,7 @@ function getPrefixes(doc) {
 			}
 		}
 	}
-  
-	
+  
+
 	//also look in html and head elements
 	var prefixes = (doc.documentElement.getAttribute('prefix') || '')
 		+ (doc.head.getAttribute('prefix') || '');
@@ -196,7 +196,7 @@ function processFields(doc, item, fieldMap, strict) {
 function completeItem(doc, newItem) {
 	// Strip off potential junk from RDF
 	newItem.seeAlso = [];
-  
-	
+  
+
 	addHighwireMetadata(doc, newItem);
 	addOtherMetadata(doc, newItem);
 	addLowQualityMetadata(doc, newItem);
@@ -205,7 +205,7 @@ function completeItem(doc, newItem) {
 	if(CUSTOM_FIELD_MAPPINGS) {
 		processFields(doc, newItem, CUSTOM_FIELD_MAPPINGS, true);
 	}
-  
-	
+  
+
 	newItem.complete();
 }
@@ -259,14 +259,13 @@ function init(doc, url, callback, forceLoadRDF) {
 			if(_prefixes[prefix]) {
 				var prop = tag.substr(delimIndex+1, 1).toLowerCase()+tag.substr(delimIndex+2);
-  
-
 				//bib and bibo types are special, they use rdf:type to define type
 				var specialNS = [_prefixes['bib'], _prefixes['bibo']];
 				if(prop == 'type' && specialNS.indexOf(_prefixes[prefix]) != -1) {
 					value = _prefixes[prefix] + value;
 					prefix = 'rdf';
 				}
-  
-				
+  
+
 				// This debug is for seeing what is being sent to RDF
 				//Zotero.debug(_prefixes[prefix]+prop +"=>"+value);
 				statements.push([url, _prefixes[prefix]+prop, value]);
@@ -275,7 +274,7 @@ function init(doc, url, callback, forceLoadRDF) {
 				if(lcValue.indexOf('blogger') != -1
 					|| lcValue.indexOf('wordpress') != -1
 					|| lcValue.indexOf('wooframework') != -1
-  
-				) {	
+  
+				) {
 					generatorType = 'blogPost';
 				}
 			} else {
@@ -308,7 +307,7 @@ function init(doc, url, callback, forceLoadRDF) {
 			}
 		}
 	}
-  
-	
+  
+
 	if(statements.length || forceLoadRDF) {
 		// load RDF translator, so that we don't need to replicate import code
 		var translator = Zotero.loadTranslator("import");
@@ -317,13 +316,12 @@ function init(doc, url, callback, forceLoadRDF) {
 			_haveItem = true;
 			completeItem(doc, newItem);
 		});
-  
-		
+  
+
 		translator.getTranslatorObject(function(rdf) {
 			for(var i=0; i<statements.length; i++) {
-  
-				var statement = statements[i];			
+  
+				var statement = statements[i];
 				rdf.Zotero.RDF.addStatement(statement[0], statement[1], statement[2], true);
 			}
-  
-
 			var nodes = rdf.getNodes(true);
 			rdf.defaultUnknownType = hwType || hwTypeGuess || generatorType ||
 				//if we have RDF data, then default to webpage
@@ -462,13 +460,13 @@ function addHighwireMetadata(doc, newItem) {
 		newItem.pages = firstpage +
 			( ( lastpage && ( lastpage = lastpage.trim() ) )?'-' + lastpage : '' );
 	}
-  
-	
+  
+
 	//fall back to some other date options
 	if(!newItem.date) {
 		newItem.date = getContentText(doc, 'citation_online_date')
 			|| getContentText(doc, 'citation_year');
 	}
-  
-	
+  
+
 	//prefer ISSN over eISSN
 	var issn = getContentText(doc, 'citation_issn') ||
 			getContentText(doc, 'citation_eIssn');
@@ -493,16 +491,16 @@ function addHighwireMetadata(doc, newItem) {
 	//add snapshot
 	newItem.attachments.push({document:doc, title:"Snapshot"});
-  
-	
+  
+
 	//store PMID in Extra and as a link attachment
 	//e.g. http://www.sciencemag.org/content/332/6032/977.full
 	var PMID = getContentText(doc, 'citation_pmid');
 	if(PMID) {
 		if(newItem.extra) newItem.extra += '\n';
 		else newItem.extra = '';
-  
-		
+  
+
 		newItem.extra += 'PMID: ' + PMID;
-  
-		
+  
+
 		newItem.attachments.push({
 			title: "PubMed entry",
 			url: "http://www.ncbi.nlm.nih.gov/pubmed/" + PMID,
@@ -525,16 +523,16 @@ function addOtherMetadata(doc, newItem) {
 		try {
 			var parsely = JSON.parse(parselyJSON);
 		} catch(e) {}
-  
-		
+  
+
 		if(parsely) {
 			if(!newItem.title && parsely.title) {
 				newItem.title = parsely.title;
 			}
-  
-			
+  
+
 			if(!newItem.url && parsely.url) {
 				newItem.url = parsely.url;
 			}
-  
-			
+  
+
 			if(!newItem.date && parsely.pub_date) {
 				var date = new Date(parsely.pub_date);
 				if(!isNaN(date.getUTCFullYear())) {
@@ -545,11 +543,11 @@ function addOtherMetadata(doc, newItem) {
 					}, true);
 				}
 			}
-  
-			
+  
+
 			if(!newItem.creators.length && parsely.author) {
 				newItem.creators.push(ZU.cleanAuthor(''+parsely.author, 'author'));
 			}
-  
-			
+  
+
 			if(!newItem.tags.length && parsely.tags && parsely.tags.length) {
 				newItem.tags = parsely.tags;
 			}
@@ -564,7 +562,7 @@ function addLowQualityMetadata(doc, newItem) {
 		Z.debug("Title was not found in meta tags. Using document title as title");
 		newItem.title = doc.title;
 	}
-  
-	
+  
+
 	if(newItem.title) {
 		newItem.title = newItem.title.replace(/\s+/g, ' '); //make sure all spaces are \u0020
 		if(newItem.publicationTitle) {
@@ -597,23 +595,23 @@ function addLowQualityMetadata(doc, newItem) {
 	if(!newItem.tags.length) {
 		 newItem.tags = ZU.xpathText(doc, '//x:meta[@name="keywords"]/@content', namespaces);
 	}
-  
-	
+  
+
 	//We can try getting abstract from 'description'
 	if(!newItem.abstractNote) {
 		newItem.abstractNote = ZU.trimInternal(
 			ZU.xpathText(doc, '//x:meta[@name="description"]/@content', namespaces) || '');
 	}
-  
-	
+  
+
 	if(!newItem.url) {
 		newItem.url = ZU.xpathText(doc, '//head/link[@rel="canonical"]/@href');
 	}
 	if(!newItem.url) {
 		newItem.url = doc.location.href;
 	}
-  
-	
+  
+
 	newItem.libraryCatalog = doc.location.host;
-  
-	
+  
+
 	// add access date
 	newItem.accessDate = 'CURRENT_TIMESTAMP';
 }
@@ -641,11 +639,11 @@ function getAuthorFromByline(doc, newItem) {
 		Z.debug("Found " + byline.length + " elements with '" + bylineClasses[i] + "' class");
 		for(var j=0; j<byline.length; j++) {
 			if (!byline[j].textContent.trim()) continue;
-  
-			
+  
+
 			bylines.push(byline[j]);
 		}
 	}
-  
-	
+  
+
 	var actualByline;
 	if(!bylines.length) {
 		Z.debug("No byline found.");
@@ -656,12 +654,12 @@ function getAuthorFromByline(doc, newItem) {
 		Z.debug(bylines.length + " bylines found:");
 		Z.debug(bylines.map(function(n) { return ZU.trimInternal(n.textContent)}).join('\n'));
 		Z.debug("Locating the one closest to title.");
-  
-		
+  
+
 		//find the closest one to the title (in DOM)
 		actualByline = false;
 		var parentLevel = 1;
 		var skipList = [];
-  
-		
+  
+
 		// Wrap title in quotes so we can use it in the xpath
 		var xpathTitle = newItem.title.toLowerCase();
 		if(xpathTitle.indexOf('"') != -1) {
@@ -676,15 +674,15 @@ function getAuthorFromByline(doc, newItem) {
 		} else {
 			xpathTitle = '"' + xpathTitle + '"';
 		}
-  
-		
+  
+
 		var titleXPath = './/*[normalize-space(translate(text(),"ABCDEFGHJIKLMNOPQRSTUVWXYZ\u00a0","abcdefghjiklmnopqrstuvwxyz "))='
 			+ xpathTitle + ']';
 		Z.debug("Looking for title using: " + titleXPath);
 		while(!actualByline && bylines.length != skipList.length && parentLevel < 5) {
 			Z.debug("Parent level " + parentLevel);
 			for(var i=0; i<bylines.length; i++) {
 				if(skipList.indexOf(i) !== -1) continue;
-  
-				
+  
+
 				if(parentLevel == 1) {
 					//skip bylines that contain bylines
 					var containsBylines = false;
@@ -697,7 +695,7 @@ function getAuthorFromByline(doc, newItem) {
 						continue;
 					}
 				}
-  
-				
+  
+
 				var bylineParent = bylines[i];
 				for(var j=0; j<parentLevel; j++) {
 					bylineParent = bylineParent.parentElement;
@@ -707,7 +705,7 @@ function getAuthorFromByline(doc, newItem) {
 					skipList.push(i);
 					continue;
 				}
-  
-				
+  
+
 				if(ZU.xpath(bylineParent, titleXPath).length) {
 					if(actualByline) {
 						//found more than one, bail
@@ -717,11 +715,11 @@ function getAuthorFromByline(doc, newItem) {
 					actualByline = bylines[i];
 				}
 			}
-  
-			
+  
+
 			parentLevel++;
 		}
 	}
-  
-	
+  
+
 	if(actualByline) {
 		var byline = ZU.trimInternal(actualByline.textContent);
 		Z.debug("Extracting author(s) from byline: " + byline);
@@ -744,7 +742,7 @@ function getAuthorFromByline(doc, newItem) {
 						//skip some odd splits and twitter handles
 						continue;
 					}
-  
-					
+  
+
 					if(authors[i].split(/\s/).length == 1) {
 						//probably corporate author
 						newItem.creators.push({
@@ -795,15 +793,28 @@ function finalDataCleanup(doc, newItem) {
 		// because most of the time they are not right
 		newItem.tags = [];
 	}
-  
-	
+  
+
 	//Cleanup DOI
 	if (newItem.DOI){
 		newItem.DOI =newItem.DOI.replace(/^doi:\s*/, "");
 	}
-  
-
+  
+
+  
+	// Add DOI to non-supported item types
+  
+	if (newItem.DOI && !ZU.fieldIsValidForType("DOI", newItem.itemType)) {
+  
+		if (newItem.extra){
+  
+			newItem.extra += "\nDOI: " + newItem.DOI;
+  
+		}
+  
+		else {
+  
+			newItem.extra = "DOI: " + newItem.DOI;
+  
+		}
+  
+	}
+  
+
+  
+
+  
+
+  
+
 	//remove itemID - comes from RDF translator, doesn't make any sense for online data
 	newItem.itemID = "";
-  
-	
+  
+
 	//worst case, if this is not called from another translator, use URL for title
 	if(!newItem.title && !Zotero.parentTranslator) newItem.title = newItem.url;
 }
@@ -1440,4 +1451,4 @@ var testCases = [
 		]
 	}
 ]
-  
-/** END TEST CASES **/
+  
+/** END TEST CASES **/