Permalink
Join GitHub today
GitHub is home to over 31 million developers working together to host and review code, manage projects, and build software together.
Sign up
translators/Nagoya University OPAC.js
Find file
Copy path
Fetching contributors…
Cannot retrieve contributors at this time
{ | |
"translatorID": "b56d756e-814e-4b46-bc58-d61dccc9f32f", | |
"label": "Nagoya University OPAC", | |
"creator": "Frank Bennett", | |
"target": "^https?://opac\\.nul\\.nagoya-u\\.ac\\.jp/webopac/(catdbl\\.do|ctlsrh\\.do)", | |
"minVersion": "2.0b7", | |
"maxVersion": "", | |
"priority": 100, | |
"inRepository": true, | |
"translatorType": 4, | |
"browserSupport": "gcsibv", | |
"lastUpdated": "2012-07-13 07:33:49" | |
} | |
// ####################### | |
// ##### Sample URLs ##### | |
// ####################### | |
/* | |
* The site is session-based, with page content negotiated | |
* in POST calls. The starting point for an OPAC search is | |
* the URL below. In testing, I tried the following: | |
* | |
* - A search listing of books | |
* - A search listing of journals (no icon) | |
* - A mixed search listing of books and journals | |
* - A journal page (no icon) | |
* - A book page | |
*/ | |
// http://opac.nul.nagoya-u.ac.jp/webopac/catsrk.do | |
// ##################### | |
// ##### Constants ##### | |
// ##################### | |
/* | |
* Strings corresponding to variables | |
*/ | |
var pageStrings = { | |
title: ['タイトル / 著者','Title / Author'], | |
year: ['出版・頒布','Publication'], | |
isbn: ['ISBN','ISBN'], | |
authors: ['著者名リンク','Author link'], | |
series: ['シリーズ情報','Series information'] | |
}; | |
var itemUrlBase = "http://opac.nul.nagoya-u.ac.jp/webopac/catdbl.do"; | |
// ############################ | |
// ##### String functions ##### | |
// ############################ | |
/* | |
* Chop a semicolon-delimited string of authors out of a raw title string, | |
* check it for Japanese characters, and save the raw string for each author | |
* to an array. If no Japanese authors were found, save directly to the item | |
* object. | |
*/ | |
var parseRomanAuthors = function (item,data) { | |
var datastring = data['title'][0]; | |
// don't bother if there is no author info | |
if ( ! datastring.match(/.*\/.*/) ) { | |
return true; | |
} | |
// cut off the title | |
datastring = datastring.replace(/.*\//, ""); | |
// raise flag if there are japanese characters | |
var japanese_check = datastring.match(/.*[^- &0-9()\[\];:,.a-zA-Z].*/); | |
// replace comma with semicolon in certain cases, to prepare for split | |
datastring = datastring.replace(/,(\s+[a-zA-Z]{3,})/, ";$1"); | |
datastring = datastring.replace(/,(\s+[a-zA-Z]{1}[^a-zA-Z])/, ";$1"); | |
datastring = datastring.replace(/(\s+and\s+)/, "; "); | |
datastring = datastring.replace(/(\s+&\s+)/, "; "); | |
// split the authors | |
var authors = datastring.replace(/\|.*/, "").split(";"); | |
// this is parsing the authors for a single work. if there is a special byline, we | |
// assume that it applies to all subsequent entries until overridden. | |
var authortype = 'author'; | |
for (i in authors) { | |
item.authorstrings.push(authors[i]); | |
var authortypehint = authors[i].replace(/^([ ,.:a-z]*).*/, "$1"); | |
if ( authortypehint.match(/.*(edit|organiz).*/) ) { | |
authortype = "editor"; | |
} else if ( authortypehint.match(/.*trans.*/) ) { | |
authortype = "translator"; | |
} | |
var author = authors[i].replace(/^[ a-z]*/, "").replace( /\.\.\..*/, "" ); | |
// need to test for length because the replacement of commas with semicolons | |
// can cause a short split at the end of a byline that originally ended in a comma | |
if ( ! japanese_check && author.length ) { | |
item.creators.push(Zotero.Utilities.cleanAuthor(author, authortype)); | |
} | |
} | |
return japanese_check; | |
} | |
/* | |
* For each author link, attempt to find a hint that the person | |
* is an editor or translator, first in the link text itself, then in | |
* the list of raw author strings captured by parseRomanAuthors. | |
* Clean out cruft, reverse the order of each name, and save | |
* directly to the item object. | |
*/ | |
var parseJapaneseAuthors = function (item, data) { | |
var authortype = author; | |
var authors = data['authors']; | |
for (i in authors ) { | |
if ( authors[i].match(/.*編.*/) ) { | |
authortype = 'editor'; | |
} else if ( authors[i].match(/.*訳.*/) ) { | |
authortype = 'translator'; | |
} else { | |
authortype = 'author'; | |
} | |
var author = authors[i].replace(/[*]/g,"").replace(/[0-9<()|].*/, "").replace(/(.*?),(.*)/, "$2 $1"); | |
// If we claim to be an author, double-check in the English entries for a translator hint. | |
// This is an enormous pain, but the original records are a mess, with different conventions | |
// for Japanese and foreign records, sometimes mixed up in the same entry. What are you | |
// going to do. | |
for ( x in item.authorstrings ) { | |
var authorstring = item.authorstrings[x]; | |
var name = author.split(" "); | |
name.reverse(); | |
if ( authorstring.indexOf( name[0] ) > -1 && authorstring.match(/.*(訳|譯|譯註)$/) ) { | |
authortype = 'translator'; | |
break; | |
} else if ( authorstring.indexOf( name[0] ) > -1 && authorstring.match(/.*(編|編著)$/) ) { | |
authortype = 'editor'; | |
break; | |
} | |
} | |
delete item.authorstrings; | |
item.creators.push(Zotero.Utilities.cleanAuthor(author, authortype)); | |
} | |
} | |
/* | |
* Split extracted title field. This always starts as a single list item, | |
* but can contain entries for several works, as in an omnibus volume of | |
* translated works, for example. Such records separate the elements of | |
* the omnibus with periods that have no trailing space, so we use that as | |
* the split point. We discard the phonetic information appended to the end | |
* of the string in Japanese records. | |
*/ | |
function splitTitle(data) { | |
// split in data array | |
var titlestring = data['title'][0].replace(/\|.*/, ""); | |
data['title'] = titlestring.split(" . "); | |
} | |
// ########################## | |
// ##### Page functions ##### | |
// ########################## | |
/* | |
* When getlist argument is nil, return a value when the target | |
* index DOM contains at least one book entry, otherwise | |
* return false. | |
* | |
* When getlist argument is true, return a list of | |
* array items for book entries in the DOM. | |
*/ | |
var sniffIndexPage = function(doc,getlist){ | |
var check = doc.evaluate("//td[div[@class='lst_value' and contains(text(),'Books')]]/following-sibling::td", doc, null, XPathResult.ANY_TYPE, null); | |
var node = check.iterateNext(); | |
if (getlist){ | |
var ret = new Object(); | |
while (node){ | |
var myitems = Zotero.Utilities.getItemArray( | |
doc, | |
node, | |
"document\\.catsrhform\\.pkey.value="); | |
for (var r in myitems){ | |
ret[r] = myitems[r]; | |
} | |
node = check.iterateNext(); | |
} | |
return ret; | |
} else { | |
return node; | |
} | |
}; | |
/* | |
* Invoke sniffIndexPage to generate a list of book | |
* items in the target DOM. | |
*/ | |
var getBookItems = function(doc){ | |
return sniffIndexPage(doc,true); | |
}; | |
/* | |
* Extract data from the DOM using the var-string pairs in | |
* pageStrings as a guide to navigation. | |
*/ | |
var scrapePage = function(doc, spec) { | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var data = new Object(); | |
for (key in spec) { | |
var check = doc.evaluate("//th[div[contains(text(),'"+spec[key][0]+"') or contains(text(),'"+spec[key][1]+"')]]/following-sibling::td/div", doc, nsResolver, XPathResult.ANY_TYPE, null); | |
var c = check.iterateNext(); | |
while (c) { | |
if (!data[key] ) { | |
data[key] = new Array(); | |
} | |
data[key].push(Zotero.Utilities.trimInternal(c.textContent)); | |
c = check.iterateNext(); | |
} | |
} | |
return data; | |
}; | |
/* | |
* Bring it all together. | |
*/ | |
function scrapeAndParse(doc,url) { | |
if (!detectWeb(doc,url)){ | |
return false; | |
} | |
var item = new Zotero.Item("book"); | |
item.authorstrings = new Array(); | |
var data = scrapePage(doc, pageStrings); | |
splitTitle(data); | |
if (data['title']) { | |
var titles = new Array(); | |
for (i in data['title']) { | |
titles.push( data['title'][i].replace(/\s+\/.*/, "") ); | |
} | |
item.title = titles.join(", "); | |
var jse_authors = parseRomanAuthors( item, data ); | |
if ( jse_authors ) { | |
parseJapaneseAuthors( item, data ); | |
} | |
} | |
if (data['year']) { | |
// sometimes there are multiple "date" fields, some of which are filled | |
// with other random information | |
for (i in data['year']) { | |
var year = data['year'][i]; | |
if ( year.match(/.*[0-9]{3}.*/) ) { | |
item.date = year.replace(/.*?([0-9][.0-9][0-9]+).*/, "$1"); | |
item.place = year.replace(/:.*/, "").replace(/[\[\]]/g, ""); | |
item.publisher = year.replace(/.*:(.*),.*/, "$1"); | |
break; | |
} | |
} | |
} | |
if (data['series']) { | |
item.series = data['series'][0].replace(/[/|<].*/, ""); | |
} | |
if (data['isbn']) { | |
item.ISBN = data['isbn'][0].replace(/[^0-9]*([0-9]+).*/, "$1"); | |
} | |
item.complete(); | |
} | |
// ######################### | |
// ##### API functions ##### | |
// ######################### | |
function detectWeb(doc, url) { | |
if (url.match(/.*\/webopac\/catdbl.do/)) { | |
var journal_test = doc.evaluate( '//th[div[contains(text(),"Frequency of publication") or contains(text(),"刊行頻度") or contains(text(),"巻号") or contains(text(),"Volumes")]]', doc, null, XPathResult.ANY_TYPE, null).iterateNext(); | |
if (!journal_test) { | |
return "book"; | |
} | |
} else if (url.match(/.*\/webopac\/ctlsrh.do/)){ | |
if (sniffIndexPage(doc)){ | |
return "multiple"; | |
} | |
} | |
return false; | |
} | |
function doWeb(doc, url) { | |
var format = detectWeb(doc, url); | |
if (format == "multiple") { | |
var items = {}; | |
for (var u in Zotero.selectItems( getBookItems(doc) )){ | |
var m = u.match(/.*document\.catsrhform\.pkey\.value=\'([^\']+)\'.*/); | |
items[itemUrlBase+"?pkey="+m[1]+"&initFlg=_RESULT_SET_NOTBIB"] = true; | |
} | |
var urls = []; | |
for (var u in items){ | |
urls.push(u); | |
} | |
ZU.processDocuments(u, scrapeAndParse); | |
} else if (format == "book"){ | |
scrapeAndParse(doc, url); | |
} | |
} | |
/** BEGIN TEST CASES **/ | |
var testCases = [ | |
{ | |
"type": "web", | |
"url": "http://opac.nul.nagoya-u.ac.jp/webopac/catdbl.do?pkey=TY50091937&initFlg=_RESULT_SET_NOTBIB", | |
"items": [ | |
{ | |
"itemType": "book", | |
"creators": [ | |
{ | |
"firstName": "Jeremy", | |
"lastName": "Adelman", | |
"creatorType": "author" | |
} | |
], | |
"notes": [], | |
"tags": [], | |
"seeAlso": [], | |
"attachments": [], | |
"authorstrings": " Jeremy Adelman", | |
"title": "Frontier development : land, labour, and capital on the wheatlands of Argentina and Canada, 1890-1914", | |
"date": "1994", | |
"place": "Oxford", | |
"publisher": "Clarendon Press", | |
"series": "Oxford historical monographs", | |
"ISBN": "0198204418", | |
"libraryCatalog": "Nagoya University OPAC", | |
"shortTitle": "Frontier development" | |
} | |
] | |
} | |
] | |
/** END TEST CASES **/ |