Permalink
Join GitHub today
GitHub is home to over 31 million developers working together to host and review code, manage projects, and build software together.
Sign up
Fetching contributors…
Cannot retrieve contributors at this time
{ | |
"translatorID": "6773a9af-5375-3224-d148-d32793884dec", | |
"label": "InfoTrac", | |
"creator": "Simon Kornblith", | |
"target": "^https?://[^/]+/itw/infomark/", | |
"minVersion": "1.0.0b3.r1", | |
"maxVersion": "", | |
"priority": 250, | |
"inRepository": true, | |
"translatorType": 4, | |
"browserSupport": "g", | |
"lastUpdated": "2015-06-10 10:51:29" | |
} | |
function detectWeb(doc, url) { | |
// ensure that there is an InfoTrac logo | |
if (!doc.evaluate('//img[substring(@alt, 1, 8) = "InfoTrac"]', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) return false; | |
if (doc.title.substring(0, 8) == "Article ") { | |
if (ZU.xpathText(doc, '//td//img[contains(@src, "ncnp_logo.gif")]/@title')) return "newspaperArticle"; | |
var genre = doc.evaluate('//comment()[substring(., 1, 6) = " Genre"]', doc, null, XPathResult.ANY_TYPE, null).iterateNext(); | |
if (genre) { | |
var value = Zotero.Utilities.trimInternal(genre.nodeValue.substr(7)); | |
if (value == "article") { | |
return "journalArticle"; | |
} else if (value == "book") { | |
return "book"; | |
} else if (value == "dissertation") { | |
return "thesis"; | |
} else if (value == "bookitem") { | |
return "bookSection"; | |
} | |
} | |
return "magazineArticle"; | |
} else if (doc.title.substring(0, 10) == "Citations ") { | |
return "multiple"; | |
} | |
} | |
function scrape(doc, url){ | |
var newItem = new Zotero.Item(); | |
var xpath = '/html/body//comment()'; | |
var elmts = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null); | |
var citation = ZU.xpath(doc, '//p/table/tbody//td/table/tbody[not(./script)]'); | |
newItem.title = ZU.xpathText(citation, './/font/b'); | |
newItem.itemType = "newspaperArticle"; | |
var author = ZU.xpathText(citation, './/td/i'); | |
if (author) newItem.creators.push(ZU.cleanAuthor(author, "author`")); | |
var date = ZU.xpathText(citation, './/td/text()'); | |
if (date) date = date.match(/[A-Z][a-z]+\s\d+,\s\d{4}/); | |
if (date) newItem.date = date[0]; | |
var pdfurl = ZU.xpathText(doc, '//blockquote/a[contains(@href, "!pdf")][1]/@href'); | |
if (pdfurl){ | |
newItem.attachments.push({url: pdfurl, title: "Infotrac Full Text PDF", mimeType: "application/pdf"}) | |
} | |
newItem.attachments.push({document: doc, title: "Infotrac Snapshot", mimeType: "text/html"}); | |
while (elmt = elmts.iterateNext()) { | |
var colon = elmt.nodeValue.indexOf(":"); | |
var field = elmt.nodeValue.substring(1, colon).toLowerCase(); | |
var value = elmt.nodeValue.substring(colon+1, elmt.nodeValue.length-1); | |
if (field == "journal") { | |
newItem.publicationTitle = value; | |
} | |
} | |
if (newItem.publicationTitle.search(/\(.+\)/)){ | |
newItem.place = newItem.publicationTitle.match(/\((.+)\)/)[1]; | |
newItem.publicationTitle = newItem.publicationTitle.replace(/\(.+\).*/, ""); | |
} | |
newItem.complete(); | |
} | |
function extractCitation(url, elmts, title, doc) { | |
var newItem = new Zotero.Item(); | |
newItem.url = url; | |
if (title) { | |
newItem.title = Zotero.Utilities.superCleanString(title); | |
} | |
newItem.title = ZU.xpathText(citation, './/font/b'); | |
newItem.itemType = "newspaperArticle"; | |
var date = ZU.xpathText(citation, './/td/text()'); | |
if (date) date = date.match(/[A-Z][a-z]+\s\d+,\s\d{4}/); | |
if (date) newItem.date = date[0]; | |
while (elmt = elmts.iterateNext()) { | |
var colon = elmt.nodeValue.indexOf(":"); | |
var field = elmt.nodeValue.substring(1, colon).toLowerCase(); | |
var value = elmt.nodeValue.substring(colon+1, elmt.nodeValue.length-1); | |
if (field == "title") { | |
newItem.title = Zotero.Utilities.superCleanString(value); | |
} else if (field == "journal") { | |
newItem.publicationTitle = value; | |
} else if (field == "pi") { | |
parts = value.split(" "); | |
var date = ""; | |
var field = null; | |
for (j in parts) { | |
firstChar = parts[j].substring(0, 1); | |
if (firstChar == "v") { | |
newItem.itemType = "journalArticle"; | |
field = "volume"; | |
} else if (firstChar == "i") { | |
field = "issue"; | |
} else if (firstChar == "p") { | |
field = "pages"; | |
var pagesRegexp = /p(\w+)\((\w+)\)/; // weird looking page range | |
var match = pagesRegexp.exec(parts[j]); | |
if (match) { // yup, it's weird | |
var finalPage = parseInt(match[1])+parseInt(match[2]) | |
parts[j] = "p"+match[1]+"-"+finalPage.toString(); | |
} else if (!newItem.itemType) { // no, it's normal | |
// check to see if it's numeric, bc newspaper pages aren't | |
var justPageNumber = parts[j].substr(1); | |
if (parseInt(justPageNumber).toString() != justPageNumber) { | |
newItem.itemType = "newspaperArticle"; | |
} | |
} | |
} else if (!field) { // date parts at the beginning, before | |
// anything else | |
date += " "+parts[j]; | |
} | |
if (field) { | |
isDate = false; | |
if (parts[j] != "pNA") { // make sure it's not an invalid | |
// page number | |
// chop of letter | |
newItem[field] = parts[j].substring(1); | |
} else if (!newItem.itemType) { // only newspapers are missing | |
// page numbers on infotrac | |
newItem.itemType = "newspaperArticle"; | |
} | |
} | |
} | |
// Set type | |
if (!newItem.itemType) { | |
newItem.itemType = "magazineArticle"; | |
} | |
if (date != "") { | |
newItem.date = date.substring(1); | |
} | |
} else if (field == "author") { | |
var author = Zotero.Utilities.cleanAuthor(value, "author", true); | |
// ensure author is not already there | |
var add = true; | |
for (var i=0; i<newItem.creators.length; i++) { | |
var existingAuthor = newItem.creators[i]; | |
if (existingAuthor.firstName == author.firstName && existingAuthor.lastName == author.lastName) { | |
add = false; | |
break; | |
} | |
} | |
if (add) newItem.creators.push(author); | |
} else if (field == "issue") { | |
newItem.issue = value; | |
} else if (field == "volume") { | |
newItem.volume = value; | |
} else if (field == "issn") { | |
newItem.ISSN = value; | |
} else if (field == "gjd") { | |
var m = value.match(/\(([0-9]{4}[^\)]*)\)(?:, pp\. ([0-9\-]+))?/); | |
if (m) { | |
newItem.date = m[1]; | |
newItem.pages = m[2]; | |
} | |
} else if (field == "BookTitle") { | |
newItem.publicationTitle = value; | |
} else if (field == "genre") { | |
value = value.toLowerCase(); | |
if (value == "article") { | |
newItem.itemType = "journalArticle"; | |
} else if (value == "book") { | |
newItem.itemType = "book"; | |
} else if (value == "dissertation") { | |
newItem.itemType = "thesis"; | |
} else if (value == "bookitem") { | |
newItem.itemType = "bookSection"; | |
} | |
} | |
} | |
if (doc) { | |
newItem.attachments.push({document:doc, title:"InfoTrac Snapshot"}); | |
} else { | |
newItem.attachments.push({url:url, title:"InfoTrac Snapshot", | |
mimeType:"text/html"}); | |
} | |
newItem.complete(); | |
} | |
function doWeb(doc, url) { | |
var ncnp; | |
if (ZU.xpathText(doc, '//td//img[contains(@src, "ncnp_logo.gif")]/@title')) ncnp = true; | |
/*the only Infotrac Site that's still up & I'm aware of is 19th Century Newspapers. | |
But there may well be others, so I'm leaving a lot of legacy code in just in case */ | |
var uri = doc.location.href; | |
if (doc.title.substring(0, 8) == "Article ") { // article | |
if (ncnp) scrape(doc, url); | |
else { | |
var xpath = '/html/body//comment()'; | |
var elmts = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null); | |
extractCitation(uri, elmts); | |
} | |
} else { // search results | |
var items = new Object(); | |
var uris = new Array(); | |
var elmts = new Array(); | |
var host = doc.location.href.match(/^https?:\/\/[^\/]+/)[0]; | |
var baseurl = doc.location.href.match(/(.+)\/purl=/); | |
var institution = url.match(/\?sw_aep=.+/)[0]; | |
var tableRows = doc.evaluate('/html/body//table/tbody/tr/td[b or strong]', doc, null, | |
XPathResult.ANY_TYPE, null); | |
var tableRow; | |
var javaScriptRe = /'([^']*)' *, *'([^']*)'/ | |
var i = 0; | |
// Go through table rows | |
if (ncnp){ | |
while (tableRow = tableRows.iterateNext()) { | |
var title = ZU.trimInternal(ZU.xpathText(tableRow, './strong')); | |
var link = ZU.xpathText(tableRow, './a[1]/@href'); | |
link = link.match(/\(\'(\/.+)\',\'/)[1]; | |
link = baseurl[1] + link + institution; | |
//Z.debug(link) | |
items[link] = title; | |
} | |
Zotero.selectItems(items, function (items) { | |
if (!items) { | |
return true; | |
} | |
for (var i in items) { | |
uris.push(i); | |
} | |
Zotero.Utilities.processDocuments(uris, scrape) | |
}); | |
} | |
else { | |
while (tableRow = tableRows.iterateNext()) { | |
var link = doc.evaluate('./a', tableRow, null, XPathResult.ANY_TYPE, null).iterateNext(); | |
var m = javaScriptRe.exec(link.href); | |
if (m) { | |
uris[i] = host+"/itw/infomark/192/215/90714844w6"+m[1]+"?sw_aep=olr_wad"+m[2]; | |
} | |
var article = doc.evaluate('./b/text()|./strong/text', link, null, XPathResult.ANY_TYPE, null).iterateNext(); | |
items[i] = article.nodeValue; | |
// Chop off final period | |
if (items[i].substr(items[i].length-1) == ".") { | |
items[i] = items[i].substr(0, items[i].length-1); | |
} | |
elmts[i] = doc.evaluate(".//comment()", tableRow, null, XPathResult.ANY_TYPE, null); | |
citation[i] = ZU.xpath(tableRow, '//') | |
i++; | |
} | |
items = Zotero.selectItems(items); | |
if (!items) { | |
return true; | |
} | |
for (var i in items) { | |
extractCitation(uris[i], elmts[i], items[i]); | |
} | |
} | |
} | |
} |