Skip to content
Permalink
Browse files

AIAA - new webpage - rewrite based on ASCE;

Amazon - fix lists;
CLASE - fix xpaths
EEBO - remove date after author
Google Books - fix certain search results
JSTOR - fix basic search results
JT Online - fix search results/target regex
Archives Canada  - add bookmarklet support
  • Loading branch information...
adam3smith committed Dec 8, 2013
1 parent 64edb33 commit 12f42c950779e1a2811147db9f627ea93212dcba
Showing with 237 additions and 110 deletions.
  1. +2 −2 Amazon.com.js
  2. +179 −77 American Institute of Aeronautics and Astronautics.js
  3. +2 −2 Archives Canada.js
  4. +14 −15 CLASE.js
  5. +6 −1 Early English Books Online.js
  6. +21 −4 Google Books.js
  7. +6 −4 JSTOR.js
  8. +7 −5 Japan Times Online.js
@@ -9,7 +9,7 @@
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcsbv",
"lastUpdated": "2013-12-02 17:14:39"
"lastUpdated": "2013-12-07 09:15:44"
}

var searchRe = new RegExp('^https?://(?:www\.)?amazon\.([^/]+)/(gp/search/|(gp/)?registry/(wishlist|registry)|exec/obidos/search-handle-url/|s/|s\\?|[^/]+/lm/|gp/richpub/)');
@@ -52,7 +52,7 @@ function doWeb(doc, url) {
} else if (doc.location.href.match(/\/lm\//)) { // Show selector for Lists
var xpath = '//span[@id="lm_asinlink95"]//a'
} else { // Show selector for Search results
var xpath = '//div[@class="productTitle"]/a |//div[@id="init-container"]//span[@class="small productTitle"]//a | //div[@class="wedding" or @class="list-items"]//span[@class="small productTitle"]//a |//a[span[@class="srTitle"]] | //div[@class="title"]/a[@class="title"]| //h3[@class="title"]/a[@class="title"] | //h3[@class="newaps"]/a';
var xpath = '//div[@class="productTitle"]/a |//div[@id="init-container"]//span[@class="small productTitle"]//a | //div[@class="wedding" or @class="list-items"]//span[@class="small productTitle"]//a |//a[span[@class="srTitle"]] | //div[@class="title"]/a[@class="title"]| //h3[@class="title"]/a[@class="title"] | //h3[@class="newaps"]/a|//div[@class="a-fixed-right-grid-inner"]//a';
}
var availableItems = {};
var links = ZU.xpath(doc, xpath);
@@ -2,104 +2,206 @@
"translatorID": "75edc5a1-6470-465a-a928-ccb77d95eb72",
"label": "American Institute of Aeronautics and Astronautics",
"creator": "Michael Berkowitz",
"target": "^https?://www\\.aiaa\\.org/",
"minVersion": "1.0.0b4.r5",
"target": "^https?://arc\\.aiaa\\.org/",
"minVersion": "3.0",
"maxVersion": "",
"priority": 100,
"inRepository": true,
"translatorType": 4,
"browserSupport": "g",
"lastUpdated": "2011-10-20 14:11:45"
"lastUpdated": "2013-12-07 20:26:26"
}

/*
AIAA Translator
Copyright (C) 2013 Sebastian Karcher
Based on ASCE
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

function detectWeb(doc, url) {
if (doc.evaluate('//td/div[@class="title"]/b/div[@class="centerHeadlines"]', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
if (url.match(/\/doi\/abs\/10\.|\/doi\/full\/10\./)) {
return "journalArticle";
} else if(url.match(/\/action\/doSearch\?|\/toc\//))
{
return "multiple";
}
}


function doWeb(doc, url) {
var n = doc.documentElement.namespaceURI;
var ns = n ? function(prefix) {
if (prefix == 'x') return n; else return null;
} : null;

var items = new Object();
var oldItems = doc.evaluate('//table/tbody/tr/td[div[@class="title"]]', doc, ns, XPathResult.ANY_TYPE, null);
var nextItem;
while (nextItem = oldItems.iterateNext()) {
var data = new Object();
data['title'] = Zotero.Utilities.trimInternal(doc.evaluate('./div[@class="title"]//div[@class="centerHeadlines"]', nextItem, ns, XPathResult.ANY_TYPE, null).iterateNext().textContent);
data['pages'] = Zotero.Utilities.trimInternal(doc.evaluate('./div[@class="title"]//div[@class="centerHeadlinesSub2"]', nextItem, ns, XPathResult.ANY_TYPE, null).iterateNext().textContent.match(/[\d\w]+\-[\d\w]+/)[0]);
data['authors'] = Zotero.Utilities.trimInternal(doc.evaluate('./ul/i', nextItem, ns, XPathResult.ANY_TYPE, null).iterateNext().textContent);
var extra = Zotero.Utilities.trimInternal(doc.evaluate('./ul', nextItem, ns, XPathResult.ANY_TYPE, null).iterateNext().textContent);
var extra = extra.replace(data['authors'], "");
data['extra'] = Zotero.Utilities.trimInternal(extra);
var pdf = doc.evaluate('.//a', nextItem, ns, XPathResult.ANY_TYPE, null).iterateNext().href;
Zotero.debug(pdf);
data['pdfurl'] = pdf;
items[data['title']] = data;
}
var volume;
var issue;
var date;
if (doc.evaluate('//td[2]/table/tbody/tr/td[1]/strong', doc, ns, XPathResult.ANY_TYPE, null).iterateNext()) {
var voliss = Zotero.Utilities.trimInternal(doc.evaluate('//td[2]/table/tbody/tr/td[1]/strong', doc, ns, XPathResult.ANY_TYPE, null).iterateNext().textContent);
voliss = voliss.match(/(\d+)\s+vol\.\s*(\d+)\s+no\.\s*(\d+)/);
volume = voliss[2];
issue = voliss[3];
date = voliss[1];
} else if (doc.evaluate('//select', doc, ns, XPathResult.ANY_TYPE, null).iterateNext()) {
var voliss = Zotero.Utilities.trimInternal(doc.evaluate('//select[@name="volume"]/option[@selected]', doc, ns, XPathResult.ANY_TYPE, null).iterateNext().textContent);
var issue = Zotero.Utilities.trimInternal(doc.evaluate('//select[@name="issue"]/option[@selected]', doc, ns, XPathResult.ANY_TYPE, null).iterateNext().textContent);
voliss = voliss.match(/vol\.\s*(\d+)\s*\-\s*(\d+)/);
volume = voliss[1];
date = voliss[2];
}
if (doc.evaluate('//tr[1]/td/b/div[@class="centerHeadlines"]', doc, ns, XPathResult.ANY_TYPE, null).iterateNext()) {
var journal = Zotero.Utilities.trimInternal(doc.evaluate('//tr[1]/td/b/div[@class="centerHeadlines"]', doc, ns, XPathResult.ANY_TYPE, null).iterateNext().textContent);
var ISSN = Zotero.Utilities.trimInternal(doc.evaluate('//tr[1]/td/font[@class="centerHeadlinesSub2"]', doc, ns, XPathResult.ANY_TYPE, null).iterateNext().textContent.replace(/(\(|\))/g, ""));
} else if (doc.evaluate('//div[@class="centerHeadlinesTitle"]', doc, ns, XPathResult.ANY_TYPE, null).iterateNext()) {
var journal = Zotero.Utilities.trimInternal(doc.evaluate('//div[@class="centerHeadlinesTitle"]', doc, ns, XPathResult.ANY_TYPE, null).iterateNext().textContent);
var ISSN = Zotero.Utilities.trimInternal(doc.evaluate('//tr/td[1]/table/tbody/tr[2]/td/div', doc, ns, XPathResult.ANY_TYPE, null).iterateNext().textContent.match(/ISSN\s*([\d\-]+)/)[1]);
if (detectWeb(doc, url) == "multiple") {
var items = new Object();
var rows = ZU.xpath(doc, '//table[@class="articleEntry"]');
var doi;
var title;
for(var i=0, n=rows.length; i<n; i++) {
doi = ZU.xpathText(rows[i], './/a[contains(@href, "/doi/abs/10.")]/@href') //.match(/10\..+/)
//Z.debug(doi)
title = ZU.xpathText(rows[i], './/div[@class="art_title"]')
if(doi && title) {
items[doi.match(/10\.[^\?]+/)[0]] = title;
}
}
//Z.debug(items)
Zotero.selectItems(items, function(selectedItems){
if(!selectedItems) return true;

var dois = new Array();
for (var i in selectedItems) {
dois.push(i);
}
scrape(null, url,dois);
});
} else {
var doi = url.match(/\/doi\/(?:abs|full)\/(10\.[^?#]+)/);
scrape(doc, url,[doi[1]]);
}
var searchItems = new Array();
for (var i in items) {
searchItems.push(i);
}

function finalizeItem(item, doc, doi, baseUrl) {
var pdfurl = '/doi/pdf/';
var absurl = '/doi/abs/';

//add attachments
item.attachments = [{
title: 'AIAA Full Text PDF',
url: pdfurl + doi,
mimeType: 'application/pdf'
}];
if(doc) {
item.attachments.push({
title: 'AIAA Snapshot',
document: doc
});
} else {
item.attachments.push({
title: 'AIAA Snapshot',
url: item.url || absurl + doi,
mimeType: 'text/html'
});
}

searchItems = Zotero.selectItems(searchItems);
for (var i in items) {
for each (var title in searchItems) {
if (i == title) {
var data = items[i];
var item = new Zotero.Item("journalArticle");
item.volume = volume;
item.issue = issue;
item.date = date;
item.title = data['title'];
item.pages = data['pages'];
item.publicationTitle = Zotero.Utilities.capitalizeTitle(journal);
item.ISSN = ISSN;
if (data['authors'].match(/\w+/)) {
var authors = data['authors'].split(/(\band\b|,|;)/);
for each (var aut in authors) {
if (aut.match(/\w+/) && aut != "and") {
item.creators.push(Zotero.Utilities.cleanAuthor(aut, "author"));
}
}
}
item.attachments = [{url:data['pdfurl'], title:"AIAA PDF (first page)", mimeType:"application/pdf"}];
item.complete();
}
}
item.complete();
}

function scrape(doc, url, dois) {
var postUrl = '/action/downloadCitation';
var postBody = 'downloadFileName=citation&' +
'direct=true&' +
'include=abs&' +
'doi=';
var risFormat = '&format=ris';
var bibtexFormat = '&format=bibtex';

for(var i=0, n=dois.length; i<n; i++) {
(function(doi) {
ZU.doPost(postUrl, postBody + doi + bibtexFormat, function(text) {
var translator = Zotero.loadTranslator("import");
// Use BibTeX translator
translator.setTranslator("9cb70025-a888-4a29-a210-93ec52da40d4");
translator.setString(text);
translator.setHandler("itemDone", function(obj, item) {
item.bookTitle = item.publicationTitle;
//Z.debug(text)
//unfortunately, bibtex is missing some data
//publisher, ISSN/ISBN
ZU.doPost(postUrl, postBody + doi + risFormat, function(text) {
//Z.debug(text)
risTrans = Zotero.loadTranslator("import");
risTrans.setTranslator("32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7");
risTrans.setString(text);
risTrans.setHandler("itemDone", function(obj, risItem) {
item.publisher = risItem.publisher;
item.ISSN = risItem.ISSN;
item.ISBN = risItem.ISBN;
finalizeItem(item, doc, doi);
});
risTrans.translate();
});
});
translator.translate();
});
})(dois[i]);
}
}/** BEGIN TEST CASES **/
}


/** BEGIN TEST CASES **/
var testCases = [
{
"type": "web",
"url": "http://www.aiaa.org/content.cfm?pageid=322&lupubid=2",
"url": "http://arc.aiaa.org/action/doSearch?searchText=titanium",
"items": "multiple"
},
{
"type": "web",
"url": "http://arc.aiaa.org/doi/abs/10.2514/1.T3744?prevSearch=&searchHistoryKey=",
"items": [
{
"itemType": "journalArticle",
"creators": [
{
"firstName": "Songping",
"lastName": "Mo",
"creatorType": "author"
},
{
"firstName": "Ying",
"lastName": "Chen",
"creatorType": "author"
},
{
"firstName": "Xing",
"lastName": "Li",
"creatorType": "author"
},
{
"firstName": "Lisi",
"lastName": "Jia",
"creatorType": "author"
}
],
"notes": [],
"tags": [],
"seeAlso": [],
"attachments": [
{
"title": "Full Text PDF",
"mimeType": "application/pdf"
},
{
"title": "Snapshot"
}
],
"itemID": "doi:10.2514/1.T3744",
"title": "Solidification Characteristics of Titania Nanofluids",
"publicationTitle": "Journal of Thermophysics and Heat Transfer",
"volume": "26",
"issue": "1",
"pages": "192-196",
"date": "2012",
"DOI": "10.2514/1.T3744",
"url": "http://arc.aiaa.org/doi/abs/10.2514/1.T3744",
"bookTitle": "Journal of Thermophysics and Heat Transfer",
"publisher": "American Institute of Aeronautics and Astronautics",
"ISSN": "0887-8722",
"libraryCatalog": "American Institute of Aeronautics and Astronautics",
"accessDate": "CURRENT_TIMESTAMP"
}
]
}
]
/** END TEST CASES **/
@@ -8,8 +8,8 @@
"priority": 100,
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcs",
"lastUpdated": "2013-02-10 13:25:50"
"browserSupport": "gcsb",
"lastUpdated": "2013-12-07 15:18:12"
}

function detectWeb (doc, url) {
@@ -9,13 +9,12 @@
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcsibv",
"lastUpdated": "2013-06-19 09:52:11"
"lastUpdated": "2013-12-07 15:15:58"
}

function detectWeb(doc, url) {
if (url.indexOf("func=full-set") != -1) return "journalArticle"
//Items load too slowly and the translator misfires for multiples
// else if (url.indexOf("func=short") != -1) return "multiple";
else if (url.indexOf("func=short") != -1 || url.indexOf("func=find") != -1) return "multiple";
}

function cleanAuthorstring(author) {
@@ -26,15 +25,15 @@ function cleanAuthorstring(author) {

function scrape(doc, url) {
var newItem = new Zotero.Item("journalArticle");
var title = ZU.xpathText(doc, '//tr/td[@id="bold" and contains(text(), "Título")]/following-sibling::td');
var publication = ZU.xpathText(doc, '//tr/td[@id="bold" and contains(text(), "Revista")]/following-sibling::td');
var date = ZU.xpathText(doc, '//tr/td[@id="bold" and contains(text(), "Año de la revista")]/following-sibling::td');
var ISSN = ZU.xpathText(doc, '//tr/td[@id="bold" and contains(text(), "ISSN")]/following-sibling::td');
var language = ZU.xpathText(doc, '//tr/td[@id="bold" and contains(text(), "Idioma") and not(contains(text(), "resumen"))]/following-sibling::td');
var abstract = ZU.xpathText(doc, '//tr/td[@id="bold" and contains(text(), "Resumen")]/following-sibling::td');
var fulltext = ZU.xpathText(doc, '//tr/td[@id="bold" and contains(text(), "Texto completo")]/following-sibling::td');
var title = ZU.xpathText(doc, '//tr/th[contains(@class, "txtLeft") and contains(text(), "Título")]/following-sibling::td');
var publication = ZU.xpathText(doc, '//tr/th[contains(@class, "txtLeft") and contains(text(), "Revista")]/following-sibling::td');
var date = ZU.xpathText(doc, '//tr/th[contains(@class, "txtLeft") and contains(text(), "Año de la revista")]/following-sibling::td');
var ISSN = ZU.xpathText(doc, '//tr/th[contains(@class, "txtLeft") and contains(text(), "ISSN")]/following-sibling::td');
var language = ZU.xpathText(doc, '//tr/th[contains(@class, "txtLeft") and contains(text(), "Idioma") and not(contains(text(), "resumen"))]/following-sibling::td');
var abstract = ZU.xpathText(doc, '//tr/th[contains(@class, "txtLeft") and contains(text(), "Resumen")]/following-sibling::td');
var fulltext = ZU.xpathText(doc, '//tr/th[contains(@class, "txtLeft") and contains(text(), "Texto completo")]/following-sibling::td');
//Descripción field has pages, issue and volume
var description = ZU.xpathText(doc, '//tr/td[@id="bold" and contains(text(), "Descripción")]/following-sibling::td');
var description = ZU.xpathText(doc, '//tr/th[contains(@class, "txtLeft") and contains(text(), "Descripción")]/following-sibling::td');
if (description) {
var volume = description.match(/V([^\s]+)/);
var issue = description.match(/N([^\s]+)/);
@@ -44,7 +43,7 @@ function scrape(doc, url) {

//Authors and Tags can have multiple rows. In that case the td[1] remains empty we loop through them until that's no longer the case

var author1 = ZU.xpathText(doc, '//tr/td[@id="bold" and contains(text(), "Autor")]/following-sibling::td');
var author1 = ZU.xpathText(doc, '//tr/th[contains(@class, "txtLeft") and contains(text(), "Autor")]/following-sibling::td');
if (author1) newItem.creators.push(ZU.cleanAuthor(cleanAuthorstring(author1), "author", true))
var authorloop = ZU.xpath(doc, '//tr[td[@id="bold" and contains(text(), "Autor")]]/following-sibling::tr/td[1]')
var author;
@@ -58,9 +57,9 @@ function scrape(doc, url) {
}


var tag1 = ZU.xpathText(doc, '//tr/td[@id="bold" and contains(text(), "Palabra Clave")]/following-sibling::td');
var tag1 = ZU.xpathText(doc, '//tr/th[contains(@class, "txtLeft") and contains(text(), "Palabra Clave")]/following-sibling::td');
if (tag1) newItem.tags.push(tag1.trim())
var tagloop = ZU.xpath(doc, '//tr[td[@id="bold" and contains(text(), "Palabra Clave")]]/following-sibling::tr/td[1]')
var tagloop = ZU.xpath(doc, '//tr[th[contains(@class, "txtLeft") and contains(text(), "Palabra Clave")]]/following-sibling::tr/td[1]')
var tag;
for (var i in tagloop) {
if (tagloop[i].textContent.search(/[^\s]/) == -1) {
@@ -112,7 +111,7 @@ function doWeb(doc, url) {
var items = {};
if (detectWeb(doc, url) == "multiple") {
//this currently doesn't do anything as multiple detect is disabled
var titles = doc.evaluate('//td/strong/a[contains(@href, "func=full-set-set")]', doc, null, XPathResult.ANY_TYPE, null);
var titles = doc.evaluate('//tr/td/a[contains(@href, "func=full-set-set")][2]', doc, null, XPathResult.ANY_TYPE, null);
var next_title;
while (next_title = titles.iterateNext()) {
items[next_title.href] = next_title.textContent;
Oops, something went wrong.

0 comments on commit 12f42c9

Please sign in to comment.
You can’t perform that action at this time.