Permalink
Join GitHub today
GitHub is home to over 31 million developers working together to host and review code, manage projects, and build software together.
Sign up
translators/Stuff.co.nz.js
Find file
Copy path
Fetching contributors…
Cannot retrieve contributors at this time
{ | |
"translatorID": "631ff0c7-2e64-4279-a9c9-ad9518d40f2b", | |
"label": "Stuff.co.nz", | |
"creator": "Sopheak Hean (University of Waikato, Faculty of Education)", | |
"target": "^https?://(www\\.)?stuff\\.co\\.nz/", | |
"minVersion": "2.1.9", | |
"maxVersion": "", | |
"priority": 100, | |
"inRepository": true, | |
"translatorType": 4, | |
"browserSupport": "gcsibv", | |
"lastUpdated": "2014-08-23 05:56:33" | |
} | |
/* | |
Stuff.co.nz Translator- Parses Stuff.co.nz articles and creates Zotero-based metadata | |
Copyright (C) 2010 Sopheak Hean, University of Waikato, Faculty of Education | |
This program is free software: you can redistribute it and/or modify | |
it under the terms of the GNU General Public License as published by | |
the Free Software Foundation, either version 3 of the License, or | |
(at your option) any later version. | |
This program is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
GNU General Public License for more details. | |
You should have received a copy of the GNU General Public License | |
along with this program. If not, see <http://www.gnu.org/licenses/>. | |
*/ | |
/* Stuff.co.nz does not have an ISSN because it is not a newspaper publisher. | |
Stuff.co.nz is a collection of newspaper articles from around the country*/ | |
function detectWeb(doc, url) { | |
var definePath = '//div[@class="blog_content"]'; | |
var XpathObject = doc.evaluate(definePath, doc, null, XPathResult.ANY_TYPE, null).iterateNext(); | |
if (XpathObject){ | |
return "blogPost"; | |
} else { | |
var definePath = '//div[@class="story_landing"]'; | |
var XpathObject = doc.evaluate(definePath, doc, null, XPathResult.ANY_TYPE, null).iterateNext(); | |
if (XpathObject){ | |
return "newspaperArticle"; | |
} | |
} | |
} | |
function scrape(doc, url) { | |
var type = detectWeb(doc, url); | |
var splitIntoArray; | |
var fullName=""; | |
var emptyString =" "; | |
var firstName, lastName; | |
/*==========================Blog Post===========================*/ | |
if (type =="blogPost"){ | |
var newItem = new Zotero.Item('blogPost'); | |
newItem.url = doc.location.href; | |
//newItem.title = "No Title Found"; | |
newItem.publicationTitle = "Stuff.co.nz"; | |
newItem.language = "English"; | |
//Get Author | |
try { /*Try and Catch if encounter erro */ | |
var blogAuthor = "//div[@id='left_col']/span"; | |
var blogAuthorObject = doc.evaluate(blogAuthor, doc, null, XPathResult.ANY_TYPE, null).iterateNext(); | |
if (blogAuthorObject) { | |
if (blogAuthorObject.textContent.replace(/\s*/g,'') ==""){ | |
newItem.creators =blogAuthorObject.textContent.replace(/\s*/g,''); | |
} | |
else { | |
blogAuthorObject = blogAuthorObject.textContent; | |
if (blogAuthorObject.match(/[\s\n\r\t]+-[\s\n\r\t]+[a-zA-Z\s\n\r\t]*/g)){ | |
blogAuthorObject = blogAuthorObject.replace(/([\s\n\r\t]+-[\s\n\r\t]+[a-zA-Z\s\n\r\t]*)/g, '').replace(/\bBy \b/g,''); | |
splitIntoArray = blogAuthorObject.split (" "); | |
for (var i = 0; i < splitIntoArray.length; i++){ | |
firstName = splitIntoArray[i].substring(0,1).toUpperCase(); | |
lastName = splitIntoArray[i].substring(1).toLowerCase(); | |
fullName += firstName + lastName + emptyString; | |
} | |
newItem.creators.push(Zotero.Utilities.cleanAuthor(fullName , "author")); | |
} | |
else { | |
splitIntoArray = blogAuthorObject.replace(/\bBy \b/g,'').split (" "); | |
for (var i = 0; i < splitIntoArray.length; i++){ | |
firstName = splitIntoArray[i].substring(0,1).toUpperCase(); | |
lastName = splitIntoArray[i].substring(1).toLowerCase(); | |
fullName += firstName + lastName + emptyString; | |
} | |
newItem.creators.push(Zotero.Utilities.cleanAuthor(fullName , "author")); } | |
} | |
} | |
} catch (err) { | |
newItem.creators = []; | |
} | |
//Blog title | |
var blogTitle = url.match(/\/blogs\/([^/]+)/); | |
if (blogTitle){ | |
newItem.blogTitle = ZU.capitalizeTitle(blogTitle[1].replace(/-/g,' '), true); | |
} | |
newItem.shortTitle = doShortTitle(doc,url); | |
newItem.title= doTitle(doc, url); | |
newItem.date = doDate(doc, url); | |
newItem.abstractNote = doAbstract(doc, url); | |
newItem.websiteType = "Newspaper"; | |
newItem.attachments.push({document: doc, title:"Stuff.co.nz Snapshot"}); | |
newItem.complete(); | |
} | |
/* ======================Newspaper Article========================*/ | |
else if (type =="newspaperArticle"){ | |
var newItem = new Zotero.Item('newspaperArticle'); | |
newItem.url = url; | |
//newItem.title = "No Title Found"; | |
//Get extended publisher if there is any then replace with stuff.co.nz | |
var myPublisher = '//span[@class="storycredit"]'; | |
var myPublisherObject = doc.evaluate(myPublisher , doc, null, XPathResult.ANY_TYPE, null).iterateNext(); | |
if (myPublisherObject) { | |
var realPublisher = myPublisherObject.textContent; | |
if (realPublisher.match(/\bBy[\s\n\r\t]+[a-zA-Z\s\r\t\n]*-[\s\n\r\t]*/g)){ | |
realPublisher = realPublisher.replace (/\bBy[\s\n\r\t]+[a-zA-Z\s\r\t\n]*-[\s\n\r\t]*/g, '').replace(/^\s*|\s*$/g, ''); | |
newItem.publicationTitle = realPublisher; | |
} else { | |
newItem.publicationTitle = "Stuff.co.nz"; | |
} | |
} else { | |
newItem.publicationTitle = "Stuff.co.nz"; | |
} | |
newItem.language = "English"; | |
//Short Title | |
newItem.shortTitle = doShortTitle(doc,url); | |
//get Abstract | |
newItem.abstractNote = doAbstract(doc, url); | |
var authorXPath = '//span[@class="storycredit"]'; | |
var authorXPathObject = doc.evaluate(authorXPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext(); | |
if (authorXPathObject){ | |
var authorArray = new Array("NZPA", "The Press", "The Dominion Post"); | |
authorXPathObject = authorXPathObject.textContent; | |
if (authorXPathObject.match(/[\s\n\r\t]+-[\s\n\r\t]+\b[a-zA-Z\s\n\r\t]*|^\s+\bBy\s*/g)){ | |
authorXPathObject = authorXPathObject.replace(/([\s\n\r\t]+-[\s\n\r\t]+\b[a-zA-Z\s\n\r\t]*)|\b.co.nz|\b.com|(-[a-zA-Z0-9]*)/g, ''); | |
var authorString = authorXPathObject.replace(/^\s+\bBy\s*|^\s+\bBY\s*/g, ''); | |
if (authorString.match(/\W\band\W+/g)) { | |
authorTemp = authorString.replace(/\W\band\W+/g, ', '); | |
authorArray = authorTemp.split(", "); | |
} else if (!authorString.match(/\W\band\W+/g)) { | |
authorArray = authorString.toLowerCase(); | |
} | |
if ( authorArray instanceof Array ) { | |
for (var i in authorArray){ | |
splitIntoArray = authorArray[i].split (" "); | |
for (var i = 0; i < splitIntoArray.length; i++){ | |
firstName = splitIntoArray[i].substring(0,1).toUpperCase(); | |
lastName = splitIntoArray[i].substring(1).toLowerCase(); | |
fullName += firstChar + lastChar + emptyString; | |
} | |
newItem.creators.push(Zotero.Utilities.cleanAuthor(JoinString, "author")); | |
} | |
} else { | |
if (authorString.match(/\W\bof\W+/g)){ | |
authorTemp = authorString.replace (/\W\bof\W(.*)/g, ''); | |
splitIntoArray = authorTemp.split (" "); | |
for (var i = 0; i < splitIntoArray.length; i++){ | |
firstName = splitIntoArray[i].substring(0,1).toUpperCase(); | |
lastName = splitIntoArray[i].substring(1).toLowerCase(); | |
fullName += firstChar + lastChar + emptyString; | |
} | |
newItem.creators.push(Zotero.Utilities.cleanAuthor(JoinString, "author")); | |
} else { | |
splitIntoArray = authorArray.split (" "); | |
for (var i = 0; i < splitIntoArray.length; i++){ | |
firstName = splitIntoArray[i].substring(0,1).toUpperCase(); | |
lastName = splitIntoArray[i].substring(1).toLowerCase(); | |
fullName += firstName+ lastName + emptyString; | |
} | |
newItem.creators.push(Zotero.Utilities.cleanAuthor(fullName, "author")); | |
} | |
} | |
} else { | |
if (authorXPathObject.match(/[\s\n\r]+/g)){ | |
authorXPathObject = ZU.capitalizeTitle( authorXPathObject.trim(), true ); //.replace(/\s+/g, '-'); | |
newItem.creators.push(ZU.cleanAuthor(authorXPathObject, "author")); | |
} else { | |
newItem.creators.push(Zotero.Utilities.cleanAuthor(authorXPathObject , "author")); | |
} | |
} | |
} else { | |
newItem.creators = []; | |
} | |
//Title of the Article | |
newItem.title= doTitle(doc, url); | |
//Section of the Article | |
var current = '//li/a[@class="current"]'; | |
var currentObject = doc.evaluate(current, doc, null, XPathResult.ANY_TYPE, null).iterateNext(); | |
if (currentObject){ | |
currentObject = currentObject.textContent; | |
var articleSection = '//li[@class="mid_nav_item"]/a'; | |
var articleSectionObject = doc.evaluate(articleSection , doc, null, XPathResult.ANY_TYPE, null).iterateNext(); | |
if (articleSectionObject){ | |
articleSectionObject = articleSectionObject .textContent; | |
switch (articleSectionObject){ | |
case "National": | |
case "Business": | |
case "Sport": | |
case "Politics": | |
newItem.place= "New Zealand"; | |
newItem.section = currentObject; | |
break; | |
case "World": | |
newItem.place= "World"; | |
newItem.section = currentObject; break; | |
default: | |
newItem.section = articleSectionObject;break; | |
} | |
} | |
var SectionType = '//li[@class="current_nav_item"]/a'; | |
var SectionTypeObject = doc.evaluate(SectionType, doc, null, XPathResult.ANY_TYPE, null).iterateNext(); | |
if (SectionType){ | |
SectionTypeObject = SectionTypeObject.textContent; | |
switch (SectionTypeObject) { | |
case "National": | |
case "Crime": | |
case "Education": | |
case "Health": | |
case "Politics": | |
case "Environment": | |
case "Business": | |
newItem.place= "New Zealand"; | |
newItem.section = currentObject; break; | |
case "Opinion": | |
case "Rugby": | |
case "Soccer": | |
case "Cricket": | |
case "Basketball": | |
case "Fishing": | |
case "League": | |
case "Scoreboard": | |
case "Football": | |
case "Golf": | |
case "Motorsport": | |
case "Netball": | |
case "Tennis": | |
newItem.section ="Sport"; break; | |
default: | |
newItem.section = SectionTypeObject; break; | |
} | |
} | |
} | |
else { | |
var SectionType = '//li[@class="current_nav_item"]/a'; | |
var SectionTypeObject = doc.evaluate(SectionType, doc, null, XPathResult.ANY_TYPE, null).iterateNext(); | |
if (SectionType){ | |
SectionTypeObject = SectionTypeObject.textContent; | |
switch (SectionTypeObject) { | |
case "National": | |
case "Crime": | |
case "Education": | |
case "Health": | |
case "Politics": | |
case "Environment": | |
case "Business": | |
newItem.place= "New Zealand"; | |
newItem.section = SectionTypeObject; break; | |
default: | |
newItem.section =SectionTypeObject; break; | |
} | |
} | |
} | |
//Snapshot of the web page. | |
newItem.attachments.push({document:doc, title:"Stuff.co.nz Snapshot"}); | |
//Call Do date function to make it cleaner in scape. This way things are easier to follow. | |
newItem.date = doDate(doc,url); | |
newItem.complete(); | |
} | |
} | |
function doShortTitle(doc, url){ | |
var shortTitle=""; | |
var subTitle = '//div[@id="left_col"]/h2'; | |
var subTitleObject = doc.evaluate(subTitle, doc, null, XPathResult.ANY_TYPE, null).iterateNext(); | |
if (subTitleObject){ | |
shortTitle= subTitleObject.textContent.replace(/^\s*|\s*$/g, ''); | |
return shortTitle; | |
} else { | |
return shortTitle; | |
} | |
} | |
function doAbstract(doc, url){ | |
var abstractString=""; | |
var a= "//meta[@name='description']"; | |
var abs= doc.evaluate(a, doc, null, XPathResult.ANY_TYPE, null).iterateNext(); | |
if (abs){ | |
abstractString = abs.content; | |
return abstractString; | |
} | |
return abstractString; | |
} | |
function doTitle(doc, url){ | |
var temp=""; | |
var getTitle = '//div[@id="left_col"]//h1'; | |
var getTitleObject = doc.evaluate(getTitle, doc, null, XPathResult.ANY_TYPE, null).iterateNext(); | |
if (getTitleObject) { | |
var temp=getTitleObject.textContent.replace(/^\s*|\s*$/g, ''); | |
return temp; | |
} | |
return temp; | |
} | |
function doDate(doc, url){ | |
var dateXpath = "//div[@id='toolbox']/div[3]"; | |
var dateXpathObject = doc.evaluate(dateXpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext(); | |
try { | |
if (dateXpathObject){ | |
var storeDateValue = dateXpathObject.textContent.replace(/\b(Last updated )\d{0,9}:\d{0,9} /g,''); | |
var ArrayDate = storeDateValue.split('/'); | |
var emptyString = " "; | |
var comma = ", "; | |
var DateString; | |
var ArrayMonth = new Array("Jan", "Feb", "Mar", "Apr", "May", "Jun", "July", "Aug", "Sep", "Oct", "Nov", "Dec"); | |
var ArrayNumber = new Array("01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"); | |
for (var i=0; i <ArrayNumber.length; i++){ | |
if (ArrayDate[1] ==ArrayNumber[i]) { | |
ArrayNumber[i] = ArrayMonth[i]; | |
var month = ArrayNumber[i] + emptyString; | |
} | |
DateString = month + ArrayDate[0] + comma + ArrayDate[2]; | |
} | |
return DateString; | |
} else { | |
DateString = ""; | |
return DateString; | |
} | |
}catch (err) { | |
DateString = ""; | |
} | |
return DateString; | |
} | |
function doWeb(doc, url) { | |
scrape(doc, url); | |
} | |
/** BEGIN TEST CASES **/ | |
var testCases = [ | |
{ | |
"type": "web", | |
"url": "http://www.stuff.co.nz/national/politics/campaign-trail/5967550/Green-party-link-to-billboard-attacks", | |
"items": [ | |
{ | |
"itemType": "newspaperArticle", | |
"title": "Green party link to billboard attacks", | |
"creators": [ | |
{ | |
"firstName": "Danya", | |
"lastName": "Levy", | |
"creatorType": "author" | |
} | |
], | |
"date": "Nov 15, 2011", | |
"abstractNote": "The man who coordinated the vandalism of 700 National billboards says it was an attempt at \"freedom of expression.\"", | |
"language": "English", | |
"libraryCatalog": "Stuff.co.nz", | |
"place": "New Zealand", | |
"publicationTitle": "Stuff.co.nz", | |
"section": "Politics", | |
"url": "http://www.stuff.co.nz/national/politics/campaign-trail/5967550/Green-party-link-to-billboard-attacks", | |
"attachments": [ | |
{ | |
"title": "Stuff.co.nz Snapshot" | |
} | |
], | |
"tags": [], | |
"notes": [], | |
"seeAlso": [] | |
} | |
] | |
} | |
] | |
/** END TEST CASES **/ |