Permalink
Join GitHub today
GitHub is home to over 31 million developers working together to host and review code, manage projects, and build software together.
Sign up
translators/The Boston Globe.js
Find file
Copy path
Fetching contributors…
Cannot retrieve contributors at this time
{ | |
"translatorID": "1f245496-4c1b-406a-8641-d286b3888231", | |
"label": "The Boston Globe", | |
"creator": "Adam Crymble, Frank Bennett, Sebastian Karcher", | |
"target": "^https?://(www|search|articles|archive)\\.boston\\.com/", | |
"minVersion": "2.1.9", | |
"maxVersion": "", | |
"priority": 100, | |
"inRepository": true, | |
"translatorType": 4, | |
"browserSupport": "gcsibv", | |
"lastUpdated": "2017-06-29 18:49:20" | |
} | |
/* | |
* Sample URLs | |
* | |
* [Original request -- uncommon page format, no embedded metadata of any kind] | |
* http://articles.boston.com/2011-05-03/news/29500032_1_bouncer-assault-local-restaurant | |
* | |
* [More common page formats, marginally reliable metadata in a comment block] | |
* http://www.boston.com/yourtown/news/charlestown/2011/04/meet_charlestowns_youth_of_the.html | |
* http://www.boston.com/business/articles/2011/05/05/oil_drops_below_100_per_barrel/ | |
* http://www.boston.com/lifestyle/articles/2011/04/28/anticipation_grows_for_mfas_art_in_bloom_festival/ | |
* Support for search results will require rewriting scrape(..) to use only regular expressions | |
*/ | |
function detectWeb(doc, url) { | |
if (url.match("search.boston.com")) { | |
// Search disabled until cross-domain can be dealt with | |
return false; | |
var results = doc.evaluate('//div[@class="resultsMain"]//div[@class="regTZ"]/a[@class="titleLink"]', doc, null, XPathResult.ANY_TYPE, null); | |
if (results.iterateNext()) { | |
return "multiple"; | |
} else { | |
return false; | |
} | |
} else if (url.match(/(\/[0-9]{4}\/[0-9]{2}\/|[0-9]{4}-[0-9]{2}-[0-9]{2})/)) { | |
return "newspaperArticle"; | |
} | |
} | |
//Boston Globe and Boston.com Translator. Original code by Adam Crymble | |
// Rewritten by Frank Bennett, 2011 | |
function sniffComment (elem) { | |
if (!elem) { | |
return elem; | |
} | |
for (var i = 0, ilen = elem.childNodes.length; i < ilen; i += 1) { | |
if (elem.childNodes[i].nodeName === "#comment") { | |
return elem.childNodes[i].nodeValue; | |
} | |
} | |
return false; | |
} | |
function findMagicComment (doc) { | |
var hideMeElems = doc.getElementsByClassName("hideMe"); | |
for (var i = 0, ilen = hideMeElems.length; i < ilen; i += 1) { | |
var elem = hideMeElems.item(i); | |
var sniff = sniffComment(elem); | |
if (sniff) { | |
return sniff; | |
} | |
} | |
var contentElem = doc.getElementById("content"); | |
return sniffComment(contentElem); | |
} | |
function findAuthorString (doc, newItem) { | |
var authors = ""; | |
var bylineElem = false; | |
var bylineElems = doc.getElementsByClassName("byline"); | |
if (bylineElems.length) { | |
bylineElem = bylineElems.item(0); | |
} | |
if (!bylineElem) { | |
var bylineElem = doc.getElementById('byline'); | |
} | |
if (bylineElem) { | |
authors = bylineElem.textContent; | |
authors = authors.replace(/\n/g, " "); | |
if (authors.match(/[Pp]osted\s+by\s+/)) { | |
newItem.itemType = "blogPost"; | |
} | |
authors = authors.replace(/^\s*(?:[Bb]y|[Pp]osted\s+by)\s+(.*)/, "$1"); | |
} | |
return authors; | |
} | |
function scrape (doc, url) { | |
// The site content is pretty chaotic, we do our best. | |
// There are two independent blocks set-and-save blocks | |
// below. | |
// Many pages seem to have metadata embedded in a comment | |
// The date and headline info look reliable, but | |
// the byline is a disaster, to be used only | |
// if absolutely necessary. | |
var magicComment = findMagicComment(doc); | |
if (magicComment) { | |
// Blind acceptance | |
var newItem =new Zotero.Item("newspaperArticle"); | |
newItem.publicationTitle = "Boston.com"; | |
// URL | |
newItem.url = doc.location.href; | |
// Attachment | |
newItem.attachments.push({url:doc.location.href,mimetype:"text/html",snapshot:true,title:"Boston.com page"}); | |
// Now try to get some citation details (go ahead, try) | |
var info = magicComment.replace('\n','','g'); | |
newItem.title = ZU.xpathText(doc, '//div[@id="headTools"]/h1'); | |
newItem.date = ZU.xpathText(doc, '//span[@id="dateline"]/text()[2]'); | |
var authors = findAuthorString(doc, newItem); | |
if (!authors) { | |
var authors = info.replace(/.*<byline>(.*)<\/byline>.*/,"$1"); | |
if (authors.toLowerCase() === authors) { | |
authors = info.replace(/.*<teasetext>(.*)<\/teasetext>.*/, "$1"); | |
var m = authors.match(/^(?:[Bb]y\s+)*([^ ,]+).*/); | |
if (m) { | |
authors = m[1]; | |
} else { | |
authors = ""; | |
} | |
} | |
} | |
authors = authors.split(/,*\s+and\s+/); | |
authors[authors.length - 1] = authors[authors.length - 1].split(/,\s+/)[0]; | |
authors = authors.join(", "); | |
authors = authors.split(/,\s+/); | |
for (var j = 0, jlen = authors.length; j < jlen; j += 1) { | |
var author = Zotero.Utilities.cleanAuthor(authors[j], 'author'); | |
if (author.lastName) { | |
newItem.creators.push(author); | |
} | |
} | |
newItem.complete(); | |
} | |
// Information block | |
var infoElem = doc.getElementById("mod-article-byline"); | |
if (infoElem) { | |
var newItem = new Zotero.Item("newspaperArticle"); | |
newItem.publicationTitle = "Boston.com"; | |
// URL | |
newItem.url = doc.location.href; | |
newItem.attachments.push({url:doc.location.href,mimetype:"text/html",snapshot:true,title:"Boston.com page"}); | |
// Date | |
var dateElem = infoElem.getElementsByClassName('pubdate'); | |
if (dateElem.length) { | |
newItem.date = dateElem.textContent; | |
} | |
// Authors | |
/* | |
for (var i = 0, ilen = infoElem.childNodes.length; i < ilen; i += 1) { | |
var node = infoElem.childNodes.item(i); | |
if (node.nodeName === 'SPAN') { | |
if ('By' === node.textContent.slice(0,2)) { | |
var authors = node.textContent.slice(3); | |
authors = authors.split(/(?:, |,*\s+and\s+)/); | |
for (var j = 0, jlen = authors.length; j < jlen; j += 1) { | |
var author = Zotero.Utilities.cleanAuthor(authors[j], 'author'); | |
newItem.creators.push(author); | |
} | |
} | |
} | |
}*/ | |
var authors = ZU.xpathText(infoElem, './span[@class="separator"]/following-sibling::span') | |
authors = authors.replace(/^\s*[Bb]y|,.+?$/g, "").trim(); | |
author = authors.split(/ and |\s*,\s*/) | |
for (var i in author){ | |
newItem.creators.push(ZU.cleanAuthor(author[i], "author")); | |
} | |
// Title | |
var headerElem = doc.getElementById('mod-article-header'); | |
if (headerElem) { | |
var h = headerElem.getElementsByTagName('h1'); | |
if (h.length) { | |
newItem.title = h[0].textContent; | |
} | |
} | |
newItem.complete(); | |
} | |
} | |
function doWeb (doc, url) { | |
var articles = new Array(); | |
if (detectWeb(doc, url) == "multiple") { | |
var items = {}; | |
var result = doc.evaluate('//div[@class="regTZ"]/a[@class="titleLink"]', doc, null, XPathResult.ANY_TYPE, null); | |
var elmt = result.iterateNext(); | |
while (elmt) { | |
//items.push(elmt.href); | |
items[elmt.href] = elmt.textContent; | |
elmt = result.iterateNext(); | |
} | |
Zotero.selectItems(items, function (items) { | |
if (!items) { | |
return true; | |
} | |
for (var i in items) { | |
articles.push(i); | |
} | |
ZU.processDocuments(articles, scrape); | |
}); | |
} else { | |
scrape(doc, url); | |
} | |
} | |
/** BEGIN TEST CASES **/ | |
var testCases = [ | |
{ | |
"type": "web", | |
"url": "http://archive.boston.com/lifestyle/articles/2011/04/28/anticipation_grows_for_mfas_art_in_bloom_festival/?camp=pm", | |
"items": [ | |
{ | |
"itemType": "newspaperArticle", | |
"title": "Anticipation grows for MFA’s spring flower festival", | |
"creators": [ | |
{ | |
"firstName": "Carol", | |
"lastName": "Stocker", | |
"creatorType": "author" | |
} | |
], | |
"date": "April 28, 2011", | |
"libraryCatalog": "The Boston Globe", | |
"publicationTitle": "Boston.com", | |
"url": "http://archive.boston.com/lifestyle/articles/2011/04/28/anticipation_grows_for_mfas_art_in_bloom_festival/?camp=pm", | |
"attachments": [ | |
{ | |
"mimetype": "text/html", | |
"snapshot": true, | |
"title": "Boston.com page" | |
} | |
], | |
"tags": [], | |
"notes": [], | |
"seeAlso": [] | |
} | |
] | |
}, | |
{ | |
"type": "web", | |
"url": "http://archive.boston.com/news/nation/washington/articles/2011/05/08/bin_laden_occupied_shrunken_dark_world/", | |
"items": [ | |
{ | |
"itemType": "newspaperArticle", | |
"title": "A peek inside bin Laden’s world: isolation, vanity, power", | |
"creators": [ | |
{ | |
"firstName": "Elisabeth", | |
"lastName": "Bumiller", | |
"creatorType": "author" | |
}, | |
{ | |
"firstName": "Carlotta", | |
"lastName": "Gall", | |
"creatorType": "author" | |
} | |
], | |
"date": "May 8, 2011", | |
"libraryCatalog": "The Boston Globe", | |
"publicationTitle": "Boston.com", | |
"shortTitle": "A peek inside bin Laden’s world", | |
"url": "http://archive.boston.com/news/nation/washington/articles/2011/05/08/bin_laden_occupied_shrunken_dark_world/", | |
"attachments": [ | |
{ | |
"mimetype": "text/html", | |
"snapshot": true, | |
"title": "Boston.com page" | |
} | |
], | |
"tags": [], | |
"notes": [], | |
"seeAlso": [] | |
} | |
] | |
} | |
] | |
/** END TEST CASES **/ |