Permalink
Join GitHub today
GitHub is home to over 31 million developers working together to host and review code, manage projects, and build software together.
Sign up
Fetching contributors…
Cannot retrieve contributors at this time
{ | |
"translatorID": "312bbb0e-bfb6-4563-a33c-085445d391ed", | |
"label": "Die Zeit", | |
"creator": "Philipp Zumstein", | |
"target": "^https?://www\\.zeit\\.de/", | |
"minVersion": "3.0", | |
"maxVersion": "", | |
"priority": 100, | |
"inRepository": true, | |
"translatorType": 4, | |
"browserSupport": "gcsibv", | |
"lastUpdated": "2018-11-25 19:35:23" | |
} | |
/* | |
***** BEGIN LICENSE BLOCK ***** | |
Copyright © 2015 Philipp Zumstein | |
This file is part of Zotero. | |
Zotero is free software: you can redistribute it and/or modify | |
it under the terms of the GNU Affero General Public License as published by | |
the Free Software Foundation, either version 3 of the License, or | |
(at your option) any later version. | |
Zotero is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
GNU Affero General Public License for more details. | |
You should have received a copy of the GNU Affero General Public License | |
along with Zotero. If not, see <http://www.gnu.org/licenses/>. | |
***** END LICENSE BLOCK ***** | |
*/ | |
/* | |
This translator works only partially, because zeit.de uses some strange javascript that makes | |
processDocuments return an error. If I just call scrape(doc, url) on a single document, it works. | |
The way the translator is programmed now, it only works for multiples if JavaScript is turned off in the browser. | |
For example at | |
http://www.zeit.de/suche/index?q=Krise | |
only the first reference can be scraped. | |
*/ | |
function detectWeb(doc, url) { | |
var pageType = doc.body.getAttribute("data-page-type"); | |
if (pageType == "article") { | |
return "newspaperArticle"; | |
} else if (getSearchResults(doc, true)){ //ZU.xpath(doc, '//h4/a|//h2/a').length>0 | |
return "multiple"; | |
} | |
} | |
function getSearchResults(doc, checkOnly) { | |
var items = {}; | |
var found = false; | |
var rows = doc.getElementsByClassName('archiveteaser'); | |
if (rows.length === 0) { | |
rows = doc.getElementsByClassName('teaser-small__container'); | |
} | |
for (var i=0; i<rows.length; i++) { | |
var href = ZU.xpathText(rows[i], '(.//a/@href)[1]'); | |
var title = ZU.trimInternal( ZU.xpathText(rows[i], './/a/h4|.//a[span]') ); | |
if (!href || !title) continue; | |
if (checkOnly) return true; | |
found = true; | |
items[href] = title; | |
} | |
return found ? items : false; | |
} | |
function doWeb(doc, url) { | |
if (detectWeb(doc, url) == "multiple") { | |
Zotero.selectItems(getSearchResults(doc, false), function (items) { | |
if (!items) { | |
return true; | |
} | |
var articles = []; | |
for (var i in items) { | |
articles.push(i); | |
} | |
ZU.processDocuments(articles, scrape); | |
}); | |
} else { | |
scrape(doc, url); | |
} | |
} | |
function scrape(doc, url) { | |
var data = ZU.xpath(doc, '//script[@type="application/ld+json"]'); | |
let json; | |
let found = false; | |
for (let dataItem of data) { | |
json = JSON.parse(dataItem.textContent); | |
if (json["@type"] == "Article") { | |
found = true; | |
break; | |
} | |
} | |
if (!found) { | |
return; | |
} | |
// Z.debug(json); | |
var newItem = new Zotero.Item("newspaperArticle"); | |
newItem.url = url; | |
newItem.title = json.headline; | |
newItem.abstractNote = json.description; | |
if (json.datePublished) { | |
newItem.date = json.datePublished.replace(/T.+/, ""); | |
} | |
if (json.author) { | |
if (!Array.isArray(json.author)) { | |
json.author = [json.author]; | |
} | |
for (let author of json.author) { | |
if (author.name) { | |
newItem.creators.push(ZU.cleanAuthor(author.name, "author")); | |
} | |
} | |
} else { | |
var authorNode = ZU.xpath(doc, '//div[@class="byline"]'); | |
for (let i=0; i<authorNode.length; i++) { | |
var authorName = authorNode[i].textContent; | |
if (authorName){ | |
authorName = authorName.replace(/^\s*Von/, ''); | |
newItem.creators.push(ZU.cleanAuthor(authorName, "author")); | |
} | |
} | |
} | |
for (let i=0; i<newItem.creators.length; i++) { | |
if (newItem.creators[i].firstName === "") { | |
newItem.creators[i].fieldMode = 1; | |
delete newItem.creators[i].firstName; | |
} | |
} | |
var section = doc.getElementsByClassName("nav__ressorts-link--current"); | |
if (section.length > 0) { | |
newItem.section = section[0].textContent; | |
} | |
newItem.publicationTitle = "Die Zeit"; | |
newItem.ISSN = "0044-2070"; | |
newItem.language = "de-DE"; | |
newItem.place = "Hamburg"; | |
var keywordsString = json.keywords; | |
var keywords = keywordsString.split(','); | |
for (let i=0; i<keywords.length; i++) { | |
newItem.tags.push( | |
keywords[i].trim() | |
); | |
} | |
// if present, use the link to show the whole content on a single page | |
var snapshotNode = ZU.xpath(doc, '//li[@class="article-pager__all"]/a'); | |
var snapshotUrl = (snapshotNode.length > 0) ? snapshotNode[0].href : url; | |
newItem.attachments.push({ | |
url : snapshotUrl, | |
title : "Snapshot", | |
mimeType : "text/html" | |
}); | |
newItem.complete(); | |
} | |
/** BEGIN TEST CASES **/ | |
var testCases = [ | |
{ | |
"type": "web", | |
"url": "https://www.zeit.de/politik/ausland/2011-09/libyen-bani-walid", | |
"items": [ | |
{ | |
"itemType": "newspaperArticle", | |
"title": "Libyen: Rebellen bereiten Angriff auf Bani Walid vor", | |
"creators": [ | |
{ | |
"lastName": "AFP", | |
"creatorType": "author", | |
"fieldMode": 1 | |
}, | |
{ | |
"lastName": "dpa", | |
"creatorType": "author", | |
"fieldMode": 1 | |
} | |
], | |
"date": "2011-09-04", | |
"ISSN": "0044-2070", | |
"abstractNote": "Die von Gadhafi-Anhängern geführte Stadt ist von Rebellentruppen eingekreist. Gespräche über eine friedliche Übergabe sind gescheitert, ein Angriff steht offenbar bevor.", | |
"language": "de-DE", | |
"libraryCatalog": "Die Zeit", | |
"place": "Hamburg", | |
"publicationTitle": "Die Zeit", | |
"section": "Politik", | |
"shortTitle": "Libyen", | |
"url": "https://www.zeit.de/politik/ausland/2011-09/libyen-bani-walid", | |
"attachments": [ | |
{ | |
"title": "Snapshot", | |
"mimeType": "text/html" | |
} | |
], | |
"tags": [ | |
{ | |
"tag": "Libyen" | |
} | |
], | |
"notes": [], | |
"seeAlso": [] | |
} | |
] | |
}, | |
{ | |
"type": "web", | |
"url": "https://www.zeit.de/2011/36/Interview-Lahm-Rinke", | |
"items": [ | |
{ | |
"itemType": "newspaperArticle", | |
"title": "Philipp Lahm: \"Hast du elf Freunde?\"", | |
"creators": [ | |
{ | |
"firstName": "Moritz", | |
"lastName": "Müller-Wirth", | |
"creatorType": "author" | |
} | |
], | |
"date": "2011-09-01", | |
"ISSN": "0044-2070", | |
"abstractNote": "Tschechow und Robben, Drama im Flutlicht und Wahrhaftigkeit bei der Arbeit. Der Fußballprofi und Autor Philipp Lahm im Gespräch mit dem Schriftsteller und Fußballer Moritz Rinke", | |
"language": "de-DE", | |
"libraryCatalog": "Die Zeit", | |
"place": "Hamburg", | |
"publicationTitle": "Die Zeit", | |
"section": "Sport", | |
"shortTitle": "Philipp Lahm", | |
"url": "https://www.zeit.de/2011/36/Interview-Lahm-Rinke", | |
"attachments": [ | |
{ | |
"title": "Snapshot", | |
"mimeType": "text/html" | |
} | |
], | |
"tags": [ | |
{ | |
"tag": "Andreas Ottl" | |
}, | |
{ | |
"tag": "Aristoteles" | |
}, | |
{ | |
"tag": "Berlin" | |
}, | |
{ | |
"tag": "Bielefeld" | |
}, | |
{ | |
"tag": "FC Bayern München" | |
}, | |
{ | |
"tag": "Fußball" | |
}, | |
{ | |
"tag": "Hertha BSC" | |
}, | |
{ | |
"tag": "Joachim Löw" | |
}, | |
{ | |
"tag": "Max Frisch" | |
}, | |
{ | |
"tag": "Philipp Lahm" | |
} | |
], | |
"notes": [], | |
"seeAlso": [] | |
} | |
] | |
}, | |
{ | |
"type": "web", | |
"url": "http://www.zeit.de/suche/index?q=Krise", | |
"items": "multiple" | |
}, | |
{ | |
"type": "web", | |
"url": "https://www.zeit.de/2009/11/A-Drinnen", | |
"items": [ | |
{ | |
"itemType": "newspaperArticle", | |
"title": "DRINNEN: Mixen aus Prinzip", | |
"creators": [ | |
{ | |
"firstName": "Ernst", | |
"lastName": "Schmiederer", | |
"creatorType": "author" | |
} | |
], | |
"date": "2009-03-05", | |
"ISSN": "0044-2070", | |
"abstractNote": "Ein Iraner in Wien. Der Fotograf Daniel Shaked, 31, gibt Österreichs einziges Hip-Hop-Magazin heraus", | |
"language": "de-DE", | |
"libraryCatalog": "Die Zeit", | |
"place": "Hamburg", | |
"publicationTitle": "Die Zeit", | |
"section": "Politik", | |
"shortTitle": "DRINNEN", | |
"url": "https://www.zeit.de/2009/11/A-Drinnen", | |
"attachments": [ | |
{ | |
"title": "Snapshot", | |
"mimeType": "text/html" | |
} | |
], | |
"tags": [ | |
{ | |
"tag": "Band" | |
}, | |
{ | |
"tag": "Eltern" | |
}, | |
{ | |
"tag": "Familie" | |
}, | |
{ | |
"tag": "Geschwister" | |
}, | |
{ | |
"tag": "Hans Krankl" | |
}, | |
{ | |
"tag": "Hip-Hop" | |
}, | |
{ | |
"tag": "Iran" | |
}, | |
{ | |
"tag": "Israel" | |
}, | |
{ | |
"tag": "Musik" | |
}, | |
{ | |
"tag": "Offenheit" | |
}, | |
{ | |
"tag": "Reise" | |
}, | |
{ | |
"tag": "Revolution" | |
}, | |
{ | |
"tag": "Salzburg" | |
}, | |
{ | |
"tag": "Teheran" | |
}, | |
{ | |
"tag": "Wien" | |
}, | |
{ | |
"tag": "Österreich" | |
} | |
], | |
"notes": [], | |
"seeAlso": [] | |
} | |
] | |
}, | |
{ | |
"type": "web", | |
"url": "http://www.zeit.de/2009/11/index", | |
"items": "multiple" | |
} | |
] | |
/** END TEST CASES **/ |