Skip to content
Permalink
Browse files

Complete rewrite of FAZ.NET.js (#1474)

Closes #1470.
  • Loading branch information...
zuphilip authored and adam3smith committed Nov 17, 2017
1 parent 432125c commit 1a74e09c6bfbe687b9f326573745f7c89967e941
Showing with 121 additions and 78 deletions.
  1. +121 −78 FAZ.NET.js
@@ -1,54 +1,66 @@
{
"translatorID": "4f0d0c90-5da0-11df-a08a-0800200c9a66",
"label": "FAZ.NET",
"creator": "ibex, Sebastian Karcher",
"creator": "Philipp Zumstein",
"target": "^https?://((www\\.)?faz\\.net/.)",
"minVersion": "2.1",
"minVersion": "3.0",
"maxVersion": "",
"priority": 100,
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcsibv",
"lastUpdated": "2016-09-19 21:07:47"
"lastUpdated": "2017-11-11 11:24:04"
}

/*
FAZ Translator - Parses FAZ articles and creates Zotero-based metadata.
Copyright (C) 2010-2012 ibex and Sebastian Karcher
***** BEGIN LICENSE BLOCK *****
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
Copyright © 2017 Philipp Zumstein
This file is part of Zotero.
Zotero is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
Zotero is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
GNU Affero General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
You should have received a copy of the GNU Affero General Public License
along with Zotero. If not, see <http://www.gnu.org/licenses/>.
***** END LICENSE BLOCK *****
*/


/* Zotero API */
// attr()/text() v2
function attr(docOrElem,selector,attr,index){var elem=index?docOrElem.querySelectorAll(selector).item(index):docOrElem.querySelector(selector);return elem?elem.getAttribute(attr):null;}function text(docOrElem,selector,index){var elem=index?docOrElem.querySelectorAll(selector).item(index):docOrElem.querySelector(selector);return elem?elem.textContent:null;}

function detectWeb(doc, url) {

//Zotero.debug("ibex detectWeb URL= "+ url);
function detectWeb(doc, url) {
if (doc.title == "Suche und Suchergebnisse - FAZ" && getSearchResults(doc, true)) {
return "multiple";
} else if (ZU.xpathText(doc, '//div[@class = "FAZArtikelEinleitung"]')) {
return "newspaperArticle";
} else if (text(doc, 'div.Artikel')) {
if (text(doc, 'div.Artikel div.VideoBox')) {
return "videoRecording";
} else {
return "newspaperArticle";
}
}
}

function getSearchResults(doc, checkOnly) {
var items = {};
var found = false;
//make sure we don't get media objects
var rows = ZU.xpath(doc, '//div[not(descendant::span[@class="icon-play30"])]/a[@class="TeaserHeadLink"]');
var rows = ZU.xpath(doc, '//div/a[@class="TeaserHeadLink"]');
for (var i=0; i<rows.length; i++) {
// skip paywalled content
if (ZU.xpathText(rows[i], './span[contains(@class, "fazplusIcon")]')) {
continue;
}
var href = rows[i].href;
var title = ZU.trimInternal(rows[i].textContent);
if (!href || !title) continue;
@@ -79,64 +91,39 @@ function doWeb(doc, url) {


function scrape(doc, url) {
var newArticle = new Zotero.Item('newspaperArticle');
newArticle.url = url;
newArticle.title = ZU.trimInternal(ZU.xpathText(doc, '//div[@class = "FAZArtikelEinleitung"]/h2').trim().replace(/\n/g,":")).replace(/^,/, "");
var date = ZU.xpathText(doc, '(//span[@class="Datum"])[1]/@content');
if (date) newArticle.date = ZU.trimInternal(date.replace(/T.+$/, ""));
var teaser = ZU.xpathText(doc, '//div[@class="FAZArtikelEinleitung"]/p[@class = "Copy"]');
if (teaser != null) {
newArticle.abstractNote = Zotero.Utilities.trimInternal(teaser).replace(/^,\s*/, "");
}

//some authors are in /a, some aren't we need to distinguish to get this right
if (ZU.xpathText(doc, '//div[@class="FAZArtikelEinleitung"]/span[@class = "Autor"]/span[contains(@class, "caps")]/a') != null) {
var xpath = '//div[@class="FAZArtikelEinleitung"]/span[@class = "Autor"]/span[contains(@class, "caps")]/a';
} else {
var xpath = '//div[@class="FAZArtikelEinleitung"]/span[@class ="Autor"]/span/span[contains(@class, "caps")]';
};
var authors = ZU.xpath(doc, xpath);
var type = detectWeb(doc, url);

for (i in authors) {
newArticle.creators.push(Zotero.Utilities.cleanAuthor(authors[i].textContent, "author"));
var translator = Zotero.loadTranslator('web');
// Embedded Metadata
translator.setTranslator('951c027d-74ac-47d4-a107-9c3069ab7b48');
// translator.setDocument(doc);

translator.setHandler('itemDone', function (obj, item) {
// fix authors
item.creators = [];
var authors = doc.querySelectorAll('.atc-Meta .atc-MetaAuthor');
for (let i=0; i<authors.length; i++) {
item.creators.push(ZU.cleanAuthor(authors[i].textContent, "author"));
}

var section = text(doc, '.gh-MainNav_SectionsLink-is-active');
if (section) {
item.section = Zotero.Utilities.trimInternal(section);
}
if (!item.language) {
item.language = "de-DE";
}
item.ISSN = "0174-4909";
item.runningTime = attr(doc, 'meta[itemprop="duration"]', 'content');
item.complete();
});

newArticle.publicationTitle = "Frankfurter Allgemeine Zeitung";

var section = ZU.xpathText(doc, '//ul[@id="nav"]/li/span[@class = "Selected"]');
if (section != null) {
newArticle.section = Zotero.Utilities.trimInternal(section);
}

var source = ZU.xpath(doc, '//div[@id="MainColumn"]/div[@class = "Article"]/p[@class = "ArticleSrc"]').innerHTML;
if (source != null) {
// newArticle.extra = ZU.trimInternal(ZU.cleanTags(source));
}
//language
var language = ZU.xpathText(doc, '//meta[@name="language"]/@content');
if (language != null) newArticle.language = language;
else newArticle.language = "de-DE";

newArticle.ISSN = "0174-4909";
newArticle.attachments.push({
title: "FAZ.NET Article Snapshot",
mimeType: "text/html",
url: doc.location.href,
snapshot: true
translator.getTranslatorObject(function(trans) {
trans.itemType = type;
trans.doWeb(doc, url);
});

newArticle.complete();
}

/* There is no built-in function to count object properties which often are used as associative arrays.*/

function countObjectProperties(obj) {
var size = 0;
for (var key in obj) {
if (obj.hasOwnProperty(key)) size++;
}
return size;
};
/** BEGIN TEST CASES **/
var testCases = [
{
@@ -158,22 +145,45 @@ var testCases = [
"creatorType": "author"
}
],
"date": "2011-06-13",
"date": "2011-06-13T06:00:00+0200",
"ISSN": "0174-4909",
"abstractNote": "Wissenschaft hat eine Geschichte, wie kann sie dann aber rational sein? Im Briefwechsel zwischen Ludwik Fleck und Moritz Schlick deuteten sich bereits Antworten an.",
"language": "de-DE",
"libraryCatalog": "FAZ.NET",
"publicationTitle": "Frankfurter Allgemeine Zeitung",
"libraryCatalog": "www.faz.net",
"publicationTitle": "FAZ.NET",
"shortTitle": "Wissenschaftsphilosophie",
"url": "http://www.faz.net/sonntagszeitung/wissenschaft/wissenschaftsphilosophie-krumme-wege-der-vernunft-1654864.html",
"url": "http://www.faz.net/1.654864",
"attachments": [
{
"title": "FAZ.NET Article Snapshot",
"mimeType": "text/html",
"snapshot": true
"title": "Snapshot"
}
],
"tags": [
{
"tag": "Ludwik Fleck"
},
{
"tag": "Moritz"
},
{
"tag": "Moritz Schlick"
},
{
"tag": "Paul Feyerabend"
},
{
"tag": "Schlick"
},
{
"tag": "Springer-Verlag"
},
{
"tag": "Thomas S. Kuhn"
},
{
"tag": "Wissenschaft"
}
],
"tags": [],
"notes": [],
"seeAlso": []
}
@@ -183,6 +193,39 @@ var testCases = [
"type": "web",
"url": "http://www.faz.net/suche/?query=argentinien&suchbegriffImage.x=0&suchbegriffImage.y=0&resultsPerPage=20",
"items": "multiple"
},
{
"type": "web",
"url": "http://www.faz.net/aktuell/sport/tango-taenzer-kaempfen-in-buenos-aires-um-den-weltmeister-titel-15155586.html",
"items": [
{
"itemType": "videoRecording",
"title": "Argentinien: Tango-Tänzer kämpfen in Buenos Aires um den Weltmeister-Titel",
"creators": [],
"date": "2017-08-17T11:56:48+0200",
"abstractNote": "Mehr als 1200 Tänzer aus 48 Ländern tanzen in Argentinien um den Weltmeister-Titel. Das Finale ist am 22. August.",
"language": "de-DE",
"libraryCatalog": "www.faz.net",
"runningTime": "58",
"shortTitle": "Argentinien",
"url": "http://www.faz.net/aktuell/sport/tango-taenzer-kaempfen-in-buenos-aires-um-den-weltmeister-titel-15155586.html",
"attachments": [
{
"title": "Snapshot"
}
],
"tags": [
{
"tag": "Argentinien"
},
{
"tag": "Buenos Aires"
}
],
"notes": [],
"seeAlso": []
}
]
}
]
/** END TEST CASES **/

0 comments on commit 1a74e09

Please sign in to comment.
You can’t perform that action at this time.