Permalink
Browse files

First commit.

- Add README.md
- Add function to set perma.cc API key.
- Add function to check wayback url.
- Add function to check list of urls.
- Add function to parse webpage and check list of urls.
- Add small suite of unit tests.
  • Loading branch information...
greebie committed Dec 15, 2018
0 parents commit 7c927c3ae037e8db8e5c76a667f38c547b68e75d
Showing with 184 additions and 0 deletions.
  1. +53 −0 README.md
  2. +86 −0 archivr.R
  3. +39 −0 archivr_test.R
  4. +6 −0 run_tests.R
@@ -0,0 +1,53 @@
# Archivr

Archivr is a project by the [Qualitative Data Repository](https://qdr.syr.edu/)
that verifies the preservation of urls in Web Archives.

Basic usage (for now):

```
git clone 'https://github.com/QualitativeDataRepository/archivr.git'
cd archivr
```
Then launch R and then:

```
source('archivr.R')
archiv(list("www.example.com"))
```

Examples:

The basic function is `archiv` that takes a list of urls and checks their
availability on the wayback machine. It will return a dataframe with the
original urls followed by the http status (or 000 if no url exists), their
availability (TRUE or FALSE), the wayback machine url, and a timestamp.

```
arc_df <- archiv(list("www.example.com", "NOTAURL", "www.github.com"))
arc_df$status # [1] 200 000 200 / Levels: 000 200
arc_df$wayback_url # [1] http://web.archive.org/web/20181214234252/http://Www.example.com
# [2] url not found
# [3] http://web.archive.org/web/20181215081640/https://github.com/
# 3 Levels: http://web.archive.org/web/20181214234252/http://Www.example.com ...
```

Archiv can also check a webpage for archived urls.

```
arc_url_df <- archiv.fromUrl("https://qdr.syr.edu/")
df <- data.frame(arc_url_df$url, arc_url_df$wayback_url)[8,]
# arc_url_df.url arc_url_df.wayback_url
# 8 http://syr.edu http://web.archive.org/web/20170110050058/http://syr.edu/
```

Future implementations will include checks in perma.cc (or both). Archiv will
also become an R package.

## TESTING

Archivr has a few unit tests that can be run for contributors. To run, use
`r -f run_tests.R` inside the archivr folder.

### Archivr was developed by Ryan Deschamps @greebie
@@ -0,0 +1,86 @@
if(!"jsonlite" %in% rownames(installed.packages())) {
install.packages("jsonlite", repos="http://cran.us.r-project.org")
}
if(!"xml2" %in% rownames(installed.packages())) {
install.packages("xml2", repos="http://cran.us.r-project.org")
}
if(!"rvest" %in% rownames(installed.packages())) {
install.packages("rvest", repos="http://cran.us.r-project.org")
}
if(!"stringr" %in% rownames(installed.packages())) {
install.packages("stringr", repos="http://cran.us.r-project.org")
}

library(jsonlite)
library(xml2)
library(rvest)
library(stringr)

#' Default url for the Wayback Machine
.wb_available_url <- "http://archive.org/wayback/available?url="
#' Global var for the API key for perma.cc
.perma_cc_key <- ""

#' Get archiving data from a list of Urls
#'
#' @param lst A list of urls to check.
#' @param source "wayback", "perma_cc" or "both".
#' @return A dataframe containing the original urls, their http status,
#' availability, the archive url if it exists and a timestamp for the last
#' web crawl.
archiv <- function (lst, source="wayback") {
if (source == "perma_cc") {
print ("This feature is not available... yet")
} else if (source == "both") {
print ("This feature is not available... yet")
} else if (source == "wayback") {
newlst <- lapply(lst, from_wayback)
df <- data.frame(matrix(unlist(newlst), nrow=length(newlst), byrow=T))
colnames(df) <- c("url", "status", "available?", "wayback_url", "timestamp")
return (df)
} else {
print ("Could not confirm source.")
return(FALSE)
}
}

archiv.fromUrl <- function (url, source="wayback") {
return(archiv(get_urls_from_webpage(url), source))
}

#' Check whether a url is available in the Wayback Machine
#'
#' @param url The url to check.
#' @return a jsonlite object where
#' object$url is the original url.
#' object$$archived_snapshots$closest$status is the http status
#' object$archived_snapshots$closest$available is TRUE
#' object$archived_snapshots$closest$url is the archived url.
#' object$archived_snapshots$closest$timestamp is the last time the url
#' was crawled.
from_wayback <- function (url) {
envelop = paste0(.wb_available_url, url)
reply <- fromJSON(envelop)
result <- list(url, "000", FALSE, "url not found", "unknown")
if (length(reply$archived_snapshots)) {
result = reply
}
return (result)
}

#' Set the api key(s) for Perma.cc apis, if required.
#'
#' @param key The Api Key.
#' @return TRUE
#' @examples
#' add("", 1)
set_api_key <- function (key) {
.perma_cc_key <<- key
}

get_urls_from_webpage <- function (url) {
pg <- read_html(url)
lst <- unique(html_attr(html_nodes(pg, "a"), "href"))
Filter(function(x)
startsWith(x, "http"), lst)
}
@@ -0,0 +1,39 @@
test_that("Test setting api key",{
key <- "BOGUSKEY"
test <- set_api_key(key)
expect_equal(key, .perma_cc_key)
})

test_that("Test getting real url from Wayback", {
url <- "www.example.com"
test <- from_wayback(url)
status <- list ("200", TRUE, "http://web.archive.org/web/20181214200505/http://Example.com")
expect_equal(test$url, url)
expect_equal(list(test$archived_snapshots$closest$status,
test$archived_snapshots$closest$available,
test$archived_snapshots$closest$url), status)
})

test_that("Archivr function returns proper df", {
lurls <- c("www.example.com", "NOTAURL", "www.github.com")
test <- archiv(lurls, "wayback")
expectedA <- as.vector(lurls)
expectedB <- c("200", "000", "200")
expectedC <- c("http://web.archive.org/web/20181214200505/http://Example.com",
"url not found", "http://web.archive.org/web/20181214210708/https://github.com/")
expect_equal(as.vector(test[,"url"]), expectedA)
expect_equal(as.vector(test$status), expectedB)
## Cannot check way_back_urls due to timestamp
## expect_equal(as.vector(test$wayback_url), expectedC)
})

test_that("Parses links from markdown text", {
md <- paste0('# A HEADING',
'\n',
'Some text with a [url](http://www.example.com)\n',
'Some text with a [url](http://www.github.com "and a title")\n',
'And a url by itself http://www.google.com\n',
'A url enclosed in diamond brackets <http://www.apple.com\n',

)
})
@@ -0,0 +1,6 @@
if(!"testthat" %in% rownames(installed.packages())) {
install.packages("testthat", repos="http://cran.us.r-project.org")
}
library(testthat)
source("archivr.R")
test_results <- test_file("archivr_test.R", reporter="summary")

0 comments on commit 7c927c3

Please sign in to comment.