Permalink
Please sign in to comment.
Browse files
First commit.
- Add README.md - Add function to set perma.cc API key. - Add function to check wayback url. - Add function to check list of urls. - Add function to parse webpage and check list of urls. - Add small suite of unit tests.
- Loading branch information...
Showing
with
184 additions
and 0 deletions.
- +53 −0 README.md
- +86 −0 archivr.R
- +39 −0 archivr_test.R
- +6 −0 run_tests.R
@@ -0,0 +1,53 @@ | |||
# Archivr | |||
|
|||
Archivr is a project by the [Qualitative Data Repository](https://qdr.syr.edu/) | |||
that verifies the preservation of urls in Web Archives. | |||
|
|||
Basic usage (for now): | |||
|
|||
``` | |||
git clone 'https://github.com/QualitativeDataRepository/archivr.git' | |||
cd archivr | |||
``` | |||
Then launch R and then: | |||
|
|||
``` | |||
source('archivr.R') | |||
archiv(list("www.example.com")) | |||
``` | |||
|
|||
Examples: | |||
|
|||
The basic function is `archiv` that takes a list of urls and checks their | |||
availability on the wayback machine. It will return a dataframe with the | |||
original urls followed by the http status (or 000 if no url exists), their | |||
availability (TRUE or FALSE), the wayback machine url, and a timestamp. | |||
|
|||
``` | |||
arc_df <- archiv(list("www.example.com", "NOTAURL", "www.github.com")) | |||
arc_df$status # [1] 200 000 200 / Levels: 000 200 | |||
arc_df$wayback_url # [1] http://web.archive.org/web/20181214234252/http://Www.example.com | |||
# [2] url not found | |||
# [3] http://web.archive.org/web/20181215081640/https://github.com/ | |||
# 3 Levels: http://web.archive.org/web/20181214234252/http://Www.example.com ... | |||
``` | |||
|
|||
Archiv can also check a webpage for archived urls. | |||
|
|||
``` | |||
arc_url_df <- archiv.fromUrl("https://qdr.syr.edu/") | |||
df <- data.frame(arc_url_df$url, arc_url_df$wayback_url)[8,] | |||
# arc_url_df.url arc_url_df.wayback_url | |||
# 8 http://syr.edu http://web.archive.org/web/20170110050058/http://syr.edu/ | |||
``` | |||
|
|||
Future implementations will include checks in perma.cc (or both). Archiv will | |||
also become an R package. | |||
|
|||
## TESTING | |||
|
|||
Archivr has a few unit tests that can be run for contributors. To run, use | |||
`r -f run_tests.R` inside the archivr folder. | |||
|
|||
### Archivr was developed by Ryan Deschamps @greebie |
@@ -0,0 +1,86 @@ | |||
if(!"jsonlite" %in% rownames(installed.packages())) { | |||
install.packages("jsonlite", repos="http://cran.us.r-project.org") | |||
} | |||
if(!"xml2" %in% rownames(installed.packages())) { | |||
install.packages("xml2", repos="http://cran.us.r-project.org") | |||
} | |||
if(!"rvest" %in% rownames(installed.packages())) { | |||
install.packages("rvest", repos="http://cran.us.r-project.org") | |||
} | |||
if(!"stringr" %in% rownames(installed.packages())) { | |||
install.packages("stringr", repos="http://cran.us.r-project.org") | |||
} | |||
|
|||
library(jsonlite) | |||
library(xml2) | |||
library(rvest) | |||
library(stringr) | |||
|
|||
#' Default url for the Wayback Machine | |||
.wb_available_url <- "http://archive.org/wayback/available?url=" | |||
#' Global var for the API key for perma.cc | |||
.perma_cc_key <- "" | |||
|
|||
#' Get archiving data from a list of Urls | |||
#' | |||
#' @param lst A list of urls to check. | |||
#' @param source "wayback", "perma_cc" or "both". | |||
#' @return A dataframe containing the original urls, their http status, | |||
#' availability, the archive url if it exists and a timestamp for the last | |||
#' web crawl. | |||
archiv <- function (lst, source="wayback") { | |||
if (source == "perma_cc") { | |||
print ("This feature is not available... yet") | |||
} else if (source == "both") { | |||
print ("This feature is not available... yet") | |||
} else if (source == "wayback") { | |||
newlst <- lapply(lst, from_wayback) | |||
df <- data.frame(matrix(unlist(newlst), nrow=length(newlst), byrow=T)) | |||
colnames(df) <- c("url", "status", "available?", "wayback_url", "timestamp") | |||
return (df) | |||
} else { | |||
print ("Could not confirm source.") | |||
return(FALSE) | |||
} | |||
} | |||
|
|||
archiv.fromUrl <- function (url, source="wayback") { | |||
return(archiv(get_urls_from_webpage(url), source)) | |||
} | |||
|
|||
#' Check whether a url is available in the Wayback Machine | |||
#' | |||
#' @param url The url to check. | |||
#' @return a jsonlite object where | |||
#' object$url is the original url. | |||
#' object$$archived_snapshots$closest$status is the http status | |||
#' object$archived_snapshots$closest$available is TRUE | |||
#' object$archived_snapshots$closest$url is the archived url. | |||
#' object$archived_snapshots$closest$timestamp is the last time the url | |||
#' was crawled. | |||
from_wayback <- function (url) { | |||
envelop = paste0(.wb_available_url, url) | |||
reply <- fromJSON(envelop) | |||
result <- list(url, "000", FALSE, "url not found", "unknown") | |||
if (length(reply$archived_snapshots)) { | |||
result = reply | |||
} | |||
return (result) | |||
} | |||
|
|||
#' Set the api key(s) for Perma.cc apis, if required. | |||
#' | |||
#' @param key The Api Key. | |||
#' @return TRUE | |||
#' @examples | |||
#' add("", 1) | |||
set_api_key <- function (key) { | |||
.perma_cc_key <<- key | |||
} | |||
|
|||
get_urls_from_webpage <- function (url) { | |||
pg <- read_html(url) | |||
lst <- unique(html_attr(html_nodes(pg, "a"), "href")) | |||
Filter(function(x) | |||
startsWith(x, "http"), lst) | |||
} |
@@ -0,0 +1,39 @@ | |||
test_that("Test setting api key",{ | |||
key <- "BOGUSKEY" | |||
test <- set_api_key(key) | |||
expect_equal(key, .perma_cc_key) | |||
}) | |||
|
|||
test_that("Test getting real url from Wayback", { | |||
url <- "www.example.com" | |||
test <- from_wayback(url) | |||
status <- list ("200", TRUE, "http://web.archive.org/web/20181214200505/http://Example.com") | |||
expect_equal(test$url, url) | |||
expect_equal(list(test$archived_snapshots$closest$status, | |||
test$archived_snapshots$closest$available, | |||
test$archived_snapshots$closest$url), status) | |||
}) | |||
|
|||
test_that("Archivr function returns proper df", { | |||
lurls <- c("www.example.com", "NOTAURL", "www.github.com") | |||
test <- archiv(lurls, "wayback") | |||
expectedA <- as.vector(lurls) | |||
expectedB <- c("200", "000", "200") | |||
expectedC <- c("http://web.archive.org/web/20181214200505/http://Example.com", | |||
"url not found", "http://web.archive.org/web/20181214210708/https://github.com/") | |||
expect_equal(as.vector(test[,"url"]), expectedA) | |||
expect_equal(as.vector(test$status), expectedB) | |||
## Cannot check way_back_urls due to timestamp | |||
## expect_equal(as.vector(test$wayback_url), expectedC) | |||
}) | |||
|
|||
test_that("Parses links from markdown text", { | |||
md <- paste0('# A HEADING', | |||
'\n', | |||
'Some text with a [url](http://www.example.com)\n', | |||
'Some text with a [url](http://www.github.com "and a title")\n', | |||
'And a url by itself http://www.google.com\n', | |||
'A url enclosed in diamond brackets <http://www.apple.com\n', | |||
|
|||
) | |||
}) |
@@ -0,0 +1,6 @@ | |||
if(!"testthat" %in% rownames(installed.packages())) { | |||
install.packages("testthat", repos="http://cran.us.r-project.org") | |||
} | |||
library(testthat) | |||
source("archivr.R") | |||
test_results <- test_file("archivr_test.R", reporter="summary") |
0 comments on commit
7c927c3