Permalink
Browse files

Add saving function.

- archiv now saves a list of urls in perma cc.
- view_archiv adopts the functionality previously owned by archiv.
- save_url saves one url (used by archiv).
  • Loading branch information...
greebie committed Dec 31, 2018
1 parent 0295635 commit a1cd306ebda863972d02020591c139b65ca8e516
Showing with 64 additions and 27 deletions.
  1. +15 −15 README.md
  2. +46 −9 archivr.R
  3. +3 −3 archivr_test.R
@@ -18,33 +18,32 @@ archiv(list("www.example.com"))

Examples:

The basic function is `archiv` that takes a list of urls and checks their
availability on the wayback machine. It will return a dataframe with the
original urls followed by the http status (or 000 if no url exists), their
availability (TRUE or FALSE), the wayback machine url, and a timestamp.
The basic function is `archiv` that takes a list of urls and stores them in
perma_cc. It will return a dataframe with the
original urls followed by the GUID, a timestamp, the wayback machine url, and
the url for the perma_cc screenshot.

```
arc_df <- archiv(list("www.example.com", "NOTAURL", "www.github.com"))
arc_df$status # [1] 200 000 200 / Levels: 000 200
arc_df$wayback_url # [1] http://web.archive.org/web/20181214234252/http://Www.example.com
# [2] url not found
# [3] http://web.archive.org/web/20181215081640/https://github.com/
# 3 Levels: http://web.archive.org/web/20181214234252/http://Www.example.com ...
arc_df$perma_cc_url # [1] [1] //perma-archives.org/warc/G5EH-JA9M/http://www.google.com
# [2] no url
# [3] //perma-archives.org/warc/QD3H-3FHP/http://www.example.org
# 3 Levels: //perma-archives.org/warc/G5EH-JA9M/http://www.google.com ...
```
To check a list of urls in perma_cc, use:
To check if a list of urls are archived in perma_cc's public api, use:
```
arc_url_df <- archiv(list("www.example.com", "NOTAURL", "www.github.com"), "perma_cc")
arc_url_df <- view_archiv(list("www.example.com", "NOTAURL", "www.github.com"), "perma_cc")
```

or you may use both:
or you may check the Wayback machine:
```
arc_url_df <- archiv(list("www.example.com", "NOTAURL", "www.github.com"), "both")
arc_url_df <- archiv(list("www.example.com", "NOTAURL", "www.github.com"), "wayback")
```

Archiv can also check a webpage for archived urls.

```
arc_url_df <- archiv.fromUrl("https://qdr.syr.edu/")
arc_url_df <- view_archiv.fromUrl("https://qdr.syr.edu/")
df <- data.frame(arc_url_df$url, arc_url_df$wayback_url)[8,]
# arc_url_df.url arc_url_df.wayback_url
@@ -59,4 +58,5 @@ df <- data.frame(arc_url_df$url, arc_url_df$wayback_url)[8,]
Archivr has a few unit tests that can be run for contributors. To run, use
`r -f run_tests.R` inside the archivr folder.

### Archivr was developed by Ryan Deschamps @greebie
### Archivr was developed by Ryan Deschamps @greebie with support from the
### Qualitative Data Repository at Syracuse University.
@@ -15,27 +15,67 @@ if(!"stringr" %in% rownames(installed.packages())) {
if(!"readtext" %in% rownames(installed.packages())) {
install.packages("readtext", repos="http://cran.us.r-project.org")
}
if(!"curl" %in% rownames(installed.packages())) {
install.packages("curl", repos="http://cran.us.r-project.org")
}

library(readtext)
library(jsonlite)
library(xml2)
library(rvest)
library(stringr)
library(curl)

#' Default url for the Wayback Machine
.wb_available_url <- "http://archive.org/wayback/available?url="
.perma_cc_api_url <- "https://api.perma.cc/v1/public/archives/?url="
.perma_cc_post_api_url <- "https://api.perma.cc/v1/archives/?api_key="
#' Global var for the API key for perma.cc
.perma_cc_key <- ""


#' Archive a list of urls in perma_cc.
#'
#' @param url_list A list of urls to archive.
#' @return A dataframe containing the original urls, the urls to the
#' archived website, the screenshot and a timestamp.
archiv <- function (url_list) {
newlst <- lapply(url_list, save_url)
df <- data.frame(matrix(unlist(newlst), nrow=length(newlst), byrow=T))
colnames(df) <- c("url", "GUID", "timestamp", "perma_cc_url", "perma_cc_screenshot")
return(df)
}

save_url <- function (arc_url, api=.perma_cc_key, method="perma_cc") {
if (method == "perma_cc") {
api_url <- paste0(.perma_cc_post_api_url, api)
}
setting <- new_handle()
handle_setopt(setting, customrequest = "POST")
handle_setform(setting, url = arc_url)
result <- list(arc_url, "noguid", "unknown", "no url", "no screenshot")
r <- curl_fetch_memory(api_url, setting)
reply <- fromJSON(rawToChar(r$content))
if ((!(is.null(reply$detail))) && reply$detail == "Authentication credentials were not provided.") {
result <- "Please input your api key:\nUse 'setup_api_key(API_KEY)'"
} else if ((!(is.null(reply$error)))) {
result <- "Received an error reply, likely because your limit has been exceeded."
} else {
if (!(reply$url == "Not a valid URL.")) {
result <- c(reply$url, reply$guid, reply$archive_timestamp, reply$captures[1,]$playback_url, reply$captures[2,]$playback_url)
}
return(result)
}
}

#' Get archiving data from a list of Urls
#'
#' @param lst A list of urls to check.
#' @param source "wayback", "perma_cc" or "both".
#' @return A dataframe containing the original urls, their http status,
#' availability, the archive url if it exists and a timestamp for the last
#' web crawl.
archiv <- function (lst, source="wayback") {
view_archiv <- function (lst, source="wayback") {
if (source == "perma_cc") {
newlst <- lapply(lst, from_perma_cc)
df <- data.frame(matrix(unlist(newlst), nrow=length(newlst), byrow=T))
@@ -75,12 +115,12 @@ archiv <- function (lst, source="wayback") {
#' @param source Either "wayback," "perma_cc" or "both".
#' @return a dataframe containing the url, status, availability,
#' archived url(s) and timestamp(s)
archiv.fromUrl <- function (url, source="wayback") {
return(archiv(get_urls_from_webpage(url), source))
view_archiv.fromUrl <- function (url, source="wayback") {
return(view_archiv(get_urls_from_webpage(url), source))
}

archiv.fromText <- function (fp, source="wayback") {
return(archiv(extract_urls_from_text(fp), source))
view_archiv.fromText <- function (fp, source="wayback") {
return(view_archiv(extract_urls_from_text(fp), source))
}

#' Check whether a url is available in the Wayback Machine
@@ -122,10 +162,7 @@ from_perma_cc <- function (url) {
available <- ifelse(step["captures.status"]=="success" || step["captures.status1"] == "success", TRUE, FALSE)
playback_url <- ifelse(is.na(step["captures.playback_url"]), step["captures.playback_url1"], step["captures.playback_url"])
timestamp <- ifelse(is.na(step["creation_timestamp"]), "unknown", step["creation_timestamp"])
print (step["captures.playback_url1"])
print (playback_url)
result <- c(unname(step["url"]), unname(status), unname(available), unname(playback_url), unname(timestamp))
print(result)
}
return(result)
}
@@ -137,7 +174,7 @@ from_perma_cc <- function (url) {
#' @examples
#' add("", 1)
set_api_key <- function (key) {
.perma_cc_key <<- key
.perma_cc_key <<- key
}

#' Extracts the urls from a webpage.
@@ -16,7 +16,7 @@ test_that("Test getting real url from Wayback", {

test_that("Archivr function returns proper df", {
lurls <- c("www.example.com", "NOTAURL", "www.github.com")
test <- archiv(lurls, "wayback")
test <- view_archiv(lurls, "wayback")
expectedA <- as.vector(lurls)
expectedB <- c("200", "000", "200")
expectedC <- c("http://web.archive.org/web/20181214200505/http://Example.com",
@@ -40,7 +40,7 @@ test_that("Parses links from markdown text", {
expect_equal(unname(test), c("http://www.example.com",
"http://www.github.com",
"http://www.google.com", "http://www.apple.com"))
test2 <- archiv.fromText(md)
test2 <- view_archiv.fromText(md)
expect_equal(length(as.vector(test2$wayback_url)), 4)
})

@@ -57,7 +57,7 @@ test_that("Parses links from Latex", {
'\\end{document}')
test <- extract_urls_from_text(latex)
expect_equal(unname(test), c("http://www.sharelatex.com", "https://www.google.com/file/path.html"))
test2 <- archiv.fromText(latex)
test2 <- view_archiv.fromText(latex)
expect_equal(length(as.vector(test2$wayback_url)), 2)
})

0 comments on commit a1cd306

Please sign in to comment.