Permalink
Browse files

Some code cleaning

Started running roxygen but Windows didn't like it?
  • Loading branch information...
adam3smith committed Feb 3, 2019
1 parent fd5c2d0 commit cf581a0016a4e410a7abf26c7952d2061ed590c4
Showing with 95 additions and 107 deletions.
  1. +1 −1 DESCRIPTION
  2. +1 −1 NAMESPACE
  3. +46 −69 R/archivr.R
  4. +47 −36 README.md
@@ -24,5 +24,5 @@ Imports:
textreadr
License: MIT
LazyData: true
RoxygenNote: 6.1.0.9000
RoxygenNote: 6.1.1
Encoding: UTF-8
@@ -3,7 +3,7 @@
export(archiv)
export(archiv.fromText)
export(archiv.fromUrl)
export(archiv_url)
export(archiv_perma)
export(archiv_wayback)
export(check_folder)
export(extract_urls_from_folder)
@@ -1,3 +1,13 @@
#' Archivr: Save Your Websites in Perma.cc or the Wayback Machine
#'
#' Archivr is a toolkit for the long-run archiving of Qualitative data.
#' It takes a list of urls and uses either the perma.cc or Wayback Machine
#' archives to store the webpages for future reference. It will also parse
#' word or html documents for urls to be archived.
#' @docType package
#' @name archivr


#' Copyright <2019> <Qualitative Data Repository, Syracuse University>

#' Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -18,16 +28,6 @@
#' OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
#' THE SOFTWARE.


#' Archivr: Save Your Websites in Perma.cc or the Wayback Machine
#'
#' Archivr is a toolkit for the long-run archiving of Qualitative data.
#' It takes a list of urls and uses either the perma.cc or Wayback Machine
#' archives to store the webpages for future reference. It will also parse
#' word or html documents for urls to be archived.
#' @docType package
#' @name archivr

library(readtext)
library(jsonlite)
library(xml2)
@@ -42,7 +42,7 @@ archiv_env <- new.env()
archiv_env$perma_cc_key <- ""
archiv_env$perma_cc_folder_id <- NULL

#' Get the folder id and name from all text files in a perma.cc folder
#' Get the folder id of the perma.cc default folder (usually "Personal Links")
#'
#' @importFrom jsonlite fromJSON
#' @export
@@ -51,6 +51,7 @@ archiv_env$perma_cc_folder_id <- NULL
get_default_folder <- function (default=1) {
perma_cc_key <- get('perma_cc_key', envir=archiv_env)
if (perma_cc_key == "") {
stop("Please input your perma.cc api key: Use 'set_api_key(API_KEY)'")
reply <- FALSE
} else {
envelop = paste0(.perma_cc_user_url, perma_cc_key)
@@ -79,25 +80,24 @@ get_default_folder <- function (default=1) {
return (paste0(url, id, key))
}

#' Archive a list of urls in perma_cc.
#' Archive a list of urls in Wayback or perma_cc.
#'
#' @param url_list A list of urls to archive.
#' @param method Either "wayback" or "perma_cc." Defaults to "wayback."
#' @export
#' @return A dataframe containing the original urls, the urls to the
#' archived website, the screenshot and a timestamp.
#' archived website. For Perma.cc also the URL to the screenshot, the short URL and a timestamp.
archiv <- function (url_list, method="wayback") {
if (method == "perma_cc") {
fold <- get_folder_id()
if (is.null(fold)) {
if (is.null(fold) || fold == "") {
print("Setting folder based on api key.")
set_folder_id(get_folder_ids()[1,]$id)
set_folder_id(get_default_folder()[1,]$id)
fold <- toString(get_folder_id())
if (is.null(fold)) {
print ("Unable to get the correct folder. Please check that your")
print ("API key is set correctly.")
if (is.null(fold) || fold == "") {
stop("Unable to set perma.cc folder. Make sure you API key is set using 'set_api_key(API_KEY)'")
}}
newlst <- lapply(url_list, archiv_url)
newlst <- lapply(url_list, archiv_perma)
print(newlst)
df <- data.frame(matrix(unlist(newlst), nrow=length(newlst), byrow=T))
colnames(df) <- c("url", "GUID", "timestamp", "perma_cc_url", "perma_cc_screenshot", "perma_cc_short_url")
@@ -110,27 +110,6 @@ archiv <- function (url_list, method="wayback") {
}
}

#' Save a batch of urls to a folder - THIS CURRENTLY DOES NOT WORK.
#' @import curl
#' @param url_list A vector of urls to archive.
#' @param api (Optional api key)
#' @param folder (Mandatory, but defaults to .folder_id)
archiv_batch <- function (url_list, api="", folder="") {
api_url <- paste0(.perma_cc_post_batch_api_url, api)
setting <- new_handle()
handle_setopt(setting, customrequest = "POST")
handle_setform(setting, urls=list_string(url_list), target_folder=folder)
r <- curl_fetch_memory(api_url, setting)
reply <- fromJSON(rawToChar(r$content))
if ((!(is.null(reply$detail))) && reply$detail == "Authentication credentials were not provided.") {
result <- "Please input your api key:\nUse 'set_api_key(API_KEY)'"
} else if ((!(is.null(reply$error)))) {
result <- "Received an error reply, likely because your limit has been exceeded."
} else {
result <- reply$id
return(result)
}
}

#' Creates a json string from a list of urls.
#'
@@ -143,41 +122,39 @@ list_string <- function (url_list) {
return (paste0("'[", string, "]'"))
}

#' Saves a single url in either perma.cc or the wayback machine.
#' Saves a single url in perma.cc.
#'
#' @param arc_url The url to archive.
#' @param method Either "perma_cc" or the default, "wayback."
#' @importFrom jsonlite fromJSON
#' @import curl
#' @export
#' @return A list or object representing the result.
archiv_url <- function (arc_url, method="perma_cc") {
archiv_perma <- function (arc_url, method="perma_cc") {
api <- get_api_key()
fold <- toString(get_folder_id())
if (method == "perma_cc") {
folder_url <- paste0()
api_url <- paste0(.perma_cc_post_api_url, api)
setting <- new_handle()
handle_setopt(setting, customrequest = "POST")
handle_setform(setting, url = arc_url, folder = fold)
result <- list(arc_url, "noguid", "unknown", "no url", "no screenshot", "no short url")
r <- curl_fetch_memory(api_url, setting)
reply <- fromJSON(rawToChar(r$content))
if ((!(is.null(reply$detail))) && reply$detail == "Authentication credentials were not provided.") {
print("Please input your api key:\nUse 'set_api_key(API_KEY)'")
} else if ((!(is.null(reply$error)))) {
print(reply)
print("Received an error reply, likely because your limit has been exceeded.")
} else {
if (!(is.null(reply$url == "Not a valid URL."))) {
result <- c(reply$url, reply$guid, reply$archive_timestamp,
reply$captures[1,]$playback_url, reply$captures[2,]$playback_url,
paste0("https://perma.cc/", reply$guid))
}
return(result)
if (is.null(api) || api == "") {
stop("API key not set for perma.cc. Use 'set_api_key() to set your key before using method='perma_cc'")
}
folder_url <- paste0()
api_url <- paste0(.perma_cc_post_api_url, api)
setting <- new_handle()
handle_setopt(setting, customrequest = "POST")
handle_setform(setting, url = arc_url, folder = fold)
result <- list(arc_url, "noguid", "unknown", "no url", "no screenshot", "no short url")
r <- curl_fetch_memory(api_url, setting)
reply <- fromJSON(rawToChar(r$content))
if ((!(is.null(reply$detail))) && reply$detail == "Authentication credentials were not provided.") {
stop("Please input your api key:\nUse 'set_api_key(API_KEY)'")
} else if ((!(is.null(reply$error)))) {
print(reply)
stop("Received an error reply, likely because your limit has been exceeded.")
} else {
if (!(is.null(reply$url == "Not a valid URL."))) {
result <- c(reply$url, reply$guid, reply$archive_timestamp,
reply$captures[1,]$playback_url, reply$captures[2,]$playback_url,
paste0("https://perma.cc/", reply$guid))
}
} else if (method == "wayback") {
return (archiv_wayback(arc_url))
return(result)
}
}

@@ -490,8 +467,8 @@ get_folder_id <- function () {
get_folder_ids <- function () {
perma_cc_key <- get_api_key()
reply <- NULL
if (is.null(perma_cc_key)) {
print("Please input your api key:\nUse 'set_api_key(API_KEY)'")
if (is.null(perma_cc_key) || perma_cc_key == "") {
stop("Please input your perma.cc api key: Use 'set_api_key(API_KEY)'")
reply <- FALSE
} else {
envelop = paste0(.perma_cc_user_url, perma_cc_key)
@@ -500,7 +477,7 @@ get_folder_ids <- function () {
for (row in 1:nrow(data))
reply <- rbind(reply, check_folder(data[row,]))
} else {
print ("Error in extracting root folders in Perma.cc.")
print ("Error in extracting root folders in perma.cc.")
}
}
return (reply)
@@ -1,22 +1,20 @@
# Archivr

Archivr is a project by the [Qualitative Data Repository](https://qdr.syr.edu/)
that verifies the preservation of urls in Web Archives.
that automates preservation of urls in Web Archives.

Basic usage (for now):

```
git clone 'https://github.com/QualitativeDataRepository/archivr.git'
cd archivr
```
Then launch R and then:
## Installation

The easiest way to install is directly from this github using the `devtools` package:

```
source('archivr.R')
view_archiv(list("www.example.com"))
library(devtools)
install_github("QualitativeDataRepository/archivr")
library(archivr)
```

Examples:
## Usage

The basic function is `archiv` that takes a list of urls and stores them in
the Way Back Machine. It will return a dataframe containing the callback
@@ -31,13 +29,44 @@ arc_df$way_back_url
# 3 http://web.archive.org/web/20190128171134/https://github.com/ ...
```


Archiv can archive all the urls in a webpage. This feature is subject to restrictions
imposed on accounts

```
arc_url_df <- archiv.fromUrl("https://qdr.syr.edu/")
df <- data.frame(arc_url_df$url, arc_url_df$wayback_url)[8,]
# arc_url_df.url arc_url_df.wayback_url
# 8 http://syr.edu http://web.archive.org/web/20170110050058/http://syr.edu/
```

Archiv will also archive all the urls in a text file. It has been tested for docx,
pdf and markdown, although other text-related files should also work. Note that
text parsing can be subject to problems, especially if the document has rich features
such as tables or columns.
```
arc_url_df <- archiv.fromText("path_to_file")
```

To allow for pre-processing of URLs before archiving, `archivr` also provides access to the funcitons used to extract URLs from a webpage (`extract_urls_from_webpage("URL")`), from a files (`extract_urls_from_text("filepath")`) (tested for .docx, markdown, and pdf), and from any supported text file in a folder (`extract_urls_from_folder("filepath")`)

### Checking archiving status

You can check whether URLs are archived by the Internet Archive's Wayback machine:
```
arc_url_df <- view_archiv(list("www.example.com", "NOTAURL", "www.github.com"), "wayback")
```

### Using Perma.cc

If you wish to use perma.cc's archive, you will need to set your api key using:

```
set_api_key("YOUR_API_KEY")
```

if you wish to save the urls in a particular perma.cc folder, you will need to set the default
if you wish to save the urls in a particular perma.cc folder, you will need to set the default
folder id using

```
@@ -50,6 +79,11 @@ using:
get_folder_ids()
```

You can check your current folder using
```
get_folder_id()
```

and then you can archive materials:

```
@@ -61,29 +95,6 @@ To check if a list of urls are archived in perma_cc's public api, use:
arc_url_df <- view_archiv(list("www.example.com", "NOTAURL", "www.github.com"), "perma_cc")
```

or you may check the Wayback machine:
```
arc_url_df <- view_archiv(list("www.example.com", "NOTAURL", "www.github.com"), "wayback")
```

Archiv can archive all the urls in a webpage. This feature is subject to restrictions
imposed on accounts

```
arc_url_df <- archiv.fromUrl("https://qdr.syr.edu/")
df <- data.frame(arc_url_df$url, arc_url_df$wayback_url)[8,]
# arc_url_df.url arc_url_df.wayback_url
# 8 http://syr.edu http://web.archive.org/web/20170110050058/http://syr.edu/
```

Archiv will also archive all the urls in a text file. It has been tested for docx,
pdf and markdown, although other text-related files should also work. Note that
text parsing can be subject to problems, especially if the document has rich features
such as tables or columns.
```
arc_url_df <- archiv.fromUrl("path_to_file")
```

### Archivr is a project developed by the Qualitative Data Repository at Syracuse
### University, authored by Ryan Deschamps (greebie on github.com) and Agile Humanities.
**Archivr is a project developed by the Qualitative Data Repository at Syracuse
University, authored by Ryan Deschamps (greebie on github.com) and Agile Humanities.**

0 comments on commit cf581a0

Please sign in to comment.