Some code cleaning

Started running roxygen but Windows didn't like it?
QualitativeDataRepository · Feb 3, 2019 · cf581a0016a4e410a7abf26c7952d2061ed590c4 · cf581a0
1 parent fd5c2d0
commit cf581a0016a4e410a7abf26c7952d2061ed590c4
Unified Split

Showing with 95 additions and 107 deletions.

+1 −1 DESCRIPTION

+1 −1 NAMESPACE

+46 −69 R/archivr.R

+47 −36 README.md
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -24,5 +24,5 @@ Imports:
    textreadr
 License: MIT
 LazyData: true
-  
-RoxygenNote: 6.1.0.9000
+  
+RoxygenNote: 6.1.1
 Encoding: UTF-8
diff --git a/NAMESPACE b/NAMESPACE
@@ -3,7 +3,7 @@
 export(archiv)
 export(archiv.fromText)
 export(archiv.fromUrl)
-  
-export(archiv_url)
+  
+export(archiv_perma)
 export(archiv_wayback)
 export(check_folder)
 export(extract_urls_from_folder)

diff --git a/R/archivr.R b/R/archivr.R
@@ -1,3 +1,13 @@
+  
+#' Archivr: Save Your Websites in Perma.cc or the Wayback Machine
+  
+#'
+  
+#' Archivr is a toolkit for the long-run archiving of Qualitative data.
+  
+#' It takes a list of urls and uses either the perma.cc or Wayback Machine
+  
+#' archives to store the webpages for future reference. It will also parse
+  
+#' word or html documents for urls to be archived.
+  
+#' @docType package
+  
+#' @name archivr
+  
+
+  
+
 #' Copyright <2019> <Qualitative Data Repository, Syracuse University>
 #' Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -18,16 +28,6 @@
 #' OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 #' THE SOFTWARE.
-  
-
-  
-#' Archivr: Save Your Websites in Perma.cc or the Wayback Machine
-  
-#'
-  
-#' Archivr is a toolkit for the long-run archiving of Qualitative data.
-  
-#' It takes a list of urls and uses either the perma.cc or Wayback Machine
-  
-#' archives to store the webpages for future reference. It will also parse
-  
-#' word or html documents for urls to be archived.
-  
-#' @docType package
-  
-#' @name archivr
-  
-
 library(readtext)
 library(jsonlite)
 library(xml2)
@@ -42,7 +42,7 @@ archiv_env <- new.env()
 archiv_env$perma_cc_key <- ""
 archiv_env$perma_cc_folder_id <- NULL
-  
-#' Get the folder id and name from all text files in a perma.cc folder
+  
+#' Get the folder id of the perma.cc default folder (usually "Personal Links")
 #'
 #' @importFrom jsonlite fromJSON
 #' @export
@@ -51,6 +51,7 @@ archiv_env$perma_cc_folder_id <- NULL
 get_default_folder <- function (default=1) {
  perma_cc_key <- get('perma_cc_key', envir=archiv_env)
  if (perma_cc_key == "") {
+  
+    stop("Please input your perma.cc api key: Use 'set_api_key(API_KEY)'")
    reply <- FALSE
  } else {
    envelop = paste0(.perma_cc_user_url, perma_cc_key)
@@ -79,25 +80,24 @@ get_default_folder <- function (default=1) {
  return (paste0(url, id, key))
 }
-  
-#' Archive a list of urls in perma_cc.
+  
+#' Archive a list of urls in Wayback or perma_cc.
 #'
 #' @param url_list A list of urls to archive.
 #' @param method Either "wayback" or "perma_cc." Defaults to "wayback."
 #' @export
 #' @return A dataframe containing the original urls, the urls to the
-  
-#'   archived website, the screenshot and a timestamp.
+  
+#'   archived website. For Perma.cc also the URL to the screenshot, the short URL and a timestamp.
 archiv <- function (url_list, method="wayback") {
  if (method == "perma_cc") {
    fold <- get_folder_id()
-  
-    if (is.null(fold)) {
+  
+    if (is.null(fold) || fold == "") {
      print("Setting folder based on api key.")
-  
-      set_folder_id(get_folder_ids()[1,]$id)
+  
+      set_folder_id(get_default_folder()[1,]$id)
      fold <- toString(get_folder_id())
-  
-      if (is.null(fold)) {
-  
-        print ("Unable to get the correct folder. Please check that your")
-  
-        print ("API key is set correctly.")
+  
+      if (is.null(fold) || fold == "") {
+  
+        stop("Unable to set perma.cc folder. Make sure you API key is set using 'set_api_key(API_KEY)'")
      }}
-  
-    newlst <- lapply(url_list, archiv_url)
+  
+    newlst <- lapply(url_list, archiv_perma)
    print(newlst)
    df <- data.frame(matrix(unlist(newlst), nrow=length(newlst), byrow=T))
    colnames(df) <- c("url", "GUID", "timestamp", "perma_cc_url", "perma_cc_screenshot", "perma_cc_short_url")
@@ -110,27 +110,6 @@ archiv <- function (url_list, method="wayback") {
  }
 }
-  
-#' Save a batch of urls to a folder - THIS CURRENTLY DOES NOT WORK.
-  
-#' @import curl
-  
-#' @param url_list A vector of urls to archive.
-  
-#' @param api (Optional api key)
-  
-#' @param folder (Mandatory, but defaults to .folder_id)
-  
-archiv_batch <- function (url_list, api="", folder="") {
-  
-  api_url <- paste0(.perma_cc_post_batch_api_url, api)
-  
-  setting <- new_handle()
-  
-  handle_setopt(setting, customrequest = "POST")
-  
-  handle_setform(setting, urls=list_string(url_list), target_folder=folder)
-  
-  r <- curl_fetch_memory(api_url, setting)
-  
-  reply <- fromJSON(rawToChar(r$content))
-  
-  if ((!(is.null(reply$detail))) && reply$detail == "Authentication credentials were not provided.") {
-  
-    result <- "Please input your api key:\nUse 'set_api_key(API_KEY)'"
-  
-  } else if ((!(is.null(reply$error)))) {
-  
-    result <- "Received an error reply, likely because your limit has been exceeded."
-  
-  } else {
-  
-    result <- reply$id
-  
-    return(result)
-  
-  }
-  
-}
 #' Creates a json string from a list of urls.
 #'
@@ -143,41 +122,39 @@ list_string <- function (url_list) {
  return (paste0("'[", string, "]'"))
 }
-  
-#' Saves a single url in either perma.cc or the wayback machine.
+  
+#' Saves a single url in perma.cc.
 #'
 #' @param arc_url The url to archive.
-  
-#' @param method Either "perma_cc" or the default, "wayback."
 #' @importFrom jsonlite fromJSON
 #' @import curl
 #' @export
 #' @return A list or object representing the result.
-  
-archiv_url <- function (arc_url, method="perma_cc") {
+  
+archiv_perma <- function (arc_url, method="perma_cc") {
  api <- get_api_key()
  fold <- toString(get_folder_id())
-  
-  if (method == "perma_cc") {
-  
-    folder_url <- paste0()
-  
-    api_url <- paste0(.perma_cc_post_api_url, api)
-  
-    setting <- new_handle()
-  
-    handle_setopt(setting, customrequest = "POST")
-  
-    handle_setform(setting, url = arc_url, folder = fold)
-  
-    result <- list(arc_url, "noguid", "unknown", "no url", "no screenshot", "no short url")
-  
-    r <- curl_fetch_memory(api_url, setting)
-  
-    reply <- fromJSON(rawToChar(r$content))
-  
-    if ((!(is.null(reply$detail))) && reply$detail == "Authentication credentials were not provided.") {
-  
-      print("Please input your api key:\nUse 'set_api_key(API_KEY)'")
-  
-    } else if ((!(is.null(reply$error)))) {
-  
-      print(reply)
-  
-      print("Received an error reply, likely because your limit has been exceeded.")
-  
-    } else {
-  
-      if (!(is.null(reply$url == "Not a valid URL."))) {
-  
-        result <- c(reply$url, reply$guid, reply$archive_timestamp,
-  
-          reply$captures[1,]$playback_url, reply$captures[2,]$playback_url,
-  
-        paste0("https://perma.cc/", reply$guid))
-  
-      }
-  
-      return(result)
+  
+  if (is.null(api) || api == "") {
+  
+    stop("API key not set for perma.cc. Use 'set_api_key() to set your key before using method='perma_cc'")
+  
+  }
+  
+  folder_url <- paste0()
+  
+  api_url <- paste0(.perma_cc_post_api_url, api)
+  
+  setting <- new_handle()
+  
+  handle_setopt(setting, customrequest = "POST")
+  
+  handle_setform(setting, url = arc_url, folder = fold)
+  
+  result <- list(arc_url, "noguid", "unknown", "no url", "no screenshot", "no short url")
+  
+  r <- curl_fetch_memory(api_url, setting)
+  
+  reply <- fromJSON(rawToChar(r$content))
+  
+  if ((!(is.null(reply$detail))) && reply$detail == "Authentication credentials were not provided.") {
+  
+    stop("Please input your api key:\nUse 'set_api_key(API_KEY)'")
+  
+  } else if ((!(is.null(reply$error)))) {
+  
+    print(reply)
+  
+    stop("Received an error reply, likely because your limit has been exceeded.")
+  
+  } else {
+  
+    if (!(is.null(reply$url == "Not a valid URL."))) {
+  
+      result <- c(reply$url, reply$guid, reply$archive_timestamp,
+  
+                  reply$captures[1,]$playback_url, reply$captures[2,]$playback_url,
+  
+                  paste0("https://perma.cc/", reply$guid))
    }
-  
-  } else if (method == "wayback") {
-  
-    return (archiv_wayback(arc_url))
+  
+    return(result)
  }
 }
@@ -490,8 +467,8 @@ get_folder_id <- function () {
 get_folder_ids <- function () {
  perma_cc_key <- get_api_key()
  reply <- NULL
-  
-  if (is.null(perma_cc_key)) {
-  
-    print("Please input your api key:\nUse 'set_api_key(API_KEY)'")
+  
+  if (is.null(perma_cc_key) || perma_cc_key == "") {
+  
+    stop("Please input your perma.cc api key: Use 'set_api_key(API_KEY)'")
    reply <- FALSE
  } else {
    envelop = paste0(.perma_cc_user_url, perma_cc_key)
@@ -500,7 +477,7 @@ get_folder_ids <- function () {
      for (row in 1:nrow(data))
        reply <- rbind(reply, check_folder(data[row,]))
    } else {
-  
-      print ("Error in extracting root folders in Perma.cc.")
+  
+      print ("Error in extracting root folders in perma.cc.")
    }
  }
  return (reply)

diff --git a/README.md b/README.md
@@ -1,22 +1,20 @@
 # Archivr
 Archivr is a project by the [Qualitative Data Repository](https://qdr.syr.edu/)
-  
-that verifies the preservation of urls in Web Archives.
+  
+that automates preservation of urls in Web Archives.
-  
-Basic usage (for now):
-  
-```
-  
-git clone 'https://github.com/QualitativeDataRepository/archivr.git'
-  
-cd archivr
-  
-```
-  
-Then launch R and then:
+  
+## Installation
+  
+
+  
+The easiest way to install is directly from this github using the `devtools` package:
 ```
-  
-source('archivr.R')
-  
-view_archiv(list("www.example.com"))
+  
+library(devtools)
+  
+install_github("QualitativeDataRepository/archivr")
+  
+library(archivr)
 ```
-  
-Examples:
+  
+## Usage
 The basic function is `archiv` that takes a list of urls and stores them in
 the Way Back Machine. It will return a dataframe containing the callback
@@ -31,13 +29,44 @@ arc_df$way_back_url
 # 3    http://web.archive.org/web/20190128171134/https://github.com/ ...
 ```
+  
+
+  
+Archiv can archive all the urls in a webpage. This feature is subject to restrictions
+  
+imposed on accounts
+  
+
+  
+```
+  
+arc_url_df <- archiv.fromUrl("https://qdr.syr.edu/")
+  
+df <- data.frame(arc_url_df$url, arc_url_df$wayback_url)[8,]
+  
+
+  
+#   arc_url_df.url                                    arc_url_df.wayback_url
+  
+# 8 http://syr.edu http://web.archive.org/web/20170110050058/http://syr.edu/
+  
+```
+  
+
+  
+Archiv will also archive all the urls in a text file. It has been tested for docx,
+  
+pdf and markdown, although other text-related files should also work. Note that
+  
+text parsing can be subject to problems, especially if the document has rich features
+  
+such as tables or columns.
+  
+```
+  
+arc_url_df <- archiv.fromText("path_to_file")
+  
+```
+  
+
+  
+To allow for pre-processing of URLs before archiving, `archivr` also provides access to the funcitons used to extract URLs from a webpage (`extract_urls_from_webpage("URL")`), from a files (`extract_urls_from_text("filepath")`) (tested for .docx, markdown, and pdf), and from any supported text file in a folder (`extract_urls_from_folder("filepath")`)
+  
+
+  
+### Checking archiving status
+  
+
+  
+You can check whether URLs are archived by the Internet Archive's Wayback machine:
+  
+```
+  
+arc_url_df <- view_archiv(list("www.example.com", "NOTAURL", "www.github.com"), "wayback")
+  
+```
+  
+
+  
+### Using Perma.cc
+  
+
 If you wish to use perma.cc's archive, you will need to set your api key using:
 ```
 set_api_key("YOUR_API_KEY")
 ```
-  
-if you wish to save the urls in a particular perma.cc folder, you will need to set the default 
+  
+if you wish to save the urls in a particular perma.cc folder, you will need to set the default
 folder id using
 ```
@@ -50,6 +79,11 @@ using:
 get_folder_ids()
 ```
+  
+You can check your current folder using
+  
+```
+  
+get_folder_id()
+  
+```
+  
+
 and then you can archive materials:
 ```
@@ -61,29 +95,6 @@ To check if a list of urls are archived in perma_cc's public api, use:
 arc_url_df <- view_archiv(list("www.example.com", "NOTAURL", "www.github.com"), "perma_cc")
 ```
-  
-or you may check the Wayback machine:
-  
-```
-  
-arc_url_df <- view_archiv(list("www.example.com", "NOTAURL", "www.github.com"), "wayback")
-  
-```
-  
-
-  
-Archiv can archive all the urls in a webpage. This feature is subject to restrictions
-  
-imposed on accounts
-  
-
-  
-```
-  
-arc_url_df <- archiv.fromUrl("https://qdr.syr.edu/")
-  
-df <- data.frame(arc_url_df$url, arc_url_df$wayback_url)[8,]
-  
-
-  
-#   arc_url_df.url                                    arc_url_df.wayback_url
-  
-# 8 http://syr.edu http://web.archive.org/web/20170110050058/http://syr.edu/
-  
-```
-  
-
-  
-Archiv will also archive all the urls in a text file. It has been tested for docx, 
-  
-pdf and markdown, although other text-related files should also work. Note that
-  
-text parsing can be subject to problems, especially if the document has rich features
-  
-such as tables or columns.
-  
-```
-  
-arc_url_df <- archiv.fromUrl("path_to_file")
-  
-```
-  
-### Archivr is a project developed by the Qualitative Data Repository at Syracuse 
-  
-### University, authored by Ryan Deschamps (greebie on github.com) and Agile Humanities.
+  
+**Archivr is a project developed by the Qualitative Data Repository at Syracuse
+  
+University, authored by Ryan Deschamps (greebie on github.com) and Agile Humanities.**