Develop Archivr into a package and add auto documentation.

Fix README.md
QualitativeDataRepository · Jan 28, 2019 · c9e627dc6beb4d392a03199b15f73af204a0ccba · c9e627d
1 parent d43e594
commit c9e627dc6beb4d392a03199b15f73af204a0ccba
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,25 @@
+  
+Package: archivr
+  
+Title: Archivr - Save Your Websites in Perma.cc or the Wayback Machine
+  
+Version: 0.0.1
+  
+Authors@R: c(person("Ryan", "Deschamps", email = "ryan.deschamps@gmail.com",
+  
+                  role = c("aut", "cre")), 
+  
+                  person("Qualitative Data", "Repository", email = "qdr@syr.edu", 
+  
+                  role = c("own", "pat")),
+  
+                  person("Sebastian", "Karcher", email = "skarcher@syr.edu", 
+  
+                  role = c("edt", "exp")))
+  
+Description: Archivr is a toolkit for the long-run archiving of Qualitative data.
+  
+    It takes a list of urls and uses either the perma.cc or Wayback Machine
+  
+    archives to store the webpages for future reference. It will also parse
+  
+    word or html documents for urls to be archived.
+  
+Maintainer: Ryan Deschamps <ryan.deschamps@gmail.com>
+  
+Depends: R (>= 3.1.0)
+  
+Imports:
+  
+    rvest,
+  
+    readtext,
+  
+    xml2,
+  
+    stringr,
+  
+    curl
+  
+License: MIT
+  
+LazyData: true
+  
+RoxygenNote: 6.1.0.9000
+  
+Encoding: UTF-8
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,13 @@
+  
+# Generated by roxygen2: do not edit by hand
+  
+
+  
+export(archiv)
+  
+export(archiv.fromText)
+  
+export(archiv.fromUrl)
+  
+export(archiv_url)
+  
+export(archiv_wayback)
+  
+export(get_folder_ids)
+  
+export(set_api_key)
+  
+export(set_folder_id)
+  
+export(view_archiv)
+  
+export(view_archiv.fromText)
+  
+export(view_archiv.fromUrl)
diff --git a/archivr.R → R/archivr.R b/archivr.R → R/archivr.R
@@ -19,35 +19,16 @@
 #' THE SOFTWARE.
-  
-#' The following may not be necessary once packaging is set up.
-  
-
-  
-if(!"jsonlite" %in% rownames(installed.packages())) {
-  
-  install.packages("jsonlite", repos="http://cran.us.r-project.org")
-  
-}
-  
-if(!"xml2" %in% rownames(installed.packages())) {
-  
-  install.packages("xml2", repos="http://cran.us.r-project.org")
-  
-}
-  
-if(!"rvest" %in% rownames(installed.packages())) {
-  
-  install.packages("rvest", repos="http://cran.us.r-project.org")
-  
-}
-  
-if(!"stringr" %in% rownames(installed.packages())) {
-  
-  install.packages("stringr", repos="http://cran.us.r-project.org")
-  
-}
-  
-if(!"readtext" %in% rownames(installed.packages())) {
-  
-  install.packages("readtext", repos="http://cran.us.r-project.org")
-  
-}
-  
-if(!"curl" %in% rownames(installed.packages())) {
-  
-  install.packages("curl", repos="http://cran.us.r-project.org")
-  
-}
-  
-if (!"pander" %in% rownames(installed.packages())) {
-  
-  install.packages("pander", repos="http://cran.us.r-project.org")
-  
-}
-  
-if(!"textreadr" %in% rownames(installed.packages())) {
-  
-  install.packages("textreadr", repos="http://cran.us.r-project.org")
-  
-}
+  
+#' Archivr: Save Your Websites in Perma.cc or the Wayback Machine
+  
+#'
+  
+#' Archivr is a toolkit for the long-run archiving of Qualitative data.
+  
+#' It takes a list of urls and uses either the perma.cc or Wayback Machine
+  
+#' archives to store the webpages for future reference. It will also parse
+  
+#' word or html documents for urls to be archived.
+  
+#' @docType package
+  
+#' @name archivr
 library(readtext)
-  
-library(textreadr)
 library(jsonlite)
 library(xml2)
 library(rvest)
@@ -102,6 +83,7 @@ get_default_folder <- function (default=1) {
 #'
 #' @param url_list A list of urls to archive.
 #' @param method Either "wayback" or "perma_cc." Defaults to "wayback."
+  
+#' @export
 #' @return A dataframe containing the original urls, the urls to the
 #'   archived website, the screenshot and a timestamp.
 archiv <- function (url_list, method="wayback") {
@@ -153,6 +135,7 @@ list_string <- function (url_list) {
 #'
 #' @param arc_url The url to archive.
 #' @param method Either "perma_cc" or the default, "wayback."
+  
+#' @export
 #' @return A list or object representing the result.
 archiv_url <- function (arc_url, fold=.perma_cc_folder_id, api=.perma_cc_key, method="perma_cc") {
  if (method == "perma_cc") {
@@ -184,6 +167,7 @@ archiv_url <- function (arc_url, fold=.perma_cc_folder_id, api=.perma_cc_key, me
 #' Save a url on the wayback machine.
 #' @param arc_url - the url to archive.
+  
+#' @export
 #' @return A list or object representing the result.
 archiv_wayback <- function (arc_url) {
  envelop <- paste0(.wb_save_url, arc_url)
@@ -202,6 +186,7 @@ archiv_wayback <- function (arc_url) {
 #'
 #' @param lst A list of urls to check.
 #' @param method "wayback", "perma_cc" or "both".
+  
+#' @export
 #' @return A dataframe containing the original urls, their http status,
 #'  availability, the archive url if it exists and a timestamp for the last
 #'  web crawl.
@@ -243,6 +228,7 @@ view_archiv <- function (lst, method="wayback") {
 #'
 #' @param url The url to extract links from.
 #' @param method Either "wayback," "perma_cc" or "both".
+  
+#' @export
 #' @return a dataframe containing the url, status, availability,
 #'   archived url(s) and timestamp(s)
 view_archiv.fromUrl <- function (url, method="wayback") {
@@ -253,6 +239,7 @@ view_archiv.fromUrl <- function (url, method="wayback") {
 #'
 #' @param fp The filepath to extract links from.
 #' @param method Either "wayback," "perma_cc" or "both".
+  
+#' @export
 #' @return a dataframe containing the url, status, availability,
 #'   archived url(s) and timestamp(s)
 view_archiv.fromText <- function (fp, method="wayback") {
@@ -263,6 +250,7 @@ view_archiv.fromText <- function (fp, method="wayback") {
 #'
 #' @param url The url to extract links from.
 #' @param method Either "wayback," "perma_cc" or "both".
+  
+#' @export
 #' @return a dataframe containing the url, status, availability,
 #'   archived url(s) and timestamp(s)
 archiv.fromUrl <- function (url, method="wayback") {
@@ -273,6 +261,7 @@ archiv.fromUrl <- function (url, method="wayback") {
 #'
 #' @param fp The filepath to extract links from.
 #' @param method Either "wayback," "perma_cc" or "both".
+  
+#' @export
 #' @return a dataframe containing the url, status, availability,
 #'   archived url(s) and timestamp(s)
 archiv.fromText <- function (fp, method="wayback") {
@@ -326,6 +315,7 @@ from_perma_cc <- function (url) {
 #' Set the api key(s) for Perma.cc apis, if required.
 #'
 #' @param key The Api Key.
+  
+#' @export
 set_api_key <- function (key) {
  .perma_cc_key <<- key
 }
@@ -335,6 +325,7 @@ set_api_key <- function (key) {
 #' @param id The folder id. This will be a string of numbers. If you do not
 #'   know your folder id, get_folder_ids() will output a complete list of
 #'   folders
+  
+#' @export
 #' @return TRUE
 set_folder_id <- function (id) {
  .perma_cc_folder_id <<- id
@@ -440,6 +431,7 @@ get_subfolders <- function (id) {
 }
 #' Get the folder ids starting from the default folder.
+  
+#' @export
 #' @return A list of vectors with the top folder and all its children.
 get_folder_ids <- function () {
  reply <- NULL

diff --git a/README.md b/README.md
@@ -13,50 +13,77 @@ Then launch R and then:
 ```
 source('archivr.R')
-  
-archiv(list("www.example.com"))
+  
+view_archiv(list("www.example.com"))
 ```
 Examples:
 The basic function is `archiv` that takes a list of urls and stores them in
-  
-perma_cc. It will return a dataframe with the
-  
-original urls followed by the GUID, a timestamp, the wayback machine url, and
-  
-the url for the perma_cc screenshot.
+  
+the Way Back Machine. It will return a dataframe containing the callback
+  
+data for the service.
 ```
 arc_df <- archiv(list("www.example.com", "NOTAURL", "www.github.com"))
-  
-arc_df$perma_cc_url # [1] [1] //perma-archives.org/warc/G5EH-JA9M/http://www.google.com
-  
-                    # [2] no url                                                    
-  
-                    # [3] //perma-archives.org/warc/QD3H-3FHP/http://www.example.org
-  
-                    # 3 Levels: //perma-archives.org/warc/G5EH-JA9M/http://www.google.com ...
+  
+arc_df$way_back_url   
+  
+#                                                        wayback_url
+  
+# 1 http://web.archive.org/web/20190128171132/http://www.example.com
+  
+# 2                                                    url not found
+  
+# 3    http://web.archive.org/web/20190128171134/https://github.com/ ...
 ```
+  
+
+  
+If you wish to use perma.cc's archive, you will need to set your api key using:
+  
+
+  
+```
+  
+set_api_key("YOUR_API_KEY")
+  
+```
+  
+
+  
+if you wish to save the urls in a particular perma.cc folder, you will need to set the default 
+  
+folder id using
+  
+
+  
+```
+  
+set_folder_id("FOLDER_ID")
+  
+```
+  
+
+  
+If you do not remember the ids of your folders, you can retrieve these in a dataframe
+  
+using:
+  
+```
+  
+get_folder_ids()
+  
+```
+  
+
+  
+and then you can archive materials:
+  
+
+  
+```
+  
+arc_df <- archiv(list("www.example.com", "NOTAURL", "www.github.com"), "perma_cc")
+  
+```
+  
+
 To check if a list of urls are archived in perma_cc's public api, use:
 ```
 arc_url_df <- view_archiv(list("www.example.com", "NOTAURL", "www.github.com"), "perma_cc")
 ```
 or you may check the Wayback machine:
 ```
-  
-arc_url_df <- archiv(list("www.example.com", "NOTAURL", "www.github.com"), "wayback")
+  
+arc_url_df <- view_archiv(list("www.example.com", "NOTAURL", "www.github.com"), "wayback")
 ```
-  
-Archiv can also check a webpage for archived urls.
+  
+Archiv can archive all the urls in a webpage. This feature is subject to restrictions
+  
+imposed on accounts
 ```
-  
-arc_url_df <- view_archiv.fromUrl("https://qdr.syr.edu/")
+  
+arc_url_df <- archiv.fromUrl("https://qdr.syr.edu/")
 df <- data.frame(arc_url_df$url, arc_url_df$wayback_url)[8,]
 #   arc_url_df.url                                    arc_url_df.wayback_url
 # 8 http://syr.edu http://web.archive.org/web/20170110050058/http://syr.edu/
 ```
+  
+Archiv will also archive all the urls in a text file. It has been tested for docx, 
+  
+pdf and markdown, although other text-related files should also work. Note that
+  
+text parsing can be subject to problems, especially if the document has rich features
+  
+such as tables or columns.
+  
+```
+  
+arc_url_df <- archiv.fromUrl("path_to_file")
+  
+```
-  
-
-  
-
-  
-## TESTING
-  
-
-  
-Archivr has a few unit tests that can be run for contributors. To run, use
-  
-`r -f run_tests.R` inside the archivr folder.
-  
-
-  
-### Archivr was developed by Ryan Deschamps @greebie with support from the
-  
-### Qualitative Data Repository at Syracuse University.
+  
+### Archivr is a project developed by the Qualitative Data Repository at Syracuse 
+  
+### University, authored by Ryan Deschamps (greebie on github.com) and Agile Humanities.
diff --git a/man/archiv.Rd b/man/archiv.Rd
diff --git a/man/archiv.fromText.Rd b/man/archiv.fromText.Rd
diff --git a/man/archiv.fromUrl.Rd b/man/archiv.fromUrl.Rd
diff --git a/man/archiv_batch.Rd b/man/archiv_batch.Rd
diff --git a/man/archiv_url.Rd b/man/archiv_url.Rd
diff --git a/man/archiv_wayback.Rd b/man/archiv_wayback.Rd