Permalink
Browse files

Develop Archivr into a package and add auto documentation.

Fix README.md
  • Loading branch information...
greebie committed Jan 28, 2019
1 parent d43e594 commit c9e627dc6beb4d392a03199b15f73af204a0ccba
@@ -0,0 +1,25 @@
Package: archivr
Title: Archivr - Save Your Websites in Perma.cc or the Wayback Machine
Version: 0.0.1
Authors@R: c(person("Ryan", "Deschamps", email = "ryan.deschamps@gmail.com",
role = c("aut", "cre")),
person("Qualitative Data", "Repository", email = "qdr@syr.edu",
role = c("own", "pat")),
person("Sebastian", "Karcher", email = "skarcher@syr.edu",
role = c("edt", "exp")))
Description: Archivr is a toolkit for the long-run archiving of Qualitative data.
It takes a list of urls and uses either the perma.cc or Wayback Machine
archives to store the webpages for future reference. It will also parse
word or html documents for urls to be archived.
Maintainer: Ryan Deschamps <ryan.deschamps@gmail.com>
Depends: R (>= 3.1.0)
Imports:
rvest,
readtext,
xml2,
stringr,
curl
License: MIT
LazyData: true
RoxygenNote: 6.1.0.9000
Encoding: UTF-8
@@ -0,0 +1,13 @@
# Generated by roxygen2: do not edit by hand

export(archiv)
export(archiv.fromText)
export(archiv.fromUrl)
export(archiv_url)
export(archiv_wayback)
export(get_folder_ids)
export(set_api_key)
export(set_folder_id)
export(view_archiv)
export(view_archiv.fromText)
export(view_archiv.fromUrl)
@@ -19,35 +19,16 @@
#' THE SOFTWARE.


#' The following may not be necessary once packaging is set up.

if(!"jsonlite" %in% rownames(installed.packages())) {
install.packages("jsonlite", repos="http://cran.us.r-project.org")
}
if(!"xml2" %in% rownames(installed.packages())) {
install.packages("xml2", repos="http://cran.us.r-project.org")
}
if(!"rvest" %in% rownames(installed.packages())) {
install.packages("rvest", repos="http://cran.us.r-project.org")
}
if(!"stringr" %in% rownames(installed.packages())) {
install.packages("stringr", repos="http://cran.us.r-project.org")
}
if(!"readtext" %in% rownames(installed.packages())) {
install.packages("readtext", repos="http://cran.us.r-project.org")
}
if(!"curl" %in% rownames(installed.packages())) {
install.packages("curl", repos="http://cran.us.r-project.org")
}
if (!"pander" %in% rownames(installed.packages())) {
install.packages("pander", repos="http://cran.us.r-project.org")
}
if(!"textreadr" %in% rownames(installed.packages())) {
install.packages("textreadr", repos="http://cran.us.r-project.org")
}
#' Archivr: Save Your Websites in Perma.cc or the Wayback Machine
#'
#' Archivr is a toolkit for the long-run archiving of Qualitative data.
#' It takes a list of urls and uses either the perma.cc or Wayback Machine
#' archives to store the webpages for future reference. It will also parse
#' word or html documents for urls to be archived.
#' @docType package
#' @name archivr

library(readtext)
library(textreadr)
library(jsonlite)
library(xml2)
library(rvest)
@@ -102,6 +83,7 @@ get_default_folder <- function (default=1) {
#'
#' @param url_list A list of urls to archive.
#' @param method Either "wayback" or "perma_cc." Defaults to "wayback."
#' @export
#' @return A dataframe containing the original urls, the urls to the
#' archived website, the screenshot and a timestamp.
archiv <- function (url_list, method="wayback") {
@@ -153,6 +135,7 @@ list_string <- function (url_list) {
#'
#' @param arc_url The url to archive.
#' @param method Either "perma_cc" or the default, "wayback."
#' @export
#' @return A list or object representing the result.
archiv_url <- function (arc_url, fold=.perma_cc_folder_id, api=.perma_cc_key, method="perma_cc") {
if (method == "perma_cc") {
@@ -184,6 +167,7 @@ archiv_url <- function (arc_url, fold=.perma_cc_folder_id, api=.perma_cc_key, me

#' Save a url on the wayback machine.
#' @param arc_url - the url to archive.
#' @export
#' @return A list or object representing the result.
archiv_wayback <- function (arc_url) {
envelop <- paste0(.wb_save_url, arc_url)
@@ -202,6 +186,7 @@ archiv_wayback <- function (arc_url) {
#'
#' @param lst A list of urls to check.
#' @param method "wayback", "perma_cc" or "both".
#' @export
#' @return A dataframe containing the original urls, their http status,
#' availability, the archive url if it exists and a timestamp for the last
#' web crawl.
@@ -243,6 +228,7 @@ view_archiv <- function (lst, method="wayback") {
#'
#' @param url The url to extract links from.
#' @param method Either "wayback," "perma_cc" or "both".
#' @export
#' @return a dataframe containing the url, status, availability,
#' archived url(s) and timestamp(s)
view_archiv.fromUrl <- function (url, method="wayback") {
@@ -253,6 +239,7 @@ view_archiv.fromUrl <- function (url, method="wayback") {
#'
#' @param fp The filepath to extract links from.
#' @param method Either "wayback," "perma_cc" or "both".
#' @export
#' @return a dataframe containing the url, status, availability,
#' archived url(s) and timestamp(s)
view_archiv.fromText <- function (fp, method="wayback") {
@@ -263,6 +250,7 @@ view_archiv.fromText <- function (fp, method="wayback") {
#'
#' @param url The url to extract links from.
#' @param method Either "wayback," "perma_cc" or "both".
#' @export
#' @return a dataframe containing the url, status, availability,
#' archived url(s) and timestamp(s)
archiv.fromUrl <- function (url, method="wayback") {
@@ -273,6 +261,7 @@ archiv.fromUrl <- function (url, method="wayback") {
#'
#' @param fp The filepath to extract links from.
#' @param method Either "wayback," "perma_cc" or "both".
#' @export
#' @return a dataframe containing the url, status, availability,
#' archived url(s) and timestamp(s)
archiv.fromText <- function (fp, method="wayback") {
@@ -326,6 +315,7 @@ from_perma_cc <- function (url) {
#' Set the api key(s) for Perma.cc apis, if required.
#'
#' @param key The Api Key.
#' @export
set_api_key <- function (key) {
.perma_cc_key <<- key
}
@@ -335,6 +325,7 @@ set_api_key <- function (key) {
#' @param id The folder id. This will be a string of numbers. If you do not
#' know your folder id, get_folder_ids() will output a complete list of
#' folders
#' @export
#' @return TRUE
set_folder_id <- function (id) {
.perma_cc_folder_id <<- id
@@ -440,6 +431,7 @@ get_subfolders <- function (id) {
}

#' Get the folder ids starting from the default folder.
#' @export
#' @return A list of vectors with the top folder and all its children.
get_folder_ids <- function () {
reply <- NULL
@@ -13,50 +13,77 @@ Then launch R and then:

```
source('archivr.R')
archiv(list("www.example.com"))
view_archiv(list("www.example.com"))
```

Examples:

The basic function is `archiv` that takes a list of urls and stores them in
perma_cc. It will return a dataframe with the
original urls followed by the GUID, a timestamp, the wayback machine url, and
the url for the perma_cc screenshot.
the Way Back Machine. It will return a dataframe containing the callback
data for the service.

```
arc_df <- archiv(list("www.example.com", "NOTAURL", "www.github.com"))
arc_df$perma_cc_url # [1] [1] //perma-archives.org/warc/G5EH-JA9M/http://www.google.com
# [2] no url
# [3] //perma-archives.org/warc/QD3H-3FHP/http://www.example.org
# 3 Levels: //perma-archives.org/warc/G5EH-JA9M/http://www.google.com ...
arc_df$way_back_url
# wayback_url
# 1 http://web.archive.org/web/20190128171132/http://www.example.com
# 2 url not found
# 3 http://web.archive.org/web/20190128171134/https://github.com/ ...
```

If you wish to use perma.cc's archive, you will need to set your api key using:

```
set_api_key("YOUR_API_KEY")
```

if you wish to save the urls in a particular perma.cc folder, you will need to set the default
folder id using

```
set_folder_id("FOLDER_ID")
```

If you do not remember the ids of your folders, you can retrieve these in a dataframe
using:
```
get_folder_ids()
```

and then you can archive materials:

```
arc_df <- archiv(list("www.example.com", "NOTAURL", "www.github.com"), "perma_cc")
```

To check if a list of urls are archived in perma_cc's public api, use:
```
arc_url_df <- view_archiv(list("www.example.com", "NOTAURL", "www.github.com"), "perma_cc")
```

or you may check the Wayback machine:
```
arc_url_df <- archiv(list("www.example.com", "NOTAURL", "www.github.com"), "wayback")
arc_url_df <- view_archiv(list("www.example.com", "NOTAURL", "www.github.com"), "wayback")
```

Archiv can also check a webpage for archived urls.
Archiv can archive all the urls in a webpage. This feature is subject to restrictions
imposed on accounts

```
arc_url_df <- view_archiv.fromUrl("https://qdr.syr.edu/")
arc_url_df <- archiv.fromUrl("https://qdr.syr.edu/")
df <- data.frame(arc_url_df$url, arc_url_df$wayback_url)[8,]
# arc_url_df.url arc_url_df.wayback_url
# 8 http://syr.edu http://web.archive.org/web/20170110050058/http://syr.edu/
```

Archiv will also archive all the urls in a text file. It has been tested for docx,
pdf and markdown, although other text-related files should also work. Note that
text parsing can be subject to problems, especially if the document has rich features
such as tables or columns.
```
arc_url_df <- archiv.fromUrl("path_to_file")
```



## TESTING

Archivr has a few unit tests that can be run for contributors. To run, use
`r -f run_tests.R` inside the archivr folder.

### Archivr was developed by Ryan Deschamps @greebie with support from the
### Qualitative Data Repository at Syracuse University.
### Archivr is a project developed by the Qualitative Data Repository at Syracuse
### University, authored by Ryan Deschamps (greebie on github.com) and Agile Humanities.

Some generated files are not rendered by default. Learn more.

Oops, something went wrong.

Some generated files are not rendered by default. Learn more.

Oops, something went wrong.

Some generated files are not rendered by default. Learn more.

Oops, something went wrong.

Some generated files are not rendered by default. Learn more.

Oops, something went wrong.

Some generated files are not rendered by default. Learn more.

Oops, something went wrong.

Some generated files are not rendered by default. Learn more.

Oops, something went wrong.
Oops, something went wrong.

0 comments on commit c9e627d

Please sign in to comment.