Copyediting. (#35)

archivesunleashed · Mar 7, 2019 · 5e0db61b41190a165d14b7ccb03d6e56d8ff9f56 · 5e0db61
1 parent 3a873cc
commit 5e0db61b41190a165d14b7ccb03d6e56d8ff9f56
Unified Split

Showing with 37 additions and 39 deletions.

+11 −12 auk-notebook-example.ipynb

+26 −27 auk-notebook.ipynb
diff --git a/auk-notebook-example.ipynb b/auk-notebook-example.ipynb
@@ -18,7 +18,7 @@
    "\n",
    "We recommend that you use `File > Make a Copy` first before changing the code in the repository. That way, you can always return to the basic visualizations we have offered here. Of course, you can also just re-download the Jupyter Notebook file from your Archives Unleashed Cloud account.\n",
    "\n",
-  
-    "### How Jupyter Notebooks Work:\n",
+  
+    "### How Jupyter Notebooks Work\n",
    "\n",
    "If you have no previous experience with Jupyter Notebooks, the most important thing to understand is that that `<Shift> + <Enter/Return>` will run the Python code inside a cell and output it to below the cell.\n",
    "    \n",
@@ -112,8 +112,7 @@
    "\n",
    "TOP_COUNT = 30 \n",
    "\n",
-  
-    "# Domain suffixes to check non-U.S. domains so that (e.g.) www.google.co.uk \n",
-  
-    "# will become \"google\".\n",
+  
+    "# Domain suffixes to check non-U.S. domains so that (e.g.) www.google.co.uk will become \"google\".\n",
    "\n",
    "STOP_DOMAINS = [\"co\", \"org\", \"net\", \"edu\"] # Domain suffixes to remove.\n",
    "\n",
@@ -141,7 +140,7 @@
    "\n",
    "# Change if you want a different filename.\n",
    "\n",
-  
-    "OUTPUT_FILENAME = \"./filtered_text.txt\" # filename if you want to output to another file.\n",
+  
+    "OUTPUT_FILENAME = \"./filtered_text.txt\" # Filename if you want to output to another file.\n",
    "\n",
    "# Characters to show per text file in output.\n",
    "# Larger numbers will result in more text showing in output.\n",
@@ -169,7 +168,7 @@
   "source": [
    "## Archives Unleashed Cloud Python Library\n",
    "\n",
-  
-    "The below cell now sets up the functions that drive the analysis throughout this notebook. If you don't run it, you won't be able to work with the data. "
+  
+    "The cell below now sets up the functions that drive the analysis throughout this notebook. If you don't run it, you won't be able to work with the data. "
   ]
  },
  {
@@ -389,11 +388,11 @@
   "metadata": {},
   "outputs": [],
   "source": [
-  
-    "DOMAIN_BY = 'name' # change to 'sub' if you want to include subdomains.\n",
-  
-    "DOMAIN_EXCLUDE = ['google', 'facebook', 'youtube', 'apple'] # add items to this list to exclude from the collection.\n",
-  
-    "DOMAIN_FIGURE_SIZE = [10, 4] # change the width and height of your graph plot ([wdth, hgt]).\n",
-  
-    "DOMAIN_RESULTS = 30 # the number of results to plot.\n",
-  
-    "DOMAIN_BAR_WIDTH = 0.35 # the width of the bars in the histogram.\n",
+  
+    "DOMAIN_BY = 'name' # Change to 'sub' if you want to include subdomains.\n",
+  
+    "DOMAIN_EXCLUDE = ['google', 'facebook', 'youtube', 'apple'] # Add items to this list to exclude from the collection.\n",
+  
+    "DOMAIN_FIGURE_SIZE = [10, 4] # Change the width and height of your graph plot ([wdth, hgt]).\n",
+  
+    "DOMAIN_RESULTS = 30 # The number of results to plot.\n",
+  
+    "DOMAIN_BAR_WIDTH = 0.35 # The width of the bars in the histogram.\n",
    "DOMAIN_Y_LABEL = \"Number of occurences.\" # The label for the y axis.\n",
    "DOMAIN_TITLE = \"Top domains by count.\" # The title of the graph."
   ]
@@ -449,7 +448,7 @@
   "source": [
    "# Text Analysis\n",
    "\n",
-  
-    "The following set of functions use the [Natural Language Toolkit](https://www.nltk.org) Python library to search for the top most used words in the collection, as well as facilitate breaking it down by name or domain. "
+  
+    "The following set of functions use the [Natural Language Toolkit](https://www.nltk.org) Python library to identify the most frequently used words in the collection, as well as facilitate breaking it down by name or domain. "
   ]
  },
  {
@@ -1107,7 +1106,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-  
-   "version": "3.7.2"
+  
+   "version": "3.7.1"
  }
 },
 "nbformat": 4,

diff --git a/auk-notebook.ipynb b/auk-notebook.ipynb
@@ -18,7 +18,7 @@
    "\n",
    "We recommend that you use `File > Make a Copy` first before changing the code in the repository. That way, you can always return to the basic visualizations we have offered here. Of course, you can also just re-download the Jupyter Notebook file from your Archives Unleashed Cloud account.\n",
    "\n",
-  
-    "### How Jupyter Notebooks Work:\n",
+  
+    "### How Jupyter Notebooks Work\n",
    "\n",
    "If you have no previous experience with Jupyter Notebooks, the most important thing to understand is that that `<Shift> + <Enter/Return>` will run the Python code inside a cell and output it to below the cell.\n",
    "    \n",
@@ -112,8 +112,7 @@
    "\n",
    "TOP_COUNT = 30 \n",
    "\n",
-  
-    "# Domain suffixes to check non-U.S. domains so that (e.g.) www.google.co.uk \n",
-  
-    "# will become \"google\".\n",
+  
+    "# Domain suffixes to check non-U.S. domains so that (e.g.) www.google.co.uk will become \"google\".\n",
    "\n",
    "STOP_DOMAINS = [\"co\", \"org\", \"net\", \"edu\"] # Domain suffixes to remove.\n",
    "\n",
@@ -169,7 +168,7 @@
   "source": [
    "## Archives Unleashed Cloud Python Library\n",
    "\n",
-  
-    "The below cell now sets up the functions that drive the analysis throughout this notebook. If you don't run it, you won't be able to work with the data. "
+  
+    "The cell below now sets up the functions that drive the analysis throughout this notebook. If you don't run it, you won't be able to work with the data. "
   ]
  },
  {
@@ -181,8 +180,8 @@
    "def clean_domain(s):\n",
    "    \"\"\"Extracts the name from the domain (e.g. 'www.google.com' becomes 'google').\n",
    "    \n",
-  
-    "    :param: s: the domain name to clean.\n",
-  
-    "    :return: the relevant name.\n",
+  
+    "    :param: s: The domain name to clean.\n",
+  
+    "    :return: The relevant name.\n",
    "    \"\"\"\n",
    "    \n",
    "    ret = \"\"\n",
@@ -200,7 +199,7 @@
    "    \n",
    "    :param split_method: Either \"full\" \"name\" or \"sub\". \"name\" provides just the domain name, \n",
    "         \"sub\" produces the name with subdomains. \"full\" provides the entire name. \n",
-  
-    "    :return: a list of tuples containing (urlname, count).\n",
+  
+    "    :return: A list of tuples containing (urlname, count).\n",
    "    \"\"\"\n",
    "    \n",
    "    ret = []\n",
@@ -232,7 +231,7 @@
    "    \"\"\"Get the text from the files (by domain or year if desired).\n",
    "    \n",
    "    :param by: \"all\", \"domain\" or \"year\" the output to return.\n",
-  
-    "    :param minline: the minimum size of a line to be included in the output.\n",
+  
+    "    :param minline: The minimum size of a line to be included in the output.\n",
    "    :return: [({year or domain}, textString)] if by is 'domain' or 'year', otherwise [textString].\n",
    "    \"\"\"\n",
    "    \n",
@@ -257,26 +256,26 @@
    "def get_text_tokens (minlen=MINIMUM_WORD_LENGTH) :\n",
    "    \"\"\"Get the data and tokenize the text.\n",
    "    \n",
-  
-    "    :param minlen: the minimum word size to be included in the list of words.\n",
-  
-    "    :return: a list of words included in the text file.\n",
+  
+    "    :param minlen: The minimum word size to be included in the list of words.\n",
+  
+    "    :return: A list of words included in the text file.\n",
    "    \"\"\"\n",
    "    \n",
    "    return [x.lower() for x in word_tokenize(' '.join(get_text())) if len(x) > minlen]\n",
    "\n",
    "def get_tokens_domains(minlen=MINIMUM_WORD_LENGTH):\n",
    "    \"\"\"Get tokens by domain.\n",
    "    \n",
-  
-    "    :param minlen: the minimum word size to be included in the list of words.\n",
-  
-    "    :return: a list of tuples with (domain, Counter).\n",
+  
+    "    :param minlen: The minimum word size to be included in the list of words.\n",
+  
+    "    :return: A list of tuples with (domain, Counter).\n",
    "    \"\"\"\n",
    "    \n",
    "    return [(x[0], Counter([y for y in word_tokenize(x[1]) if len(y) > minlen])) for x in get_text(\"domain\")]\n",
    "\n",
    "def get_tokens_years(minlen=MINIMUM_WORD_LENGTH):\n",
    "    \"\"\"Get tokens by year.\n",
    "    \n",
-  
-    "    :para minlen: the minimum word size to be included in the list of words.\n",
-  
-    "    :return: a list of tuples with (year, Counter).\n",
+  
+    "    :para minlen: The minimum word size to be included in the list of words.\n",
+  
+    "    :return: A list of tuples with (year, Counter).\n",
    "    \"\"\"\n",
    "    \n",
    "    return [(x[0], Counter([y for y in word_tokenize(x[1]) if len(y) > minlen])) for x in get_text(\"year\")]\n",
@@ -300,7 +299,7 @@
    "    :para fun: A function that returns a list of (key, Counter([tokenized_list])).\n",
    "    :para total: The number of top tokens to return for each key.\n",
    "    :para minlen: The minimum word length.\n",
-  
-    "    :return: list of minlen tokens by fun.\n",
+  
+    "    :return: List of minlen tokens by fun.\n",
    "    \"\"\"\n",
    "    \n",
    "    sep = dict()\n",
@@ -354,8 +353,8 @@
    "def sentiment_scores(by=\"domain\"):\n",
    "    \"\"\" Calculates sentiment scores for a body of text.\n",
    "    \n",
-  
-    "    :param by: either \"year\" or \"domain\".\n",
-  
-    "    :return: a list of tuples with (year/domain, (\"neg\", score), (\"neu\", score) etc.).\n",
+  
+    "    :param by: Either \"year\" or \"domain\".\n",
+  
+    "    :return: A list of tuples with (year/domain, (\"neg\", score), (\"neu\", score) etc.).\n",
    "    \"\"\"\n",
    "    \n",
    "    sep = dict()\n",
@@ -389,11 +388,11 @@
   "metadata": {},
   "outputs": [],
   "source": [
-  
-    "DOMAIN_BY = 'name' # change to 'sub' if you want to include subdomains.\n",
-  
-    "DOMAIN_EXCLUDE = ['google', 'facebook', 'youtube', 'apple'] # add items to this list to exclude from the collection.\n",
-  
-    "DOMAIN_FIGURE_SIZE = [10, 4] # change the width and height of your graph plot ([wdth, hgt]).\n",
-  
-    "DOMAIN_RESULTS = 30 # the number of results to plot.\n",
-  
-    "DOMAIN_BAR_WIDTH = 0.35 # the width of the bars in the histogram.\n",
+  
+    "DOMAIN_BY = 'name' # Change to 'sub' if you want to include subdomains.\n",
+  
+    "DOMAIN_EXCLUDE = ['google', 'facebook', 'youtube', 'apple'] # Add items to this list to exclude from the collection.\n",
+  
+    "DOMAIN_FIGURE_SIZE = [10, 4] # Change the width and height of your graph plot ([wdth, hgt]).\n",
+  
+    "DOMAIN_RESULTS = 30 # The number of results to plot.\n",
+  
+    "DOMAIN_BAR_WIDTH = 0.35 # The width of the bars in the histogram.\n",
    "DOMAIN_Y_LABEL = \"Number of occurences.\" # The label for the y axis.\n",
    "DOMAIN_TITLE = \"Top domains by count.\" # The title of the graph."
   ]
@@ -436,7 +435,7 @@
   "source": [
    "# Text Analysis\n",
    "\n",
-  
-    "The following set of functions use the [Natural Language Toolkit](https://www.nltk.org) Python library to search for the top most used words in the collection, as well as facilitate breaking it down by name or domain. "
+  
+    "The following set of functions use the [Natural Language Toolkit](https://www.nltk.org) Python library to identify the most frequently used words in the collection, as well as facilitate breaking it down by name or domain. "
   ]
  },
  {
@@ -776,9 +775,9 @@
   "outputs": [],
   "source": [
    "NETWORK_GRAPH_FIGURE_SIZE = [25, 25] # Change the size of the plot.\n",
-  
-    "NETWORK_NODE_SIZE = 100 # increase or decrease the node size for the graph.\n",
-  
-    "NETWORK_FONT_SIZE = 10 # increase or decrease the font size for the graph.\n",
-  
-    "NETWORK_SHOW_LABELS = True # change to False if you do not want to see the labels."
+  
+    "NETWORK_NODE_SIZE = 100 # Increase or decrease the node size for the graph.\n",
+  
+    "NETWORK_FONT_SIZE = 10 # Increase or decrease the font size for the graph.\n",
+  
+    "NETWORK_SHOW_LABELS = True # Change to False if you do not want to see the labels."
   ]
  },
  {
@@ -903,7 +902,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-  
-   "version": "3.7.2"
+  
+   "version": "3.7.1"
  }
 },
 "nbformat": 4,