going back to aws to revise about page collection

euphonic · Jan 28, 2019 · 5148aab5d264449b6bdf4430807b7a8ae574f64f · 5148aab
1 parent 9ee4ecc
commit 5148aab5d264449b6bdf4430807b7a8ae574f64f
Unified Split

Showing 4,962 changed files with 529,845 additions and 522,710 deletions.
diff --git a/code/.DS_Store b/code/.DS_Store
diff --git a/code/analysis/.ipynb_checkpoints/simple-web-measures-checkpoint.ipynb b/code/analysis/.ipynb_checkpoints/simple-web-measures-checkpoint.ipynb
@@ -2,10 +2,8 @@
 "cells": [
  {
   "cell_type": "code",
-  
-   "execution_count": 1,
-  
-   "metadata": {
-  
-    "collapsed": true
-  
-   },
+  
+   "execution_count": 30,
+  
+   "metadata": {},
   "outputs": [],
   "source": [
    "# Create simple web measures, including number of pages and mean words per page\n",
@@ -19,15 +17,15 @@
    "import sys\n",
    "import pprint\n",
    "import pymongo\n",
-  
-    "import csv"
+  
+    "import csv\n",
+  
+    "import pandas as pd\n",
+  
+    "import re"
   ]
  },
  {
   "cell_type": "code",
-  
-   "execution_count": 2,
-  
-   "metadata": {
-  
-    "collapsed": true
-  
-   },
+  
+   "execution_count": 25,
+  
+   "metadata": {},
   "outputs": [],
   "source": [
    "# Setting up database connection information\n",
@@ -42,10 +40,8 @@
  },
  {
   "cell_type": "code",
-  
-   "execution_count": 12,
-  
-   "metadata": {
-  
-    "collapsed": true
-  
-   },
+  
+   "execution_count": 26,
+  
+   "metadata": {},
   "outputs": [],
   "source": [
    "measures = {} # key is firm\n",
@@ -54,38 +50,44 @@
  },
  {
   "cell_type": "code",
-  
-   "execution_count": 13,
-  
-   "metadata": {
-  
-    "collapsed": true
-  
-   },
+  
+   "execution_count": 27,
+  
+   "metadata": {},
   "outputs": [],
   "source": [
    "def get_pages ():\n",
    "    target_col = db[TARGET_COLLECTION]\n",
-  
-    "    pipeline = [ { \"$group\": {\"_id\":\"$firm_name\" , \"number\":{\"$sum\":1}} } ]\n",
+  
+    "    pipeline = [ { \"$match\": { \"firm_name\" : { \"$exists\": \"true\", \"$ne\": \"null\" }} },\n",
+  
+    "                { \"$group\": {\"_id\":\"$firm_name\" , \"number\":{\"$sum\":1}} } ]\n",
    "    pages_by_firm_name = list(target_col.aggregate(pipeline))\n",
    "    print ('Found ' + str(len(pages_by_firm_name)) + ' firm names with pages')\n",
    "    return pages_by_firm_name\n",
    "\n",
    "def print_measures():\n",
    "    f_out = open(OUTF, 'w')\n",
    "    csv_out = csv.writer(f_out)\n",
-  
-    "    for firm in measures:\n",
-  
-    "        firm_name = firm['firm_name']\n",
-  
-    "        pages = firm['firm_name']['pages']\n",
-  
-    "        csv_out.writerow([firm_name, pages])"
+  
+    "    csv_out.writerow(['firm_name', 'num_pages'])\n",
+  
+    "    for firm_name, m in measures.items():\n",
+  
+    "        pages = m['pages']\n",
+  
+    "        csv_out.writerow([firm_name, pages])\n",
+  
+    "        \n",
+  
+    "# standard firm cleaning regex\n",
+  
+    "def clean_firm_name (firm):\n",
+  
+    "    firm_clnd = re.sub('(\\.|,| corporation| incorporated| llc| inc| international| gmbh| ltd)', '', firm, flags=re.IGNORECASE).rstrip()\n",
+  
+    "    return firm_clnd"
   ]
  },
  {
   "cell_type": "code",
-  
-   "execution_count": 14,
-  
-   "metadata": {},
+  
+   "execution_count": 28,
+  
+   "metadata": {
+  
+    "scrolled": true
+  
+   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-  
-      "Found 1187 firm names with pages\n"
+  
+      "Found 1186 firm names with pages\n"
     ]
    }
   ],
@@ -95,16 +97,25 @@
  },
  {
   "cell_type": "code",
-  
-   "execution_count": null,
-  
-   "metadata": {
-  
-    "collapsed": true
-  
-   },
+  
+   "execution_count": 31,
+  
+   "metadata": {},
+  
+   "outputs": [],
+  
+   "source": [
+  
+    "# pp.pprint(pages_by_firm_name)\n",
+  
+    "for rec in pages_by_firm_name:\n",
+  
+    "    firm_name = clean_firm_name(rec['_id'][0])\n",
+  
+    "    measures[firm_name] = {}\n",
+  
+    "    measures[firm_name]['pages'] = int(rec['number'])\n",
+  
+    "# pp.pprint(measures)"
+  
+   ]
+  
+  },
+  
+  {
+  
+   "cell_type": "code",
+  
+   "execution_count": 32,
+  
+   "metadata": {},
   "outputs": [],
   "source": [
-  
-    "pp.pprint(pages_by_firm_name)\n",
-  
-    "for rec in grouped_by_firm_name:\n",
-  
-    "    firm_name = rec['_id'][0]\n",
-  
-    "    measures[firm_name]['pages'] = rec['number']"
+  
+    "print_measures()"
   ]
  }
 ],
@@ -117,14 +128,14 @@
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
-  
-    "version": 2
+  
+    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
-  
-   "pygments_lexer": "ipython2",
-  
-   "version": "2.7.14"
+  
+   "pygments_lexer": "ipython3",
+  
+   "version": "3.6.5"
  }
 },
 "nbformat": 4,

diff --git a/code/analysis/simple-web-measures.ipynb b/code/analysis/simple-web-measures.ipynb
@@ -2,10 +2,8 @@
 "cells": [
  {
   "cell_type": "code",
-  
-   "execution_count": 1,
-  
-   "metadata": {
-  
-    "collapsed": true
-  
-   },
+  
+   "execution_count": 30,
+  
+   "metadata": {},
   "outputs": [],
   "source": [
    "# Create simple web measures, including number of pages and mean words per page\n",
@@ -19,15 +17,15 @@
    "import sys\n",
    "import pprint\n",
    "import pymongo\n",
-  
-    "import csv"
+  
+    "import csv\n",
+  
+    "import pandas as pd\n",
+  
+    "import re"
   ]
  },
  {
   "cell_type": "code",
-  
-   "execution_count": 2,
-  
-   "metadata": {
-  
-    "collapsed": true
-  
-   },
+  
+   "execution_count": 25,
+  
+   "metadata": {},
   "outputs": [],
   "source": [
    "# Setting up database connection information\n",
@@ -42,10 +40,8 @@
  },
  {
   "cell_type": "code",
-  
-   "execution_count": 12,
-  
-   "metadata": {
-  
-    "collapsed": true
-  
-   },
+  
+   "execution_count": 26,
+  
+   "metadata": {},
   "outputs": [],
   "source": [
    "measures = {} # key is firm\n",
@@ -55,9 +51,7 @@
  {
   "cell_type": "code",
   "execution_count": 33,
-  
-   "metadata": {
-  
-    "collapsed": true
-  
-   },
+  
+   "metadata": {},
   "outputs": [],
   "source": [
    "def get_pages ():\n",
@@ -74,13 +68,20 @@
    "    csv_out.writerow(['firm_name', 'num_pages'])\n",
    "    for firm_name, m in measures.items():\n",
    "        pages = m['pages']\n",
-  
-    "        csv_out.writerow([firm_name, pages])"
+  
+    "        csv_out.writerow([firm_name, pages])\n",
+  
+    "        \n",
+  
+    "# standard firm cleaning regex\n",
+  
+    "def clean_firm_name (firm):\n",
+  
+    "    firm_clnd = re.sub('(\\.|,| corporation| incorporated| llc| inc| international| gmbh| ltd)', '', firm, flags=re.IGNORECASE).rstrip()\n",
+  
+    "    return firm_clnd"
   ]
  },
  {
   "cell_type": "code",
-  
-   "execution_count": 24,
-  
-   "metadata": {},
+  
+   "execution_count": 28,
+  
+   "metadata": {
+  
+    "scrolled": true
+  
+   },
   "outputs": [
    {
     "name": "stdout",
@@ -96,21 +97,21 @@
  },
  {
   "cell_type": "code",
-  
-   "execution_count": 27,
+  
+   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pp.pprint(pages_by_firm_name)\n",
    "for rec in pages_by_firm_name:\n",
-  
-    "    firm_name = rec['_id'][0]\n",
+  
+    "    firm_name = clean_firm_name(rec['_id'][0])\n",
    "    measures[firm_name] = {}\n",
    "    measures[firm_name]['pages'] = int(rec['number'])\n",
    "# pp.pprint(measures)"
   ]
  },
  {
   "cell_type": "code",
-  
-   "execution_count": 34,
+  
+   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -127,14 +128,14 @@
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
-  
-    "version": 2
+  
+    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
-  
-   "pygments_lexer": "ipython2",
-  
-   "version": "2.7.14"
+  
+   "pygments_lexer": "ipython3",
+  
+   "version": "3.6.5"
  }
 },
 "nbformat": 4,

diff --git a/code/crawler/.DS_Store b/code/crawler/.DS_Store
diff --git a/code/patents/firm_level_patent_measures.R b/code/patents/firm_level_patent_measures.R
@@ -1,4 +1,4 @@
-  
-# this script creates a matrix of firm level patent measures
+  
+# this script creates a matrix of firm level patent measures for use in MSM
 library(dplyr)
 library(extrafont)

diff --git a/code/patents/sme_large_firm_compare.R b/code/patents/sme_large_firm_compare.R
@@ -6,11 +6,9 @@ library(extrafont)
 # font_import()
 loadfonts(device = "pdf")
 library (ggplot2)
+  
+library("ggpubr")
-  
-# MacOS
 setwd("/Users/sarora/dev/EAGER/data/patents/measures")
-  
-# Windows
-  
-# setwd("C:\\Users\\sarora\\Documents\\GitHub\\EAGER\\data\\patents\\measures")
 # load data
 in.ass_all <- read.csv("assignees_overall.csv", header = TRUE, stringsAsFactors = FALSE)
@@ -25,14 +23,14 @@ in.pat_all <- read.csv("patents_overall.csv", header = TRUE, stringsAsFactors =
 in.ass_first_year <- read.csv("assignees_first-year.csv", header = TRUE, stringsAsFactors = FALSE)
 in.lookup <- read.csv("assignee-2-patent-lookup.csv", header = TRUE, stringsAsFactors = FALSE) 
-  
-in.eager_assignee <- read.csv("..\\eager_assignee.csv", header = TRUE, stringsAsFactors = FALSE) 
+  
+in.eager_assignee <- read.csv("..//eager_assignee.csv", header = TRUE, stringsAsFactors = FALSE) 
 in.eager_assignee$employees <- as.numeric(gsub(",", "", in.eager_assignee$employees))
 in.eager_assignee$sme <- in.eager_assignee$employees
 in.eager_assignee[which(in.eager_assignee$employees > 500 & in.eager_assignee$thes_types=="Corporate"), 5] <- 0
 in.eager_assignee[which(in.eager_assignee$employees < 500 & !is.na(in.eager_assignee$employees) & in.eager_assignee$thes_types=="Corporate"), 5] <- 1
 View(in.eager_assignee)
-  
-in.web_pages <- read.csv("..\\..\\analysis\\measures\\simple_web_measures_v1.csv", header = TRUE, stringsAsFactors = FALSE) 
+  
+in.web_pages <- read.csv("..//..//analysis//measures//simple_web_measures_v1.csv", header = TRUE, stringsAsFactors = FALSE) 
 # number of small vs large firms
 head(in.pat_all)
@@ -70,10 +68,10 @@ mean_assignees_all_by_size
 # average number of assignees 3 industries
 head (in.ass_3)
-  
-mean_assignees_3_by_size <- in.ass_3 %>% inner_join(in.lookup, by = c("patent_id" = "id")) %>% group_by(organization_clnd)  %>%  
+  
+mean_assignees_all_by_size <- in.ass_3 %>% inner_join(in.lookup, by = c("patent_id" = "id")) %>% group_by(organization_clnd)  %>%  
  summarize(mean = mean( count.pa.assignee_id., na.rm=TRUE)) %>% inner_join(in.eager_assignee, by = c("organization_clnd")) %>% group_by(sme) %>% 
  summarize(mean = mean(mean, na.rm=TRUE))
-  
-mean_assignees_3_by_size
+  
+mean_assignees_all_by_size
 # average number of inventors all  
 head (in.inv_all)
@@ -120,10 +118,31 @@ theme.eager_chart_SMALLM <- theme.eager_chart +
 g1.df <- patents_web_emps %>% arrange(employees) %>% as.data.frame()
 head (g1.df)
-  
-ggplot(data=g1.df, aes(x=employees, y=num_pages)) +
-  
-  geom_point(alpha=.4, size=4, color="#880011") +
-  
-  labs(x="Employees", y="Number of pages") +
+  
+nrow (g1.df)
+  
+g1.a <- ggplot(data=g1.df, aes(x=employees, y=num_pages)) +
+  
+  geom_point(alpha=.4, size=4, color="#0037ff") +
+  
+  labs(x="Employees", y="Number ofpages\nat depth of 1") +
+  
+  scale_x_continuous(labels=comma, limits=c(0,10000)) +
+  
+  scale_y_continuous(limits=c(0,400), breaks=seq(0,400,by=100)) +
+  
+  geom_smooth(method = "lm") +
+  
+  theme.eager_chart_SCATTER
+  
+g1.a
+  
+ggsave("../../analysis/emps_x_pages.png")
+  
+
+  
+g1.b <- ggplot(data=g1.df, aes(x=count.p.id., y=num_pages)) +
+  
+  geom_point(alpha=.4, size=4, color="#7b00ff") +
+  
+  labs(x="Patents", y="Number ofpages\nat depth of 1") +
+  
+  scale_x_continuous(labels=comma, limits=c(0,40000)) +
+  
+  scale_y_continuous(limits=c(0,400), breaks=seq(0,400,by=100)) +
+  
+  geom_smooth(method = "lm") +
  theme.eager_chart_SCATTER
-  
-g1.df
+  
+g1.b
+  
+ggsave("../../analysis/patents_x_pages.png")
+  
+# variables are not normally distributed 
+  
+shapiro.test(g1.df$employees)
+  
+shapiro.test(g1.df$num_pages)
+  
+shapiro.test(g1.df$count.p.id.)
+  
+cor.test(g1.df$employees, g1.df$num_pages, alternative="two.sided", method="kendall" )
+  
+cor.test(g1.df$count.p.id., g1.df$num_pages, alternative="two.sided", method="kendall" )