Skip to content
Permalink
Browse files

going back to aws to revise about page collection

  • Loading branch information...
euphonic committed Jan 28, 2019
1 parent 9ee4ecc commit 5148aab5d264449b6bdf4430807b7a8ae574f64f
Showing 4,962 changed files with 529,845 additions and 522,710 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
BIN +0 Bytes (100%) code/.DS_Store
Binary file not shown.
@@ -2,10 +2,8 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"# Create simple web measures, including number of pages and mean words per page\n",
@@ -19,15 +17,15 @@
"import sys\n",
"import pprint\n",
"import pymongo\n",
"import csv"
"import csv\n",
"import pandas as pd\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"# Setting up database connection information\n",
@@ -42,10 +40,8 @@
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true
},
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"measures = {} # key is firm\n",
@@ -54,38 +50,44 @@
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": true
},
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"def get_pages ():\n",
" target_col = db[TARGET_COLLECTION]\n",
" pipeline = [ { \"$group\": {\"_id\":\"$firm_name\" , \"number\":{\"$sum\":1}} } ]\n",
" pipeline = [ { \"$match\": { \"firm_name\" : { \"$exists\": \"true\", \"$ne\": \"null\" }} },\n",
" { \"$group\": {\"_id\":\"$firm_name\" , \"number\":{\"$sum\":1}} } ]\n",
" pages_by_firm_name = list(target_col.aggregate(pipeline))\n",
" print ('Found ' + str(len(pages_by_firm_name)) + ' firm names with pages')\n",
" return pages_by_firm_name\n",
"\n",
"def print_measures():\n",
" f_out = open(OUTF, 'w')\n",
" csv_out = csv.writer(f_out)\n",
" for firm in measures:\n",
" firm_name = firm['firm_name']\n",
" pages = firm['firm_name']['pages']\n",
" csv_out.writerow([firm_name, pages])"
" csv_out.writerow(['firm_name', 'num_pages'])\n",
" for firm_name, m in measures.items():\n",
" pages = m['pages']\n",
" csv_out.writerow([firm_name, pages])\n",
" \n",
"# standard firm cleaning regex\n",
"def clean_firm_name (firm):\n",
" firm_clnd = re.sub('(\\.|,| corporation| incorporated| llc| inc| international| gmbh| ltd)', '', firm, flags=re.IGNORECASE).rstrip()\n",
" return firm_clnd"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"execution_count": 28,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 1187 firm names with pages\n"
"Found 1186 firm names with pages\n"
]
}
],
@@ -95,16 +97,25 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"# pp.pprint(pages_by_firm_name)\n",
"for rec in pages_by_firm_name:\n",
" firm_name = clean_firm_name(rec['_id'][0])\n",
" measures[firm_name] = {}\n",
" measures[firm_name]['pages'] = int(rec['number'])\n",
"# pp.pprint(measures)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"pp.pprint(pages_by_firm_name)\n",
"for rec in grouped_by_firm_name:\n",
" firm_name = rec['_id'][0]\n",
" measures[firm_name]['pages'] = rec['number']"
"print_measures()"
]
}
],
@@ -117,14 +128,14 @@
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.14"
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
@@ -2,10 +2,8 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"# Create simple web measures, including number of pages and mean words per page\n",
@@ -19,15 +17,15 @@
"import sys\n",
"import pprint\n",
"import pymongo\n",
"import csv"
"import csv\n",
"import pandas as pd\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"# Setting up database connection information\n",
@@ -42,10 +40,8 @@
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true
},
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"measures = {} # key is firm\n",
@@ -55,9 +51,7 @@
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"def get_pages ():\n",
@@ -74,13 +68,20 @@
" csv_out.writerow(['firm_name', 'num_pages'])\n",
" for firm_name, m in measures.items():\n",
" pages = m['pages']\n",
" csv_out.writerow([firm_name, pages])"
" csv_out.writerow([firm_name, pages])\n",
" \n",
"# standard firm cleaning regex\n",
"def clean_firm_name (firm):\n",
" firm_clnd = re.sub('(\\.|,| corporation| incorporated| llc| inc| international| gmbh| ltd)', '', firm, flags=re.IGNORECASE).rstrip()\n",
" return firm_clnd"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"execution_count": 28,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
@@ -96,21 +97,21 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"# pp.pprint(pages_by_firm_name)\n",
"for rec in pages_by_firm_name:\n",
" firm_name = rec['_id'][0]\n",
" firm_name = clean_firm_name(rec['_id'][0])\n",
" measures[firm_name] = {}\n",
" measures[firm_name]['pages'] = int(rec['number'])\n",
"# pp.pprint(measures)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
@@ -127,14 +128,14 @@
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.14"
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
BIN +0 Bytes (100%) code/crawler/.DS_Store
Binary file not shown.
@@ -1,4 +1,4 @@
# this script creates a matrix of firm level patent measures
# this script creates a matrix of firm level patent measures for use in MSM

library(dplyr)
library(extrafont)
@@ -6,11 +6,9 @@ library(extrafont)
# font_import()
loadfonts(device = "pdf")
library (ggplot2)
library("ggpubr")

# MacOS
setwd("/Users/sarora/dev/EAGER/data/patents/measures")
# Windows
# setwd("C:\\Users\\sarora\\Documents\\GitHub\\EAGER\\data\\patents\\measures")

# load data
in.ass_all <- read.csv("assignees_overall.csv", header = TRUE, stringsAsFactors = FALSE)
@@ -25,14 +23,14 @@ in.pat_all <- read.csv("patents_overall.csv", header = TRUE, stringsAsFactors =
in.ass_first_year <- read.csv("assignees_first-year.csv", header = TRUE, stringsAsFactors = FALSE)
in.lookup <- read.csv("assignee-2-patent-lookup.csv", header = TRUE, stringsAsFactors = FALSE)

in.eager_assignee <- read.csv("..\\eager_assignee.csv", header = TRUE, stringsAsFactors = FALSE)
in.eager_assignee <- read.csv("..//eager_assignee.csv", header = TRUE, stringsAsFactors = FALSE)
in.eager_assignee$employees <- as.numeric(gsub(",", "", in.eager_assignee$employees))
in.eager_assignee$sme <- in.eager_assignee$employees
in.eager_assignee[which(in.eager_assignee$employees > 500 & in.eager_assignee$thes_types=="Corporate"), 5] <- 0
in.eager_assignee[which(in.eager_assignee$employees < 500 & !is.na(in.eager_assignee$employees) & in.eager_assignee$thes_types=="Corporate"), 5] <- 1
View(in.eager_assignee)

in.web_pages <- read.csv("..\\..\\analysis\\measures\\simple_web_measures_v1.csv", header = TRUE, stringsAsFactors = FALSE)
in.web_pages <- read.csv("..//..//analysis//measures//simple_web_measures_v1.csv", header = TRUE, stringsAsFactors = FALSE)

# number of small vs large firms
head(in.pat_all)
@@ -70,10 +68,10 @@ mean_assignees_all_by_size

# average number of assignees 3 industries
head (in.ass_3)
mean_assignees_3_by_size <- in.ass_3 %>% inner_join(in.lookup, by = c("patent_id" = "id")) %>% group_by(organization_clnd) %>%
mean_assignees_all_by_size <- in.ass_3 %>% inner_join(in.lookup, by = c("patent_id" = "id")) %>% group_by(organization_clnd) %>%
summarize(mean = mean( count.pa.assignee_id., na.rm=TRUE)) %>% inner_join(in.eager_assignee, by = c("organization_clnd")) %>% group_by(sme) %>%
summarize(mean = mean(mean, na.rm=TRUE))
mean_assignees_3_by_size
mean_assignees_all_by_size

# average number of inventors all
head (in.inv_all)
@@ -120,10 +118,31 @@ theme.eager_chart_SMALLM <- theme.eager_chart +

g1.df <- patents_web_emps %>% arrange(employees) %>% as.data.frame()
head (g1.df)
ggplot(data=g1.df, aes(x=employees, y=num_pages)) +
geom_point(alpha=.4, size=4, color="#880011") +
labs(x="Employees", y="Number of pages") +
nrow (g1.df)
g1.a <- ggplot(data=g1.df, aes(x=employees, y=num_pages)) +
geom_point(alpha=.4, size=4, color="#0037ff") +
labs(x="Employees", y="Number ofpages\nat depth of 1") +
scale_x_continuous(labels=comma, limits=c(0,10000)) +
scale_y_continuous(limits=c(0,400), breaks=seq(0,400,by=100)) +
geom_smooth(method = "lm") +
theme.eager_chart_SCATTER
g1.a
ggsave("../../analysis/emps_x_pages.png")

g1.b <- ggplot(data=g1.df, aes(x=count.p.id., y=num_pages)) +
geom_point(alpha=.4, size=4, color="#7b00ff") +
labs(x="Patents", y="Number ofpages\nat depth of 1") +
scale_x_continuous(labels=comma, limits=c(0,40000)) +
scale_y_continuous(limits=c(0,400), breaks=seq(0,400,by=100)) +
geom_smooth(method = "lm") +
theme.eager_chart_SCATTER
g1.df
g1.b
ggsave("../../analysis/patents_x_pages.png")

# variables are not normally distributed
shapiro.test(g1.df$employees)
shapiro.test(g1.df$num_pages)
shapiro.test(g1.df$count.p.id.)

cor.test(g1.df$employees, g1.df$num_pages, alternative="two.sided", method="kendall" )
cor.test(g1.df$count.p.id., g1.df$num_pages, alternative="two.sided", method="kendall" )

0 comments on commit 5148aab

Please sign in to comment.
You can’t perform that action at this time.