Skip to content
Permalink
Browse files

Sorts by date; calculating "churn"

  • Loading branch information...
ianmilligan1 committed May 6, 2019
1 parent 6170530 commit af55aee41714eaef4370b4dbcf16b3ccf54fc61a
Showing with 155 additions and 22 deletions.
  1. +155 −22 edit_distances.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
@@ -12,7 +12,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
@@ -24,17 +24,19 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"# read the file and set it up (using first four columns, no header)\n",
"df = pd.read_csv(file, sep = \",\", usecols=[0,1,2,3], header=None)"
"# then sorts them by date\n",
"df = pd.read_csv(file, sep = \",\", usecols=[0,1,2,3], header=None)\n",
"df = df.sort_values(0)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
@@ -45,7 +47,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
@@ -55,7 +57,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 37,
"metadata": {},
"outputs": [
{
@@ -64,7 +66,7 @@
"'http://www.ndp.ca/contact/'"
]
},
"execution_count": 6,
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
@@ -75,7 +77,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
@@ -86,7 +88,27 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"20"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(urls)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
@@ -96,15 +118,15 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"391\n",
"430\n"
"276\n",
"391\n"
]
}
],
@@ -116,26 +138,33 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(20061020 (20080113\n",
"391\n",
"(20080113 (20060523\n",
"430\n"
"http://www.ndp.ca/contact/ http://www.ndp.ca/contact/\n",
"430\n",
"(20060523 (20061020\n",
"http://www.ndp.ca/contact/ http://www.ndp.ca/contact/\n",
"276\n",
"(20061020 (20080113\n",
"http://www.ndp.ca/contact/ http://www.ndp.ca/contact/\n",
"391\n"
]
}
],
"source": [
"# all together, just need to change the urls list value now\n",
"# this is a test of just calculating the distance on one\n",
"data = dups[dups[2].str.contains(urls[0])][3].tolist()\n",
"url = dups[dups[2].str.contains(urls[0])][2].tolist()\n",
"dates = dups[dups[2].str.contains(urls[0])][0].tolist()\n",
"for x in range(1,len(data)):\n",
"for x in range(0,len(data)):\n",
" print(dates[x-1],dates[x])\n",
" print(url[x-1],url[x])\n",
" print(nltk.edit_distance(data[x-1],data[x]))\n",
" \n",
"# shows edit distance between two dates\n",
@@ -144,10 +173,114 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": []
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(20060523 (20061020\n",
"http://www.ndp.ca/contact/ http://www.ndp.ca/contact/\n",
"276\n",
"(20061020 (20080113\n",
"http://www.ndp.ca/contact/ http://www.ndp.ca/contact/\n",
"391\n",
"(20060523 (20060920\n",
"http://www.ndp.ca/image/tid/146 http://www.ndp.ca/image/tid/146\n",
"578\n",
"(20070118 (20080611\n",
"http://www.ndp.ca/page/3498 http://www.ndp.ca/page/3498\n",
"663\n",
"(20070118 (20080611\n",
"http://www.ndp.ca/page/3508 http://www.ndp.ca/page/3508\n",
"0\n",
"(20070118 (20080611\n",
"http://www.ndp.ca/page/3551 http://www.ndp.ca/page/3551\n",
"378\n",
"(20070118 (20080611\n",
"http://www.ndp.ca/page/3565 http://www.ndp.ca/page/3565\n",
"378\n",
"(20070118 (20080611\n",
"http://www.ndp.ca/page/3567 http://www.ndp.ca/page/3567\n",
"696\n",
"(20070118 (20080611\n",
"http://www.ndp.ca/page/3568 http://www.ndp.ca/page/3568\n",
"378\n",
"(20070118 (20080611\n",
"http://www.ndp.ca/page/3582 http://www.ndp.ca/page/3582\n",
"378\n",
"(20070118 (20080611\n",
"http://www.ndp.ca/page/3586 http://www.ndp.ca/page/3586\n",
"414\n",
"(20070118 (20080611\n",
"http://www.ndp.ca/page/3610 http://www.ndp.ca/page/3610\n",
"388\n",
"(20070118 (20080611\n",
"http://www.ndp.ca/page/3672 http://www.ndp.ca/page/3672\n",
"518\n",
"(20070118 (20080611\n",
"http://www.ndp.ca/page/3680 http://www.ndp.ca/page/3680\n",
"518\n",
"(20070118 (20080611\n",
"http://www.ndp.ca/page/3689 http://www.ndp.ca/page/3689\n",
"378\n",
"(20070118 (20080611\n",
"http://www.ndp.ca/page/3725 http://www.ndp.ca/page/3725\n",
"378\n",
"(20060523 (20080512\n",
"http://www.ndp.ca/page/3751 http://www.ndp.ca/page/3751\n",
"809\n",
"(20060523 (20070118\n",
"http://www.ndp.ca/page/3753 http://www.ndp.ca/page/3753\n",
"324\n",
"(20060920 (20080512\n",
"http://www.ndp.ca/page/4014 http://www.ndp.ca/page/4014\n",
"669\n",
"(20080113 (20080611\n",
"http://www.ndp.ca/page/4711 http://www.ndp.ca/page/4711\n",
"157\n",
"(20070118 (20130505\n",
"http://www.ndp.ca/volunteer http://www.ndp.ca/volunteer\n",
"1251\n"
]
}
],
"source": [
"# the mother of all loops\n",
"\n",
"churn=0\n",
"for y in range(0,len(urls)): \n",
" data = dups[dups[2].str.contains(urls[y])][3].tolist()\n",
" url = dups[dups[2].str.contains(urls[y])][2].tolist()\n",
" dates = dups[dups[2].str.contains(urls[y])][0].tolist()\n",
" for x in range(1,len(data)):\n",
" print(dates[x-1],dates[x])\n",
" print(url[x-1],url[x])\n",
" distance = nltk.edit_distance(data[x-1],data[x])\n",
" print(distance)\n",
" churn = churn+distance"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"9920"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"churn"
]
},
{
"cell_type": "code",

0 comments on commit af55aee

Please sign in to comment.
You can’t perform that action at this time.