|
@@ -2,7 +2,7 @@ |
|
|
|
"cells": [ |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 1, |
|
|
|
"execution_count": 32, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
@@ -12,7 +12,7 @@ |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 2, |
|
|
|
"execution_count": 33, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
@@ -24,17 +24,19 @@ |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 3, |
|
|
|
"execution_count": 34, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"# read the file and set it up (using first four columns, no header)\n", |
|
|
|
"df = pd.read_csv(file, sep = \",\", usecols=[0,1,2,3], header=None)" |
|
|
|
"# then sorts them by date\n", |
|
|
|
"df = pd.read_csv(file, sep = \",\", usecols=[0,1,2,3], header=None)\n", |
|
|
|
"df = df.sort_values(0)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 4, |
|
|
|
"execution_count": 35, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
@@ -45,7 +47,7 @@ |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 5, |
|
|
|
"execution_count": 36, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
@@ -55,7 +57,7 @@ |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 6, |
|
|
|
"execution_count": 37, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
@@ -64,7 +66,7 @@ |
|
|
|
"'http://www.ndp.ca/contact/'" |
|
|
|
] |
|
|
|
}, |
|
|
|
"execution_count": 6, |
|
|
|
"execution_count": 37, |
|
|
|
"metadata": {}, |
|
|
|
"output_type": "execute_result" |
|
|
|
} |
|
@@ -75,7 +77,7 @@ |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 7, |
|
|
|
"execution_count": 38, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
@@ -86,7 +88,27 @@ |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 8, |
|
|
|
"execution_count": 39, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"data": { |
|
|
|
"text/plain": [ |
|
|
|
"20" |
|
|
|
] |
|
|
|
}, |
|
|
|
"execution_count": 39, |
|
|
|
"metadata": {}, |
|
|
|
"output_type": "execute_result" |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"len(urls)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 40, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
@@ -96,15 +118,15 @@ |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 9, |
|
|
|
"execution_count": 41, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"391\n", |
|
|
|
"430\n" |
|
|
|
"276\n", |
|
|
|
"391\n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
@@ -116,26 +138,33 @@ |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 10, |
|
|
|
"execution_count": 42, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"(20061020 (20080113\n", |
|
|
|
"391\n", |
|
|
|
"(20080113 (20060523\n", |
|
|
|
"430\n" |
|
|
|
"http://www.ndp.ca/contact/ http://www.ndp.ca/contact/\n", |
|
|
|
"430\n", |
|
|
|
"(20060523 (20061020\n", |
|
|
|
"http://www.ndp.ca/contact/ http://www.ndp.ca/contact/\n", |
|
|
|
"276\n", |
|
|
|
"(20061020 (20080113\n", |
|
|
|
"http://www.ndp.ca/contact/ http://www.ndp.ca/contact/\n", |
|
|
|
"391\n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"# all together, just need to change the urls list value now\n", |
|
|
|
"# this is a test of just calculating the distance on one\n", |
|
|
|
"data = dups[dups[2].str.contains(urls[0])][3].tolist()\n", |
|
|
|
"url = dups[dups[2].str.contains(urls[0])][2].tolist()\n", |
|
|
|
"dates = dups[dups[2].str.contains(urls[0])][0].tolist()\n", |
|
|
|
"for x in range(1,len(data)):\n", |
|
|
|
"for x in range(0,len(data)):\n", |
|
|
|
" print(dates[x-1],dates[x])\n", |
|
|
|
" print(url[x-1],url[x])\n", |
|
|
|
" print(nltk.edit_distance(data[x-1],data[x]))\n", |
|
|
|
" \n", |
|
|
|
"# shows edit distance between two dates\n", |
|
@@ -144,10 +173,114 @@ |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"execution_count": 43, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [] |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"(20060523 (20061020\n", |
|
|
|
"http://www.ndp.ca/contact/ http://www.ndp.ca/contact/\n", |
|
|
|
"276\n", |
|
|
|
"(20061020 (20080113\n", |
|
|
|
"http://www.ndp.ca/contact/ http://www.ndp.ca/contact/\n", |
|
|
|
"391\n", |
|
|
|
"(20060523 (20060920\n", |
|
|
|
"http://www.ndp.ca/image/tid/146 http://www.ndp.ca/image/tid/146\n", |
|
|
|
"578\n", |
|
|
|
"(20070118 (20080611\n", |
|
|
|
"http://www.ndp.ca/page/3498 http://www.ndp.ca/page/3498\n", |
|
|
|
"663\n", |
|
|
|
"(20070118 (20080611\n", |
|
|
|
"http://www.ndp.ca/page/3508 http://www.ndp.ca/page/3508\n", |
|
|
|
"0\n", |
|
|
|
"(20070118 (20080611\n", |
|
|
|
"http://www.ndp.ca/page/3551 http://www.ndp.ca/page/3551\n", |
|
|
|
"378\n", |
|
|
|
"(20070118 (20080611\n", |
|
|
|
"http://www.ndp.ca/page/3565 http://www.ndp.ca/page/3565\n", |
|
|
|
"378\n", |
|
|
|
"(20070118 (20080611\n", |
|
|
|
"http://www.ndp.ca/page/3567 http://www.ndp.ca/page/3567\n", |
|
|
|
"696\n", |
|
|
|
"(20070118 (20080611\n", |
|
|
|
"http://www.ndp.ca/page/3568 http://www.ndp.ca/page/3568\n", |
|
|
|
"378\n", |
|
|
|
"(20070118 (20080611\n", |
|
|
|
"http://www.ndp.ca/page/3582 http://www.ndp.ca/page/3582\n", |
|
|
|
"378\n", |
|
|
|
"(20070118 (20080611\n", |
|
|
|
"http://www.ndp.ca/page/3586 http://www.ndp.ca/page/3586\n", |
|
|
|
"414\n", |
|
|
|
"(20070118 (20080611\n", |
|
|
|
"http://www.ndp.ca/page/3610 http://www.ndp.ca/page/3610\n", |
|
|
|
"388\n", |
|
|
|
"(20070118 (20080611\n", |
|
|
|
"http://www.ndp.ca/page/3672 http://www.ndp.ca/page/3672\n", |
|
|
|
"518\n", |
|
|
|
"(20070118 (20080611\n", |
|
|
|
"http://www.ndp.ca/page/3680 http://www.ndp.ca/page/3680\n", |
|
|
|
"518\n", |
|
|
|
"(20070118 (20080611\n", |
|
|
|
"http://www.ndp.ca/page/3689 http://www.ndp.ca/page/3689\n", |
|
|
|
"378\n", |
|
|
|
"(20070118 (20080611\n", |
|
|
|
"http://www.ndp.ca/page/3725 http://www.ndp.ca/page/3725\n", |
|
|
|
"378\n", |
|
|
|
"(20060523 (20080512\n", |
|
|
|
"http://www.ndp.ca/page/3751 http://www.ndp.ca/page/3751\n", |
|
|
|
"809\n", |
|
|
|
"(20060523 (20070118\n", |
|
|
|
"http://www.ndp.ca/page/3753 http://www.ndp.ca/page/3753\n", |
|
|
|
"324\n", |
|
|
|
"(20060920 (20080512\n", |
|
|
|
"http://www.ndp.ca/page/4014 http://www.ndp.ca/page/4014\n", |
|
|
|
"669\n", |
|
|
|
"(20080113 (20080611\n", |
|
|
|
"http://www.ndp.ca/page/4711 http://www.ndp.ca/page/4711\n", |
|
|
|
"157\n", |
|
|
|
"(20070118 (20130505\n", |
|
|
|
"http://www.ndp.ca/volunteer http://www.ndp.ca/volunteer\n", |
|
|
|
"1251\n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"# the mother of all loops\n", |
|
|
|
"\n", |
|
|
|
"churn=0\n", |
|
|
|
"for y in range(0,len(urls)): \n", |
|
|
|
" data = dups[dups[2].str.contains(urls[y])][3].tolist()\n", |
|
|
|
" url = dups[dups[2].str.contains(urls[y])][2].tolist()\n", |
|
|
|
" dates = dups[dups[2].str.contains(urls[y])][0].tolist()\n", |
|
|
|
" for x in range(1,len(data)):\n", |
|
|
|
" print(dates[x-1],dates[x])\n", |
|
|
|
" print(url[x-1],url[x])\n", |
|
|
|
" distance = nltk.edit_distance(data[x-1],data[x])\n", |
|
|
|
" print(distance)\n", |
|
|
|
" churn = churn+distance" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 45, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"data": { |
|
|
|
"text/plain": [ |
|
|
|
"9920" |
|
|
|
] |
|
|
|
}, |
|
|
|
"execution_count": 45, |
|
|
|
"metadata": {}, |
|
|
|
"output_type": "execute_result" |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"churn" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
0 comments on commit
af55aee