first commit

oduwsdl · Aug 27, 2019 · 64edaef58738f3b2f03cdb29e2f7f5a1d22cb581 · 64edaef
commit 64edaef58738f3b2f03cdb29e2f7f5a1d22cb581
Unified Split

Showing with 324 additions and 0 deletions.

+9 −0 .gitignore

+21 −0 LICENSE

+1 −0 NwalaTextUtils/__init__.py

+257 −0 NwalaTextUtils/textutils.py

+7 −0 README.md

+29 −0 setup.py
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,9 @@
+  
+# Compiled python modules.
+  
+.DS_Store
+  
+*.pyc
+  
+
+  
+/build/
+  
+/dist/
+  
+/*.egg-info
+  
+/*.egg
+  
+
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+  
+MIT License
+  
+
+  
+Copyright (c) 2018 ODU Web Science / Digital Libraries Research Group
+  
+
+  
+Permission is hereby granted, free of charge, to any person obtaining a copy
+  
+of this software and associated documentation files (the "Software"), to deal
+  
+in the Software without restriction, including without limitation the rights
+  
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  
+copies of the Software, and to permit persons to whom the Software is
+  
+furnished to do so, subject to the following conditions:
+  
+
+  
+The above copyright notice and this permission notice shall be included in all
+  
+copies or substantial portions of the Software.
+  
+
+  
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+  
+SOFTWARE.
diff --git a/NwalaTextUtils/__init__.py b/NwalaTextUtils/__init__.py
@@ -0,0 +1 @@
+  
+__version__ = '0.0.1'
diff --git a/NwalaTextUtils/textutils.py b/NwalaTextUtils/textutils.py
@@ -0,0 +1,257 @@
+  
+import os
+  
+import requests
+  
+import sys
+  
+import time
+  
+
+  
+from boilerpipe.extract import Extractor
+  
+from bs4 import BeautifulSoup
+  
+from multiprocessing import Pool
+  
+
+  
+def genericErrorInfo():
+  
+	exc_type, exc_obj, exc_tb = sys.exc_info()
+  
+	fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
+  
+
+  
+	errorMessage = fname + ', ' + str(exc_tb.tb_lineno)  + ', ' + str(sys.exc_info())
+  
+	print('\tERROR:', errorMessage)
+  
+
+  
+	return  sys.exc_info()
+  
+
+  
+#html proc - start
+  
+def getCustomHeaderDict():
+  
+
+  
+	headers = {
+  
+		'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
+  
+		'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+  
+		'Accept-Language': 'en-US,en;q=0.5',
+  
+		'Accept-Encoding': 'gzip, deflate',
+  
+		'Connnection': 'keep-alive',
+  
+		'Cache-Control':'max-age=0'	
+  
+		}
+  
+
+  
+	return headers
+  
+
+  
+def isSizeLimitExceed(responseHeaders, sizeRestrict):
+  
+
+  
+	if( 'Content-Length' in responseHeaders ):
+  
+		if( int(responseHeaders['Content-Length']) > sizeRestrict ):
+  
+			return True
+  
+
+  
+	return False
+  
+
+  
+def downloadSave(response, outfile):
+  
+
+  
+	try:
+  
+		with open(outfile, 'wb') as dfile:
+  
+			for chunk in response.iter_content(chunk_size=1024): 
+  
+				# writing one chunk at a time to pdf file 
+  
+				if(chunk):
+  
+					dfile.write(chunk) 
+  
+	except:
+  
+		genericErrorInfo()
+  
+
+  
+def mimicBrowser(uri, getRequestFlag=True, extraParams=None):
+  
+
+  
+	uri = uri.strip()
+  
+	if( len(uri) == 0 ):
+  
+		return ''
+  
+
+  
+	if( extraParams is None ):
+  
+		extraParams = {}
+  
+
+  
+	extraParams.setdefault('timeout', 10)
+  
+	extraParams.setdefault('sizeRestrict', -1)
+  
+	extraParams.setdefault('headers', getCustomHeaderDict())
+  
+	extraParams.setdefault('addResponseHeader', False)
+  
+
+  
+
+  
+	try:
+  
+		response = ''
+  
+		reponseText = ''
+  
+		if( getRequestFlag ):
+  
+
+  
+			if( 'saveFilePath' in extraParams ):
+  
+				response = requests.get(uri, headers=extraParams['headers'], timeout=extraParams['timeout'], stream=True)#, verify=False
+  
+			else:
+  
+				response = requests.get(uri, headers=extraParams['headers'], timeout=extraParams['timeout'])#, verify=False
+  
+
+  
+			if( extraParams['sizeRestrict'] != -1 ):
+  
+				if( isSizeLimitExceed(response.headers, extraParams['sizeRestrict']) ):
+  
+					return 'Error: Exceeded size restriction: ' + str(extraParams['sizeRestrict'])
+  
+
+  
+
+  
+			if( 'saveFilePath' in extraParams ):
+  
+				downloadSave(response, extraParams['saveFilePath'])
+  
+			else:
+  
+				reponseText = response.text
+  
+
+  
+			if( extraParams['addResponseHeader'] ):
+  
+				return	{'responseHeader': response.headers, 'text': reponseText}
+  
+
+  
+			return reponseText
+  
+		else:
+  
+			response = requests.head(uri, headers=extraParams['headers'], timeout=extraParams['timeout'])#, verify=False
+  
+			response.headers['status-code'] = response.status_code
+  
+			return response.headers
+  
+	except:
+  
+
+  
+		genericErrorInfo()
+  
+		print('\tquery is: ', uri)
+  
+		if( getRequestFlag == False ):
+  
+			return {}
+  
+
+  
+	return ''
+  
+
+  
+def dereferenceURI(URI, maxSleepInSeconds=5, extraParams=None):
+  
+
+  
+	URI = URI.strip()
+  
+	if( len(URI) == 0 ):
+  
+		return ''
+  
+
+  
+	if( extraParams is None ):
+  
+		extraParams = {}
+  
+
+  
+	htmlPage = ''
+  
+	try:
+  
+
+  
+		if( maxSleepInSeconds > 0 ):
+  
+			print('\tderef.URI(), sleep:', maxSleepInSeconds)
+  
+			time.sleep(maxSleepInSeconds)
+  
+
+  
+		extraParams.setdefault('sizeRestrict', 4000000)
+  
+		htmlPage = mimicBrowser(URI, extraParams=extraParams)
+  
+	except:
+  
+		genericErrorInfo()
+  
+
+  
+	return htmlPage
+  
+
+  
+def extractPageTitleFromHTML(html):
+  
+
+  
+	title = ''
+  
+	try:
+  
+		soup = BeautifulSoup(html, 'html.parser')
+  
+		title = soup.find('title')
+  
+
+  
+		if( title is None ):
+  
+			title = ''
+  
+		else:
+  
+			title = title.text.strip()
+  
+	except:
+  
+		genericErrorInfo()
+  
+
+  
+	return title
+  
+
+  
+def cleanHtml(html, method='python-boilerpipe'):
+  
+
+  
+	if( len(html) == 0 ):
+  
+		return ''
+  
+
+  
+	#experience problem of parallelizing, maybe due to: https://stackoverflow.com/questions/8804830/python-multiprocessing-pickling-error
+  
+	if( method == 'python-boilerpipe' ):
+  
+		try:
+  
+			extractor = Extractor(extractor='ArticleExtractor', html=html)
+  
+			return extractor.getText()
+  
+		except:
+  
+			genericErrorInfo()
+  
+	elif( method == 'nltk' ):
+  
+		"""
+  
+		Copied from NLTK package.
+  
+		Remove HTML markup from the given string.
+  
+
+  
+		:param html: the HTML string to be cleaned
+  
+		:type html: str
+  
+		:rtype: str
+  
+		"""
+  
+
+  
+		# First we remove inline JavaScript/CSS:
+  
+		cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
+  
+		# Then we remove html comments. This has to be done before removing regular
+  
+		# tags since comments can contain '>' characters.
+  
+		cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
+  
+		# Next we can remove the remaining tags:
+  
+		cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
+  
+		# Finally, we deal with whitespace
+  
+		cleaned = re.sub(r"&nbsp;", " ", cleaned)
+  
+		cleaned = re.sub(r"  ", " ", cleaned)
+  
+		cleaned = re.sub(r"  ", " ", cleaned)
+  
+
+  
+		#my addition to remove blank lines
+  
+		cleaned = re.sub("\n\s*\n*", "\n", cleaned)
+  
+
+  
+		return cleaned.strip()
+  
+
+  
+	return ''
+  
+
+  
+def prlGetTxtFrmURIs(urisLst):
+  
+
+  
+	size = len(urisLst)
+  
+	if( size == 0 ):
+  
+		return []
+  
+
+  
+	docsLst = []
+  
+	jobsLst = []
+  
+	for i in range(size):
+  
+
+  
+		printMsg = ''
+  
+
+  
+		if( i % 10 == 0 ):
+  
+			printMsg = '\tderef uri i: ' + str(i) + ' of ' + str(size)
+  
+
+  
+		keywords = {
+  
+			'URI': urisLst[i],
+  
+			'maxSleepInSeconds': 0
+  
+		}
+  
+
+  
+		jobsLst.append( {
+  
+			'func': dereferenceURI, 
+  
+			'args': keywords, 
+  
+			'misc': False, 
+  
+			'print': printMsg
+  
+		})
+  
+
+  
+
+  
+	resLst = parallelTask(jobsLst)
+  
+	for res in resLst:
+  
+
+  
+		text = cleanHtml( res['output'] )
+  
+
+  
+		docsLst.append({
+  
+			'text': text,
+  
+			'id': urisLst[i],
+  
+			'title': extractPageTitleFromHTML( res['output'] ),
+  
+			'uri': res['input']['args']['URI']
+  
+		})
+  
+
+  
+	return docsLst
+  
+#html proc - end
+  
+
+  
+#parallel proc - start
+  
+def parallelProxy(job):
+  
+
+  
+	output = job['func'](**job['args'])
+  
+
+  
+	if( 'print' in job ):
+  
+		if( len(job['print']) != 0 ):
+  
+			print(job['print'])
+  
+
+  
+	return {'input': job, 'output': output, 'misc': job['misc']}
+  
+
+  
+def parallelTask(jobsLst, threadCount=5):
+  
+
+  
+	if( len(jobsLst) == 0 ):
+  
+		return []
+  
+
+  
+	if( threadCount < 2 ):
+  
+		threadCount = 2
+  
+
+  
+	try:
+  
+		workers = Pool(threadCount)
+  
+		resLst = workers.map(parallelProxy, jobsLst)
+  
+
+  
+		workers.close()
+  
+		workers.join()
+  
+	except:
+  
+		genericErrorInfo()
+  
+		return []
+  
+
+  
+	return resLst
+  
+#parallel proc - end
diff --git a/README.md b/README.md
@@ -0,0 +1,7 @@
+  
+# NwalaTextUtils
+  
+
+  
+Collection of text processing Python functions.
+  
+## Installation
+  
+```
+  
+$ 
+  
+```
diff --git a/setup.py b/setup.py
@@ -0,0 +1,29 @@
+  
+#!/usr/bin/env python
+  
+
+  
+from setuptools import setup, find_packages
+  
+from NwalaTextUtils import __version__
+  
+
+  
+desc = """Collection of functions for processing text"""
+  
+
+  
+
+  
+setup(
+  
+    name='NwalaTextUtils',
+  
+    version=__version__,
+  
+    description=desc,
+  
+    long_description='See: https://github.com/oduwsdl/NwalaTextUtils/',
+  
+    author='Alexander C. Nwala',
+  
+    author_email='anwala@cs.odu.edu',
+  
+    url='https://github.com/oduwsdl/NwalaTextUtils/',
+  
+    packages=find_packages(),
+  
+    license="MIT",
+  
+    classifiers=[
+  
+        "Programming Language :: Python :: 3",
+  
+        "License :: OSI Approved :: MIT License",
+  
+        "Operating System :: OS Independent"
+  
+    ],
+  
+    install_requires=[
+  
+       'requests',
+  
+       'beautifulsoup4'
+  
+    ],
+  
+    dependency_links=['http://github.com/misja/python-boilerpipe/tarball/master#egg=python-boilerpipe']
+  
+)