Skip to content
Permalink
Browse files

first commit

  • Loading branch information...
anwala committed Aug 27, 2019
0 parents commit 64edaef58738f3b2f03cdb29e2f7f5a1d22cb581
Showing with 324 additions and 0 deletions.
  1. +9 −0 .gitignore
  2. +21 −0 LICENSE
  3. +1 −0 NwalaTextUtils/__init__.py
  4. +257 −0 NwalaTextUtils/textutils.py
  5. +7 −0 README.md
  6. +29 −0 setup.py
@@ -0,0 +1,9 @@
# Compiled python modules.
.DS_Store
*.pyc

/build/
/dist/
/*.egg-info
/*.egg

21 LICENSE
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2018 ODU Web Science / Digital Libraries Research Group

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
@@ -0,0 +1 @@
__version__ = '0.0.1'
@@ -0,0 +1,257 @@
import os
import requests
import sys
import time

from boilerpipe.extract import Extractor
from bs4 import BeautifulSoup
from multiprocessing import Pool

def genericErrorInfo():
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]

errorMessage = fname + ', ' + str(exc_tb.tb_lineno) + ', ' + str(sys.exc_info())
print('\tERROR:', errorMessage)

return sys.exc_info()

#html proc - start
def getCustomHeaderDict():

headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connnection': 'keep-alive',
'Cache-Control':'max-age=0'
}

return headers

def isSizeLimitExceed(responseHeaders, sizeRestrict):

if( 'Content-Length' in responseHeaders ):
if( int(responseHeaders['Content-Length']) > sizeRestrict ):
return True

return False

def downloadSave(response, outfile):

try:
with open(outfile, 'wb') as dfile:
for chunk in response.iter_content(chunk_size=1024):
# writing one chunk at a time to pdf file
if(chunk):
dfile.write(chunk)
except:
genericErrorInfo()

def mimicBrowser(uri, getRequestFlag=True, extraParams=None):

uri = uri.strip()
if( len(uri) == 0 ):
return ''

if( extraParams is None ):
extraParams = {}

extraParams.setdefault('timeout', 10)
extraParams.setdefault('sizeRestrict', -1)
extraParams.setdefault('headers', getCustomHeaderDict())
extraParams.setdefault('addResponseHeader', False)


try:
response = ''
reponseText = ''
if( getRequestFlag ):

if( 'saveFilePath' in extraParams ):
response = requests.get(uri, headers=extraParams['headers'], timeout=extraParams['timeout'], stream=True)#, verify=False
else:
response = requests.get(uri, headers=extraParams['headers'], timeout=extraParams['timeout'])#, verify=False

if( extraParams['sizeRestrict'] != -1 ):
if( isSizeLimitExceed(response.headers, extraParams['sizeRestrict']) ):
return 'Error: Exceeded size restriction: ' + str(extraParams['sizeRestrict'])


if( 'saveFilePath' in extraParams ):
downloadSave(response, extraParams['saveFilePath'])
else:
reponseText = response.text

if( extraParams['addResponseHeader'] ):
return {'responseHeader': response.headers, 'text': reponseText}

return reponseText
else:
response = requests.head(uri, headers=extraParams['headers'], timeout=extraParams['timeout'])#, verify=False
response.headers['status-code'] = response.status_code
return response.headers
except:

genericErrorInfo()
print('\tquery is: ', uri)
if( getRequestFlag == False ):
return {}

return ''

def dereferenceURI(URI, maxSleepInSeconds=5, extraParams=None):

URI = URI.strip()
if( len(URI) == 0 ):
return ''

if( extraParams is None ):
extraParams = {}

htmlPage = ''
try:

if( maxSleepInSeconds > 0 ):
print('\tderef.URI(), sleep:', maxSleepInSeconds)
time.sleep(maxSleepInSeconds)

extraParams.setdefault('sizeRestrict', 4000000)
htmlPage = mimicBrowser(URI, extraParams=extraParams)
except:
genericErrorInfo()

return htmlPage

def extractPageTitleFromHTML(html):

title = ''
try:
soup = BeautifulSoup(html, 'html.parser')
title = soup.find('title')

if( title is None ):
title = ''
else:
title = title.text.strip()
except:
genericErrorInfo()

return title

def cleanHtml(html, method='python-boilerpipe'):

if( len(html) == 0 ):
return ''

#experience problem of parallelizing, maybe due to: https://stackoverflow.com/questions/8804830/python-multiprocessing-pickling-error
if( method == 'python-boilerpipe' ):
try:
extractor = Extractor(extractor='ArticleExtractor', html=html)
return extractor.getText()
except:
genericErrorInfo()
elif( method == 'nltk' ):
"""
Copied from NLTK package.
Remove HTML markup from the given string.
:param html: the HTML string to be cleaned
:type html: str
:rtype: str
"""

# First we remove inline JavaScript/CSS:
cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
# Then we remove html comments. This has to be done before removing regular
# tags since comments can contain '>' characters.
cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
# Next we can remove the remaining tags:
cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
# Finally, we deal with whitespace
cleaned = re.sub(r"&nbsp;", " ", cleaned)
cleaned = re.sub(r" ", " ", cleaned)
cleaned = re.sub(r" ", " ", cleaned)

#my addition to remove blank lines
cleaned = re.sub("\n\s*\n*", "\n", cleaned)

return cleaned.strip()

return ''

def prlGetTxtFrmURIs(urisLst):

size = len(urisLst)
if( size == 0 ):
return []

docsLst = []
jobsLst = []
for i in range(size):

printMsg = ''

if( i % 10 == 0 ):
printMsg = '\tderef uri i: ' + str(i) + ' of ' + str(size)

keywords = {
'URI': urisLst[i],
'maxSleepInSeconds': 0
}

jobsLst.append( {
'func': dereferenceURI,
'args': keywords,
'misc': False,
'print': printMsg
})


resLst = parallelTask(jobsLst)
for res in resLst:

text = cleanHtml( res['output'] )

docsLst.append({
'text': text,
'id': urisLst[i],
'title': extractPageTitleFromHTML( res['output'] ),
'uri': res['input']['args']['URI']
})

return docsLst
#html proc - end

#parallel proc - start
def parallelProxy(job):

output = job['func'](**job['args'])

if( 'print' in job ):
if( len(job['print']) != 0 ):
print(job['print'])

return {'input': job, 'output': output, 'misc': job['misc']}

def parallelTask(jobsLst, threadCount=5):

if( len(jobsLst) == 0 ):
return []

if( threadCount < 2 ):
threadCount = 2

try:
workers = Pool(threadCount)
resLst = workers.map(parallelProxy, jobsLst)

workers.close()
workers.join()
except:
genericErrorInfo()
return []

return resLst
#parallel proc - end
@@ -0,0 +1,7 @@
# NwalaTextUtils

Collection of text processing Python functions.
## Installation
```
$
```
@@ -0,0 +1,29 @@
#!/usr/bin/env python

from setuptools import setup, find_packages
from NwalaTextUtils import __version__

desc = """Collection of functions for processing text"""


setup(
name='NwalaTextUtils',
version=__version__,
description=desc,
long_description='See: https://github.com/oduwsdl/NwalaTextUtils/',
author='Alexander C. Nwala',
author_email='anwala@cs.odu.edu',
url='https://github.com/oduwsdl/NwalaTextUtils/',
packages=find_packages(),
license="MIT",
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent"
],
install_requires=[
'requests',
'beautifulsoup4'
],
dependency_links=['http://github.com/misja/python-boilerpipe/tarball/master#egg=python-boilerpipe']
)

0 comments on commit 64edaef

Please sign in to comment.
You can’t perform that action at this time.