Skip to content
Permalink
Browse files

a fix for Archvie-It favicons, Docker improvements

  • Loading branch information...
shawnmjones committed Jun 4, 2019
1 parent 6d24eed commit 500683cedbb68699608b57694f48fcb839cda5b7
@@ -28,6 +28,10 @@ RUN npm install puppeteer
# COPY Pipfile /app/
# RUN pipenv install --system

# for Python environment dependencies
COPY requirements.txt /
RUN pip install -r /requirements.txt

# installing the MementoEmbed application
COPY . /app

@@ -110,7 +110,7 @@ def favicon(self):

self.logger.debug("attempting to acquire the archive favicon URI from HTML at {}".format(self.uri))

r = self.httpcache.get(self.uri)
r = self.httpcache.get(self.uri, use_referrer=False)

# self.logger.debug("searching through HTML: \n\n{}\n\n".format(r.text))

@@ -124,7 +124,7 @@ def favicon(self):

self.logger.debug("got an archive favicon of {}".format(self.archive_favicon_uri))

r = self.httpcache.get(self.archive_favicon_uri)
r = self.httpcache.get(self.archive_favicon_uri, use_referrer=False)

if not favicon_resource_test(r):
self.archive_favicon_uri = None
@@ -54,10 +54,12 @@ def purgeuri(self, uri):

def saveuri(self, uri, headers={}):

module_logger.debug("saving URI to cache: {}".format(uri))
module_logger.debug("saving URI {} to cache with request headers {}".format(uri, headers))

r = self.session.get(uri, headers=headers)

module_logger.debug("URI at end of chain was {}".format(r.url))

observation_datetime = datetime.datetime.utcnow()

self.conn.hset(uri, "request_headers", json.dumps(dict(r.request.headers)))
@@ -67,6 +69,10 @@ def saveuri(self, uri, headers={}):
self.conn.hset(uri, "response_elapsed", r.elapsed.microseconds)
self.conn.hset(uri, "response_headers", json.dumps(dict(r.headers)))

module_logger.debug("response headers for URI {} now stored in cache as: {}".format(
uri, self.conn.hget(uri, "response_headers")
))

# sometimes there is no encoding
if r.encoding is not None:
self.conn.hset(uri, "response_encoding", r.encoding)
@@ -100,7 +106,7 @@ def get(self, uri, headers={}, timeout=None):
req_headers = CaseInsensitiveDict(json.loads(self.conn.hget(uri, "request_headers")))
req_method = self.conn.hget(uri, "request_method")

module_logger.debug("issuing request to URI {} with headers {}".format(uri, req_headers))
module_logger.debug("generating request object for URI {} with headers {}".format(uri, req_headers))
request = requests.Request(req_method, uri, headers=req_headers)
request.prepare()

@@ -114,10 +120,14 @@ def get(self, uri, headers={}, timeout=None):
response.encoding = self.conn.hget(uri, "response_encoding").decode('utf-8')
else:
response.encoding = self.conn.hget(uri, "response_encoding")


module_logger.debug("encoding set to {} for URI {}".format(response.encoding, uri))
response.headers = CaseInsensitiveDict(json.loads(self.conn.hget(uri, "response_headers")))

module_logger.debug("response headers pulled from caceh for URI {}: {}".format(
uri, response.headers
))

response._content = self.conn.hget(uri, "response_content")
response.url = uri

@@ -79,14 +79,14 @@ def get(self, uri, headers={}, use_referrer=True):

req_headers['User-Agent'] = self.user_agent

module_logger.debug("setting user agent to {}".format(self.user_agent))
module_logger.debug("sending request with headers {}".format(req_headers))
module_logger.debug("setting user agent to {} for URI {}".format(self.user_agent, uri))
module_logger.debug("sending request with headers {} for URI {}".format(req_headers, uri))

response = self.uricache.get(uri, headers=req_headers, timeout=self.timeout)

module_logger.debug("request headers sent were {}".format(response.request.headers))
module_logger.debug("response status: {}".format(response.status_code))
module_logger.debug("response headers: {}".format(response.headers))
module_logger.debug("request headers sent were {} for URI {}".format(response.request.headers, uri))
module_logger.debug("response status is {} for URI {}".format(response.status_code, uri))
module_logger.debug("response headers are {} for URI {}".format(response.headers, uri))

if 'content-encoding' in response.headers:
if response.headers['content-encoding'] == 'br':
@@ -0,0 +1,98 @@
aiu==0.1.0a1
alabaster==0.7.12
appdirs==1.4.3
appnope==0.1.0
astroid==2.2.5
Babel==2.6.0
backcall==0.1.0
beautifulsoup4==4.7.1
Brotli==1.0.4
bs4==0.0.1
cairocffi==1.0.2
CairoSVG==2.3.0
certifi==2019.3.9
cffi==1.12.3
chardet==3.0.4
Click==7.0
cssselect==1.0.3
cssselect2==0.2.1
decorator==4.4.0
defusedxml==0.6.0
dicttoxml==1.7.4
docutils==0.14
feedfinder2==0.0.4
feedparser==5.2.1
Flask==1.0.2
html5lib==1.0.1
htmlmin==0.1.12
idna==2.8
imageio==2.5.0
imagesize==1.1.0
importlib-metadata==0.15
ipython==7.5.0
ipython-genutils==0.2.0
isort==4.3.20
itsdangerous==1.1.0
jedi==0.13.3
jieba3k==0.35.1
Jinja2==2.10.1
JPype1==0.6.3
jusText==2.2.0
lazy-object-proxy==1.4.1
lexrank==0.1.0
lxml==4.3.3
MarkupSafe==1.1.1
mccabe==0.6.1
newspaper3k==0.2.8
nltk==3.4.1
numpy==1.16.3
packaging==19.0
parso==0.4.0
path.py==12.0.1
pexpect==4.7.0
pickleshare==0.7.5
Pillow==5.2.0
prompt-toolkit==2.0.9
ptyprocess==0.6.0
pycparser==2.19
Pygments==2.4.0
pylint==2.3.1
pyparsing==2.4.0
pyrsistent==0.15.2
python-dateutil==2.8.0
python-magic==0.4.15
pytz==2019.1
PyYAML==5.1
readability-lxml==0.7
redis==3.0.1
redis-namespace==3.0.1.1
regex==2019.5.25
requests==2.22.0
requests-cache==0.4.13
requests-file==1.4.3
requests-futures==0.9.9
rope==0.14.0
scipy==1.3.0
six==1.12.0
snowballstemmer==1.2.1
soupsieve==1.9.1
Sphinx==1.8.4
sphinx-rtd-theme==0.4.3
sphinxcontrib-websupport==1.1.2
summa==1.2.0
text-summarizer==0.0.4
tinycss2==1.0.2
tinysegmenter==0.3
tldextract==2.2.0
traitlets==4.3.2
typed-ast==1.3.5
uritools==2.2.0
urlextract==0.10
urllib3==1.25.2
waitress==1.3.0
warcio==1.6.3
wcwidth==0.1.7
webencodings==0.5.1
Werkzeug==0.15.4
wrapt==1.11.1
zipp==0.5.1
@@ -39,7 +39,7 @@ APPLICATION_LOGFILE = '/app/logs/mementoembed-application.log'
# the log level to use, specified as Python log levels
# values are: logging.DEBUG, logging.INFO, logging.WARNING, logging.ERROR
# for more information, see https://docs.python.org/3/library/logging.html
APPLICATION_LOGLEVEL = "INFO"
APPLICATION_LOGLEVEL = "DEBUG"

# --- ACCESS LOG FILE ---
# These settings apply to the log file documenting visitors to this MementoEmbed instance
@@ -2,7 +2,7 @@ urim,title,snippet,memento-datetime,best-image-uri,archive-uri,archive-name,arch
https://www.webarchive.org.uk/wayback/archive/20090522221251/http://blasttheory.co.uk/,Blast Theory,Sam Pearson and Clara Garcia Fraile are in residence for one month Sam Pearson and Clara Garcia Fraile are in residence for one month working on a new project called In My Shoes. They are developin,2009-05-22T22:12:51Z,https://www.webarchive.org.uk/wayback/archive/20090522221251im_/http:/blasttheory.co.uk/bt/i/dotf/Untitled-1.jpg,https://www.webarchive.org.uk,WEBARCHIVE.ORG.UK,https://www.webarchive.org.uk/favicon.ico,,,,http://blasttheory.co.uk/,blasttheory.co.uk,https://www.blasttheory.co.uk/wp-content/themes/blasttheory/images/bt_icon.ico,Live
https://web.archive.org/web/20180515130056/http://www.cs.odu.edu/~mln/,Michael L. Nelson,"About Me... I joined the Computer Science department at Old Dominion University in 2002. I worked at NASA Langley Research Center from 1991-2002. Through a NASA fellowship, I spent the 2000-2001 ac",2018-05-15T13:00:56Z,http://www.cs.odu.edu/~mln/images/mln-ad-100x130.jpg,https://archive.org,ARCHIVE.ORG,https://web.archive.org/_static/images/archive.ico,,,,http://www.cs.odu.edu/~mln/,www.cs.odu.edu,https://web.archive.org/web/http://www.cs.odu.edu/favicon.ico,Live
https://webarchive.nrscotland.gov.uk/20170805210615/http://livingonwater.co.uk/index.php/homepage/show/home/home,Living on Water - Home,"Thanks to the Living on Water initiative a choice of new, high-quality residential and commercial moorings are being developed at key places along Scotland\u2019s beautiful canal network. This means tha",2017-08-05T21:06:21Z,https://webarchive.nrscotland.gov.uk/20170805210621im_/http://www.livingonwater.co.uk/system/uploads/images/homeslider/1.jpg,https://webarchive.nrscotland.gov.uk,NRSCOTLAND.GOV.UK,https://webarchive.nrscotland.gov.uk/wb-static/imr/images/favicon.png,,,,http://www.livingonwater.co.uk/index.php/homepage/show/home/home,www.livingonwater.co.uk,https://www.google.com/s2/favicons?domain=livingonwater.co.uk,Live
http://arquivo.pt/wayback/19980205082901/http://www.caleida.pt/saramago/,José Saramago - Home Page,Este Site foi distinguido com a insignia de Top 5% Portugal \u00a9 1996: Caleida Comunica\u00e7\u00e3o Global Lda Page designed for Netscape 1.2 - 256c 800x600,1998-02-05T08:29:01Z,http://arquivo.pt/wayback/19980205082901im_/http://www.caleida.pt/saramago/imagens/foto_saramago.gif,http://arquivo.pt,ARQUIVO.PT,https://arquivo.pt/wayback/http://www.caleida.pt/favicon.ico,,,,http://www.caleida.pt/saramago/,www.caleida.pt,https://arquivo.pt/wayback/http://www.caleida.pt/favicon.ico,Live
http://arquivo.pt/wayback/19980205082901/http://www.caleida.pt/saramago/,José Saramago - Home Page,Este Site foi distinguido com a insignia de Top 5% Portugal \u00a9 1996: Caleida Comunica\u00e7\u00e3o Global Lda Page designed for Netscape 1.2 - 256c 800x600,1998-02-05T08:29:01Z,http://arquivo.pt/wayback/19980205082901im_/http://www.caleida.pt/saramago/imagens/foto_saramago.gif,http://arquivo.pt,ARQUIVO.PT,http://arquivo.pt/img/logo-16.png,,,,http://www.caleida.pt/saramago/,www.caleida.pt,http://www.caleida.pt/wp-content/uploads/2019/01/cropped-caleida-32x32.png,Live
http://wayback.archive-it.org/2950/20120508033201/http://www.salon.com/2012/05/02/did_may_day_succeed/singleton/,Did May Day succeed?,"Yesterday’s Occupy reboot mobilized a diverse group of people, but reverted to familiar tactics in the end",2012-05-08T03:32:01Z,http://wayback.archive-it.org/2950/20120508033201im_/http://media.salon.com/2012/05/occupy_la-460x307.jpg,https://archive-it.org,ARCHIVE-IT.ORG,http://wayback.archive-it.org/favicon.ico,2950,Occupy Movement 2011/2012,https://archive-it.org/collections/2950,http://www.salon.com/2012/05/02/did_may_day_succeed/singleton/,www.salon.com,http://wayback.archive-it.org/2950/20120508033201im_/http://www.salon.com/favicon.ico,Rotten
http://webarchive.nationalarchives.gov.uk/20081208222543/http://www.nacell.org.uk/,NACELL - National Advisory Centre on Early Language Learning,"ELL Forum Email forum for news exchange, discussion and peer support. More",2008-12-08T22:25:43Z,http://webarchive.nationalarchives.gov.uk/20081208222543im_/http:/www.nacell.org.uk/images/yellow_top_bar.gif,http://webarchive.nationalarchives.gov.uk,NATIONALARCHIVES.GOV.UK,http://webarchive.nationalarchives.gov.uk/search/img/favicon.ico,,,,http://www.nacell.org.uk/,www.nacell.org.uk,https://www.google.com/s2/favicons?domain=nacell.org.uk,Live
http://webarchive.parliament.uk/20100426094738/http://www.publications.parliament.uk/pa/cm199899/cmselect/cmagric/141/9020402.htm,House of Commons - Agriculture - Minutes of Evidence,"Examination of witnesses (Questions 373 - 379) THURSDAY 4 FEBRUARY 1999 MR ROBERT GEORGE, MR MICHAEL HOSKING, MR MERVYN MOUNTJOY, MR DONALD TURTLE Chairman 373. Gentlemen, welcome to this first s",2010-04-26T09:47:38Z,http://webarchive.parliament.uk/20100426094738im_/http://www.publications.parliament.uk/server-side/images/parliament_logo.gif,http://webarchive.parliament.uk,PARLIAMENT.UK,https://www.google.com/s2/favicons?domain=parliament.uk,,,,http://www.publications.parliament.uk/pa/cm199899/cmselect/cmagric/141/9020402.htm,www.publications.parliament.uk,http://webarchive.parliament.uk/20100426094738oe_/http://www.publications.parliament.uk/favicon.ico,Live
@@ -37,9 +37,9 @@ def test_service(endpoint, datarow):
if field not in ['urim', 'generation-time', 'snippet']:

if datarow[field] == '':
self.assertEqual(data[field], None)
self.assertEqual(data[field], None, msg="failed for field {}".format(field))
else:
self.assertEqual(data[field], datarow[field])
self.assertEqual(data[field], datarow[field], msg="failed for field {}".format(field))


with open(batteryfilename) as f:

0 comments on commit 500683c

Please sign in to comment.
You can’t perform that action at this time.