Permalink
Browse files

updates based on testing

  • Loading branch information...
shawnmjones committed Jul 20, 2018
1 parent 3afeb64 commit c5f2425e8523a8149a7e4212e24ee189411da8fe
Showing with 24 additions and 7 deletions.
  1. +1 −1 aiu/timemap.py
  2. +2 −2 aiu/version.py
  3. +21 −4 bin/seeds2warc
View
@@ -187,4 +187,4 @@ def process_local_dict(local_dict, working_dict):
process_local_dict(local_dict, dict_timemap)
return dict_timemap
return dict_timemap
View
@@ -1,7 +1,7 @@
__name__ = "aiu"
__version__ = "0.1.0a1"
__version__ = "0.1.1a1"
name = __name__
version = __version__
user_agent_string = "{}/{} - See: https://github.com/shawnmjones/archiveit_utilities".format(name, version)
user_agent_string = "{}/{} - See: https://github.com/shawnmjones/archiveit_utilities".format(name, version)
View
@@ -19,6 +19,10 @@ from warcio.warcwriter import WARCWriter
from warcio.statusandheaders import StatusAndHeaders
from requests_futures.sessions import FuturesSession
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import aiu
from aiu import ArchiveItCollection
from aiu import convert_LinkTimeMap_to_dict
@@ -102,8 +106,20 @@ def fetch_mementos_and_write_warcs(timemap_data, working_directory, collection_i
raw_urims.append(raw_urim)
logger.info("Issuing requests for {} raw mementos".format(len(raw_urims)))
with FuturesSession(max_workers=cpu_count) as session:
retry_session = requests.Session()
retry = Retry(
total=10,
read=10,
connect=10,
backoff_factor=0.3,
status_forcelist=(500, 502, 504)
)
adapter = HTTPAdapter(max_retries=retry)
retry_session.mount('http://', adapter)
retry_session.mount('https://', adapter)
with FuturesSession(max_workers=cpu_count, session=retry_session) as session:
futures = get_uri_responses(session, raw_urims)
warcinfo = {
@@ -152,9 +168,10 @@ def fetch_mementos_and_write_warcs(timemap_data, working_directory, collection_i
response = futures[raw_urim].result()
try:
# TODO: if the original URI used a Link header, it will be overridden
linkdict = convert_LinkTimeMap_to_dict(response.headers["link"])
urir = linkdict["original_uri"]
except KeyError as e:
except (KeyError, aiu.timemap.MalformedLinkFormatTimeMap) as e:
logger.warn("no original relation in the Link header for raw memento at {}".format(raw_urim))
sample_urim = invert_raw_urimdata_mapping[raw_urim][0]
@@ -281,4 +298,4 @@ if __name__ == '__main__':
logger.info("Data has been written out to {}".format(output_directory))
logger.info("Finished run")
logger.info("Finished run")

0 comments on commit c5f2425

Please sign in to comment.