Skip to content
Permalink
Browse files

Log closing of ARC and WARC files, resolves #156 (#301)

* Log opening and closing of archive files as per #156
* Remove redundant log message. Spark already logs the file that is to be read when an executor computes an RDD.
  • Loading branch information...
jrwiebe authored and ruebot committed Jan 31, 2019
1 parent 1e69040 commit fc0178d7588871681da75a478195c7b6724f3487
Showing with 15 additions and 1 deletion.
  1. +15 −1 src/main/java/io/archivesunleashed/data/ArchiveRecordInputFormat.java
@@ -18,6 +18,7 @@
package io.archivesunleashed.data;

import io.archivesunleashed.data.ArchiveRecordWritable.ArchiveFormat;
import org.apache.log4j.Logger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
@@ -47,6 +48,12 @@
*/
public class ArchiveRecordInputFormat extends FileInputFormat<LongWritable,
ArchiveRecordWritable> {
/**
* Setup logger.
*/
private static final Logger LOG =
Logger.getLogger(ArchiveRecordInputFormat.class);

@Override
public final RecordReader<LongWritable,
ArchiveRecordWritable> createRecordReader(final InputSplit split,
@@ -102,6 +109,11 @@ protected final boolean isSplitable(final JobContext context,
*/
private ArchiveRecordWritable value = null;

/**
* Archive file name.
*/
private String fileName;

/**
* Seekable file position.
*/
@@ -124,8 +136,9 @@ public final void initialize(final InputSplit archiveRecordSplit,

FileSystem fs = file.getFileSystem(job);
FSDataInputStream fileIn = fs.open(split.getPath());
fileName = split.getPath().toString();

reader = ArchiveReaderFactory.get(split.getPath().toString(),
reader = ArchiveReaderFactory.get(fileName,
new BufferedInputStream(fileIn), true);

if (reader instanceof ARCReader) {
@@ -223,6 +236,7 @@ public final float getProgress() throws IOException {
@Override
public final synchronized void close() throws IOException {
reader.close();
LOG.info("Closed archive file " + fileName);
}
}
}

0 comments on commit fc0178d

Please sign in to comment.
You can’t perform that action at this time.
You signed in with another tab or window. Reload to refresh your session. You signed out in another tab or window. Reload to refresh your session.