Permalink
Browse files
More flexible duplicate detection
- Loading branch information...
Showing
with
6 additions
and
1 deletion.
-
+6
−1
src/org/netpreserve/logtrix/CrawlLogIterator.java
|
@@ -32,6 +32,8 @@ |
|
|
|
private static final Logger log = LoggerFactory.getLogger(CrawlLogIterator.class); |
|
|
|
|
|
|
|
private final ObjectMapper objectMapper = new ObjectMapper(); |
|
|
|
|
|
|
|
private String duplicateMarker = "duplicate"; |
|
|
|
|
|
|
|
/** |
|
|
|
* A reader for the crawl.log file being processed |
|
@@ -55,6 +57,9 @@ public CrawlLogIterator(Path path) throws IOException { |
|
|
|
} |
|
|
|
|
|
|
|
public CrawlLogIterator(Reader reader) { |
|
|
|
if (System.getProperty("duplicate-marker") != null) { |
|
|
|
duplicateMarker = System.getProperty("duplicate-marker"); |
|
|
|
} |
|
|
|
if (reader instanceof BufferedReader) { |
|
|
|
in = (BufferedReader) reader; |
|
|
|
} else { |
|
@@ -215,7 +220,7 @@ protected CrawlDataItem parseLine(String line) throws IOException { |
|
|
|
|
|
|
|
// Index 11: Annotations (may be missing) |
|
|
|
boolean revisit = false; |
|
|
|
if (lineParts[11].contains("Revisit")) { |
|
|
|
if (lineParts[11].contains(duplicateMarker)) { |
|
|
|
revisit=true; |
|
|
|
} |
|
|
|
cdi.setDuplicate(revisit); |
|
|
0 comments on commit
897596f