Skip to content
Permalink
Browse files

More flexible duplicate detection

  • Loading branch information...
kris-sigur committed Apr 5, 2019
1 parent f1294db commit 897596f9ead52f2cac745a08919a8941973735df
Showing with 6 additions and 1 deletion.
  1. +6 −1 src/org/netpreserve/logtrix/CrawlLogIterator.java
@@ -32,6 +32,8 @@
private static final Logger log = LoggerFactory.getLogger(CrawlLogIterator.class);

private final ObjectMapper objectMapper = new ObjectMapper();

private String duplicateMarker = "duplicate";

/**
* A reader for the crawl.log file being processed
@@ -55,6 +57,9 @@ public CrawlLogIterator(Path path) throws IOException {
}

public CrawlLogIterator(Reader reader) {
if (System.getProperty("duplicate-marker") != null) {
duplicateMarker = System.getProperty("duplicate-marker");
}
if (reader instanceof BufferedReader) {
in = (BufferedReader) reader;
} else {
@@ -215,7 +220,7 @@ protected CrawlDataItem parseLine(String line) throws IOException {

// Index 11: Annotations (may be missing)
boolean revisit = false;
if (lineParts[11].contains("Revisit")) {
if (lineParts[11].contains(duplicateMarker)) {
revisit=true;
}
cdi.setDuplicate(revisit);

0 comments on commit 897596f

Please sign in to comment.
You can’t perform that action at this time.