Permalink
Browse files
Filter blank src/dest out of webgraph. (#400)
- Loading branch information
Showing
with
1 addition
and
0 deletions.
-
+1
−0
src/main/scala/io/archivesunleashed/package.scala
|
@@ -275,6 +275,7 @@ package object archivesunleashed { |
|
|
.keepValidPages() |
|
|
.flatMap(r => ExtractLinksRDD(r.getUrl, r.getContentString) |
|
|
.map(t => (r.getCrawlDate, t._1, t._2, t._3))) |
|
|
.filter(t => t._2 != "" && t._3 != "") |
|
|
.map(t => Row(t._1, t._2, t._3, t._4)) |
|
|
|
|
|
val schema = new StructType() |
|
|
0 comments on commit
3dc1545