Skip to content
Permalink
Browse files

Move data frame fields names to snake_case. (#327)

- Resolves #229
  • Loading branch information...
ruebot authored and ianmilligan1 committed Jul 18, 2019
1 parent 0e701b2 commit f35d54ed760949d7f2a07f7342bacb552b6c2719
@@ -45,10 +45,10 @@ object DomainFrequencyExtractor {
def apply(d: DataFrame): Dataset[Row] = {
val spark = SparkSession.builder().master("local").getOrCreate()
// scalastyle:off
import spark.implicits._
import spark.implicits._
// scalastyle:on

d.select(df.ExtractBaseDomain($"Url").as("Domain"))
.groupBy("Domain").count().orderBy(desc("count"))
d.select(df.ExtractBaseDomain($"url").as("domain"))
.groupBy("domain").count().orderBy(desc("count"))
}
}
@@ -55,10 +55,10 @@ object DomainGraphExtractor {
// scalastyle:off
import spark.implicits._
// scalastyle:on
d.select($"CrawlDate",
df.RemovePrefixWWW(df.ExtractBaseDomain($"Src")).as("SrcDomain"),
df.RemovePrefixWWW(df.ExtractBaseDomain($"Dest")).as("DestDomain"))
.filter("SrcDomain != ''").filter("DestDomain != ''")
.groupBy($"CrawlDate", $"SrcDomain", $"DestDomain").count().orderBy(desc("count"))
d.select($"crawl_date",
df.RemovePrefixWWW(df.ExtractBaseDomain($"src")).as("src_domain"),
df.RemovePrefixWWW(df.ExtractBaseDomain($"dest")).as("dest_domain"))
.filter("src_domain != ''").filter("dest_domain != ''")
.groupBy($"crawl_date", $"src_domain", $"dest_domain").count().orderBy(desc("count"))
}
}
@@ -45,7 +45,7 @@ object PlainTextExtractor {
// scalastyle:off
import spark.implicits._
// scalastyle:on
d.select($"CrawlDate", df.ExtractBaseDomain($"Url").as("Domain"),
$"Url", df.RemoveHTML($"Content").as("Text"))
d.select($"crawl_date", df.ExtractBaseDomain($"url").as("domain"),
$"url", df.RemoveHTML($"content").as("Text"))
}
}
@@ -99,10 +99,10 @@ package object archivesunleashed {
.map(r => Row(r.getCrawlDate, r.getUrl, r.getMimeType, r.getContentString))

val schema = new StructType()
.add(StructField("CrawlDate", StringType, true))
.add(StructField("Url", StringType, true))
.add(StructField("MimeType", StringType, true))
.add(StructField("Content", StringType, true))
.add(StructField("crawl_date", StringType, true))
.add(StructField("url", StringType, true))
.add(StructField("mime_type", StringType, true))
.add(StructField("content", StringType, true))

val sqlContext = SparkSession.builder()
sqlContext.getOrCreate().createDataFrame(records, schema)
@@ -115,10 +115,10 @@ package object archivesunleashed {
.map(t => Row(t._1, t._2, t._3, t._4))

val schema = new StructType()
.add(StructField("CrawlDate", StringType, true))
.add(StructField("Src", StringType, true))
.add(StructField("Dest", StringType, true))
.add(StructField("Anchor", StringType, true))
.add(StructField("crawl_date", StringType, true))
.add(StructField("src", StringType, true))
.add(StructField("dest", StringType, true))
.add(StructField("anchor", StringType, true))

val sqlContext = SparkSession.builder();
sqlContext.getOrCreate().createDataFrame(records, schema)

0 comments on commit f35d54e

Please sign in to comment.
You can’t perform that action at this time.