Permalink
Browse files

Setup mycila plugin, and normalize all license headers; Resolves #4. (#…

…20)

Travis-CI had one check fail during GitHub outage, others cleared. Overriding defaults.
  • Loading branch information...
ruebot authored and ianmilligan1 committed Jul 31, 2017
1 parent bb2e665 commit eb0e8a9046f6dbfc20c2ab8609424e40cd652db6
Showing with 426 additions and 132 deletions.
  1. +14 −0 LICENSE_HEADER.txt
  2. +38 −0 pom.xml
  3. +2 −2 src/main/java/io/archivesunleashed/data/ArcRecordUtils.java
  4. +2 −1 src/main/java/io/archivesunleashed/data/WarcRecordUtils.java
  5. +2 −1 src/main/java/io/archivesunleashed/data/package-info.java
  6. +2 −2 src/main/java/io/archivesunleashed/io/ArcRecordWritable.java
  7. +2 −2 src/main/java/io/archivesunleashed/io/GenericArchiveRecordWritable.java
  8. +2 −2 src/main/java/io/archivesunleashed/io/WarcRecordWritable.java
  9. +2 −1 src/main/java/io/archivesunleashed/io/package-info.java
  10. +2 −2 src/main/java/io/archivesunleashed/mapreduce/WacArcInputFormat.java
  11. +2 −2 src/main/java/io/archivesunleashed/mapreduce/WacGenericInputFormat.java
  12. +2 −2 src/main/java/io/archivesunleashed/mapreduce/WacWarcInputFormat.java
  13. +2 −1 src/main/java/io/archivesunleashed/mapreduce/package-info.java
  14. +16 −0 src/main/scala/io/archivesunleashed/spark/archive/io/ArcRecord.scala
  15. +16 −0 src/main/scala/io/archivesunleashed/spark/archive/io/ArchiveRecord.scala
  16. +16 −0 src/main/scala/io/archivesunleashed/spark/archive/io/GenericArchiveRecord.scala
  17. +16 −0 src/main/scala/io/archivesunleashed/spark/archive/io/WarcRecord.scala
  18. +16 −0 src/main/scala/io/archivesunleashed/spark/matchbox/ComputeImageSize.scala
  19. +16 −0 src/main/scala/io/archivesunleashed/spark/matchbox/ComputeMD5.scala
  20. +2 −2 src/main/scala/io/archivesunleashed/spark/matchbox/DetectLanguage.scala
  21. +2 −2 src/main/scala/io/archivesunleashed/spark/matchbox/DetectMimeTypeTika.scala
  22. +2 −2 src/main/scala/io/archivesunleashed/spark/matchbox/ExtractAtMentions.scala
  23. +2 −2 src/main/scala/io/archivesunleashed/spark/matchbox/ExtractBoilerpipeText.scala
  24. +16 −0 src/main/scala/io/archivesunleashed/spark/matchbox/ExtractDate.scala
  25. +2 −2 src/main/scala/io/archivesunleashed/spark/matchbox/ExtractDomain.scala
  26. +2 −2 src/main/scala/io/archivesunleashed/spark/matchbox/ExtractEntities.scala
  27. +2 −2 src/main/scala/io/archivesunleashed/spark/matchbox/ExtractGraph.scala
  28. +2 −2 src/main/scala/io/archivesunleashed/spark/matchbox/ExtractHashtags.scala
  29. +2 −2 src/main/scala/io/archivesunleashed/spark/matchbox/ExtractImageLinks.scala
  30. +2 −2 src/main/scala/io/archivesunleashed/spark/matchbox/ExtractLinks.scala
  31. +16 −0 src/main/scala/io/archivesunleashed/spark/matchbox/ExtractPopularImages.scala
  32. +2 −24 src/main/scala/io/archivesunleashed/spark/matchbox/ExtractTextFromPDFs.scala
  33. +2 −2 src/main/scala/io/archivesunleashed/spark/matchbox/ExtractUrls.scala
  34. +2 −2 src/main/scala/io/archivesunleashed/spark/matchbox/NER3Classifier.scala
  35. +23 −7 src/main/scala/io/archivesunleashed/spark/matchbox/NERCombinedJson.scala
  36. +2 −2 src/main/scala/io/archivesunleashed/spark/matchbox/RecordLoader.scala
  37. +2 −2 src/main/scala/io/archivesunleashed/spark/matchbox/RemoveHTML.scala
  38. +16 −0 src/main/scala/io/archivesunleashed/spark/matchbox/RemoveHttpHeader.scala
  39. +2 −2 src/main/scala/io/archivesunleashed/spark/matchbox/StringUtils.scala
  40. +16 −0 src/main/scala/io/archivesunleashed/spark/matchbox/TupleFormatter.scala
  41. +16 −0 src/main/scala/io/archivesunleashed/spark/matchbox/TweetUtils.scala
  42. +3 −3 src/main/scala/io/archivesunleashed/spark/matchbox/WriteGDF.scala
  43. +2 −2 src/main/scala/io/archivesunleashed/spark/pythonconverters/ArcRecordConverter.scala
  44. +2 −2 src/main/scala/io/archivesunleashed/spark/rdd/RecordRDD.scala
  45. +16 −0 src/main/scala/io/archivesunleashed/spark/scripts/CrawlStatistics.scala
  46. +2 −2 src/main/scala/io/archivesunleashed/spark/scripts/Filter.scala
  47. +2 −2 src/main/scala/io/archivesunleashed/spark/scripts/SocialMediaLinks.scala
  48. +16 −0 src/main/scala/io/archivesunleashed/spark/utils/JsonUtil.scala
  49. +2 −2 src/test/java/io/archivesunleashed/ingest/WacArcLoaderTest.java
  50. +2 −1 src/test/java/io/archivesunleashed/ingest/WacWarcLoaderTest.java
  51. +2 −1 src/test/java/io/archivesunleashed/ingest/package-info.java
  52. +2 −2 src/test/java/io/archivesunleashed/io/ArcRecordWritableTest.java
  53. +2 −2 src/test/java/io/archivesunleashed/io/GenericArchiveRecordWritableTest.java
  54. +2 −2 src/test/java/io/archivesunleashed/io/WarcRecordWritableTest.java
  55. +2 −1 src/test/java/io/archivesunleashed/io/package-info.java
  56. +2 −2 src/test/java/io/archivesunleashed/mapreduce/WacArcInputFormatTest.java
  57. +2 −2 src/test/java/io/archivesunleashed/mapreduce/WacGenericInputFormatTest.java
  58. +2 −2 src/test/java/io/archivesunleashed/mapreduce/WacWarcInputFormatTest.java
  59. +2 −1 src/test/java/io/archivesunleashed/mapreduce/package-info.java
  60. +2 −2 src/test/scala/io/archivesunleashed/spark/ArcTest.scala
  61. +2 −2 src/test/scala/io/archivesunleashed/spark/GenericArchiveRecordTest.scala
  62. +2 −2 src/test/scala/io/archivesunleashed/spark/WarcTest.scala
  63. +2 −2 src/test/scala/io/archivesunleashed/spark/matchbox/ExtractAtMentionsTest.scala
  64. +16 −0 src/test/scala/io/archivesunleashed/spark/matchbox/ExtractDateTest.scala
  65. +2 −2 src/test/scala/io/archivesunleashed/spark/matchbox/ExtractDomainTest.scala
  66. +2 −2 src/test/scala/io/archivesunleashed/spark/matchbox/ExtractEntitiesTest.scala
  67. +2 −2 src/test/scala/io/archivesunleashed/spark/matchbox/ExtractHashtagsTest.scala
  68. +2 −2 src/test/scala/io/archivesunleashed/spark/matchbox/ExtractImageLinksTest.scala
  69. +2 −2 src/test/scala/io/archivesunleashed/spark/matchbox/ExtractLinksTest.scala
  70. +2 −2 src/test/scala/io/archivesunleashed/spark/matchbox/ExtractUrlsTest.scala
  71. +2 −2 src/test/scala/io/archivesunleashed/spark/matchbox/StringUtilsTest.scala
  72. +16 −0 src/test/scala/io/archivesunleashed/spark/matchbox/TupleFormatterTest.scala
  73. +2 −2 src/test/scala/io/archivesunleashed/spark/rdd/CountableRDDTest.scala
View
@@ -0,0 +1,14 @@
Archives Unleashed Toolkit (AUT):
An open-source platform for analyzing web archives.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
View
38 pom.xml
@@ -18,11 +18,14 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<project_name>${project.artifactId}</project_name>
<project_organization>The Archives Unleashed Project</project_organization>
<scala.version>2.10.5</scala.version>
<hadoop.version>2.6.0-cdh5.7.1</hadoop.version>
<spark.version>1.6.0-cdh5.7.1</spark.version>
<github.global.server>github</github.global.server>
<checkstyle.plugin.version>2.15</checkstyle.plugin.version>
<license.plugin.version>2.11</license.plugin.version>
</properties>
<licenses>
@@ -179,6 +182,41 @@
<check />
</configuration>
</plugin>
<!-- for mycila -->
<plugin>
<groupId>com.mycila</groupId>
<artifactId>license-maven-plugin</artifactId>
<version>${license.plugin.version}</version>
<configuration>
<header>LICENSE_HEADER.txt</header>
<mapping>
<java>SLASHSTAR_STYLE</java>
</mapping>
<includes>
<include>**/src/main/java/**</include>
<include>**/src/test/java/**</include>
</includes>
<excludes>
<exclude>target/**</exclude>
<exclude>**/src/test/resources/**</exclude>
<exclude>**/src/main/resources/**</exclude>
<exclude>**/*.properties</exclude>
</excludes>
<properties>
<name>${project.artifactId}</name>
<holder>${project.organization.name}</holder>
</properties>
<encoding>UTF-8</encoding>
<strictCheck>true</strictCheck>
</configuration>
<executions>
<execution>
<goals>
<goal>check</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
@@ -1,5 +1,6 @@
/*
* Warcbase: an open-source platform for managing web archives
* Archives Unleashed Toolkit (AUT):
* An open-source platform for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,7 +14,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.data;
import java.io.BufferedInputStream;
@@ -1,5 +1,6 @@
/*
* Warcbase: an open-source platform for managing web archives
* Archives Unleashed Toolkit (AUT):
* An open-source platform for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -1,5 +1,6 @@
/*
* Warcbase: an open-source platform for managing web archives
* Archives Unleashed Toolkit (AUT):
* An open-source platform for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -1,5 +1,6 @@
/*
* Warcbase: an open-source platform for managing web archives
* Archives Unleashed Toolkit (AUT):
* An open-source platform for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,7 +14,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.io;
import io.archivesunleashed.data.ArcRecordUtils;
@@ -1,5 +1,6 @@
/*
* Warcbase: an open-source platform for managing web archives
* Archives Unleashed Toolkit (AUT):
* An open-source platform for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,7 +14,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.io;
import io.archivesunleashed.data.ArcRecordUtils;
@@ -1,5 +1,6 @@
/*
* Warcbase: an open-source platform for managing web archives
* Archives Unleashed Toolkit (AUT):
* An open-source platform for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,7 +14,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.io;
import io.archivesunleashed.data.WarcRecordUtils;
@@ -1,5 +1,6 @@
/*
* Warcbase: an open-source platform for managing web archives
* Archives Unleashed Toolkit (AUT):
* An open-source platform for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -1,5 +1,6 @@
/*
* Warcbase: an open-source platform for managing web archives
* Archives Unleashed Toolkit (AUT):
* An open-source platform for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,7 +14,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.mapreduce;
import io.archivesunleashed.io.ArcRecordWritable;
@@ -1,5 +1,6 @@
/*
* Warcbase: an open-source platform for managing web archives
* Archives Unleashed Toolkit (AUT):
* An open-source platform for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,7 +14,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.mapreduce;
import io.archivesunleashed.io.GenericArchiveRecordWritable.ArchiveFormat;
@@ -1,5 +1,6 @@
/*
* Warcbase: an open-source platform for managing web archives
* Archives Unleashed Toolkit (AUT):
* An open-source platform for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,7 +14,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.mapreduce;
import io.archivesunleashed.io.WarcRecordWritable;
@@ -1,5 +1,6 @@
/*
* Warcbase: an open-source platform for managing web archives
* Archives Unleashed Toolkit (AUT):
* An open-source platform for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -1,3 +1,19 @@
/*
* Archives Unleashed Toolkit (AUT):
* An open-source platform for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.archive.io
import org.apache.spark.SerializableWritable
@@ -1,3 +1,19 @@
/*
* Archives Unleashed Toolkit (AUT):
* An open-source platform for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.archive.io
trait ArchiveRecord extends Serializable {
@@ -1,3 +1,19 @@
/*
* Archives Unleashed Toolkit (AUT):
* An open-source platform for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.archive.io
import java.text.SimpleDateFormat
@@ -1,3 +1,19 @@
/*
* Archives Unleashed Toolkit (AUT):
* An open-source platform for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.archive.io
import java.text.SimpleDateFormat
@@ -1,3 +1,19 @@
/*
* Archives Unleashed Toolkit (AUT):
* An open-source platform for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.matchbox
import java.io.ByteArrayInputStream
@@ -1,3 +1,19 @@
/*
* Archives Unleashed Toolkit (AUT):
* An open-source platform for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.matchbox
import java.security.MessageDigest
@@ -1,5 +1,6 @@
/*
* Warcbase: an open-source platform for managing web archives
* Archives Unleashed Toolkit (AUT):
* An open-source platform for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,7 +14,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.matchbox
import org.apache.tika.language.LanguageIdentifier
@@ -1,5 +1,6 @@
/*
* Warcbase: an open-source platform for managing web archives
* Archives Unleashed Toolkit (AUT):
* An open-source platform for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,7 +14,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.matchbox
import java.io.ByteArrayInputStream
@@ -1,5 +1,6 @@
/*
* Warcbase: an open-source platform for managing web archives
* Archives Unleashed Toolkit (AUT):
* An open-source platform for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,7 +14,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.matchbox
object ExtractAtMentions {
@@ -1,5 +1,6 @@
/*
* Warcbase: an open-source platform for managing web archives
* Archives Unleashed Toolkit (AUT):
* An open-source platform for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,7 +14,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.matchbox
import java.io.IOException
Oops, something went wrong.

0 comments on commit eb0e8a9

Please sign in to comment.