commoncrawl · jt55401 · May 31, 2024 · May 31, 2024 · May 31, 2024 · May 31, 2024
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1,27 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/java
+{
+	"name": "Java",
+	// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
+	"image": "mcr.microsoft.com/devcontainers/java:1-11-bookworm",
+
+	"features": {
+		"ghcr.io/devcontainers/features/java:1": {
+			"version": "none",
+			"installMaven": "true",
+			"installGradle": "false"
+		}
+	}
+
+	// Use 'forwardPorts' to make a list of ports inside the container available locally.
+	// "forwardPorts": [],
+
+	// Use 'postCreateCommand' to run commands after the container is created.
+	// "postCreateCommand": "java -version",
+
+	// Configure tool-specific properties.
+	// "customizations": {},
+
+	// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
+	// "remoteUser": "root"
+}
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -0,0 +1,12 @@
+# To get started with Dependabot version updates, you'll need to specify which
+# package ecosystems to update and where the package manifests are located.
+# Please see the documentation for more information:
+# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+# https://containers.dev/guide/dependabot
+
+version: 2
+updates:
+ - package-ecosystem: "devcontainers"
+   directory: "/"
+   schedule:
+     interval: weekly
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,21 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "type": "java",
+            "name": "Current File",
+            "request": "launch",
+            "mainClass": "${file}"
+        },
+        {
+            "type": "java",
+            "name": "Attach",
+            "request": "attach",
+            "hostName": "localhost",
+            "port": 8000
+        }
+    ]
+}
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "java.compile.nullAnalysis.mode": "automatic"
+}
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,13 @@
+FROM maven:3.6.3-jdk-11 AS build
+WORKDIR /app
+COPY pom.xml .
+COPY src ./src
+RUN mvn package
+
+FROM spark:3.5.1
+WORKDIR /app
+COPY --from=build /app/target/*.jar ./target/
+COPY --from=build /app/src/script/convert_url_index.sh ./src/script/convert_url_index.sh
+VOLUME /app/data
+ENV SPARK_ON_YARN="--master local"
+ENTRYPOINT ["/app/src/script/convert_url_index.sh"]
diff --git a/pom.xml b/pom.xml
@@ -12,7 +12,7 @@
 
 	<properties>
 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-		<java.version>1.8</java.version>
+		<java.version>11</java.version>
 
 		<spark.version>3.5.1</spark.version>
 		<spark.core.version>2.12</spark.core.version>

diff --git a/src/main/java/org/commoncrawl/spark/CCIndex2Table.java b/src/main/java/org/commoncrawl/spark/CCIndex2Table.java
@@ -17,18 +17,18 @@
 package org.commoncrawl.spark;
 
 import java.io.IOException;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.Option;
 import org.apache.commons.cli.Options;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.RowFactory;
+import org.commoncrawl.spark.util.CCIndex2FilenameParser;
+import org.commoncrawl.spark.util.CCIndex2FilenameParser.FilenameParts;
+import org.commoncrawl.spark.util.CCIndex2FilenameParser.FilenameParseError;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-
 /**
  * Convert Common Crawl's URL index into a tabular format.
  */
@@ -39,9 +39,6 @@ public class CCIndex2Table extends IndexTable {
 
 	protected static boolean useBuiltinNestedSchema = false;
 
-	protected static final Pattern filenameAnalyzer = Pattern
-			.compile("^(?:common-crawl/)?crawl-data/([^/]+)/segments/([^/]+)/(crawldiagnostics|robotstxt|warc)/");
-
 	protected static class CdxLine extends IndexTable.CdxLine {
 		String redirect;
 		String digest;
@@ -68,12 +65,16 @@ public CdxLine(String line) throws IOException {
 			length = getInt("length");
 			status = getHttpStatus("status");
 
-			Matcher m = filenameAnalyzer.matcher(filename);
-			if (m.find()) {
-				crawl = m.group(1);
-				segment = m.group(2);
-				subset = m.group(3);
-			} else {
+			crawl = "unknown";
+			segment = "unknown";
+			subset = "unknown";
+
+			try{
+				final FilenameParts parts = CCIndex2FilenameParser.getParts(filename);
+				crawl = parts.crawl;
+				segment = parts.segment;
+				subset = parts.subset;
+			} catch (FilenameParseError e) {
 				LOG.error("Filename not parseable: {}", filename);
 			}
 
@@ -109,6 +110,10 @@ public static Row convertCdxLine(String line) {
 					cdx.crawl, cdx.subset);
 		} else {
 			Row h = cdx.uri.getHostName().asRow();
+			if( h.get(0) == null ) {
+				LOG.error("Failed to parse hostname: " + cdx.uri.getHostName() + " from line:\n\t" + line);
+				return null;
+			}
 			return RowFactory.create(
 					// SURT and complete URL
 					cdx.urlkey,

diff --git a/src/main/java/org/commoncrawl/spark/util/CCIndex2FilenameParser.java b/src/main/java/org/commoncrawl/spark/util/CCIndex2FilenameParser.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.commoncrawl.spark.util;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class CCIndex2FilenameParser {
+    protected static final Pattern filenameAnalyzer = Pattern
+                .compile("^(?:common-crawl/)?crawl-data/([^/]+)/segments/([^/]+)/(crawldiagnostics|robotstxt|warc|wat|wet)/");
+
+                // crawl-data/CC-NEWS/2019/01/CC-NEWS-20190101042830-00057.warc.gz
+    protected static final Pattern newsFilenameAnalyzer = Pattern
+                .compile("^(?:common-crawl/)?crawl-data/CC-NEWS/(\\d+)/(\\d+)/CC-NEWS-(.+)\\.warc\\.gz");
+
+    // Class to encapsulate the extracted crawl, segment, and subset.
+    public static class FilenameParts {
+        public String crawl;
+        public String segment;
+        public String subset;
+    }
+
+    // Error class if we can't find the crawl, segment, and subset.
+    public static class FilenameParseError extends Exception {
+        public FilenameParseError(String message) {
+            super(message);
+        }
+    }
+
+    public static FilenameParts getParts(String filename) throws FilenameParseError {
+        FilenameParts parts = new FilenameParts();
+        Matcher m = filenameAnalyzer.matcher(filename);
+        if(m.find()){
+            parts.crawl = m.group(1);
+            parts.segment = m.group(2);
+            parts.subset = m.group(3);
+        } else {
+            Matcher newsParts = newsFilenameAnalyzer.matcher(filename);
+            if(!newsParts.find()){
+                throw new FilenameParseError("Filename not parseable (tried normal and news): " + filename);
+            }
+            parts.crawl = String.format("CC-NEWS-%s-%s", newsParts.group(1), newsParts.group(2));
+            parts.segment = newsParts.group(3);
+            parts.subset = "news-warc";
+        }
+        return parts;
+    }
+}
diff --git a/src/script/convert_url_index.sh b/src/script/convert_url_index.sh
@@ -60,6 +60,8 @@ $SPARK_HOME/bin/spark-submit \
     --conf spark.sql.hive.metastorePartitionPruning=true \
     --conf spark.hadoop.parquet.enable.summary-metadata=false \
     --conf spark.sql.parquet.outputTimestampType=TIMESTAMP_MILLIS \
+    --conf "spark.driver.userClassPathFirst=true" \
+    --conf "spark.executor.userClassPathFirst=true" \
     --class org.commoncrawl.spark.CCIndex2Table $_APPJAR \
     --outputCompression=$COMPRS \
     --outputFormat=$FORMAT $NESTED \

diff --git a/src/test/java/org/commoncrawl/spark/TestCCIndex2FilenameParser.java b/src/test/java/org/commoncrawl/spark/TestCCIndex2FilenameParser.java
@@ -0,0 +1,56 @@
+package org.commoncrawl.spark;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import org.commoncrawl.spark.util.CCIndex2FilenameParser;
+import org.commoncrawl.spark.util.CCIndex2FilenameParser.FilenameParts;
+import org.commoncrawl.spark.util.CCIndex2FilenameParser.FilenameParseError;
+import org.junit.jupiter.api.Test;
+
+public class TestCCIndex2FilenameParser {
+
+    @Test
+    public void testMainWarcFilename()  throws FilenameParseError {
+        String filename = "crawl-data/CC-MAIN-2018-47/segments/1542039741324.15/warc/CC-MAIN-20181113153141-20181113174452-00011.warc.gz";
+        FilenameParts parts = CCIndex2FilenameParser.getParts(filename);
+        assertEquals("CC-MAIN-2018-47", parts.crawl);
+        assertEquals("1542039741324.15", parts.segment);
+        assertEquals("warc", parts.subset);
+    }
+
+    @Test
+    public void testMainWat() throws FilenameParseError  {
+        String filename = "crawl-data/CC-MAIN-2018-47/segments/1542039741324.15/wat/CC-MAIN-20181113153141-20181113174452-00011.warc.wat.gz";
+        FilenameParts parts = CCIndex2FilenameParser.getParts(filename);
+        assertEquals("CC-MAIN-2018-47", parts.crawl);
+        assertEquals("1542039741324.15", parts.segment);
+        assertEquals("wat", parts.subset);
+    }
+
+    @Test
+    public void testMainWet() throws FilenameParseError  {
+        String filename = "crawl-data/CC-MAIN-2018-47/segments/1542039741016.16/wet/CC-MAIN-20181112172845-20181112194415-00012.warc.wet.gz";
+        FilenameParts parts = CCIndex2FilenameParser.getParts(filename);
+        assertEquals("CC-MAIN-2018-47", parts.crawl);
+        assertEquals("1542039741016.16", parts.segment);
+        assertEquals("wet", parts.subset);
+    }
+
+    @Test
+    public void testMainCrawldiagnostics() throws FilenameParseError {
+        String filename = "crawl-data/CC-MAIN-2018-47/segments/1542039741016.16/crawldiagnostics/CC-MAIN-20181112172845-20181112194415-00012.warc.gz";
+        FilenameParts parts = CCIndex2FilenameParser.getParts(filename);
+        assertEquals("CC-MAIN-2018-47", parts.crawl);
+        assertEquals("1542039741016.16", parts.segment);
+        assertEquals("crawldiagnostics", parts.subset);
+    }
+
+    @Test
+    public void testNewsWarcFilename() throws FilenameParseError {
+        String filename = "crawl-data/CC-NEWS/2019/01/CC-NEWS-20190101042830-00057.warc.gz";
+        FilenameParts parts = CCIndex2FilenameParser.getParts(filename);
+        assertEquals("CC-NEWS-2019-01", parts.crawl);
+        assertEquals("20190101042830-00057", parts.segment);
+        assertEquals("news-warc", parts.subset);
+    }
+
+}