Merge pull request #34 from commoncrawl/eot-archive-converter

Integrate end-of-term archive table conversion tool
commoncrawl · Nov 19, 2024 · 08d441c · 08d441c
2 parents fcbed8b + 16741c0
commit 08d441c
Show file tree

Hide file tree

Showing 4 changed files with 453 additions and 0 deletions.
diff --git a/src/main/java/org/commoncrawl/spark/EOTIndexTable.java b/src/main/java/org/commoncrawl/spark/EOTIndexTable.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.commoncrawl.spark;
+
+import java.io.IOException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.cli.CommandLine;
+	import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+/**
+ * Convert End of Term Web Archive's CDX index into a tabular format.
+ */
+public class EOTIndexTable extends IndexTable {
+
+	private static final Logger LOG = LoggerFactory.getLogger(EOTIndexTable.class);
+	protected String name = EOTIndexTable.class.getCanonicalName();
+
+	protected static final Pattern filenameAnalyzer = Pattern
+			.compile("^crawl-data/([^/]+)/segments/([^/]+)/(warc)/");
+
+	protected static class CdxLine extends IndexTable.CdxLine {
+		String digest;
+		String mime;
+		String filename;
+		long offset, length;
+		short status;
+		String crawl, segment, subset;
+
+		public CdxLine(String line) throws IOException {
+			super(line);
+
+			uri = getWarcUri("url");
+
+			digest = getString("digest");
+			mime = getString("mime");
+
+			filename =  getString("filename");
+			offset = getLong("offset");
+			length = getLong("length");
+			status = getHttpStatus("status");
+
+			Matcher m = filenameAnalyzer.matcher(filename);
+			if (m.find()) {
+				crawl = m.group(1);
+				segment = m.group(2);
+				subset = m.group(3);
+			} else {
+				LOG.error("Filename not parseable: {}", filename);
+			}
+		}
+	}
+
+	public static Row convertCdxLine(String line) {
+		CdxLine cdx;
+		try {
+			cdx = new CdxLine(line);
+		} catch (Exception e) {
+			LOG.error("Failed to read CDX line: {}", line, e);
+			return null;
+		}
+		Row h = cdx.uri.getHostName().asRow();
+		return RowFactory.create(
+				// SURT and complete URL
+				cdx.urlkey,
+				cdx.uri.getUrlString(),
+				// host
+				h.get(0), h.get(1),
+				h.get(2), h.get(3),
+				h.get(4), h.get(5),
+				h.get(6), h.get(7),
+				h.get(8), h.get(9),
+				h.get(10),
+				// URL components
+				cdx.uri.getProtocol(),
+				cdx.uri.getPort(),
+				cdx.uri.getPath(),
+				cdx.uri.getQuery(),
+				// fetch info
+				cdx.timestamp, cdx.status,
+				// content-related
+				cdx.digest, cdx.mime,
+				// WARC record location
+				cdx.filename, cdx.offset, cdx.length, cdx.segment,
+				// partition fields
+				cdx.crawl, cdx.subset);
+	}
+
+	@Override
+	protected CommandLine applyCommandLineOptions(CommandLine cli) {
+		super.applyCommandLineOptions(cli);
+		mapIndexEntries = EOTIndexTable::convertCdxLine;
+		String schemaDefinition = "/schema/eot-index-schema.json";
+		try {
+			schema = readJsonSchemaResource(schemaDefinition);
+		} catch (IOException e) {
+			throw new RuntimeException("Failed to read output table schema " + schemaDefinition, e);
+		}
+		return cli;
+	}
+
+	public static void main(String[] args) throws IOException {
+		EOTIndexTable job = new EOTIndexTable();
+		job.run(args);
+	}
+}
diff --git a/src/main/resources/schema/eot-index-schema.json b/src/main/resources/schema/eot-index-schema.json
@@ -0,0 +1,257 @@
+{
+  "type": "struct",
+  "fields": [
+    {
+      "name": "url_surtkey",
+      "type": "string",
+      "nullable": false,
+      "metadata": {
+        "description": "SURT URL key",
+        "example": "com,example)/path/index.html"
+      }
+    },
+    {
+      "name": "url",
+      "type": "string",
+      "nullable": false,
+      "metadata": {
+        "description": "URL string",
+        "example": "https://www.example.com/path/index.html",
+        "fromCDX": "url"
+      }
+    },
+    {
+      "name": "url_host_name",
+      "type": "string",
+      "nullable": false,
+      "metadata": {
+        "description": "Hostname, including IP addresses",
+        "example": "www.example.com"
+      }
+    },
+    {
+      "name": "url_host_tld",
+      "nullable": true,
+      "type": "string",
+      "metadata": {
+        "description": "Top-level domain or last part of the hostname",
+        "example": "com for the hostname www.example.com"
+      }
+    },
+    {
+      "name": "url_host_2nd_last_part",
+      "nullable": true,
+      "type": "string",
+      "metadata": {
+        "description": "Second last part of the hostname",
+        "example": "example for the hostname www.example.com, co for bbc.co.uk"
+      }
+    },
+    {
+      "name": "url_host_3rd_last_part",
+      "nullable": true,
+      "type": "string",
+      "metadata": {
+        "description": "Third last part of the hostname",
+        "example": "www for the hostname www.example.com"
+      }
+    },
+    {
+      "name": "url_host_4th_last_part",
+      "nullable": true,
+      "type": "string",
+      "metadata": {
+        "description": "4th last part of the hostname",
+        "example": "host1 for host1.subdomain.example.com"
+      }
+    },
+    {
+      "name": "url_host_5th_last_part",
+      "nullable": true,
+      "type": "string",
+      "metadata": {
+        "description": "5th last part of the hostname",
+        "example": "host1 for host1.sub2.subdomain.example.com"
+      }
+    },
+    {
+      "name": "url_host_registry_suffix",
+      "type": "string",
+      "nullable": true,
+      "metadata": {
+        "description": "Domain registry suffix",
+        "example": "com, co.uk"
+      }
+    },
+    {
+      "name": "url_host_registered_domain",
+      "type": "string",
+      "nullable": true,
+      "metadata": {
+        "description": "Domain name of the host (one level below the registry suffix)",
+        "example": "example.com, bbc.co.uk"
+      }
+    },
+    {
+      "name": "url_host_private_suffix",
+      "type": "string",
+      "nullable": true,
+      "metadata": {
+        "description": "Suffix of domain registries including private registrars, see https://publicsuffix.org/",
+        "example": "com, co.uk, but also s3.amazonaws.com or blogspot.com"
+      }
+    },
+    {
+      "name": "url_host_private_domain",
+      "type": "string",
+      "nullable": true,
+      "metadata": {
+        "description": "Domain name of the host (one level below the private suffix)",
+        "example": "mypublicbucket.s3.amazonaws.com or myblog.blogspot.com"
+      }
+    },
+    {
+      "name": "url_host_name_reversed",
+      "type": "string",
+      "nullable": true,
+      "metadata": {
+        "description": "Hostname, excluding IP addresses, in reverse domain name notation",
+        "example": "com.example.www"
+      }
+    },
+    {
+      "name": "url_protocol",
+      "type": "string",
+      "nullable": false,
+      "metadata": {
+        "description": "Protocol of the URL",
+        "example": "https"
+      }
+    },
+    {
+      "name": "url_port",
+      "type": "integer",
+      "nullable": true,
+      "metadata": {
+        "description": "Port of the URL (null if not explicitly specified in the URL)",
+        "example": "8443"
+      }
+    },
+    {
+      "name": "url_path",
+      "type": "string",
+      "nullable": true,
+      "metadata": {
+        "description": "File path of the URL",
+        "example": "/path/index.html"
+      }
+    },
+    {
+      "name": "url_query",
+      "type": "string",
+      "nullable": true,
+      "metadata": {
+        "description": "Query part of the URL",
+        "example": "q=abc&lang=en for .../search?q=abc&lang=en"
+      }
+    },
+    {
+      "name": "fetch_time",
+      "type": "timestamp",
+      "nullable": false,
+      "metadata": {
+        "description": "Fetch time (capture time stamp)",
+        "example": "2017-10-24T00:14:32Z"
+      }
+    },
+    {
+      "name": "fetch_status",
+      "type": "short",
+      "nullable": false,
+      "metadata": {
+        "description": "HTTP response status code (-1 if absent, eg. for revisit records)",
+        "example": "200",
+        "fromCDX": "status"
+      }
+    },
+    {
+      "name": "content_digest",
+      "type": "string",
+      "nullable": true,
+      "metadata": {
+        "description": "SHA-1 content digest (WARC-Payload-Digest)",
+        "example": "CH7IV3XAD3M7A42JARKRLJ3T5PGGCGXD",
+        "fromCDX": "digest"
+      }
+    },
+    {
+      "name": "content_mime_type",
+      "type": "string",
+      "nullable": true,
+      "metadata": {
+        "description": "Content-Type sent in HTTP response header",
+        "example": "text/html",
+        "fromCDX": "mime"
+      }
+    },
+    {
+      "name": "warc_filename",
+      "type": "string",
+      "nullable": false,
+      "metadata": {
+        "description": "WARC filename/path below s3://eotarchive/ or https://eotarchive.s3.amazonaws.com/",
+        "example": "crawl-data/EOT-2008/segments/IA-001/warc/DOTGOV-2008-01-20080923002742-04410-crawling14.us.archive.org.arc.gz",
+        "fromCDX": "filename"
+      }
+    },
+    {
+      "name": "warc_record_offset",
+      "type": "long",
+      "nullable": false,
+      "metadata": {
+        "description": "Offset of the WARC record",
+        "example": "397346194",
+        "fromCDX": "offset"
+      }
+    },
+    {
+      "name": "warc_record_length",
+      "type": "long",
+      "nullable": false,
+      "metadata": {
+        "description": "Length of the WARC record",
+        "example": "24662",
+        "fromCDX": "length"
+      }
+    },
+    {
+      "name": "warc_segment",
+      "type": "string",
+      "nullable": false,
+      "metadata": {
+        "description": "Segment the WARC file belongs to",
+        "example": "IA-001"
+      }
+    },
+    {
+      "name": "crawl",
+      "type": "string",
+      "nullable": false,
+      "metadata": {
+        "description": "Crawl the capture/record is part of",
+        "example": "EOT-2008"
+      }
+    },
+    {
+      "name": "subset",
+      "type": "string",
+      "nullable": false,
+      "metadata": {
+        "description": "Subset of responses (organized as subfolder of segments)",
+        "enumeration": {
+          "warc": "WARC captures"
+        }
+      }
+    }
+  ]
+}