Skip to content

Commit

Permalink
Merge pull request #34 from commoncrawl/eot-archive-converter
Browse files Browse the repository at this point in the history
Integrate end-of-term archive table conversion tool
  • Loading branch information
sebastian-nagel authored Nov 19, 2024
2 parents fcbed8b + 16741c0 commit 08d441c
Show file tree
Hide file tree
Showing 4 changed files with 453 additions and 0 deletions.
125 changes: 125 additions & 0 deletions src/main/java/org/commoncrawl/spark/EOTIndexTable.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.commoncrawl.spark;

import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.cli.CommandLine;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
* Convert End of Term Web Archive's CDX index into a tabular format.
*/
public class EOTIndexTable extends IndexTable {

private static final Logger LOG = LoggerFactory.getLogger(EOTIndexTable.class);
protected String name = EOTIndexTable.class.getCanonicalName();

protected static final Pattern filenameAnalyzer = Pattern
.compile("^crawl-data/([^/]+)/segments/([^/]+)/(warc)/");

protected static class CdxLine extends IndexTable.CdxLine {
String digest;
String mime;
String filename;
long offset, length;
short status;
String crawl, segment, subset;

public CdxLine(String line) throws IOException {
super(line);

uri = getWarcUri("url");

digest = getString("digest");
mime = getString("mime");

filename = getString("filename");
offset = getLong("offset");
length = getLong("length");
status = getHttpStatus("status");

Matcher m = filenameAnalyzer.matcher(filename);
if (m.find()) {
crawl = m.group(1);
segment = m.group(2);
subset = m.group(3);
} else {
LOG.error("Filename not parseable: {}", filename);
}
}
}

public static Row convertCdxLine(String line) {
CdxLine cdx;
try {
cdx = new CdxLine(line);
} catch (Exception e) {
LOG.error("Failed to read CDX line: {}", line, e);
return null;
}
Row h = cdx.uri.getHostName().asRow();
return RowFactory.create(
// SURT and complete URL
cdx.urlkey,
cdx.uri.getUrlString(),
// host
h.get(0), h.get(1),
h.get(2), h.get(3),
h.get(4), h.get(5),
h.get(6), h.get(7),
h.get(8), h.get(9),
h.get(10),
// URL components
cdx.uri.getProtocol(),
cdx.uri.getPort(),
cdx.uri.getPath(),
cdx.uri.getQuery(),
// fetch info
cdx.timestamp, cdx.status,
// content-related
cdx.digest, cdx.mime,
// WARC record location
cdx.filename, cdx.offset, cdx.length, cdx.segment,
// partition fields
cdx.crawl, cdx.subset);
}

@Override
protected CommandLine applyCommandLineOptions(CommandLine cli) {
super.applyCommandLineOptions(cli);
mapIndexEntries = EOTIndexTable::convertCdxLine;
String schemaDefinition = "/schema/eot-index-schema.json";
try {
schema = readJsonSchemaResource(schemaDefinition);
} catch (IOException e) {
throw new RuntimeException("Failed to read output table schema " + schemaDefinition, e);
}
return cli;
}

public static void main(String[] args) throws IOException {
EOTIndexTable job = new EOTIndexTable();
job.run(args);
}
}
257 changes: 257 additions & 0 deletions src/main/resources/schema/eot-index-schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
{
"type": "struct",
"fields": [
{
"name": "url_surtkey",
"type": "string",
"nullable": false,
"metadata": {
"description": "SURT URL key",
"example": "com,example)/path/index.html"
}
},
{
"name": "url",
"type": "string",
"nullable": false,
"metadata": {
"description": "URL string",
"example": "https://www.example.com/path/index.html",
"fromCDX": "url"
}
},
{
"name": "url_host_name",
"type": "string",
"nullable": false,
"metadata": {
"description": "Hostname, including IP addresses",
"example": "www.example.com"
}
},
{
"name": "url_host_tld",
"nullable": true,
"type": "string",
"metadata": {
"description": "Top-level domain or last part of the hostname",
"example": "com for the hostname www.example.com"
}
},
{
"name": "url_host_2nd_last_part",
"nullable": true,
"type": "string",
"metadata": {
"description": "Second last part of the hostname",
"example": "example for the hostname www.example.com, co for bbc.co.uk"
}
},
{
"name": "url_host_3rd_last_part",
"nullable": true,
"type": "string",
"metadata": {
"description": "Third last part of the hostname",
"example": "www for the hostname www.example.com"
}
},
{
"name": "url_host_4th_last_part",
"nullable": true,
"type": "string",
"metadata": {
"description": "4th last part of the hostname",
"example": "host1 for host1.subdomain.example.com"
}
},
{
"name": "url_host_5th_last_part",
"nullable": true,
"type": "string",
"metadata": {
"description": "5th last part of the hostname",
"example": "host1 for host1.sub2.subdomain.example.com"
}
},
{
"name": "url_host_registry_suffix",
"type": "string",
"nullable": true,
"metadata": {
"description": "Domain registry suffix",
"example": "com, co.uk"
}
},
{
"name": "url_host_registered_domain",
"type": "string",
"nullable": true,
"metadata": {
"description": "Domain name of the host (one level below the registry suffix)",
"example": "example.com, bbc.co.uk"
}
},
{
"name": "url_host_private_suffix",
"type": "string",
"nullable": true,
"metadata": {
"description": "Suffix of domain registries including private registrars, see https://publicsuffix.org/",
"example": "com, co.uk, but also s3.amazonaws.com or blogspot.com"
}
},
{
"name": "url_host_private_domain",
"type": "string",
"nullable": true,
"metadata": {
"description": "Domain name of the host (one level below the private suffix)",
"example": "mypublicbucket.s3.amazonaws.com or myblog.blogspot.com"
}
},
{
"name": "url_host_name_reversed",
"type": "string",
"nullable": true,
"metadata": {
"description": "Hostname, excluding IP addresses, in reverse domain name notation",
"example": "com.example.www"
}
},
{
"name": "url_protocol",
"type": "string",
"nullable": false,
"metadata": {
"description": "Protocol of the URL",
"example": "https"
}
},
{
"name": "url_port",
"type": "integer",
"nullable": true,
"metadata": {
"description": "Port of the URL (null if not explicitly specified in the URL)",
"example": "8443"
}
},
{
"name": "url_path",
"type": "string",
"nullable": true,
"metadata": {
"description": "File path of the URL",
"example": "/path/index.html"
}
},
{
"name": "url_query",
"type": "string",
"nullable": true,
"metadata": {
"description": "Query part of the URL",
"example": "q=abc&lang=en for .../search?q=abc&lang=en"
}
},
{
"name": "fetch_time",
"type": "timestamp",
"nullable": false,
"metadata": {
"description": "Fetch time (capture time stamp)",
"example": "2017-10-24T00:14:32Z"
}
},
{
"name": "fetch_status",
"type": "short",
"nullable": false,
"metadata": {
"description": "HTTP response status code (-1 if absent, eg. for revisit records)",
"example": "200",
"fromCDX": "status"
}
},
{
"name": "content_digest",
"type": "string",
"nullable": true,
"metadata": {
"description": "SHA-1 content digest (WARC-Payload-Digest)",
"example": "CH7IV3XAD3M7A42JARKRLJ3T5PGGCGXD",
"fromCDX": "digest"
}
},
{
"name": "content_mime_type",
"type": "string",
"nullable": true,
"metadata": {
"description": "Content-Type sent in HTTP response header",
"example": "text/html",
"fromCDX": "mime"
}
},
{
"name": "warc_filename",
"type": "string",
"nullable": false,
"metadata": {
"description": "WARC filename/path below s3://eotarchive/ or https://eotarchive.s3.amazonaws.com/",
"example": "crawl-data/EOT-2008/segments/IA-001/warc/DOTGOV-2008-01-20080923002742-04410-crawling14.us.archive.org.arc.gz",
"fromCDX": "filename"
}
},
{
"name": "warc_record_offset",
"type": "long",
"nullable": false,
"metadata": {
"description": "Offset of the WARC record",
"example": "397346194",
"fromCDX": "offset"
}
},
{
"name": "warc_record_length",
"type": "long",
"nullable": false,
"metadata": {
"description": "Length of the WARC record",
"example": "24662",
"fromCDX": "length"
}
},
{
"name": "warc_segment",
"type": "string",
"nullable": false,
"metadata": {
"description": "Segment the WARC file belongs to",
"example": "IA-001"
}
},
{
"name": "crawl",
"type": "string",
"nullable": false,
"metadata": {
"description": "Crawl the capture/record is part of",
"example": "EOT-2008"
}
},
{
"name": "subset",
"type": "string",
"nullable": false,
"metadata": {
"description": "Subset of responses (organized as subfolder of segments)",
"enumeration": {
"warc": "WARC captures"
}
}
}
]
}
Loading

0 comments on commit 08d441c

Please sign in to comment.