Merge pull request #11 from Coleridge-Initiative/fix_pdf

working toward #6
Coleridge-Initiative · Dec 24, 2019 · f6648af · f6648af
2 parents fff1824 + 3acb7d9
commit f6648af
Show file tree

Hide file tree

Showing 4 changed files with 50,593 additions and 43,616 deletions.
diff --git a/DOWNLOAD.md b/DOWNLOAD.md
@@ -21,7 +21,7 @@ from the public S3 bucket.
 Download the corpus PDFs and other resource files:
 
 ```
-python bin/download_resources.py
+python bin/download_resources.py --logger errors.txt
 ```
 
 The PDF files get stored in the `resources/pub/pdf` subdirectory.
@@ -47,6 +47,8 @@ java -jar $SPJAR -o ./resources/pub/json ./resources/pub/pdf
 That command will download multiple resources from the Allan AI public
 datastore, which may take several minutes.
 
+TODO: replace this step with use of a containerized `SPv2` server.
+
 
 ## Upload PDF and JSON files
 

diff --git a/bin/download_resources.py b/bin/download_resources.py
@@ -171,13 +171,14 @@ def enum_dat_resources (corpus: dict, output_path: Path, force_download: bool) -
         downloaded_before = e_id in downloaded_dat_id
 
         if force_download or not downloaded_before:
-            res_url = entity["foaf:page"]["@value"]
+            if "foaf:page" in entity:
+                res_url = entity["foaf:page"]["@value"]
 
-            if res_url.startswith("http://example.com"):
-                # ignore these placeholder URLs
-                continue
-            else:
-                todo.append(["unknown", e_id, res_url, dat_path])
+                if res_url.startswith("http://example.com"):
+                    # ignore these placeholder URLs
+                    continue
+                else:
+                    todo.append(["unknown", e_id, res_url, dat_path])
 
     return dat_path, todo