Skip to content

Commit

Permalink
Merge pull request #1 from adelavega/new_ext
Browse files Browse the repository at this point in the history
RF: Major update to Python 3
  • Loading branch information
adelavega authored Mar 30, 2023
2 parents 55ffe2f + b04ce31 commit 6a0525b
Show file tree
Hide file tree
Showing 19 changed files with 4,765 additions and 253 deletions.
2 changes: 1 addition & 1 deletion ace/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# ex: set sts=4 ts=4 sw=4 et:
"""ACE -- Automated Coordinate Extraction.
"""
__all__ = ["config", "database", "datatable", "set_logging_level", "scrape", "sources", "tableparser", "tests", "__version__"]
__all__ = ["config", "ingest", "database", "datatable", "set_logging_level", "scrape", "sources", "tableparser", "tests", "__version__"]

import logging
import sys
Expand Down
94 changes: 22 additions & 72 deletions ace/database.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,20 @@
# Database stuff and models

from sqlalchemy import (TypeDecorator, Table, Column, Integer, Float, String,
ForeignKey, Boolean, DateTime, Text)
ForeignKey, DateTime, Text)
from sqlalchemy.orm import relationship, backref, sessionmaker
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.ext.associationproxy import association_proxy
from sqlalchemy.dialects.mysql import MEDIUMTEXT
from sqlalchemy.sql import exists
from datetime import datetime
from ace import config
import simplejson as json
import logging
import sys
from os import path
import datetime

from . import sources
from . import config
from . import extract

Expand Down Expand Up @@ -47,7 +45,7 @@ def __init__(self, adapter=None, db_name=None, user=None, password=None):
else:
raise ValueError("Value of SQL_ADAPTER in settings must be either 'sqlite' or 'mysql'")

engine = create_engine(db_uri, echo=False)
engine = create_engine(db_uri, echo=False, connect_args={'timeout': 15})

if adapter == 'mysql': engine.execute("SET sql_mode=''")

Expand All @@ -62,54 +60,8 @@ def add(self, record):
def save(self):
''' Commit all stored records to file. '''
self.session.commit()

def add_articles(self, files, commit=True, table_dir=None, limit=None,
pmid_filenames=False, metadata_dir=None):
''' Process articles and add their data to the DB.
Args:
files: The path to the article(s) to process. Can be a single
filename (string), a list of filenames, or a path to pass
to glob (e.g., "article_dir/NIMG*html")
commit: Whether or not to save records to DB file after adding them.
table_dir: Directory to store downloaded tables in (if None, tables
will not be saved.)
limit: Optional integer indicating max number of articles to add
(selected randomly from all available). When None, will add all
available articles.
pmid_filenames: When True, assume that the file basename is a PMID.
This saves us from having to retrieve metadata from PubMed When
checking if a file is already in the DB, and greatly speeds up
batch processing when overwrite is off.
metadata_dir: Location to read/write PubMed metadata for articles.
When None (default), retrieves new metadata each time. If a
path is provided, will check there first before querying PubMed,
and will save the result of the query if it doesn't already
exist.
'''

manager = sources.SourceManager(self, table_dir)

if isinstance(files, str):
from glob import glob
files = glob(files)
if limit is not None:
from random import shuffle
shuffle(files)
files = files[:limit]

for i, f in enumerate(files):
logger.info("Processing article %s..." % f)
html = open(f).read()
source = manager.identify_source(html)
try:
pmid = path.splitext(path.basename(f))[0] if pmid_filenames else None
article = source.parse_article(html, pmid, metadata_dir=metadata_dir)
if article and (config.SAVE_ARTICLES_WITHOUT_ACTIVATIONS or article.tables):
self.add(article)
if commit and (i % 100 == 0 or i == len(files) - 1):
self.save()
except Exception as err:
print(err)
# except Exception as err:
# print(err)

def delete_article(self, pmid):
article = self.session.query(Article).filter_by(id=pmid).first()
Expand All @@ -119,9 +71,14 @@ def delete_article(self, pmid):
def print_stats(self):
''' Summarize the current state of the DB. '''
n_articles = self.session.query(Article).count()
n_articles_with_coordinates = self.session.query(Table).filter(Table.n_activations>0).distinct('article_id').count()
n_tables = self.session.query(Table).count()
n_activations = self.session.query(Activation).count()
print("The database currently contains:\n\t%d articles\n\t%d tables\n\t%d activations" % n_articles, n_tables, n_activations)
n_links = self.session.query(NeurovaultLink).count()
n_articles_with_links = self.session.query(NeurovaultLink).distinct('article_id').count()
print(f"The database currently contains: {n_articles} articles.\n"
f"{n_articles_with_coordinates} have coordinates, and {n_articles_with_links} have NeuroVault links.\n"
f"Total of {n_tables} tables, {n_activations} activations and {n_links} NeuroVault links.")

def article_exists(self, pmid):
''' Check if an article already exists in the database. '''
Expand Down Expand Up @@ -170,8 +127,10 @@ class Article(Base):

tables = relationship('Table', cascade="all, delete-orphan",
backref='article')
activations = relationship('Activation', cascade="all, delete-orphan",

neurovault_links = relationship('NeurovaultLink', cascade="all, delete-orphan",
backref='article')

features = association_proxy('tags', 'feature')

def __init__(self, text, pmid=None, doi=None, metadata=None):
Expand All @@ -192,6 +151,7 @@ def update_from_metadata(self):
self.authors = pmd['authors']
self.abstract = pmd['abstract']
self.citation = pmd['citation']
self.doi = pmd['doi']


class Table(Base):
Expand Down Expand Up @@ -291,23 +251,13 @@ def validate(self):

return True

class NeurovaultLink(Base):

__tablename__ = 'Neurovaultlinks'

# class Feature(Base):

# __tablename__ = 'features'

# id = Column(String, primary_key=True)
# name = Column(String)
id = Column(Integer, primary_key=True, autoincrement=True)
neurovault_id = Column(Integer)
url = Column(String(100))
type = Column(String(100))


# class Tag(Base):

# __tablename__ = 'tags'

# feature_id = Column(Integer, ForeignKey('features.id'), primary_key=True)
# article_id = Column(Integer, ForeignKey('articles.id'), primary_key=True)
# weight = Column(Float)

# article = relationship(Article, backref=backref(
# "tags", cascade="all, delete-orphan"))
# feature = relationship("Feature")
article_id = Column(Integer, ForeignKey('articles.id'))
4 changes: 0 additions & 4 deletions ace/datatable.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import logging
logger = logging.getLogger(__name__)
import math
# logging.basicConfig()


class DataTable:
Expand Down Expand Up @@ -48,8 +46,6 @@ def add_val(self, val, rows=1, cols=1):
open_pos = flat.index(None)
ri = open_pos / self.n_cols
if (ri + rows) > self.n_rows:
logging.error("Error: DataTable row has more columns than labels: [%d, %d, %d]" % (
ri, rows, self.n_rows))
for i in range(round((ri + rows)) - self.n_rows):
self.data.append([None] * self.n_cols)

Expand Down
2 changes: 1 addition & 1 deletion ace/export.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .database import Database, Article, Table, Activation
from .database import Article
from sqlalchemy import func
import logging

Expand Down
55 changes: 55 additions & 0 deletions ace/ingest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from os import path
import logging
from . import sources, config

logger = logging.getLogger(__name__)

def add_articles(db, files, commit=True, table_dir=None, limit=None,
pmid_filenames=False, metadata_dir=None, **kwargs):
''' Process articles and add their data to the DB.
Args:
files: The path to the article(s) to process. Can be a single
filename (string), a list of filenames, or a path to pass
to glob (e.g., "article_ls dir/NIMG*html")
commit: Whether or not to save records to DB file after adding them.
table_dir: Directory to store downloaded tables in (if None, tables
will not be saved.)
limit: Optional integer indicating max number of articles to add
(selected randomly from all available). When None, will add all
available articles.
pmid_filenames: When True, assume that the file basename is a PMID.
This saves us from having to retrieve metadata from PubMed When
checking if a file is already in the DB, and greatly speeds up
batch processing when overwrite is off.
metadata_dir: Location to read/write PubMed metadata for articles.
When None (default), retrieves new metadata each time. If a
path is provided, will check there first before querying PubMed,
and will save the result of the query if it doesn't already
exist.
kwargs: Additional keyword arguments to pass to parse_article.
'''

manager = sources.SourceManager(db, table_dir)

if isinstance(files, str):
from glob import glob
files = glob(files)
if limit is not None:
from random import shuffle
shuffle(files)
files = files[:limit]

for i, f in enumerate(files):
logger.info("Processing article %s..." % f)
html = open(f).read()
source = manager.identify_source(html)
if source is None:
logger.warning("Could not identify source for %s" % f)
continue
# try:
pmid = path.splitext(path.basename(f))[0] if pmid_filenames else None
article = source.parse_article(html, pmid, metadata_dir=metadata_dir, **kwargs)
if article and (config.SAVE_ARTICLES_WITHOUT_ACTIVATIONS or article.tables):
db.add(article)
if commit and (i % 100 == 0 or i == len(files) - 1):
db.save()
Loading

0 comments on commit 6a0525b

Please sign in to comment.