Skip to content

Commit

Permalink
Tests are halfway passing
Browse files Browse the repository at this point in the history
  • Loading branch information
adelavega committed Feb 24, 2023
1 parent 2b857de commit 55ffe2f
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 45 deletions.
2 changes: 1 addition & 1 deletion ace/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def retrieve_journal_articles(self, journal, delay=None, mode='browser', search=
outf = open(filename, 'w')
# Still having encoding issues with some journals (e.g.,
# PLoS ONE). Why???
outf.write(doc.encode('utf-8'))
outf.write(doc)
outf.close()
articles_found += 1

Expand Down
91 changes: 48 additions & 43 deletions ace/sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,21 +239,22 @@ def parse_article(self, html, pmid=None, **kwargs):
url = '%s/T%d.expansion.html' % (content_url, t_num)
table_soup = self._download_table(url)
tc = table_soup.find(class_='table-expansion')
t = tc.find('table', {'id': 'table-%d' % (t_num)})
t = self.parse_table(t)
if t:
t.position = t_num
t.label = tc.find(class_='table-label').text
t.number = t.label.split(' ')[-1].strip()
try:
t.caption = tc.find(class_='table-caption').get_text()
except:
pass
try:
t.notes = tc.find(class_='table-footnotes').get_text()
except:
pass
tables.append(t)
if tc:
t = tc.find('table', {'id': 'table-%d' % (t_num)})
t = self.parse_table(t)
if t:
t.position = t_num
t.label = tc.find(class_='table-label').text
t.number = t.label.split(' ')[-1].strip()
try:
t.caption = tc.find(class_='table-caption').get_text()
except:
pass
try:
t.notes = tc.find(class_='table-footnotes').get_text()
except:
pass
tables.append(t)

self.article.tables = tables
return self.article
Expand Down Expand Up @@ -413,19 +414,22 @@ def parse_article(self, html, pmid=None, **kwargs):
url = 'http://www.mitpressjournals.org/action/showPopup?citid=citart1&id=T%d&doi=%s' % (
num, doi)
table_soup = self._download_table(url)
tc = table_soup.find('table').find('table') # JCogNeuro nests tables 2-deep
t = self.parse_table(tc)
if t:
t.position = num
t.number = num
cap = tc.caption.find('span', class_='title')
t.label = cap.b.get_text()
t.caption = cap.get_text()
try:
t.notes = table_soup.find('div', class_="footnote").p.get_text()
except:
pass
tables.append(t)
tc = table_soup.find('table') # JCogNeuro nests tables 2-deep
if tc:
tc = tc.find('table')
if tc:
t = self.parse_table(tc)
if t:
t.position = num
t.number = num
cap = tc.caption.find('span', class_='title')
t.label = cap.b.get_text()
t.caption = cap.get_text()
try:
t.notes = table_soup.find('div', class_="footnote").p.get_text()
except:
pass
tables.append(t)

self.article.tables = tables
return self.article
Expand Down Expand Up @@ -511,21 +515,22 @@ def parse_article(self, html, pmid=None, **kwargs):
url = '%s/T%d.expansion.html' % (content_url, t_num)
table_soup = self._download_table(url)
tc = table_soup.find(class_='table-expansion')
t = tc.find('table', {'id': 'table-%d' % (t_num)})
t = self.parse_table(t)
if t:
t.position = t_num
t.label = tc.find(class_='table-label').text
t.number = t.label.split(' ')[-1].strip()
try:
t.caption = tc.find(class_='table-caption').get_text()
except:
pass
try:
t.notes = tc.find(class_='table-footnotes').get_text()
except:
pass
tables.append(t)
if tc:
t = tc.find('table', {'id': 'table-%d' % (t_num)})
t = self.parse_table(t)
if t:
t.position = t_num
t.label = tc.find(class_='table-label').text
t.number = t.label.split(' ')[-1].strip()
try:
t.caption = tc.find(class_='table-caption').get_text()
except:
pass
try:
t.notes = tc.find(class_='table-footnotes').get_text()
except:
pass
tables.append(t)

self.article.tables = tables
return self.article
Expand Down
2 changes: 1 addition & 1 deletion ace/tests/test_ace.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def testDatabaseProcessingStream(self):

def testJournalScraping(self):
scrape_path = join(get_test_data_path(), 'scrape_test')
os.mkdir(scrape_path, exists_ok=True)
os.makedirs(scrape_path, exist_ok=True)
# Test with PLoS ONE because it's OA
scraper = scrape.Scraper(scrape_path)
scraper.retrieve_journal_articles('PLoS ONE', delay=5.0, mode='direct',
Expand Down

0 comments on commit 55ffe2f

Please sign in to comment.