Skip to content
This repository has been archived by the owner on Nov 20, 2020. It is now read-only.

Commit

Permalink
making this work on meta (hopefully)
Browse files Browse the repository at this point in the history
  • Loading branch information
gautamh committed Aug 4, 2017
1 parent 5f42a9c commit f7a2f54
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 17 deletions.
14 changes: 9 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,19 @@ WORKDIR /usr/src/app

RUN apt-get remove python-pip

COPY requirements.txt ./
RUN pip3 install -r requirements.txt
RUN apt-get update && \
apt-get install libxml2-dev libxslt-dev python-dev lib32z1-dev -y

COPY . /usr/src/app
COPY requirements.txt /usr/src/app/
RUN pip3 install -r /usr/src/app/requirements.txt
COPY . /usr/src/app/

RUN python3 /usr/src/app/setup.py install
ENV PYTHONPATH=/usr/src/app

RUN mkdir -p /var/www/data

EXPOSE 3000 3001 5901

CMD xvfb-run --server-args="-screen 0 1024x768x24" python3 ./njcampfin/__init__.py Phil Murphy 2017 GOVERNOR
CMD xvfb-run --server-args="-screen 0 1024x768x24" /usr/bin/python3 /usr/src/app/njcampfin/__init__.py "" "" 2017 GOVERNOR /var/www/data/nj_campfin_governor.json True True > /var/www/data/nj_campfin_governor_output.txt


26 changes: 15 additions & 11 deletions njcampfin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,10 +136,11 @@ def get_filing_list(first_name, last_name, year, office, outfile, get_filings, u
print("Began scraping with params {} {} {} {} {}".format(first_name, last_name, year, office, outfile))

results = []
reader = csv.reader(logfile)
for row in reader:
results.append(convert_csv_row_to_result(row))
print(len(results))
if logfile is not None and logfile != '':
reader = csv.reader(logfile)
for row in reader:
results.append(convert_csv_row_to_result(row))
print(len(results))

previously_scraped = []
if outfile is not None and outfile != '':
Expand Down Expand Up @@ -183,7 +184,8 @@ def get_filing_list(first_name, last_name, year, office, outfile, get_filings, u

if len(results) > 25:
advance_to_page(browser, page_controls_xpath, (len(results) // 25) + 1)
writer = csv.writer(logfile)
if logfile is not None and logfile != '':
writer = csv.writer(logfile)

while True:
wait = WebDriverWait(browser, int(os.environ['WAIT_TIME']))
Expand Down Expand Up @@ -286,7 +288,7 @@ def get_filing_list(first_name, last_name, year, office, outfile, get_filings, u
else:
details['url'] = ''

if details not in results:
if details not in results and logfile is not None and logfile != '':
writer.writerow([
details['name'],
details['summary_link'],
Expand All @@ -302,6 +304,7 @@ def get_filing_list(first_name, last_name, year, office, outfile, get_filings, u
details['url']
])
logfile.flush()
if details not in results:
results.append(details)
else:
details = {
Expand All @@ -314,7 +317,7 @@ def get_filing_list(first_name, last_name, year, office, outfile, get_filings, u
'year': year,
}

if details not in results:
if details not in results and logfile is not None and logfile != '':
writer.writerow([
details['name'],
details['summary_link'],
Expand All @@ -325,6 +328,7 @@ def get_filing_list(first_name, last_name, year, office, outfile, get_filings, u
details['year'],
])
logfile.flush()
if details not in results:
results.append(details)

if check_exists_by_xpath(page_controls_xpath, browser):
Expand All @@ -344,12 +348,12 @@ def get_filing_list(first_name, last_name, year, office, outfile, get_filings, u
sys.stdout.write(json.dumps(results))

def main():
with open('log.csv', 'r+') as logfile:
get_filing_list(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6].lower() == 'true', sys.argv[7].lower() == 'true', logfile)
'''schedule.every(6).hours.do(get_filing_list, sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[7].lower() == 'true', sys.argv[7].lower() == 'true', logfile)
'''with open('log.csv', 'r+') as logfile:
get_filing_list(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6].lower() == 'true', sys.argv[7].lower() == 'true', None)'''
schedule.every(6).hours.do(get_filing_list, sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[7].lower() == 'true', sys.argv[7].lower() == 'true', None)
while True:
schedule.run_pending()
time.sleep(1)'''
time.sleep(1)

if __name__=='__main__':
main()
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ chardet==3.0.4
docutils==0.13.1
idna==2.5
jmespath==0.9.3
nyt-nj-campfin==0.0.1
packaging==16.8
pyparsing==2.1.10
python-dateutil==2.6.0
Expand Down

0 comments on commit f7a2f54

Please sign in to comment.