Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Address change in lobbyist structure, fix bug preventing download of all filings #36

Merged
merged 5 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions .github/workflows/tmp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ jobs:
build:
# The type of runner that the job will run on
runs-on: ubuntu-latest
strategy:
matrix:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Parallelizes building of these files.

target: [lobbyist.xlsx, lobbyist_employer.xlsx]

# Steps represent a sequence of tasks that will be executed as part of the job
steps:
Expand All @@ -20,12 +23,11 @@ jobs:
- name: setup requirementss
run: pip install -r requirements.txt
- name: scrape lobbyist data
run: make data/processed/lobbyist.xlsx data/processed/lobbyist_employer.xlsx
run: make data/processed/${{ matrix.target }}
- name: upload to s3
env:
S3BUCKET: ${{ secrets.S3BUCKET }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
run: |
aws s3 cp data/processed/lobbyist.xlsx $S3BUCKET --acl public-read
aws s3 cp data/processed/lobbyist_employer.xlsx $S3BUCKET --acl public-read
aws s3 cp data/processed/${{ matrix.target }} $S3BUCKET --acl public-read
28 changes: 6 additions & 22 deletions lobbyist_employer.mk
Original file line number Diff line number Diff line change
Expand Up @@ -5,41 +5,25 @@ LOBBYIST_EMPLOYER_DATA_DIR=data/lobbyist_employer
$(LOBBYIST_EMPLOYER_DATA_DIR)/intermediate/lobbyist_employer_contributions.csv \
$(LOBBYIST_EMPLOYER_DATA_DIR)/intermediate/lobbyist_employer_expenditures.csv

data/processed/lobbyist_employer.xlsx : $(LOBBYIST_EMPLOYER_DATA_DIR)/intermediate/lobbyist_employer.csv \
data/processed/lobbyist_employer.xlsx : $(LOBBYIST_EMPLOYER_DATA_DIR)/raw/lobbyist_employer.csv \
$(LOBBYIST_EMPLOYER_DATA_DIR)/processed/lobbyist_employer_contributions.csv \
$(LOBBYIST_EMPLOYER_DATA_DIR)/processed/lobbyist_employer_expenditures.csv
python scripts/to_excel.py $^ $@

$(LOBBYIST_EMPLOYER_DATA_DIR)/processed/lobbyist_employer_%.csv : $(LOBBYIST_EMPLOYER_DATA_DIR)/intermediate/lobbyist_employer_%.csv \
$(LOBBYIST_EMPLOYER_DATA_DIR)/intermediate/lobbyist_employer_filings.csv \
$(LOBBYIST_EMPLOYER_DATA_DIR)/intermediate/lobbyist_employer.csv
$(LOBBYIST_EMPLOYER_DATA_DIR)/raw/lobbyist_employer.csv
csvjoin --left -c Source,ReportFileName $< $(word 2, $^) | \
csvjoin --left -c MemberID,LobbyMemberID - $(word 3, $^) > $@

$(LOBBYIST_EMPLOYER_DATA_DIR)/intermediate/lobbyist_employer_%.csv : lobbyist_employer_filings
python -m scrapers.lobbyist.extract_transactions $* $(LOBBYIST_EMPLOYER_DATA_DIR) > $@
python scrapers/lobbyist/cli.py extract-transactions -t $* -d $(LOBBYIST_EMPLOYER_DATA_DIR)/assets > $@

lobbyist_employer_filings : $(LOBBYIST_EMPLOYER_DATA_DIR)/intermediate/lobbyist_employer_filings.csv
python -m scrapers.lobbyist.download_filings $(LOBBYIST_EMPLOYER_DATA_DIR) < $<
python scrapers/lobbyist/cli.py download-filings -d $(LOBBYIST_EMPLOYER_DATA_DIR)/assets < $<

$(LOBBYIST_EMPLOYER_DATA_DIR)/intermediate/lobbyist_employer_filings.csv : $(LOBBYIST_EMPLOYER_DATA_DIR)/raw/lobbyist_employer.csv
csvsql --query "SELECT DISTINCT LobbyMemberID AS id, LobbyMemberversionid AS version FROM STDIN" < $< | \
python -m scrapers.lobbyist.scrape_filings --employer | \
csvsql --query 'select ReportFileName, ReportTypeCode, MAX(MemberID) as MemberID from STDIN group by ReportFileName, ReportTypeCode' > $@

$(LOBBYIST_EMPLOYER_DATA_DIR)/intermediate/lobbyist_employer.csv : $(LOBBYIST_EMPLOYER_DATA_DIR)/raw/lobbyist_employer.csv
csvsql --query "SELECT DISTINCT \
LobbyMemberID, \
Name \
FROM ( \
SELECT \
LobbyMemberID, \
MAX(LobbyMemberversionid) AS LobbyMemberversionid \
FROM STDIN \
GROUP BY LobbyMemberID \
) AS lobbyists \
JOIN STDIN \
USING (LobbyMemberID, LobbyMemberversionid)" < $< > $@
python scrapers/lobbyist/cli.py scrape-filings --employer < $< > $@

$(LOBBYIST_EMPLOYER_DATA_DIR)/raw/lobbyist_employer.csv : lobbyist_employer_data_dirs
python -m scrapers.lobbyist.scrape_employers > $@
python scrapers/lobbyist/cli.py scrape-employers > $@
80 changes: 16 additions & 64 deletions lobbyists.mk
Original file line number Diff line number Diff line change
@@ -1,82 +1,34 @@
# Lobbyist expenditures and contributions
LOBBYIST_DATA_DIR=data/lobbyist

.PRECIOUS : $(LOBBYIST_DATA_DIR)/raw/lobbyist.csv \
$(LOBBYIST_DATA_DIR)/raw/filings.csv \
$(LOBBYIST_DATA_DIR)/raw/lobbyist_client.csv \
$(LOBBYIST_DATA_DIR)/intermediate/lobbyist_contributions.csv \
$(LOBBYIST_DATA_DIR)/intermediate/lobbyist_expenditures.csv

data/processed/lobbyist.xlsx : $(LOBBYIST_DATA_DIR)/processed/lobbyist_employer.csv \
data/processed/lobbyist.xlsx : $(LOBBYIST_DATA_DIR)/raw/lobbyist.csv \
$(LOBBYIST_DATA_DIR)/raw/lobbyist_client.csv \
$(LOBBYIST_DATA_DIR)/processed/lobbyist_contributions.csv \
$(LOBBYIST_DATA_DIR)/processed/lobbyist_expenditures.csv
python scripts/to_excel.py $^ $@

$(LOBBYIST_DATA_DIR)/processed/lobbyist_employer.csv : $(LOBBYIST_DATA_DIR)/raw/lobbyist.csv \
$(LOBBYIST_DATA_DIR)/intermediate/client.csv
csvsql --query "SELECT \
ClientID, \
MemberID, \
Phone, \
LobbyistName, \
LobbyistAddress, \
Email, \
StartYear, \
EndYear \
FROM ( \
SELECT \
ClientID, \
MemberID, \
MAX(MemberVersionID) AS MemberVersionID, \
MAX(Year) AS Year, \
MIN(Year) AS StartYear, \
MAX(Year) AS EndYear \
FROM STDIN \
GROUP BY ClientID, MemberID \
) AS lobbyists \
JOIN STDIN \
USING (ClientID, MemberID, MemberVersionID, Year)" < $< | \
csvjoin -c ClientID - $(word 2, $^) > $@
$(LOBBYIST_DATA_DIR)/raw/lobbyist_client.csv : $(LOBBYIST_DATA_DIR)/raw/lobbyist.csv
python scrapers/lobbyist/cli.py scrape-lobbyist-clients < $< > $@

$(LOBBYIST_DATA_DIR)/processed/lobbyist_%.csv : $(LOBBYIST_DATA_DIR)/intermediate/lobbyist_%.csv \
$(LOBBYIST_DATA_DIR)/intermediate/filings.csv \
$(LOBBYIST_DATA_DIR)/intermediate/lobbyist.csv
$(LOBBYIST_DATA_DIR)/raw/filings.csv \
$(LOBBYIST_DATA_DIR)/raw/lobbyist.csv
csvjoin --left -c Source,ReportFileName $< $(word 2, $^) | \
csvjoin --left -c MemberID - $(word 3, $^) > $@
csvjoin --left -c ID - $(word 3, $^) > $@

$(LOBBYIST_DATA_DIR)/intermediate/lobbyist_%.csv : lobbyist_filings
python -m scrapers.lobbyist.extract_transactions $* $(LOBBYIST_DATA_DIR) > $@
python scrapers/lobbyist/cli.py extract-transactions -t $* -d $(LOBBYIST_DATA_DIR)/assets > $@

lobbyist_filings : $(LOBBYIST_DATA_DIR)/intermediate/filings.csv
python -m scrapers.lobbyist.download_filings $(LOBBYIST_DATA_DIR) < $<
lobbyist_filings : $(LOBBYIST_DATA_DIR)/raw/filings.csv
python scrapers/lobbyist/cli.py download-filings -d $(LOBBYIST_DATA_DIR)/assets < $<

$(LOBBYIST_DATA_DIR)/intermediate/lobbyist.csv : $(LOBBYIST_DATA_DIR)/raw/lobbyist.csv
csvsql --query "SELECT \
MemberID, \
Phone, \
LobbyistName, \
LobbyistAddress, \
Email \
FROM ( \
SELECT \
MemberID, \
MAX(MemberVersionID) AS MemberVersionID, \
MAX(Year) AS Year, \
MAX(ClientID) AS ClientID \
FROM STDIN \
GROUP BY MemberID \
) AS lobbyists \
JOIN STDIN \
USING (MemberID, MemberVersionID, Year, ClientID)" < $< > $@
$(LOBBYIST_DATA_DIR)/raw/filings.csv : $(LOBBYIST_DATA_DIR)/raw/lobbyist.csv
python scrapers/lobbyist/cli.py scrape-filings < $< > $@

$(LOBBYIST_DATA_DIR)/intermediate/filings.csv : $(LOBBYIST_DATA_DIR)/raw/lobbyist.csv
csvsql --query "SELECT DISTINCT MemberID AS id, MemberVersionID AS version FROM STDIN" < $< | \
python -m scrapers.lobbyist.scrape_filings | \
csvsql --query 'select ReportFileName, ReportTypeCode, MAX(MemberID) as MemberID from STDIN group by ReportFileName, ReportTypeCode' > $@

$(LOBBYIST_DATA_DIR)/raw/lobbyist.csv : $(LOBBYIST_DATA_DIR)/intermediate/client.csv
python -m scrapers.lobbyist.scrape_lobbyists < $< > $@

$(LOBBYIST_DATA_DIR)/intermediate/client.csv : $(LOBBYIST_DATA_DIR)/raw/client.csv
csvsql --query "SELECT ClientID, ClientVersionID, MAX(ClientName) AS ClientName FROM STDIN GROUP BY ClientID" < $< > $@

$(LOBBYIST_DATA_DIR)/raw/client.csv : lobbyist_data_dirs
python -m scrapers.lobbyist.scrape_clients > $@
$(LOBBYIST_DATA_DIR)/raw/lobbyist.csv : lobbyist_data_dirs
python scrapers/lobbyist/cli.py scrape-lobbyists > $@
Empty file added scrapers/__init__.py
Empty file.
Empty file added scrapers/lobbyist/__init__.py
Empty file.
75 changes: 75 additions & 0 deletions scrapers/lobbyist/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import functools

import click


def scrapelib_opts(f):
@click.option("--rpm", default=180, show_default=True)
@click.option("--retries", default=3, show_default=True)
@click.option("--verify/--no-verify", default=False, show_default=True)
@functools.wraps(f)
def wrapped_func(*args, **kwargs):
return f(*args, **kwargs)

return wrapped_func
Comment on lines +6 to +14
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was the main reason for pulling this into a CLI. That way, we can run scrapes faster locally by specifying --rpm=something larger than 3



@click.group()
def scrape():
...


@scrape.command()
@scrapelib_opts
def scrape_lobbyist_clients(rpm, retries, verify):
import scrape_lobbyist_clients

scrape_lobbyist_clients.main(rpm=rpm, retries=retries, verify=verify)


@scrape.command()
@scrapelib_opts
def scrape_lobbyists(rpm, retries, verify):
import scrape_lobbyists

scrape_lobbyists.main(rpm=rpm, retries=retries, verify=verify)


@scrape.command()
@scrapelib_opts
def scrape_employers(rpm, retries, verify):
import scrape_employers

scrape_employers.main(rpm=rpm, retries=retries, verify=verify)


@scrape.command()
@scrapelib_opts
@click.option("--employer", "is_employer_scrape", is_flag=True)
def scrape_filings(rpm, retries, verify, is_employer_scrape):
import scrape_filings

scrape_filings.main(
rpm=rpm, retries=retries, verify=verify, is_employer_scrape=is_employer_scrape
)


@scrape.command()
@click.option("-d", "--asset-directory", "asset_directory")
def download_filings(asset_directory):
import download_filings

download_filings.main(asset_directory)


@scrape.command()
@click.option("-t", "--transaction-type", "transaction_type")
@click.option("-d", "--asset-directory", "asset_directory")
def extract_transactions(transaction_type, asset_directory):
import extract_transactions

extract_transactions.main(transaction_type, asset_directory)


if __name__ == "__main__":
scrape()
57 changes: 30 additions & 27 deletions scrapers/lobbyist/download_filings.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,39 +6,42 @@
from tqdm import tqdm


_, subdir = sys.argv
def main(asset_directory):
def get_file_name(report_type_code, member_id, report_file_name):
report_path = os.path.join(asset_directory, report_type_code, member_id)
os.makedirs(report_path, exist_ok=True)
return os.path.join(report_path, report_file_name)

reader = csv.DictReader(sys.stdin)

def get_file_name(report_type_code, member_id, report_file_name):
report_path = os.path.join(subdir, "assets", report_type_code, member_id)
os.makedirs(report_path, exist_ok=True)
return os.path.join(report_path, report_file_name)
for row in tqdm(reader):
outfile = get_file_name(
row["ReportTypeCode"],
row["MemberID"],
row["ReportFileName"],
)

if os.path.exists(outfile):
continue

reader = csv.DictReader(sys.stdin)
filing_url = f"https://login.cfis.sos.state.nm.us//ReportsOutput//{row['ReportTypeCode']}/{row['ReportFileName']}"

for row in tqdm(reader):
outfile = get_file_name(
row["ReportTypeCode"],
row["MemberID"],
row["ReportFileName"],
)
try:
response = requests.get(filing_url, verify=False)
except Exception as e:
print(f"Could not retrieve {filing_url}: {e}")
continue

if os.path.exists(outfile):
continue
if response.ok:
# LAR - Quarterly, LCD - 48-hour, LNA - No expenditures
with open(outfile, "wb") as f:
f.write(response.content)

filing_url = f"https://login.cfis.sos.state.nm.us//ReportsOutput//{row['ReportTypeCode']}/{row['ReportFileName']}"
else:
print(
f"Could not retrieve {filing_url}:\n{response.content.decode('utf-8')}"
)

try:
response = requests.get(filing_url, verify=False)
except Exception as e:
print(f"Could not retrieve {filing_url}: {e}")
continue

if response.ok:
# LAR - Quarterly, LCD - 48-hour, LNA - No expenditures
with open(outfile, "wb") as f:
f.write(response.content)

else:
print(f"Could not retrieve {filing_url}:\n{response.content.decode('utf-8')}")
if __name__ == "__main__":
main("assets")
14 changes: 8 additions & 6 deletions scrapers/lobbyist/extract_transactions.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,7 @@ def extract_transactions(pdf_obj, table_start_signature, table_end_signature):
return transactions


if __name__ == "__main__":
_, transaction_type, subdir = sys.argv

def main(transaction_type, asset_directory):
if transaction_type == "expenditures":
column_names = [
"Date",
Expand Down Expand Up @@ -75,9 +73,9 @@ def extract_transactions(pdf_obj, table_start_signature, table_end_signature):

for root, _, files in tqdm(
itertools.chain(
os.walk(os.path.join(subdir, "assets/LAR")),
os.walk(os.path.join(subdir, "assets/LCD")),
os.walk(os.path.join(subdir, "assets/LNA")),
os.walk(os.path.join(asset_directory, "LAR")),
os.walk(os.path.join(asset_directory, "LCD")),
os.walk(os.path.join(asset_directory, "LNA")),
)
):
for file in files:
Expand All @@ -100,3 +98,7 @@ def extract_transactions(pdf_obj, table_start_signature, table_end_signature):

finally:
null_file.close()


if __name__ == "__main__":
main("expenditures", "assets")
Loading
Loading