Skip to content

Commit

Permalink
Python examples (#1)
Browse files Browse the repository at this point in the history
* solution python

* python scripts ra
  • Loading branch information
jgarciab authored Apr 18, 2024
1 parent d89ba8c commit 2c6ce2a
Show file tree
Hide file tree
Showing 4 changed files with 8,497 additions and 0 deletions.
92 changes: 92 additions & 0 deletions python_scripts/efficient_data_transformation.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "HnsoC_-olsXJ"
},
"outputs": [],
"source": [
"## READING SPSS FILES\n",
"import pyreadstat\n",
"df, meta = pyreadstat.read_sav(filename, row_limit=10)\n",
"# print codebook\n",
"print(meta.variable_value_labels)\n",
"# you can then read specific variables with usecols=[\"var1\",\"var2\"]\n",
"# you can disable the conversion to categorical with formats_as_category=False. It's much faster but then you'll have -9 for missing values (and others). Be careful using it."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "3U46RzMblxFQ"
},
"outputs": [],
"source": [
"## WORKING WITH POLARS instead of pandas (10x speed-up)\n",
"import polars as pl\n",
"# Read a CSV file (arguments similar to pandas but with different names, e.g. n_rows, delimiter)\n",
"csv_df = pl.read_csv('filename.csv')\n",
"# Read a Feather file\n",
"feather_df = pl.read_feather('filename.feather')\n",
"\n",
"# Converting from a Pandas file (e.g. after reading it with pyreadstat)\n",
"pdf = pl.from_pandas(df)\n",
"# you can also convert back to pandas\n",
"pandas_df = polars_df.to_pandas()\n",
"\n",
"## Select variables\n",
"#In pandas: selected_df_pandas = df[[\"var1\", \"var2\"]]\n",
"subset_pdf = pdf.select([\"var1\", \"var2\"])\n",
"\n",
"## Select rows\n",
"#In pandas: subset_df_pandas = df.loc[df[\"var1\"] == 3]\n",
"subset_df_polars = pdf.filter(pdf[\"var1\"] == 3)\n",
"\n",
"## Merge\n",
"#In pandas (e.g. left join): merged_df_pandas = pd.merge(df1, df2, on=[\"var1\", \"var2\"], how=\"left\")\n",
"merged_pdf_polars = pdf1.join(pdf2, on=[\"var1\", \"var2\"], how='left')\n",
"\n",
"\n",
"## Lazy read to filter\n",
"pdf = (\n",
"pl.scan_csv(\"my_long_file.csv\") # lazy, doesn't do a thing\n",
" .select(\n",
" [\"a\", \"c\"]\n",
" ) # select only 2 columns (other columns will not be read)\n",
" .filter(\n",
" pl.col(\"a\") > 10\n",
" ) # the filter is pushed down the scan, so less data is read into memory\n",
" .collect() # collect the data\n",
")"
]
}
],
"metadata": {
"colab": {
"authorship_tag": "ABX9TyMFOeiqZbYzvppLaHYx56H2",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.15"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Loading

0 comments on commit 2c6ce2a

Please sign in to comment.