Python examples (#1)

* solution python * python scripts ra
sodascience · Apr 18, 2024 · 2c6ce2a · 2c6ce2a
1 parent d89ba8c
commit 2c6ce2a
Show file tree

Hide file tree

Showing 4 changed files with 8,497 additions and 0 deletions.
diff --git a/python_scripts/efficient_data_transformation.ipynb b/python_scripts/efficient_data_transformation.ipynb
@@ -0,0 +1,92 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "HnsoC_-olsXJ"
+   },
+   "outputs": [],
+   "source": [
+    "## READING SPSS FILES\n",
+    "import pyreadstat\n",
+    "df, meta = pyreadstat.read_sav(filename, row_limit=10)\n",
+    "# print codebook\n",
+    "print(meta.variable_value_labels)\n",
+    "# you can then read specific variables with usecols=[\"var1\",\"var2\"]\n",
+    "# you can disable the conversion to categorical with formats_as_category=False. It's much faster but then you'll have -9 for missing values (and others). Be careful using it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "3U46RzMblxFQ"
+   },
+   "outputs": [],
+   "source": [
+    "## WORKING WITH POLARS instead of pandas (10x speed-up)\n",
+    "import polars as pl\n",
+    "# Read a CSV file (arguments similar to pandas but with different names, e.g. n_rows, delimiter)\n",
+    "csv_df = pl.read_csv('filename.csv')\n",
+    "# Read a Feather file\n",
+    "feather_df = pl.read_feather('filename.feather')\n",
+    "\n",
+    "# Converting from a Pandas file (e.g. after reading it with pyreadstat)\n",
+    "pdf = pl.from_pandas(df)\n",
+    "# you can also convert back to pandas\n",
+    "pandas_df = polars_df.to_pandas()\n",
+    "\n",
+    "## Select variables\n",
+    "#In pandas: selected_df_pandas = df[[\"var1\", \"var2\"]]\n",
+    "subset_pdf = pdf.select([\"var1\", \"var2\"])\n",
+    "\n",
+    "## Select rows\n",
+    "#In pandas: subset_df_pandas = df.loc[df[\"var1\"] == 3]\n",
+    "subset_df_polars = pdf.filter(pdf[\"var1\"] == 3)\n",
+    "\n",
+    "## Merge\n",
+    "#In pandas (e.g. left join): merged_df_pandas = pd.merge(df1, df2, on=[\"var1\", \"var2\"], how=\"left\")\n",
+    "merged_pdf_polars = pdf1.join(pdf2, on=[\"var1\", \"var2\"], how='left')\n",
+    "\n",
+    "\n",
+    "## Lazy read to filter\n",
+    "pdf = (\n",
+    "pl.scan_csv(\"my_long_file.csv\")  # lazy, doesn't do a thing\n",
+    "    .select(\n",
+    "        [\"a\", \"c\"]\n",
+    "    )  # select only 2 columns (other columns will not be read)\n",
+    "    .filter(\n",
+    "        pl.col(\"a\") > 10\n",
+    "    )  # the filter is pushed down the scan, so less data is read into memory\n",
+    "    .collect()  # collect the data\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "authorship_tag": "ABX9TyMFOeiqZbYzvppLaHYx56H2",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}