-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
✨ Add first version of mapply for DataFrame and Series
- Loading branch information
Showing
12 changed files
with
367 additions
and
42 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
{ | ||
"custom_plugin_paths": [], | ||
"exclude": { | ||
"files": null, | ||
"lines": null | ||
}, | ||
"generated_at": "2020-10-26T21:50:19Z", | ||
"plugins_used": [ | ||
{ | ||
"name": "AWSKeyDetector" | ||
}, | ||
{ | ||
"name": "ArtifactoryDetector" | ||
}, | ||
{ | ||
"base64_limit": 4.5, | ||
"name": "Base64HighEntropyString" | ||
}, | ||
{ | ||
"name": "BasicAuthDetector" | ||
}, | ||
{ | ||
"name": "CloudantDetector" | ||
}, | ||
{ | ||
"hex_limit": 3, | ||
"name": "HexHighEntropyString" | ||
}, | ||
{ | ||
"name": "IbmCloudIamDetector" | ||
}, | ||
{ | ||
"name": "IbmCosHmacDetector" | ||
}, | ||
{ | ||
"name": "JwtTokenDetector" | ||
}, | ||
{ | ||
"keyword_exclude": null, | ||
"name": "KeywordDetector" | ||
}, | ||
{ | ||
"name": "MailchimpDetector" | ||
}, | ||
{ | ||
"name": "PrivateKeyDetector" | ||
}, | ||
{ | ||
"name": "SlackDetector" | ||
}, | ||
{ | ||
"name": "SoftlayerDetector" | ||
}, | ||
{ | ||
"name": "StripeDetector" | ||
}, | ||
{ | ||
"name": "TwilioKeyDetector" | ||
} | ||
], | ||
"results": {}, | ||
"version": "0.14.3", | ||
"word_list": { | ||
"file": null, | ||
"hash": null | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
pathos | ||
psutil | ||
tqdm>=4.27 # from tqdm.auto import tqdm |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
from os import path | ||
from setuptools import setup | ||
|
||
from setuptools import find_packages, setup | ||
|
||
here = path.abspath(path.dirname(__file__)) | ||
|
||
|
@@ -31,19 +32,18 @@ def read_readme(path): | |
install_requires=read_requirements(requirements_path), | ||
use_scm_version={"write_to": "src/mapply/_version.py"}, | ||
package_dir={"": "src"}, | ||
packages=find_packages(where="src"), | ||
author="ddelange", | ||
author_email="[email protected]", | ||
url="https://github.com/ddelange/mapply", | ||
python_requires=">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*", | ||
python_requires=">=3.6", | ||
classifiers=[ | ||
"Development Status :: 5 - Production/Stable", | ||
"Intended Audience :: Developers", | ||
"Operating System :: OS Independent", | ||
"License :: OSI Approved :: MIT License", | ||
"Programming Language :: Python", | ||
"Programming Language :: Python :: 2", | ||
"Programming Language :: Python :: 2.7", | ||
"Programming Language :: Python :: 3", | ||
"Programming Language :: Python :: 3.5", | ||
"Programming Language :: Python :: 3.6", | ||
"Programming Language :: Python :: 3.7", | ||
"Programming Language :: Python :: 3.8", | ||
|
@@ -52,4 +52,5 @@ def read_readme(path): | |
"Topic :: Utilities", | ||
], | ||
keywords="pandas parallel apply map applymap multicore multiprocessing", | ||
license="MIT", | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,41 @@ | ||
# flake8: noqa:F401 | ||
from mapply._version import version as __version__ | ||
from functools import partialmethod | ||
|
||
from mapply._version import version as __version__ # noqa:F401 | ||
from mapply.mapply import mapply as _mapply | ||
|
||
|
||
def init( | ||
*, | ||
n_workers: int = -1, | ||
chunk_size: int = 100, | ||
max_chunks_per_worker: int = 20, | ||
progressbar: bool = True, | ||
apply_name: str = "mapply", | ||
map_name: str = "mmap", | ||
applymap_name: str = "mapplymap", | ||
): | ||
"""Initialize and patch PandasObject. | ||
Args: | ||
n_workers: Amount of workers (processes) to spawn. | ||
chunk_size: Minimum amount of items per chunk. Determines upper limit for n_chunks. | ||
max_chunks_per_worker: Upper limit on amount of chunks per worker. Will lower | ||
n_chunks determined by chunk_size if necessary. Set to 0 to skip this check. | ||
progressbar: Whether to wrap the chunks in a tqdm.auto.tqdm. | ||
apply_name: Attribute name for the patched apply function. | ||
map_name: Attribute name for the patched map function. | ||
applymap_name: Attribute name for the patched applymap function. | ||
""" | ||
from pandas.core.base import PandasObject | ||
|
||
setattr( | ||
PandasObject, | ||
apply_name, | ||
partialmethod( | ||
_mapply, | ||
n_workers=n_workers, | ||
chunk_size=chunk_size, | ||
max_chunks_per_worker=max_chunks_per_worker, | ||
progressbar=progressbar, | ||
), | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
from functools import partial | ||
from typing import Any, Callable, Union | ||
|
||
from mapply.parallel import N_CORES, multiprocessing_imap | ||
|
||
|
||
def _choose_n_chunks( | ||
df_or_series: Any, | ||
n_workers: int, | ||
chunk_size: int, | ||
max_chunks_per_worker: int, | ||
): | ||
"""Choose final amount of chunks to be sent to the ProcessingPool.""" | ||
# no sense running parallel if data is too small | ||
n_chunks = int(len(df_or_series) / chunk_size) | ||
|
||
if max_chunks_per_worker: | ||
# no sense making too many chunks | ||
n_chunks = min(n_chunks, max_chunks_per_worker * N_CORES) | ||
if n_chunks < 1 or n_workers == 1 or N_CORES == 1: | ||
# no sense running parallel | ||
n_chunks = 1 | ||
|
||
return n_chunks | ||
|
||
|
||
def mapply( | ||
df_or_series: Any, | ||
function: Callable, | ||
axis: Union[int, str] = 0, | ||
*, | ||
n_workers: int = -1, | ||
chunk_size: int = 100, | ||
max_chunks_per_worker: int = 20, | ||
progressbar: bool = True, | ||
args=(), | ||
**kwargs | ||
) -> Any: | ||
"""Run apply on n_workers. Split in chunks, gather results, and concat them. | ||
Args: | ||
df_or_series: Argument reserved to the class instance, a.k.a. 'self'. | ||
function: Function to apply to each column or row. | ||
axis: Axis along which the function is applied. | ||
n_workers: Amount of workers (processes) to spawn. | ||
chunk_size: Minimum amount of items per chunk. Determines upper limit for n_chunks. | ||
max_chunks_per_worker: Upper limit on amount of chunks per worker. Will lower | ||
n_chunks determined by chunk_size if necessary. Set to 0 to skip this check. | ||
progressbar: Whether to wrap the chunks in a tqdm.auto.tqdm. | ||
args: Additional positional arguments to pass to function. | ||
kwargs: Additional keyword arguments to pass to function. | ||
Returns: | ||
Series or DataFrame resulting from applying function along given axis. | ||
""" | ||
from numpy import array_split | ||
from pandas import Series, concat | ||
|
||
n_chunks = _choose_n_chunks( | ||
df_or_series, | ||
n_workers, | ||
chunk_size, | ||
max_chunks_per_worker, | ||
) | ||
|
||
if isinstance(axis, str): | ||
axis = ["index", "columns"].index(axis) | ||
|
||
if axis == 1: | ||
# axis argument pre-processing | ||
df_or_series = df_or_series.T | ||
|
||
dfs = array_split(df_or_series, n_chunks, axis=axis) | ||
|
||
def run_apply(function, df, args=(), **kwargs): | ||
# axis argument is handled such that always axis=0 here | ||
return df.apply(function, args=args, **kwargs) # pragma: no cover | ||
|
||
results = multiprocessing_imap( | ||
partial(run_apply, function, args=args, **kwargs), | ||
dfs, | ||
n_workers=n_workers, | ||
progressbar=progressbar, | ||
) | ||
|
||
if ( | ||
len(results) > 1 | ||
and isinstance(results[0], Series) | ||
and results[0].index.equals(results[1].index) | ||
): | ||
# one more aggregation needed for final df, e.g. df.parallel_apply(sum) | ||
return concat(results, axis=1).apply(function, axis=1, args=args, **kwargs) | ||
|
||
if axis == 1: | ||
# axis argument pre-processing | ||
results = (df.T for df in results) # type: ignore | ||
return concat(results) |
Oops, something went wrong.