pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth',200)
df.loc[[223,415]]
df=pd.DataFrame({'A':['a',2]}).applymap(lambda x:x-1 if isinstance(x,int) else x)
print(df)
A
0 a
1 1
df[ df['column'].apply(lambda l: [d.get('key')==value for d in l]
if l else [False]).apply(lambda l: True in l) ]
Based on https://stackoverflow.com/a/16354730 General help:
df.apply(lambda row: row['a'] % row['c'],axis=1)
Details...
def my_test2(row):
return row['a'] % row['c']
df = DataFrame ({'a' : np.random.randn(6),
'b' : ['foo', 'bar'] * 3,
'c' : np.random.randn(6)})
df['Value'] = df.apply(lambda row: my_test(row['a'], row['c']), axis=1)
df
Out[..]:
a b c Value
0 -1.674308 foo 0.343801 0.044698
...
To apply in all entries use df.applymap
It can be used to merge two columns
df.apply(lambda row: row['A'] if row['A'] else row['B'],axis=1 )
def func(row):
for i in range(len(row['col'])):
row['col'][i].get('key')='NEW VALUE'
...
return row['col']
>>> df['col'].apply(func,axis='columns')
- Apply a function to a Dataframe elementwise.
df.applymap(func)
df.drop_duplicates(subset=['column_name']).reset_index(drop=True)
pd.options.mode.chained_assignment = None # default='warn'
df.drop(['col_1','col_2'],axis='columns')
df.dropna(axis=1)
mi
is the list of indices
df.drop(df.index[list(mi)]
if df1.shape[0]==df2.shape[0]: # df1.index!=df2.index
df1.index=df2.index.values
df.rename({'OLD':'NEW'}, axis = 'columns')
df.rename(index={0:'hello',1:'world'})
df[df.duplicated()]
or
df[df.duplicated('column')]
If a df is passed as a function argument, a column change could propagate outsided the function. To avoid problems
pass a hard copy of the df to the function through df.copy()
df_all=df[df.duplicated('KEY',keep=False)]
df_unique=df.drop_duplicates('KEY')]
for i in df_unique.index:
mtch=df[df['KEY']==df_all.loc[i,'KEY']
if mctch.shape[0]:
.....
pandas.concat([df1, df2], axis=1)
from numpy.random import randn
from pandas import DataFrame
df = DataFrame(randn(10, 2), columns=list('ab'))
df.query('a > b')
df[df.a > df.b] # same result as the previous expression
import pandas as pd
l=pd.DataFrame()
l=l.append(pd.Series({'T':'A','W':10}),ignore_index=True)
l=l.append(pd.Series({'T':'B','W':1}),ignore_index=True)
r=pd.DataFrame()
r=r.append(pd.Series({'T':'A','S':2}),ignore_index=True)
r=r.append(pd.Series({'T':'C','S':1}),ignore_index=True)
print(l.merge(r,on='T',how='outer').fillna(0))
print('='*15)
print(l.merge(r,on='T',how='left').fillna(0))
print('='*15)
print(l.merge(r,on='T',how='right').fillna(0))
print('='*15)
print(l.merge(r,on='T',how='inner').fillna(0))
T W S
0 A 10.0 2.0
1 B 1.0 0.0
2 C 0.0 1.0
===============
T W S
0 A 10.0 2.0
1 B 1.0 0.0
===============
T W S
0 A 10.0 2.0
1 C 0.0 1.0
===============
T W S
0 A 10.0 2.0
pd=pd.DataFrame()
pd.loc[10,'hello']='world'
See stackoverflow
df[['test1','test2','test3']].max(axis=1)
import csv
df.to_csv('file.csv',sep=' ',
quoting=csv.QUOTE_NONNUMERIC,header=False,index=False)
See also here
df['FULL NAME']=df['NAME']+' '+df['SURNAME']
df = df[~df.datecolumn.isin(a)]
df.groupby('dptos')['dptos'].count().sort_values(ascending=False)
dptos
Instituto de Física 862
Instituto de Biología 656
Instituto de Química 645
Departamento de Matemáticas 130
#For AND
df[ ( df['col1']>0 ) & df['col2']<10 ) ]
# For OR use `|`
df.loc[ ( df['col1']>0 ) & df['col2']<10 ) ,'column_name']
and change the values of all of them
df.loc[ ( df['col1']>0 ) & df['col2']<10 ) ,'column_name'] = new_value
See: https://stackoverflow.com/a/14163209/2268280
df=df.where((pd.notnull(df)), None)
df.to_dict('records')
See https://stackoverflow.com/a/47548471/2268280
df['B'] = df['B'].astype('object')
df.at[1, 'B'] = [1, 2, 3]
df[df.str.contains('query',na=False)]
See: https://stackoverflow.com/a/47010105/2268280
mask = np.column_stack([df[col].astype(str).str.contains('El TLC con la Unión Europea', na=False) for col in df])
df[mask]
Used hdf
which allow to save several dataframes to the same file:
df.to_hdf('file.hdf5','FreeKey')
When the objects are complex numbers or numpy arrays
:
import pandas as pd
import numpy as np
def read_csv_objects(file,columns,**kwargs):
'''
Read pandas data frame with column objects from csv (or excel) file
columns is the column or list of columns which contains the objects,
'''
df=pd.read_csv(file,**kwargs)
columns=list(columns)
for c in columns:
df[c]=df[c].str.replace('\n',',').apply(lambda x: eval(x) )
if df[c].dtype == list:
df[c]=df[c].apply(lambda x: np.array(x))
return df
Use low_memory=False
. For example here
cv=pd.read_csv('/kaggle/input/CORD-19-research-challenge/metadata.csv',low_memory=False)
$ cat kk.json
{"A":1,"B":3}
{"A":5,"B":6}
Read with the following options:
pd.read_json('kk.json',orient='records',lines=True)
Udea orient='index'
. See for example here
a=pd.read_json(url, orient='index').T
Wrongly load as
>>> df.loc[0,'d']
{'1':'A'}
See https://stackoverflow.com/a/34346202/2268280
def jsonKeys2int(x):
if isinstance(x, dict):
return {int(k):v for k,v in x.items()}
return x
>>> df['d']=df['d'].apply(jsonKeys2int)
>>> df.loc[0,'d']
{1:'A'}
Also to enforce UTF-8 encoding and properly characters like 'π' whithout escaping
df.to_json('file.json',orient='records',force_ascii=False)
This can be read either as
pd.read_json('file.json')
or Directly a full json list of dictionaries
- Full file
with open(r"file.json", "r") as read_file:
data = json.load(read_file)
- For a compressed json with UTF-8 encoding (see: https://stackoverflow.com/a/39451012/2268280):
import gzip
with gzip.GzipFile('file.json.gz', 'r') as read_file:
data = json.loads(read_file.read().decode('utf-8'))
- Line by line to avoid encoding problems (see: https://stackoverflow.com/a/29312618/2268280)
data = []
for line in open('file.json', 'r'):
data.append(json.loads(line))
https://stackoverflow.com/a/69226551/2268280
writer = pd.ExcelWriter('file.xlsx',
engine='xlsxwriter',
engine_kwargs={'options': {'strings_to_urls': False}})
df.to_excel(writer,index=False)
writer.close()