First replace() infs with NaN:
df.replace([np.inf, -np.inf], np.nan, inplace=True)
and then drop NaNs via dropna():
df.dropna(subset=["col1", "col2"], how="all", inplace=True)
For example:
>>> df = pd.DataFrame({"col1": [1, np.inf, -np.inf], "col2": [2, 3, np.nan]})
>>> df
col1 col2
0 1.0 2.0
1 inf 3.0
2 -inf NaN
>>> df.replace([np.inf, -np.inf], np.nan, inplace=True)
>>> df
col1 col2
0 1.0 2.0
1 NaN 3.0
2 NaN NaN
>>> df.dropna(subset=["col1", "col2"], how="all", inplace=True)
>>> df
col1 col2
0 1.0 2.0
1 NaN 3.0
The same method also works for Series.
First replace() infs with NaN:
df.replace([np.inf, -np.inf], np.nan, inplace=True)
and then drop NaNs via dropna():
df.dropna(subset=["col1", "col2"], how="all", inplace=True)
For example:
>>> df = pd.DataFrame({"col1": [1, np.inf, -np.inf], "col2": [2, 3, np.nan]})
>>> df
col1 col2
0 1.0 2.0
1 inf 3.0
2 -inf NaN
>>> df.replace([np.inf, -np.inf], np.nan, inplace=True)
>>> df
col1 col2
0 1.0 2.0
1 NaN 3.0
2 NaN NaN
>>> df.dropna(subset=["col1", "col2"], how="all", inplace=True)
>>> df
col1 col2
0 1.0 2.0
1 NaN 3.0
The same method also works for Series.
DEPRECATED
With option context, this is possible without permanently setting use_inf_as_na. For example:
with pd.option_context('mode.use_inf_as_na', True):
df = df.dropna(subset=['col1', 'col2'], how='all')
Of course it can be set to treat inf as NaN permanently with
pd.set_option('use_inf_as_na', True)
For older versions, replace use_inf_as_na with use_inf_as_null.
All you have to do is wrap pd.isnull in a way that in case it gets an iterable it will be forced to check it element-wise. This way you will always get a scalar boolean as output.
from collections import Iterable
def is_scalar_null(value):
if isinstance(value, Iterable):
return all(not pd.isnull(v) for v in value)
return not pd.isnull(value)
assert is_scalar_null(3)
assert is_scalar_null([1, 2])
assert is_scalar_null(pd.Series([1]))
assert not is_scalar_null(None)
assert not is_scalar_null(np.nan)
assert not is_scalar_null([np.nan, 1])
assert not is_scalar_null(pd.Series([np.nan, 1]))
You can then patch the actual pd.isnull, but I can not say that I suggest doing so.
from collections import Iterable
orig_pd_is_null = pd.isnull
def is_scalar_null(value):
if isinstance(value, Iterable):
return all(not orig_pd_is_null(v) for v in value)
return not orig_pd_is_null(value)
pd.isnull = is_scalar_null
assert pd.isnull(3)
assert pd.isnull([1, 2])
assert pd.isnull(pd.Series([1]))
assert not pd.isnull(None)
assert not pd.isnull(np.nan)
assert not pd.isnull([np.nan, 1])
assert not pd.isnull(pd.Series([np.nan, 1]))
This approach will probably break in case of nested iterables, but that can be fixed by using recursion in is_scalar_null.
This is an extension to @DeepSpace's solution. For NumPy arrays and, by extension, numeric Pandas series, you can utilize numba for JIT-compiling your loop. all / any with a generator comprehension is generally less efficient, and often prohibitively expensive when your NaN value is near the end of your array.
For example, in an extreme case we see a ~240x performance differential:
from collections import Iterable
from numba import njit
def any_null(arr):
for i in range(len(arr)):
if np.isnan(arr[i]):
return True
return False
def is_scalar_null(value, jit_flag=True):
checker = njit(any_null) if jit_flag else any_null
if isinstance(value, pd.Series):
return checker(value.values)
elif isinstance(value, np.ndarray):
return checker(value)
elif isinstance(value, Iterable):
return all(not pd.isnull(v) for v in value)
return not pd.isnull(value)
np.random.seed(0)
A = np.random.random(10**7)
A[-1] = np.nan
%timeit is_scalar_null(A, jit_flag=True) # 74.3 ms per loop
%timeit is_scalar_null(A, jit_flag=False) # 17.6 s per loop