I have a df and I want to calculate mean of the 3rd quintile for each group. The way do is to write a self defined function and to apply for each group; but there are some issues. The code:
import pandas as pd
import numpy as np
df = pd.DataFrame({'A': pd.Series(np.array(range(20))), 'B': ['a','a','a','a','a','a','a','a','a','a','b','b','b','b','b','b','b','b','b','b']})
def func_mean_quintile(df):
# Make sure data is in DataFrame
df = pd.DataFrame(df)
df['pct'] = pd.to_numeric(pd.cut(df.iloc[:,0], 5, labels=np.r_[1:6]))
avg = df[df['pct'] == 3].iloc[:,0].mean()
return np.full((len(df)), avg)
df['C'] = df.groupby('B')['A'].apply(func_mean_quintile)
The result is NaN for all column C
I don't know where is it wrong?
Plus if you know how to make self defined function perform better, please help
Thank you
CodePudding user response:
Proposed solution without function
You do not need a function; this should do the calc:
q_lo = 0.4 # start of 3d quintile
q_hi = 0.6 # end of 3d quintile
(df.groupby('B')
.apply(lambda g:g.assign(C = g.loc[(g['A'] >= g['A'].quantile(q_lo)) & (g['A'] < g['A'].quantile(q_hi)), 'A' ].mean()))
.reset_index(drop = True)
)
output:
A B C
0 0 a 4.5
1 1 a 4.5
2 2 a 4.5
3 3 a 4.5
4 4 a 4.5
5 5 a 4.5
6 6 a 4.5
7 7 a 4.5
8 8 a 4.5
9 9 a 4.5
10 10 b 14.5
11 11 b 14.5
12 12 b 14.5
13 13 b 14.5
14 14 b 14.5
15 15 b 14.5
16 16 b 14.5
17 17 b 14.5
18 18 b 14.5
19 19 b 14.5
Your original solution
Also works if you replace the line df['C'] = ... with
df['C'] = df.groupby('B')['A'].transform(func_mean_quintile)
CodePudding user response:
Do it like this:
import pandas as pd
import numpy as np
df = pd.DataFrame({'A': pd.Series(np.array(range(20))), 'B':['a','a','a','a','a','a','a','a','a','a','b','b','b','b','b','b','b','b','b' ,'b']})
def func_mean_quintile(df):
# Make sure data is in DataFrame
df = pd.DataFrame(df)
df['pct'] = pd.to_numeric(pd.cut(df.iloc[:,0], 5, labels=np.r_[1:6]))
avg = df[df['pct'] == 3].iloc[:,0].mean()
return np.full((len(df)), avg)
means = df.groupby('B').apply(func_mean_quintile)
df['C'][df["B"]=='a'] = means["a"]
df['C'][df["B"]=='b'] = means["b"]
This will give you the required output.
CodePudding user response:
Think its easier if you split it in two different steps. First label each datapoint with which quantile it is in. Secondly just an aggregation per quantile.
import pandas as pd
import numpy as np
df = pd.DataFrame(
{
"a": pd.Series(np.array(range(20))),
"b": ["a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "b", "b", "b", "b", "b", "b", "b", "b", "b", "b"],
}
)
df["a_quantile"] = pd.cut(df.a, bins=4, labels=["q1", "q2", "q3", "q4"])
df_agg = df.groupby("a_quantile").agg({"a": ["mean"]})
df_agg.head()
With the aggregation results shown below:
Out[9]:
a
mean
a_quantile
q1 2
q2 7
q3 12
q4 17
