Practicing Coding Interview Questions in Python
Kirill Smirnov
Data Science Consultant, Altran
retinol = pd.read_csv('retinol.csv')
retinol = retinol[['age','gender','smoking','bmi','vitamin use','plasma B-carotene','plasma retinol']]
print(retinol.head())
age gender smoking bmi vitamin use plasma B-carotene plasma retinol
0 64 Female Former 21.48380 Yes_fairly_often 200 915
1 76 Female Never 23.87631 Yes_fairly_often 124 727
2 38 Female Former 20.01080 Yes_not_often 328 721
3 40 Female Former 25.14062 No 153 615
4 72 Female Never 20.98504 Yes_fairly_often 92 799
background factors $\rightarrow$ plasma B-carotene
, plasma retinol
groups the data according to some criteria allowing to perform an operation on each group.
df.groupby(column_name(s))
gens = retinol.groupby('gender')
print(gens)
<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x00000262DB5E2780>
gensmoks = retinol.groupby(['gender', 'smoking'])
print(gensmoks)
<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x00000262DB5F57B8>
gens = retinol.groupby('gender')
for group in gens:
# Each group is a tuple
# First element is a grouping factor
print(group[0])
# Second element is a DataFrame
print(group[1].head(3))
len(gens)
2
Female
age gender smoking bmi ...
0 64 Female Former 21.48380 ...
1 76 Female Never 23.87631 ...
2 38 Female Former 20.01080 ...
Male
age gender smoking bmi ...
12 57 Male Never 31.73039 ...
14 66 Male Never 27.31916 ...
15 64 Male Former 31.44674 ...
gensmoks = retinol.groupby(['gender', 'smoking'])
for group in gensmoks:
# Each group is a tuple
# First element is a grouping factor
print(group[0])
# Second element is a DataFrame
print(group[1].head(3))
len(gensmoks)
6
('Female', 'Current_Smoker')
age gender smoking bmi ...
32 74 Female Current_Smoker 16.33114 ...
35 44 Female Current_Smoker 25.87867 ...
43 31 Female Current_Smoker 23.34593 ...
('Female', 'Former')
age gender smoking bmi ...
0 64 Female Former 21.48380 ...
2 38 Female Former 20.01080 ...
3 40 Female Former 25.14062 ...
('Female', 'Never')
age gender smoking bmi ...
1 76 Female Never 23.87631 ...
4 72 Female Never 20.98504 ...
...
gens = retinol.groupby('gender')
retinol['plasma retinol'].mean()
602.790476
retinol['vitamin use'].count()
315
gens['plasma retinol'].mean()
plasma retinol
gender
Female 587.721612
Male 700.738095
gens['vitamin use'].count()
vitamin use
gender
Female 273
Male 42
.agg(function, axis= , args= )
- almost identical to the .apply()
method
import numpy as np
retinol['plasma retinol'].agg(np.mean)
602.790476
.agg(function, axis= , args= )
- almost identical to the .apply()
method
import numpy as np
retinol[['plasma B-carotene', 'plasma retinol']].agg(np.mean)
plasma B-carotene 189.892063
plasma retinol 602.790476
dtype: float64
.agg(function, axis= , args= )
- almost identical to the .apply()
method
import numpy as np
retinol[['plasma B-carotene', 'plasma retinol']].agg([np.mean, np.std])
plasma B-carotene plasma retinol
mean 189.892063 602.790476
std 183.000803 208.895474
gens = retinol.groupby('gender')
gens['plasma retinol'].agg([np.mean, np.std])
plasma retinol
mean std
gender
Female 587.721612 185.430687
Male 700.738095 307.808783
gensmoks = retinol.groupby(['gender', 'smoking'])
gensmoks['plasma retinol'].agg([np.mean, np.std])
plasma retinol
mean std
gender smoking
Female Current_Smoker 556.111111 191.112649
Former 607.752688 187.983733
Never 582.687500 182.182398
Male Current_Smoker 598.857143 289.618961
Former 798.500000 323.196203
Never 590.153846 249.307991
gens = retinol.groupby('gender')
def n_more_than_mean(series):
result = series[series > np.mean(series)]
return len(result)
gens[['plasma B-carotene', 'retinol']].agg(n_more_than_mean)
plasma B-carotene plasma retinol
gender
Female 87 119
Male 13 19
gens = retinol.groupby('gender')
def n_more_than_mean(series):
result = series[series > np.mean(series)]
return len(result)
gens[['plasma B-carotene', 'plasma retinol']].agg([n_more_than_mean, lambda x: len(x)])
plasma B-carotene plasma retinol
count_more_than_mean <lambda> count_more_than_mean <lambda>
gender
Female 87 273 119 273
Male 13 42 19 42
gens = retinol.groupby('gender')
def n_more_than_mean(series):
result = series[series > np.mean(series)]
return len(result)
gens[['plasma B-carotene', 'plasma retinol']].agg({'count': n_more_than_mean, 'len': lambda x: len(x)})
count len
plasma B-carotene plasma retinol plasma B-carotene plasma retinol
gender
Female 87 119 273 273
Male 13 19 42 42
.transform(function, axis= , args= )
- almost identical to the .apply()
method
import numpy as np
def center_scale(series):
return (series - np.mean(series))/np.std(series)
compounds = ['plasma B-carotene', 'retinol']
df = retinol[compounds].transform(center_scale)
print(df)
plasma B-carotene plasma retinol
0 0.055322 1.496951
1 -0.360637 0.595547
2 0.755886 0.566779
3 -0.201916 0.058541
4 -0.535778 0.940766
5 -0.229282 0.245534
6 0.372765 1.108580
...
309 -0.251174 0.715415
310 -0.141711 -1.854544
311 -0.601456 -1.317538
312 0.602637 -0.483260
313 -0.377057 0.389375
314 0.235936 1.070223
gensmoks = retinol.groupby(['gender', 'smoking'])
compounds = ['plasma B-carotene', 'retinol']
df = gensmoks[compounds].transform(center_scale)
print(df)
plasma B-carotene plasma retinol
0 -0.018568 1.643294
1 -0.436191 0.794897
2 0.629616 0.605697
3 -0.256573 0.038762
4 -0.597427 1.191485
5 -0.281892 0.247351
6 0.238985 1.384270
...
309 -0.302148 0.771498
310 -0.200869 -2.095267
311 -0.657891 -1.402860
312 0.450607 -0.444440
313 -0.418619 0.407804
314 0.113019 1.340205
gensmoks = retinol.groupby(['gender', 'smoking'])
compounds = ['plasma B-carotene', 'retinol']
df = gensmoks[compounds].transform(
lambda x: (x - np.mean(x))/np.std(x)
)
print(df)
plasma B-carotene plasma retinol
0 -0.018568 1.643294
1 -0.436191 0.794897
2 0.629616 0.605697
3 -0.256573 0.038762
4 -0.597427 1.191485
5 -0.281892 0.247351
6 0.238985 1.384270
...
309 -0.302148 0.771498
310 -0.200869 -2.095267
311 -0.657891 -1.402860
312 0.450607 -0.444440
313 -0.418619 0.407804
314 0.113019 1.340205
.filter(function)
function
$\rightarrow$ True
- group stays
function
$\rightarrow$ False
- group leaves
function(pd.DataFrame)
- the function acts on the whole DataFrame in each group.
gensmoks = retinol.groupby(['gender', 'smoking'])
len(gensmoks)
6
def check_bmi(dataframe):
return np.mean(dataframe['bmi']) > 26
retinol_filtered = gensmoks.filter(check_bmi)
print(retinol_filtered)
age gender smoking bmi ...
1 76 Female Never 23.87631 ...
4 72 Female Never 20.98504 ...
6 65 Female Never 22.01154 ...
7 58 Female Never 28.75702 ...
8 35 Female Never 23.07662 ...
11 40 Female Never 36.43161 ...
13 66 Female Never 21.78854 ...
...
299 47 Female Never 37.27761 ...
302 41 Female Never 34.61493 ...
306 66 Female Never 33.10759 ...
311 45 Female Never 23.82703 ...
312 49 Female Never 24.26126 ...
314 45 Female Never 26.50808 ...
gensmoks = retinol.groupby(['gender', 'smoking'])
len(gensmoks)
6
def check_bmi(dataframe):
return np.mean(dataframe['bmi']) > 26
retinol_filtered = gensmoks.filter(check_bmi)
len(retinol_filtered.groupby(['gender', 'smoking']))
3
Practicing Coding Interview Questions in Python