How to use the .apply() method on a DataFrame?

Practicing Coding Interview Questions in Python

Kirill Smirnov

Data Science Consultant, Altran

Dataset

import pandas as pd

scores = pd.read_csv('exams.csv')
scores = scores[['math score', 'reading score', 'writing score']]
print(scores.head())

   math score  reading score  writing score
0          74             86             82
1          44             49             53
2          54             46             43
3          88             95             92
4          85             81             81

Default .apply()

df.apply(function)

print(scores.head())

   math score  reading score  writing score
0          74             86             82
1          44             49             53
2          54             46             43
3          88             95             92
4          85             81             81

import numpy as np

scores_new = scores.apply(np.sqrt)
print(score_new)

   math score  reading score  writing score
0    8.602325       9.273618       9.055385
1    6.633250       7.000000       7.280110
2    7.348469       6.782330       6.557439
3    9.380832       9.746794       9.591663
4    9.219544       9.000000       9.000000
...

Default .apply()

df.apply(function)

print(scores.head())

   math score  reading score  writing score
0          74             86             82
1          44             49             53
2          54             46             43
3          88             95             92
4          85             81             81

import numpy as np

scores_new = scores.apply(np.mean)
print(score_new.head())

math score       65.18
reading score    69.28
writing score    67.96
dtype: float64

type(scores_new)

pandas.core.series.Series

Default .apply()

df.apply(function)

print(scores.head())

   math score  reading score  writing score
0          74             86             82
1          44             49             53
2          54             46             43
3          88             95             92
4          85             81             81

function(pd.Series)

input size $n$
$\rightarrow$ np.sqrt(pd.Series)
$\rightarrow$ output size $n$

input size $n$
$\rightarrow$ np.mean(pd.Series)
$\rightarrow$ single value

Default .apply(): own functions

df.apply(function)

print(scores.head())

   math score  reading score  writing score
0          74             86             82
1          44             49             53
2          54             46             43
3          88             95             92
4          85             81             81

def divide_scores(x):
    return x / 2

scores_new = scores.apply(divide_scores)
print(scores_new)

    math score  reading score  writing score
0         37.0           43.0           41.0
1         22.0           24.5           26.5
2         27.0           23.0           21.5
3         44.0           47.5           46.0
4         42.5           40.5           40.5
...

Default .apply(): own functions

df.apply(function)

print(scores.head())

   math score  reading score  writing score
0          74             86             82
1          44             49             53
2          54             46             43
3          88             95             92
4          85             81             81

def perfect_score(x):
    return 100

scores_new = scores.apply(perfect_score)
print(scores_new)

math score       100
reading score    100
writing score    100
dtype: int64

Lambda expressions

def divide_scores(x):
    return x / 2

scores_new = scores.apply(divide_scores)
print(scores_new)

    math score  reading score  writing score
0         37.0           43.0           41.0
1         22.0           24.5           26.5
2         27.0           23.0           21.5
3         44.0           47.5           46.0
4         42.5           40.5           40.5
...

def perfect_score(x):
    return 100

scores_new = scores.apply(perfect_score) 
print(scores_new)

math score       100
reading score    100
writing score    100
dtype: int64

Lambda expressions

scores_new = scores.apply(lambda x: x / 2)
print(scores_new)

    math score  reading score  writing score
0         37.0           43.0           41.0
1         22.0           24.5           26.5
2         27.0           23.0           21.5
3         44.0           47.5           46.0
4         42.5           40.5           40.5
...

scores_new = scores.apply(lambda x: 100)
print(scores_new)

math score       100
reading score    100
writing score    100
dtype: int64

Additional arguments: axis

df.apply(function, axis= )

Additional arguments: axis

df.apply(function, axis=0)

Additional arguments: axis

df.apply(function, axis=1)

Additional arguments: axis

df.apply(function, axis= )

axis=0 - function is applied over columns

axis=1 - function is applied over rows

print(scores.head())

   math score  reading score  writing score
0          74             86             82
1          44             49             53
2          54             46             43
3          88             95             92
4          85             81             81

import numpy as np

scores_new = scores.apply(np.mean)
print(score_new.head())

math score       65.18
reading score    69.28
writing score    67.96
dtype: float64

Additional arguments: axis

df.apply(function, axis= )

axis=0 - function is applied over columns

axis=1 - function is applied over rows

print(scores.head())

   math score  reading score  writing score
0          74             86             82
1          44             49             53
2          54             46             43
3          88             95             92
4          85             81             81

import numpy as np

scores_new = scores.apply(np.mean, axis=0)
print(score_new.head())

math score       65.18
reading score    69.28
writing score    67.96
dtype: float64

Additional arguments: axis

df.apply(function, axis= )

axis=0 - function is applied over columns

axis=1 - function is applied over rows

print(scores.head())

   math score  reading score  writing score
0          74             86             82
1          44             49             53
2          54             46             43
3          88             95             92
4          85             81             81

import numpy as np

scores_new = scores.apply(np.mean, axis=1)
print(score_new.head())

0     80.666667
1     48.666667
2     47.666667
3     91.666667
4     82.333333
5     84.000000
6     75.000000
7     70.666667
...

Additional arguments: result_type

df.apply(function, result_type= )

result_type='expand'

print(scores.head())

   math score  reading score  writing score
0          74             86             82
1          44             49             53
2          54             46             43
3          88             95             92
4          85             81             81

import numpy

def span(x):
    return [np.min(x), np.max(x)]

scores_new = scores.apply(span)
print(scores_new)

math score       [27, 100]
reading score    [33, 100]
writing score    [30, 100]
dtype: object

Additional arguments: result_type

df.apply(function, result_type= )

result_type='expand'

print(scores.head())

   math score  reading score  writing score
0          74             86             82
1          44             49             53
2          54             46             43
3          88             95             92
4          85             81             81

import numpy

def span(x):
    return [np.min(x), np.max(x)]

scores.apply(span, result_type='expand')

   math score  reading score  writing score
0          27             33             30
1         100            100            100

Additional arguments: result_type

df.apply(function, result_type= )

result_type='expand'

print(scores.head())

   math score  reading score  writing score
0          74             86             82
1          44             49             53
2          54             46             43
3          88             95             92
4          85             81             81

import numpy

def span(x):
    return [np.min(x), np.max(x)]

scores.apply(span, result_type='expand', axis=1)

     0    1
0   74   86
1   44   53
2   43   54
3   88   95
4   81   85
...

Additional arguments: result_type

df.apply(function, result_type= )

result_type='broadcast'

print(scores.head())

   math score  reading score  writing score
0          74             86             82
1          44             49             53
2          54             46             43
3          88             95             92
4          85             81             81

import numpy as np

scores_new = scores.apply(np.mean)
print(score_new.head())

math score       65.18
reading score    69.28
writing score    67.96
dtype: float64

Additional arguments: result_type

df.apply(function, result_type= )

result_type='broadcast'

print(scores.head())

   math score  reading score  writing score
0          74             86             82
1          44             49             53
2          54             46             43
3          88             95             92
4          85             81             81

import numpy as np

scores.apply(np.mean, result_type='broadcast')

    math score  reading score  writing score
0           65             69             67
1           65             69             67
2           65             69             67
3           65             69             67
4           65             69             67
5           65             69             67
6           65             69             67
7           65             69             67
...

More than one argument in a function

function(pd.Series)

More than one argument in a function

function(pd.Series, arg1, arg2, ..., kwarg1=val1, kwarg2=val2, ...)

def check_mean(x, a, b, inside=True):
    mean = np.mean(x)
    if inside:
        return mean > a and mean < b
    else:
        return mean < a or mean > b

Applying the function

print(scores.head())

   math score  reading score  writing score
0          74             86             82
1          44             49             53
2          54             46             43
3          88             95             92
4          85             81             81

import numpy as np

scores.apply(check_mean)

TypeError

Additional arguments: args

df.apply(function, args= )

args - [arg1, arg2, ...]

print(scores.head())

   math score  reading score  writing score
0          74             86             82
1          44             49             53
2          54             46             43
3          88             95             92
4          85             81             81

import numpy as np

scores.apply(check_mean, args=[67, 70])

math score       False
reading score     True
writing score     True
dtype: bool

Additional arguments: args

df.apply(function, args= )

args - (arg1, arg2, ...)

print(scores.head())

   math score  reading score  writing score
0          74             86             82
1          44             49             53
2          54             46             43
3          88             95             92
4          85             81             81

import numpy as np

scores.apply(
    check_mean, args=[67, 70], inside=False
)

math score        True
reading score    False
writing score    False
dtype: bool

Let's practice!

Practicing Coding Interview Questions in Python