Web Scraping in R
Timo Grossenbacher
Instructor
<html>
<body>
<table id="cast">
<tr><td class = "actor">Arnold S.</td><td class = "role"><em>1</em> (Voice)</td></tr>
<tr><td class = "actor">Burt R.</td><td class = "role"><em>2</em> (Choreo)</td></tr>
<tr><td class = "actor">Charlize T.</td><td class = "role"><em>3</em> (Voice)</td></tr>
</table>
</body>
</html>
html %>%
html_elements("#cast td.role") %>%
html_text()
[1] "1 (Voice)" "2 (Choreo)" "3 (Voice)"
<html>
<body>
<table id="cast">
<tr><td class = "actor">Arnold S.</td><td class = "role"><em>1</em> (Voice)</td></tr>
<tr><td class = "actor">Burt R.</td><td class = "role"><em>2</em> (Choreo)</td></tr>
<tr><td class = "actor">Charlize T.</td><td class = "role"><em>3</em> (Voice)</td></tr>
</table>
</body>
</html>
html %>%
html_elements(xpath = '//*[@id = "cast"]//td[@class = "role"]') %>% # equal to ""#cast td.role"
html_elements(xpath = "./text()") %>%
html_text(trim = T)
[1] "(Voice)" "(Choreo)" "(Voice)"
<html>
<body>
<table id="cast">
<tr><td class = "actor">Arnold S.</td><td class = "role"><em>1</em> (Voice)</td></tr>
<tr><td class = "actor">Burt R.</td><td class = "role"><em>2</em> (Choreo)</td></tr>
<tr><td class = "actor">Charlize T.</td><td class = "role"><em>3</em> (Voice)</td></tr>
</table>
</body>
</html>
html %>%
html_elements(xpath = '//*[@id = "cast"]//td[@class = "role" and text() = " (Voice)"]')
{xml_nodeset (2)}
[1] <td class="role">\n<em>1</em> (Voice)</td>
[2] <td class="role">\n<em>3</em> (Voice)</td>
<html>
<body>
<table id="cast">
<tr><td class = "actor">Arnold S.</td><td class = "role"><em>1</em> (Voice)</td></tr>
<tr><td class = "actor">Burt R.</td><td class = "role"><em>2</em> (Choreo)</td></tr>
<tr><td class = "actor">Charlize T.</td><td class = "role"><em>3</em> (Voice)</td></tr>
</table>
</body>
</html>
html %>%
# same as before
html_elements(xpath = '//*[@id = "cast"]//td[@class = "role" and text() = " (Voice)"]') %>%
html_elements(xpath = '..') # selects the parent (tr) of each selected td element
{xml_nodeset (2)}
[1] <tr>\n<td class="actor">Arnold S.</td>\n<td class="role">\n<em>1</em> (Voice)</td>\n ...
[2] <tr>\n<td class="actor">Burt R.</td>\n<td class="role">\n<em>3</em> (Voice)</td>\n ..
Web Scraping in R