The XPATH text() function

Web Scraping in R

Timo Grossenbacher

Instructor

<html>
<body>
  <table id="cast">
    <tr><td class = "actor">Arnold S.</td><td class = "role"><em>1</em> (Voice)</td></tr>
    <tr><td class = "actor">Burt R.</td><td class = "role"><em>2</em> (Choreo)</td></tr>
    <tr><td class = "actor">Charlize T.</td><td class = "role"><em>3</em> (Voice)</td></tr>
  </table>
</body>
</html>
html %>%
  html_elements("#cast td.role") %>%
  html_text()
[1] "1 (Voice)"  "2 (Choreo)" "3 (Voice)"
Web Scraping in R
<html>
<body>
  <table id="cast">
    <tr><td class = "actor">Arnold S.</td><td class = "role"><em>1</em> (Voice)</td></tr>
    <tr><td class = "actor">Burt R.</td><td class = "role"><em>2</em> (Choreo)</td></tr>
    <tr><td class = "actor">Charlize T.</td><td class = "role"><em>3</em> (Voice)</td></tr>
  </table>
</body>
</html>
html %>% 
  html_elements(xpath = '//*[@id = "cast"]//td[@class = "role"]') %>% # equal to ""#cast td.role"
  html_elements(xpath = "./text()") %>% 
  html_text(trim = T)
[1] "(Voice)"  "(Choreo)" "(Voice)"
Web Scraping in R
<html>
<body>
  <table id="cast">
    <tr><td class = "actor">Arnold S.</td><td class = "role"><em>1</em> (Voice)</td></tr>
    <tr><td class = "actor">Burt R.</td><td class = "role"><em>2</em> (Choreo)</td></tr>
    <tr><td class = "actor">Charlize T.</td><td class = "role"><em>3</em> (Voice)</td></tr>
  </table>
</body>
</html>
html %>%
 html_elements(xpath = '//*[@id = "cast"]//td[@class = "role" and text() = " (Voice)"]')
{xml_nodeset (2)}
[1] <td class="role">\n<em>1</em> (Voice)</td>
[2] <td class="role">\n<em>3</em> (Voice)</td>
Web Scraping in R
<html>
<body>
  <table id="cast">
    <tr><td class = "actor">Arnold S.</td><td class = "role"><em>1</em> (Voice)</td></tr>
    <tr><td class = "actor">Burt R.</td><td class = "role"><em>2</em> (Choreo)</td></tr>
    <tr><td class = "actor">Charlize T.</td><td class = "role"><em>3</em> (Voice)</td></tr>
  </table>
</body>
</html>
html %>%
 # same as before
 html_elements(xpath = '//*[@id = "cast"]//td[@class = "role" and text() = " (Voice)"]') %>% 
 html_elements(xpath = '..') # selects the parent (tr) of each selected td element
{xml_nodeset (2)}
[1] <tr>\n<td class="actor">Arnold S.</td>\n<td class="role">\n<em>1</em> (Voice)</td>\n ...
[2] <tr>\n<td class="actor">Burt R.</td>\n<td class="role">\n<em>3</em> (Voice)</td>\n ..
Web Scraping in R

Let's practice!

Web Scraping in R

Preparing Video For Download...