Carson Sievert
June 12, 2015
library(rvest) src <- html("http://en.wikipedia.org/wiki/Table_(information)") node <- html_node(src, css = ".wikitable")
".wikitable"
is a CSS selector which says: "grab nodes (aka elements) with a class of wikitable".html_table()
converts a single <table>
node to a data frame.html_table(node)
## First name Last name Age ## 1 Bielat Adamczak 24 ## 2 Blaszczyk Kostrzewski 25 ## 3 Olatunkboh Chijiaku 22 ## 4 Adrienne Anthoula 22 ## 5 Axelia Athanasios 22 ## 6 Jon-Kabat Zinn 22
html("http://en.wikipedia.org/wiki/Table_(information)") %>% html_node(".wikitable") %>% html_table()
## First name Last name Age ## 1 Bielat Adamczak 24 ## 2 Blaszczyk Kostrzewski 25 ## 3 Olatunkboh Chijiaku 22 ## 4 Adrienne Anthoula 22 ## 5 Axelia Athanasios 22 ## 6 Jon-Kabat Zinn 22
Navigate this page and try the following:
Easy: Grab the table at the bottom of the page (hint: instead of grabbing a node by class with html_node(".class")
, you can grab by id with html_node("#id")
)
Medium: Grab the actual mean, max, and min temperature.
Hard: Grab the weather history graph and write the figure to disk (download.file()
may be helpful here).
See here for a solution (thanks Hadley Wickham for the example)
<table>
data?domain <- "http://www.sec.gov" susp <- paste0(domain, "/litigation/suspensions.shtml") hrefs <- html(susp) %>% html_nodes("p+ table a") %>% html_attr(name = "href") tail(hrefs)
## [1] "/litigation/suspensions/2015/34-74326.pdf" ## [2] "/litigation/suspensions/2015/34-74326-o.pdf" ## [3] "/litigation/suspensions/2015/34-74324.pdf" ## [4] "/litigation/suspensions/2015/34-74324-o.pdf" ## [5] "/litigation/suspensions/2015/34-74218.pdf" ## [6] "/litigation/suspensions/2015/34-74218-o.pdf"
# download all the pdfs! hrefs <- hrefs[!is.na(hrefs)] pdfs <- paste0(domain, hrefs) mapply(download.file, pdfs, basename(pdfs))
Nativigate to Wikipedia's list of data structures use SelectorGadget + rvest to do the following:
See here for a solution.
# returns the HTML page source, which is _static_ (no table!) rvest::html("http://bl.ocks.org/cpsievert/raw/2a9fb8f504cd56e9e8e3/")
## <!DOCTYPE html> ## <html><body> ## A Simple Table made with JavaScript ## <p></p> ## <script> ## function tableCreate(){ ## var body = document.body, ## tbl = document.createElement('table'); ## ## for(var i = 0; i < 3; i++){ ## var tr = tbl.insertRow(); ## for(var j = 0; j < 3; j++){ ## var td = tr.insertCell(); ## td.appendChild(document.createTextNode("Cell")); ## } ## } ## body.appendChild(tbl); ## } ## tableCreate(); ## </script> ## </body></html> ##
rdom can construct the DOM:
library(rdom) rdom("http://bl.ocks.org/cpsievert/raw/2a9fb8f504cd56e9e8e3/") %>% html_node("table") %>% html_table()
X1 X2 X3 1 Cell Cell Cell 2 Cell Cell Cell 3 Cell Cell Cell
You can give rdom()
CSS Selectors directly to avoid sending the entire DOM from phantomjs to R
rdom("http://www.techstars.com/companies/stats/", "table") %>% html_table()
X1 X2 X3 1 Cell Cell Cell 2 Cell Cell Cell 3 Cell Cell Cell
GET
.library(httr) hadley <- GET("https://api.github.com/users/hadley") content(hadley)[c("name", "company")]
## $name ## [1] "Hadley Wickham" ## ## $company ## [1] "RStudio"
POST
, PUT
, DELETE
, etc…
hadley <- httr::GET("https://api.github.com/users/hadley") hadley$header[1:3]
## $server ## [1] "GitHub.com" ## ## $date ## [1] "Fri, 12 Jun 2015 16:10:34 GMT" ## ## $`content-type` ## [1] "application/json; charset=utf-8"
XML is a markup language that looks very similar to HTML.
<mariokart> <driver name="Bowser" occupation="Koopa"> <vehicle speed="55" weight="25"> Wario Bike </vehicle> <vehicle speed="40" weight="67"> Piranha Prowler </vehicle> </driver> <driver name="Peach" occupation="Princess"> <vehicle speed="54" weight="29"> Royal Racer </vehicle> <vehicle speed="50" weight="34"> Wild Wing </vehicle> </driver> </mariokart>
XML2R is a framework to simplify acquistion of tabular/relational XML.
library(XML2R) obs <- XML2Obs("http://bit.ly/mario-xml") table(names(obs))
## ## mariokart//driver mariokart//driver//vehicle ## 2 4
obs # named list of observations
## $`mariokart//driver//vehicle` ## speed weight XML_value ## [1,] "55" "25" " Wario Bike " ## ## $`mariokart//driver//vehicle` ## speed weight XML_value ## [1,] "40" "67" " Piranha Prowler " ## ## $`mariokart//driver` ## name occupation ## [1,] "Bowser" "Koopa" ## ## $`mariokart//driver//vehicle` ## speed weight XML_value ## [1,] "54" "29" " Royal Racer " ## ## $`mariokart//driver//vehicle` ## speed weight XML_value ## [1,] "50" "34" " Wild Wing " ## ## $`mariokart//driver` ## name occupation ## [1,] "Peach" "Princess"
collapse_obs(obs) # group into table(s) by observational name/unit
## $`mariokart//driver` ## name occupation ## [1,] "Bowser" "Koopa" ## [2,] "Peach" "Princess" ## ## $`mariokart//driver//vehicle` ## speed weight XML_value ## [1,] "55" "25" " Wario Bike " ## [2,] "40" "67" " Piranha Prowler " ## [3,] "54" "29" " Royal Racer " ## [4,] "50" "34" " Wild Wing "
obs <- add_key(obs, parent = "mariokart//driver", recycle = "name") collapse_obs(obs)
## $`mariokart//driver` ## name occupation ## [1,] "Bowser" "Koopa" ## [2,] "Peach" "Princess" ## ## $`mariokart//driver//vehicle` ## speed weight XML_value name ## [1,] "55" "25" " Wario Bike " "Bowser" ## [2,] "40" "67" " Piranha Prowler " "Bowser" ## [3,] "54" "29" " Royal Racer " "Peach" ## [4,] "50" "34" " Wild Wing " "Peach"
Now (if I want) I can merge the tables into a single table…
tabs <- collapse_obs(obs) merge(tabs[[1]], tabs[[2]], by = "name")
## name occupation speed weight XML_value ## 1 Bowser Koopa 55 25 Wario Bike ## 2 Bowser Koopa 40 67 Piranha Prowler ## 3 Peach Princess 54 29 Royal Racer ## 4 Peach Princess 50 34 Wild Wing
[ { "driver": "Bowser", "occupation": "Koopa", "vehicles": [ { "model": "Wario Bike", "speed": 55, "weight": 25 }, { "model": "Piranha Prowler", "speed": 40, "weight": 67 } ] }, { "driver": "Peach", "occupation": "Princess", "vehicles": [ { "model": "Royal Racer", "speed": 54, "weight": 29 }, { "model": "Wild Wing", "speed": 50, "weight": 34 } ] } ]
library(jsonlite) mario <- fromJSON("http://bit.ly/mario-json") str(mario) # nested data.frames?!?
## 'data.frame': 2 obs. of 3 variables: ## $ driver : chr "Bowser" "Peach" ## $ occupation: chr "Koopa" "Princess" ## $ vehicles :List of 2 ## ..$ :'data.frame': 2 obs. of 3 variables: ## .. ..$ model : chr "Wario Bike" "Piranha Prowler" ## .. ..$ speed : int 55 40 ## .. ..$ weight: int 25 67 ## ..$ :'data.frame': 2 obs. of 3 variables: ## .. ..$ model : chr "Royal Racer" "Wild Wing" ## .. ..$ speed : int 54 50 ## .. ..$ weight: int 29 34
mario$driver
## [1] "Bowser" "Peach"
mario$vehicles
## [[1]] ## model speed weight ## 1 Wario Bike 55 25 ## 2 Piranha Prowler 40 67 ## ## [[2]] ## model speed weight ## 1 Royal Racer 54 29 ## 2 Wild Wing 50 34
How do we get two tables (with a common id) like the XML example?
# this mapply statement is essentially equivalent to add_key vehicles <- mapply(function(x, y) cbind(x, driver = y), mario$vehicles, mario$driver, SIMPLIFY = FALSE) Reduce(rbind, vehicles)
## model speed weight driver ## 1 Wario Bike 55 25 Bowser ## 2 Piranha Prowler 40 67 Bowser ## 3 Royal Racer 54 29 Peach ## 4 Wild Wing 50 34 Peach
mario[!grepl("vehicle", names(mario))]
## driver occupation ## 1 Bowser Koopa ## 2 Peach Princess