- Adapted from R for data science by Garrett Grolemund and Hadley Wickham.
2016-03-02
library(rvest) src <- html("http://en.wikipedia.org/wiki/Table_(information)") node <- html_node(src, css = ".wikitable")
".wikitable"
is a CSS selector which says: "grab nodes (aka elements) with a class of wikitable".html_table()
converts a single <table>
node to a data frame.html_table(node) #> First name Last name Age #> 1 Tinu Elejogun 14 #> 2 Blaszczyk Kostrzewski 25 #> 3 Lily McGarrett 16 #> 4 Olatunkboh Chijiaku 22 #> 5 Adrienne Anthoula 22 #> 6 Axelia Athanasios 22 #> 7 Jon-Kabat Zinn 22
html("http://en.wikipedia.org/wiki/Table_(information)") %>% html_node(".wikitable") %>% html_table() #> First name Last name Age #> 1 Tinu Elejogun 14 #> 2 Blaszczyk Kostrzewski 25 #> 3 Lily McGarrett 16 #> 4 Olatunkboh Chijiaku 22 #> 5 Adrienne Anthoula 22 #> 6 Axelia Athanasios 22 #> 7 Jon-Kabat Zinn 22
Navigate this page and try the following:
Easy: Grab the table at the bottom of the page (hint: instead of grabbing a node by class with html_node(".class")
, you can grab by id with html_node("#id")
)
Medium: Grab the actual mean, max, and min temperature.
Hard: Grab the weather history graph and write the figure to disk (download.file()
may be helpful here).
See here for a solution (thanks Hadley Wickham for the example)
<table>
data?domain <- "http://www.sec.gov" susp <- paste0(domain, "/litigation/suspensions.shtml") hrefs <- html(susp) %>% html_nodes("p+ table a") %>% html_attr(name = "href") tail(hrefs) #> [1] "/litigation/suspensions/2016/34-76961.pdf" #> [2] "/litigation/suspensions/2016/34-76961-o.pdf" #> [3] "/litigation/suspensions/2016/34-76939.pdf" #> [4] "/litigation/suspensions/2016/34-76939-o.pdf" #> [5] "/litigation/suspensions/2016/34-76860.pdf" #> [6] "/litigation/suspensions/2016/34-76860-o.pdf"
# download all the pdfs! hrefs <- hrefs[!is.na(hrefs)] pdfs <- paste0(domain, hrefs) mapply(download.file, pdfs, basename(pdfs))
Nativigate to Wikipedia's list of data structures use SelectorGadget + rvest to do the following:
See here for a solution.
html("http://bl.ocks.org/cpsievert/raw/2a9fb8f504cd56e9e8e3/") %>% html_node("table") #> Error in eval(expr, envir, enclos): No matches
<table>
?html("http://bl.ocks.org/cpsievert/raw/2a9fb8f504cd56e9e8e3/") %>% html_node("body") %>% as.character() %>% cat() #> <body> #> A Simple Table made with JavaScript #> <p/> #> <script><![CDATA[ #> function tableCreate(){ #> var body = document.body, #> tbl = document.createElement('table'); #> #> for(var i = 0; i < 3; i++){ #> var tr = tbl.insertRow(); #> for(var j = 0; j < 3; j++){ #> var td = tr.insertCell(); #> td.appendChild(document.createTextNode("Cell")); #> } #> } #> body.appendChild(tbl); #> } #> tableCreate(); #> ]]></script> #> </body>
rdom can construct the DOM:
library(rdom) rdom("http://bl.ocks.org/cpsievert/raw/2a9fb8f504cd56e9e8e3/") %>% html_node("table") %>% html_table()
X1 X2 X3 1 Cell Cell Cell 2 Cell Cell Cell 3 Cell Cell Cell
You can give rdom()
CSS Selectors directly to avoid sending the entire DOM from phantomjs to R
rdom("http://www.techstars.com/companies/stats/", "table") %>% html_table()
GET
.library(httr) response <- GET("https://api.github.com/users/hadley") content(response)[c("name", "company")] #> $name #> [1] "Hadley Wickham" #> #> $company #> [1] "RStudio"
POST
, PUT
, DELETE
, etc…
response$header[1:3] #> $server #> [1] "GitHub.com" #> #> $date #> [1] "Wed, 02 Mar 2016 04:12:22 GMT" #> #> $`content-type` #> [1] "application/json; charset=utf-8"
XML is a markup language that looks very similar to HTML.
<mariokart> <driver name="Bowser" occupation="Koopa"> <vehicle speed="55" weight="25"> Wario Bike </vehicle> <vehicle speed="40" weight="67"> Piranha Prowler </vehicle> </driver> <driver name="Peach" occupation="Princess"> <vehicle speed="54" weight="29"> Royal Racer </vehicle> <vehicle speed="50" weight="34"> Wild Wing </vehicle> </driver> </mariokart>
XML2R is a framework to simplify acquistion of tabular/relational XML.
library(XML2R) obs <- XML2Obs("https://gist.githubusercontent.com/cpsievert/85e340814cb855a60dc4/raw/651b7626e34751c7485cff2d7ea3ea66413609b8/mariokart.xml") table(names(obs))
#> #> mariokart//driver mariokart//driver//vehicle #> 2 4
obs #> $`mariokart//driver//vehicle` #> speed weight XML_value #> [1,] "55" "25" " Wario Bike " #> #> $`mariokart//driver//vehicle` #> speed weight XML_value #> [1,] "40" "67" " Piranha Prowler " #> #> $`mariokart//driver` #> name occupation #> [1,] "Bowser" "Koopa" #> #> $`mariokart//driver//vehicle` #> speed weight XML_value #> [1,] "54" "29" " Royal Racer " #> #> $`mariokart//driver//vehicle` #> speed weight XML_value #> [1,] "50" "34" " Wild Wing " #> #> $`mariokart//driver` #> name occupation #> [1,] "Peach" "Princess"
collapse_obs(obs) # group into table(s) by observational name/unit #> $`mariokart//driver` #> name occupation #> [1,] "Bowser" "Koopa" #> [2,] "Peach" "Princess" #> #> $`mariokart//driver//vehicle` #> speed weight XML_value #> [1,] "55" "25" " Wario Bike " #> [2,] "40" "67" " Piranha Prowler " #> [3,] "54" "29" " Royal Racer " #> [4,] "50" "34" " Wild Wing "
obs <- add_key(obs, parent = "mariokart//driver", recycle = "name") collapse_obs(obs) #> $`mariokart//driver` #> name occupation #> [1,] "Bowser" "Koopa" #> [2,] "Peach" "Princess" #> #> $`mariokart//driver//vehicle` #> speed weight XML_value name #> [1,] "55" "25" " Wario Bike " "Bowser" #> [2,] "40" "67" " Piranha Prowler " "Bowser" #> [3,] "54" "29" " Royal Racer " "Peach" #> [4,] "50" "34" " Wild Wing " "Peach"
Now (if I want) I can merge the tables into a single table…
tabs <- collapse_obs(obs) merge(tabs[[1]], tabs[[2]], by = "name") #> name occupation speed weight XML_value #> 1 Bowser Koopa 55 25 Wario Bike #> 2 Bowser Koopa 40 67 Piranha Prowler #> 3 Peach Princess 54 29 Royal Racer #> 4 Peach Princess 50 34 Wild Wing
[ { "driver": "Bowser", "occupation": "Koopa", "vehicles": [ { "model": "Wario Bike", "speed": 55, "weight": 25 }, { "model": "Piranha Prowler", "speed": 40, "weight": 67 } ] }, { "driver": "Peach", "occupation": "Princess", "vehicles": [ { "model": "Royal Racer", "speed": 54, "weight": 29 }, { "model": "Wild Wing", "speed": 50, "weight": 34 } ] } ]
library(jsonlite) mario <- fromJSON("http://bit.ly/mario-json") str(mario) # nested data.frames?!? #> 'data.frame': 2 obs. of 3 variables: #> $ driver : chr "Bowser" "Peach" #> $ occupation: chr "Koopa" "Princess" #> $ vehicles :List of 2 #> ..$ :'data.frame': 2 obs. of 3 variables: #> .. ..$ model : chr "Wario Bike" "Piranha Prowler" #> .. ..$ speed : int 55 40 #> .. ..$ weight: int 25 67 #> ..$ :'data.frame': 2 obs. of 3 variables: #> .. ..$ model : chr "Royal Racer" "Wild Wing" #> .. ..$ speed : int 54 50 #> .. ..$ weight: int 29 34
mario$driver #> [1] "Bowser" "Peach" mario$vehicles #> [[1]] #> model speed weight #> 1 Wario Bike 55 25 #> 2 Piranha Prowler 40 67 #> #> [[2]] #> model speed weight #> 1 Royal Racer 54 29 #> 2 Wild Wing 50 34
How do we get two tables (with a common id) like the XML example?
# this mapply statement is essentially equivalent to add_key vehicles <- Map(function(x, y) cbind(x, driver = y), mario$vehicles, mario$driver) Reduce(rbind, vehicles) #> model speed weight driver #> 1 Wario Bike 55 25 Bowser #> 2 Piranha Prowler 40 67 Bowser #> 3 Royal Racer 54 29 Peach #> 4 Wild Wing 50 34 Peach mario[!grepl("vehicle", names(mario))] #> driver occupation #> 1 Bowser Koopa #> 2 Peach Princess
# install dependencies and run first example (press ESC to quit) if (!require("shiny")) install.packages("shiny") if (!require("leaflet")) install.packages("leaflet") runGitHub("rstudio/shiny-examples", subdir = "063-superzip-example")
library(shiny) library(ggplot2) ui <- fluidPage( numericInput( inputId = "size", label = "Choose a point size", value = 3, min = 1, max = 10 ), plotOutput("plotId") ) server <- function(input, output) { output$plotId <- renderPlot({ ggplot(mtcars, aes(wt, mpg)) + geom_point(size = input$size) }) } shinyApp(ui, server)
ui <- fluidPage( sidebarPanel( selectInput( inputId = "x", label = "Choose an x variable", choices = names(mtcars) ), selectInput( inputId = "y", label = "Choose an y variable", choices = names(mtcars) ) ), mainPanel( plotOutput("plotId") ) ) server <- function(input, output) { output$plotId <- renderPlot({ ggplot(mtcars, aes_string(input$x, input$y)) + geom_point() }) } shinyApp(ui, server)
colour
.