library(knitr) library(tibble) library(dplyr)
1/03/2020
library(knitr) library(tibble) library(dplyr)
set.seed(1024)
my.dataset <- data.frame(site = c('A', 'B', 'A', 'A', 'B', 'B'), season = c('Winter', 'Summer', 'Summer', 'Spring', 'Fall', 'Spring'), pH = c(7.4, 6.3, 8.6, 7.2, 8.9, 8.4)) kable(my.dataset)
site | season | pH |
---|---|---|
A | Winter | 7.4 |
B | Summer | 6.3 |
A | Summer | 8.6 |
A | Spring | 7.2 |
B | Fall | 8.9 |
B | Spring | 8.4 |
my.dataset[3, 2]
## [1] "Summer"
kable(my.dataset)
site | season | pH |
---|---|---|
A | Winter | 7.4 |
B | Summer | 6.3 |
A | Summer | 8.6 |
A | Spring | 7.2 |
B | Fall | 8.9 |
B | Spring | 8.4 |
Note that the kable()
function is used to prettify the table output. Normally, you DON’T need to use it to print tables.
my.dataset
## site season pH ## 1 A Winter 7.4 ## 2 B Summer 6.3 ## 3 A Summer 8.6 ## 4 A Spring 7.2 ## 5 B Fall 8.9 ## 6 B Spring 8.4
my.dataset$site
## [1] "A" "B" "A" "A" "B" "B"
my.dataset$pH[3]
## [1] 8.6
kable(my.dataset[2,])
site | season | pH | |
---|---|---|---|
2 | B | Summer | 6.3 |
my.dataset[,3]
## [1] 7.4 6.3 8.6 7.2 8.9 8.4
Be careful!
my.dataset[,3]
## [1] 7.4 6.3 8.6 7.2 8.9 8.4
my.dataset$pH
## [1] 7.4 6.3 8.6 7.2 8.9 8.4
my.dataset[,"pH"]
## [1] 7.4 6.3 8.6 7.2 8.9 8.4
These return vectors.
my.dataset[, 3, drop = F]
## pH ## 1 7.4 ## 2 6.3 ## 3 8.6 ## 4 7.2 ## 5 8.9 ## 6 8.4
my.dataset[, "pH", drop = F]
## pH ## 1 7.4 ## 2 6.3 ## 3 8.6 ## 4 7.2 ## 5 8.9 ## 6 8.4
These return data frames!
kable(my.dataset[my.dataset$pH > 7, ])
site | season | pH | |
---|---|---|---|
1 | A | Winter | 7.4 |
3 | A | Summer | 8.6 |
4 | A | Spring | 7.2 |
5 | B | Fall | 8.9 |
6 | B | Spring | 8.4 |
kable(my.dataset[my.dataset$pH > 7 & my.dataset$site == "A", ])
site | season | pH | |
---|---|---|---|
1 | A | Winter | 7.4 |
3 | A | Summer | 8.6 |
4 | A | Spring | 7.2 |
my.dataset[my.dataset$pH > 7 & my.dataset$site == "A", "season"]
## [1] "Winter" "Summer" "Spring"
my.dataset[pH > 7,]
## Error in `[.data.frame`(my.dataset, pH > 7, ): object 'pH' not found
attach(my.dataset) my.dataset[pH > 7,]
## site season pH ## 1 A Winter 7.4 ## 3 A Summer 8.6 ## 4 A Spring 7.2 ## 5 B Fall 8.9 ## 6 B Spring 8.4
detach(my.dataset) my.dataset[pH > 7,]
## Error in `[.data.frame`(my.dataset, pH > 7, ): object 'pH' not found
my.dataset$newColumn <- c(T, F, F, T, T, F) kable(my.dataset)
site | season | pH | newColumn |
---|---|---|---|
A | Winter | 7.4 | TRUE |
B | Summer | 6.3 | FALSE |
A | Summer | 8.6 | FALSE |
A | Spring | 7.2 | TRUE |
B | Fall | 8.9 | TRUE |
B | Spring | 8.4 | FALSE |
my.dataset$newColumn <- NULL kable(my.dataset)
site | season | pH |
---|---|---|
A | Winter | 7.4 |
B | Summer | 6.3 |
A | Summer | 8.6 |
A | Spring | 7.2 |
B | Fall | 8.9 |
B | Spring | 8.4 |
This is a bit messy!
my.dataset[6, ] <- data.frame( site = "B", season = "Winter", pH = 6.3) kable(my.dataset)
site | season | pH |
---|---|---|
A | Winter | 7.4 |
B | Summer | 6.3 |
A | Summer | 8.6 |
A | Spring | 7.2 |
B | Fall | 8.9 |
B | Winter | 6.3 |
my.dataset[7, "site"] <- "A" my.dataset[7, "season"] <- "Fall" my.dataset[7, "pH"] <- 6.4 kable(my.dataset)
site | season | pH |
---|---|---|
A | Winter | 7.4 |
B | Summer | 6.3 |
A | Summer | 8.6 |
A | Spring | 7.2 |
B | Fall | 8.9 |
B | Winter | 6.3 |
A | Fall | 6.4 |
kable(my.dataset)
site | season | pH |
---|---|---|
A | Winter | 7.4 |
B | Summer | 6.3 |
A | Summer | 8.6 |
A | Spring | 7.2 |
B | Fall | 8.9 |
B | Winter | 6.3 |
A | Fall | 6.4 |
my.dataset <- my.dataset[-(2:3),] kable(my.dataset)
site | season | pH | |
---|---|---|---|
1 | A | Winter | 7.4 |
4 | A | Spring | 7.2 |
5 | B | Fall | 8.9 |
6 | B | Winter | 6.3 |
7 | A | Fall | 6.4 |
kable(my.dataset)
site | season | pH |
---|---|---|
A | Winter | 7.4 |
B | Summer | 6.3 |
A | Summer | 8.6 |
A | Spring | 7.2 |
B | Fall | 8.9 |
B | Winter | 6.3 |
A | Fall | 6.4 |
nrow(my.dataset)
## [1] 7
ncol(my.dataset)
## [1] 3
dim(my.dataset)
## [1] 7 3
You can edit small data frames in R through a GUI:
my.dataset <- edit(my.dataset)
kable(my.dataset)
site | season | pH |
---|---|---|
A | Winter | 7.4 |
B | Summer | 6.3 |
A | Summer | 8.6 |
A | Spring | 7.2 |
B | Fall | 8.9 |
B | Winter | 6.3 |
A | Fall | 6.4 |
names(my.dataset)
## [1] "site" "season" "pH"
colnames(my.dataset)
## [1] "site" "season" "pH"
names(my.dataset) <- c("Site", "Season", "pH_val") kable(my.dataset)
Site | Season | pH_val |
---|---|---|
A | Winter | 7.4 |
B | Summer | 6.3 |
A | Summer | 8.6 |
A | Spring | 7.2 |
B | Fall | 8.9 |
B | Winter | 6.3 |
A | Fall | 6.4 |
tibble
packagelibrary(tibble)
my.dataset <- data.frame( site = c('A', 'B', 'A', 'A', 'B'), season = c('Winter', 'Summer', 'Summer', 'Spring', 'Fall'), pH = c(7.4, 6.3, 8.6, 7.2, 8.9)) my.dataset
## site season pH ## 1 A Winter 7.4 ## 2 B Summer 6.3 ## 3 A Summer 8.6 ## 4 A Spring 7.2 ## 5 B Fall 8.9
my.dataset <- tibble( site = c('A', 'B', 'A', 'A', 'B'), season = c('Winter', 'Summer', 'Summer', 'Spring', 'Fall'), pH = c(7.4, 6.3, 8.6, 7.2, 8.9)) my.dataset
## # A tibble: 5 × 3 ## site season pH ## <chr> <chr> <dbl> ## 1 A Winter 7.4 ## 2 B Summer 6.3 ## 3 A Summer 8.6 ## 4 A Spring 7.2 ## 5 B Fall 8.9
dat <- tibble(TempCels = sample(-10:40, size = 100, replace = TRUE), TempFahr = TempCels * 9 / 5 + 32, Location = rep(letters[1:2], each = 50)) dat
## # A tibble: 100 × 3 ## TempCels TempFahr Location ## <int> <dbl> <chr> ## 1 10 50 a ## 2 11 51.8 a ## 3 2 35.6 a ## 4 21 69.8 a ## 5 36 96.8 a ## 6 34 93.2 a ## 7 5 41 a ## 8 -4 24.8 a ## 9 -3 26.6 a ## 10 32 89.6 a ## # … with 90 more rows
data(iris) iris
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species ## 1 5.1 3.5 1.4 0.2 setosa ## 2 4.9 3.0 1.4 0.2 setosa ## 3 4.7 3.2 1.3 0.2 setosa ## 4 4.6 3.1 1.5 0.2 setosa ## 5 5.0 3.6 1.4 0.2 setosa ## 6 5.4 3.9 1.7 0.4 setosa ## 7 4.6 3.4 1.4 0.3 setosa ## 8 5.0 3.4 1.5 0.2 setosa ## 9 4.4 2.9 1.4 0.2 setosa ## 10 4.9 3.1 1.5 0.1 setosa ## 11 5.4 3.7 1.5 0.2 setosa ## 12 4.8 3.4 1.6 0.2 setosa ## 13 4.8 3.0 1.4 0.1 setosa ## 14 4.3 3.0 1.1 0.1 setosa ## 15 5.8 4.0 1.2 0.2 setosa ## 16 5.7 4.4 1.5 0.4 setosa ## 17 5.4 3.9 1.3 0.4 setosa ## 18 5.1 3.5 1.4 0.3 setosa ## 19 5.7 3.8 1.7 0.3 setosa ## 20 5.1 3.8 1.5 0.3 setosa ## 21 5.4 3.4 1.7 0.2 setosa ## 22 5.1 3.7 1.5 0.4 setosa ## 23 4.6 3.6 1.0 0.2 setosa ## 24 5.1 3.3 1.7 0.5 setosa ## 25 4.8 3.4 1.9 0.2 setosa ## 26 5.0 3.0 1.6 0.2 setosa ## 27 5.0 3.4 1.6 0.4 setosa ## 28 5.2 3.5 1.5 0.2 setosa ## 29 5.2 3.4 1.4 0.2 setosa ## 30 4.7 3.2 1.6 0.2 setosa ## 31 4.8 3.1 1.6 0.2 setosa ## 32 5.4 3.4 1.5 0.4 setosa ## 33 5.2 4.1 1.5 0.1 setosa ## 34 5.5 4.2 1.4 0.2 setosa ## 35 4.9 3.1 1.5 0.2 setosa ## 36 5.0 3.2 1.2 0.2 setosa ## 37 5.5 3.5 1.3 0.2 setosa ## 38 4.9 3.6 1.4 0.1 setosa ## 39 4.4 3.0 1.3 0.2 setosa ## 40 5.1 3.4 1.5 0.2 setosa ## 41 5.0 3.5 1.3 0.3 setosa ## 42 4.5 2.3 1.3 0.3 setosa ## 43 4.4 3.2 1.3 0.2 setosa ## 44 5.0 3.5 1.6 0.6 setosa ## 45 5.1 3.8 1.9 0.4 setosa ## 46 4.8 3.0 1.4 0.3 setosa ## 47 5.1 3.8 1.6 0.2 setosa ## 48 4.6 3.2 1.4 0.2 setosa ## 49 5.3 3.7 1.5 0.2 setosa ## 50 5.0 3.3 1.4 0.2 setosa ## 51 7.0 3.2 4.7 1.4 versicolor ## 52 6.4 3.2 4.5 1.5 versicolor ## 53 6.9 3.1 4.9 1.5 versicolor ## 54 5.5 2.3 4.0 1.3 versicolor ## 55 6.5 2.8 4.6 1.5 versicolor ## 56 5.7 2.8 4.5 1.3 versicolor ## 57 6.3 3.3 4.7 1.6 versicolor ## 58 4.9 2.4 3.3 1.0 versicolor ## 59 6.6 2.9 4.6 1.3 versicolor ## 60 5.2 2.7 3.9 1.4 versicolor ## 61 5.0 2.0 3.5 1.0 versicolor ## 62 5.9 3.0 4.2 1.5 versicolor ## 63 6.0 2.2 4.0 1.0 versicolor ## 64 6.1 2.9 4.7 1.4 versicolor ## 65 5.6 2.9 3.6 1.3 versicolor ## 66 6.7 3.1 4.4 1.4 versicolor ## 67 5.6 3.0 4.5 1.5 versicolor ## 68 5.8 2.7 4.1 1.0 versicolor ## 69 6.2 2.2 4.5 1.5 versicolor ## 70 5.6 2.5 3.9 1.1 versicolor ## 71 5.9 3.2 4.8 1.8 versicolor ## 72 6.1 2.8 4.0 1.3 versicolor ## 73 6.3 2.5 4.9 1.5 versicolor ## 74 6.1 2.8 4.7 1.2 versicolor ## 75 6.4 2.9 4.3 1.3 versicolor ## 76 6.6 3.0 4.4 1.4 versicolor ## 77 6.8 2.8 4.8 1.4 versicolor ## 78 6.7 3.0 5.0 1.7 versicolor ## 79 6.0 2.9 4.5 1.5 versicolor ## 80 5.7 2.6 3.5 1.0 versicolor ## 81 5.5 2.4 3.8 1.1 versicolor ## 82 5.5 2.4 3.7 1.0 versicolor ## 83 5.8 2.7 3.9 1.2 versicolor ## 84 6.0 2.7 5.1 1.6 versicolor ## 85 5.4 3.0 4.5 1.5 versicolor ## 86 6.0 3.4 4.5 1.6 versicolor ## 87 6.7 3.1 4.7 1.5 versicolor ## 88 6.3 2.3 4.4 1.3 versicolor ## 89 5.6 3.0 4.1 1.3 versicolor ## 90 5.5 2.5 4.0 1.3 versicolor ## 91 5.5 2.6 4.4 1.2 versicolor ## 92 6.1 3.0 4.6 1.4 versicolor ## 93 5.8 2.6 4.0 1.2 versicolor ## 94 5.0 2.3 3.3 1.0 versicolor ## 95 5.6 2.7 4.2 1.3 versicolor ## 96 5.7 3.0 4.2 1.2 versicolor ## 97 5.7 2.9 4.2 1.3 versicolor ## 98 6.2 2.9 4.3 1.3 versicolor ## 99 5.1 2.5 3.0 1.1 versicolor ## 100 5.7 2.8 4.1 1.3 versicolor ## 101 6.3 3.3 6.0 2.5 virginica ## 102 5.8 2.7 5.1 1.9 virginica ## 103 7.1 3.0 5.9 2.1 virginica ## 104 6.3 2.9 5.6 1.8 virginica ## 105 6.5 3.0 5.8 2.2 virginica ## 106 7.6 3.0 6.6 2.1 virginica ## 107 4.9 2.5 4.5 1.7 virginica ## 108 7.3 2.9 6.3 1.8 virginica ## 109 6.7 2.5 5.8 1.8 virginica ## 110 7.2 3.6 6.1 2.5 virginica ## 111 6.5 3.2 5.1 2.0 virginica ## 112 6.4 2.7 5.3 1.9 virginica ## 113 6.8 3.0 5.5 2.1 virginica ## 114 5.7 2.5 5.0 2.0 virginica ## 115 5.8 2.8 5.1 2.4 virginica ## 116 6.4 3.2 5.3 2.3 virginica ## 117 6.5 3.0 5.5 1.8 virginica ## 118 7.7 3.8 6.7 2.2 virginica ## 119 7.7 2.6 6.9 2.3 virginica ## 120 6.0 2.2 5.0 1.5 virginica ## 121 6.9 3.2 5.7 2.3 virginica ## 122 5.6 2.8 4.9 2.0 virginica ## 123 7.7 2.8 6.7 2.0 virginica ## 124 6.3 2.7 4.9 1.8 virginica ## 125 6.7 3.3 5.7 2.1 virginica ## 126 7.2 3.2 6.0 1.8 virginica ## 127 6.2 2.8 4.8 1.8 virginica ## 128 6.1 3.0 4.9 1.8 virginica ## 129 6.4 2.8 5.6 2.1 virginica ## 130 7.2 3.0 5.8 1.6 virginica ## 131 7.4 2.8 6.1 1.9 virginica ## 132 7.9 3.8 6.4 2.0 virginica ## 133 6.4 2.8 5.6 2.2 virginica ## 134 6.3 2.8 5.1 1.5 virginica ## 135 6.1 2.6 5.6 1.4 virginica ## 136 7.7 3.0 6.1 2.3 virginica ## 137 6.3 3.4 5.6 2.4 virginica ## 138 6.4 3.1 5.5 1.8 virginica ## 139 6.0 3.0 4.8 1.8 virginica ## 140 6.9 3.1 5.4 2.1 virginica ## 141 6.7 3.1 5.6 2.4 virginica ## 142 6.9 3.1 5.1 2.3 virginica ## 143 5.8 2.7 5.1 1.9 virginica ## 144 6.8 3.2 5.9 2.3 virginica ## 145 6.7 3.3 5.7 2.5 virginica ## 146 6.7 3.0 5.2 2.3 virginica ## 147 6.3 2.5 5.0 1.9 virginica ## 148 6.5 3.0 5.2 2.0 virginica ## 149 6.2 3.4 5.4 2.3 virginica ## 150 5.9 3.0 5.1 1.8 virginica
iris_tibble <- as_tibble(iris) iris_tibble
## # A tibble: 150 × 5 ## Sepal.Length Sepal.Width Petal.Length Petal.Width Species ## <dbl> <dbl> <dbl> <dbl> <fct> ## 1 5.1 3.5 1.4 0.2 setosa ## 2 4.9 3 1.4 0.2 setosa ## 3 4.7 3.2 1.3 0.2 setosa ## 4 4.6 3.1 1.5 0.2 setosa ## 5 5 3.6 1.4 0.2 setosa ## 6 5.4 3.9 1.7 0.4 setosa ## 7 4.6 3.4 1.4 0.3 setosa ## 8 5 3.4 1.5 0.2 setosa ## 9 4.4 2.9 1.4 0.2 setosa ## 10 4.9 3.1 1.5 0.1 setosa ## # … with 140 more rows
print(iris_tibble, n = 3, width = 50)
## # A tibble: 150 × 5 ## Sepal.Length Sepal.Width Petal.Length ## <dbl> <dbl> <dbl> ## 1 5.1 3.5 1.4 ## 2 4.9 3 1.4 ## 3 4.7 3.2 1.3 ## # … with 147 more rows, and 2 more variables: ## # Petal.Width <dbl>, Species <fct>
iris[1:15, "Petal.Width"]
## [1] 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 0.2 0.2 0.1 0.1 0.2
iris_tibble[1:15, "Petal.Width"]
## # A tibble: 15 × 1 ## Petal.Width ## <dbl> ## 1 0.2 ## 2 0.2 ## 3 0.2 ## 4 0.2 ## 5 0.2 ## 6 0.4 ## 7 0.3 ## 8 0.2 ## 9 0.2 ## 10 0.1 ## 11 0.2 ## 12 0.2 ## 13 0.1 ## 14 0.1 ## 15 0.2
drop = F
dplyr
is a very useful package for data manipulationselect(filter(iris_tibble, Species == "setosa"), Petal.Width, Petal.Length)
## # A tibble: 50 × 2 ## Petal.Width Petal.Length ## <dbl> <dbl> ## 1 0.2 1.4 ## 2 0.2 1.4 ## 3 0.2 1.3 ## 4 0.2 1.5 ## 5 0.2 1.4 ## 6 0.4 1.7 ## 7 0.3 1.4 ## 8 0.2 1.5 ## 9 0.2 1.4 ## 10 0.1 1.5 ## # … with 40 more rows
select
selects the desired columns from the datasetfilter
selects the matching rows from the datasetmutate
allows creating new columns from existing onessummarise
provides summary statisticsgroup_by
allows grouping rows and obtaining summary statisticsdplyr
also declares the pipe operator %>%
T %>% foo(a, b)
is the same as foo(T, a, b)
iris_tibble %>% filter(Species == "setosa") %>% select(Petal.Width, Petal.Length)
## # A tibble: 50 × 2 ## Petal.Width Petal.Length ## <dbl> <dbl> ## 1 0.2 1.4 ## 2 0.2 1.4 ## 3 0.2 1.3 ## 4 0.2 1.5 ## 5 0.2 1.4 ## 6 0.4 1.7 ## 7 0.3 1.4 ## 8 0.2 1.5 ## 9 0.2 1.4 ## 10 0.1 1.5 ## # … with 40 more rows
foo_d(foo_c(foo_b(foo_a(x))))
vs.
x %>% foo_a() %>% foo_b() %>% foo_c() %>% foo_d()
vs.
x %>% foo_a() %>% foo_b() %>% foo_c() %>% foo_d()
It is useful to save and load very large R objects
This is much faster than reading from other data types
Save objects f
and my.dataset
into mysession.RData
save(f, my.dataset, file='mysession.RData')
mysession.RData
load('mysession.RData')
x.csv
ID, Name, Age 23424, Ana, 45 11234, Charles, 23 77654, Susanne, 76
readr
packagelibrary(readr) dat <- read_csv("x.csv")
## Rows: 3 Columns: 3
## ── Column specification ──────────────────────────────────────────────────────── ## Delimiter: "," ## chr (1): Name ## dbl (2): ID, Age
## ## ℹ Use `spec()` to retrieve the full column specification for this data. ## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dat
## # A tibble: 3 × 3 ## ID Name Age ## <dbl> <chr> <dbl> ## 1 23424 Ana 45 ## 2 11234 Charles 23 ## 3 77654 Susanne 76
x.txt
ID Name Age Phone 23424 Ana 45 ??? 11234 Charles 23 3456789 77654 Susanne 76 2345678
read_delim()
dat <- read_delim("x.txt", delim = " ", na = "???")
## Rows: 3 Columns: 4
## ── Column specification ──────────────────────────────────────────────────────── ## Delimiter: " " ## chr (1): Name ## dbl (3): ID, Age, Phone
## ## ℹ Use `spec()` to retrieve the full column specification for this data. ## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dat
## # A tibble: 3 × 4 ## ID Name Age Phone ## <dbl> <chr> <dbl> <dbl> ## 1 23424 Ana 45 NA ## 2 11234 Charles 23 3456789 ## 3 77654 Susanne 76 2345678
na
library(DBI) library(RMySQL) # The DBMS-specific code starts here drv <- dbDriver("MySQL") # Loading the MySQL driver con <- dbConnect(drv,dbname="transDB", # connecting to the DBMS username="myuser",password="mypasswd", host="localhost") # The DBMS-specific code ends here # getting the results of an SQL query as a data frame data <- dbGetQuery(con,"SELECT * FROM clients") # closing up stuff dbDisconnect(con) dbUnloadDriver(drv)
library(RMySQL) library(dplyr) dbConn <- src_mysql("sonae", host = "localhost", user = "prodUser", password = "myPassword") sensors <- tbl(dbConn,"sensor_values") sensors %>% filter(sid==274,value > 100) %>% select(time,value)
Source: query [?? x 2] Database: mysql 5.7.14 [prodUser@localhost:/sonae] time value <chr> <dbl> 1 2009-04-01 06:31:56 100.60 2 2009-04-01 06:32:04 103.11 3 2009-04-01 06:38:21 104.05 4 2009-04-01 06:38:29 103.87 5 2009-04-01 06:44:46 101.29 6 2009-04-01 06:44:54 100.16 7 2009-04-01 08:00:01 100.25 8 2009-04-01 08:55:52 101.64 9 2009-04-01 09:00:14 100.44 10 2009-04-01 09:11:50 102.33 # ... with more rows
d <- read.table("clipboard", header = TRUE)
library(readxl) fc <- "c:\\Documents and Settings\\xpto\\My Documents\\calc.xls" dat <- read_excel(fc, sheet = 1)
foreign
packageWrite a script to do the following things in a batch
csv_object.dat
xls_object.dat
.dat
files