# For creating these slides library(knitr) # For nicely printed data frames library(tibble) # For many data wrangling operations library(dplyr)
2024-03-04
# For creating these slides library(knitr) # For nicely printed data frames library(tibble) # For many data wrangling operations library(dplyr)
# For result reproducibility set.seed(1024)
Name | Surname | Age |
---|---|---|
Burkay | Genç | 45 |
Hasan | Kaya | 55 |
Ayşe | Çam | 40 |
Fatma | Gürgen | 23 |
my.dataset <- data.frame(site = c('A', 'B', 'A', 'A', 'B', 'B'), season = c('Winter', 'Summer', 'Summer', 'Spring', 'Fall', 'Spring'), pH = c(7.4, 6.3, 8.6, 7.2, 8.9, 8.4)) kable(my.dataset)
site | season | pH |
---|---|---|
A | Winter | 7.4 |
B | Summer | 6.3 |
A | Summer | 8.6 |
A | Spring | 7.2 |
B | Fall | 8.9 |
B | Spring | 8.4 |
my.dataset[3, 2]
## [1] "Summer"
kable(my.dataset)
site | season | pH |
---|---|---|
A | Winter | 7.4 |
B | Summer | 6.3 |
A | Summer | 8.6 |
A | Spring | 7.2 |
B | Fall | 8.9 |
B | Spring | 8.4 |
Note that the kable()
function is used to prettify the table output. Normally, you DON’T need to use it to print tables.
my.dataset
## site season pH ## 1 A Winter 7.4 ## 2 B Summer 6.3 ## 3 A Summer 8.6 ## 4 A Spring 7.2 ## 5 B Fall 8.9 ## 6 B Spring 8.4
my.dataset$site
## [1] "A" "B" "A" "A" "B" "B"
my.dataset$pH[3]
## [1] 8.6
# Rows are data frames my.dataset[2,]
## site season pH ## 2 B Summer 6.3
# Columns are vectors my.dataset[,3]
## [1] 7.4 6.3 8.6 7.2 8.9 8.4
This returns a vector.
my.dataset[,3]
## [1] 7.4 6.3 8.6 7.2 8.9 8.4
This returns a data frame!
my.dataset[3]
## pH ## 1 7.4 ## 2 6.3 ## 3 8.6 ## 4 7.2 ## 5 8.9 ## 6 8.4
head
and tail
You can print first or last few rows of a data frame:
head(my.dataset, 3)
## site season pH ## 1 A Winter 7.4 ## 2 B Summer 6.3 ## 3 A Summer 8.6
tail(my.dataset, 3)
## site season pH ## 4 A Spring 7.2 ## 5 B Fall 8.9 ## 6 B Spring 8.4
This returns a vector.
my.dataset$pH
## [1] 7.4 6.3 8.6 7.2 8.9 8.4
This returns a data frame!
my.dataset["pH"]
## pH ## 1 7.4 ## 2 6.3 ## 3 8.6 ## 4 7.2 ## 5 8.9 ## 6 8.4
This returns a vector.
my.dataset[, "pH"]
## [1] 7.4 6.3 8.6 7.2 8.9 8.4
This returns a data frame!
my.dataset[, "pH", drop = F]
## pH ## 1 7.4 ## 2 6.3 ## 3 8.6 ## 4 7.2 ## 5 8.9 ## 6 8.4
kable(my.dataset[my.dataset$pH > 7, ])
site | season | pH | |
---|---|---|---|
1 | A | Winter | 7.4 |
3 | A | Summer | 8.6 |
4 | A | Spring | 7.2 |
5 | B | Fall | 8.9 |
6 | B | Spring | 8.4 |
kable(my.dataset[my.dataset$pH > 7 & my.dataset$site == "A", ])
site | season | pH | |
---|---|---|---|
1 | A | Winter | 7.4 |
3 | A | Summer | 8.6 |
4 | A | Spring | 7.2 |
my.dataset[my.dataset$pH > 7 & my.dataset$site == "A", "season"]
## [1] "Winter" "Summer" "Spring"
my.dataset[pH > 7,]
## Error in eval(expr, envir, enclos): object 'pH' not found
attach(my.dataset) my.dataset[pH > 7,]
## site season pH ## 1 A Winter 7.4 ## 3 A Summer 8.6 ## 4 A Spring 7.2 ## 5 B Fall 8.9 ## 6 B Spring 8.4
detach(my.dataset) my.dataset[pH > 7,]
## Error in eval(expr, envir, enclos): object 'pH' not found
my.dataset$newColumn <- c(T, F, F, T, T, F) kable(my.dataset)
site | season | pH | newColumn |
---|---|---|---|
A | Winter | 7.4 | TRUE |
B | Summer | 6.3 | FALSE |
A | Summer | 8.6 | FALSE |
A | Spring | 7.2 | TRUE |
B | Fall | 8.9 | TRUE |
B | Spring | 8.4 | FALSE |
my.dataset$newColumn <- NULL kable(my.dataset)
site | season | pH |
---|---|---|
A | Winter | 7.4 |
B | Summer | 6.3 |
A | Summer | 8.6 |
A | Spring | 7.2 |
B | Fall | 8.9 |
B | Spring | 8.4 |
This is a bit messy!
my.dataset[6, ] <- data.frame( site = "B", season = "Winter", pH = 6.3) kable(my.dataset)
site | season | pH |
---|---|---|
A | Winter | 7.4 |
B | Summer | 6.3 |
A | Summer | 8.6 |
A | Spring | 7.2 |
B | Fall | 8.9 |
B | Winter | 6.3 |
You could also rbind
the above.
my.dataset[7, "site"] <- "A" my.dataset[7, "season"] <- "Fall" my.dataset[7, "pH"] <- 6.4 kable(my.dataset)
site | season | pH |
---|---|---|
A | Winter | 7.4 |
B | Summer | 6.3 |
A | Summer | 8.6 |
A | Spring | 7.2 |
B | Fall | 8.9 |
B | Winter | 6.3 |
A | Fall | 6.4 |
kable(my.dataset)
site | season | pH |
---|---|---|
A | Winter | 7.4 |
B | Summer | 6.3 |
A | Summer | 8.6 |
A | Spring | 7.2 |
B | Fall | 8.9 |
B | Winter | 6.3 |
A | Fall | 6.4 |
my.dataset <- my.dataset[-(2:3),] kable(my.dataset)
site | season | pH | |
---|---|---|---|
1 | A | Winter | 7.4 |
4 | A | Spring | 7.2 |
5 | B | Fall | 8.9 |
6 | B | Winter | 6.3 |
7 | A | Fall | 6.4 |
kable(my.dataset)
site | season | pH |
---|---|---|
A | Winter | 7.4 |
B | Summer | 6.3 |
A | Summer | 8.6 |
A | Spring | 7.2 |
B | Fall | 8.9 |
B | Winter | 6.3 |
A | Fall | 6.4 |
nrow(my.dataset)
## [1] 7
ncol(my.dataset)
## [1] 3
dim(my.dataset)
## [1] 7 3
You can edit small data frames in R through a GUI:
my.dataset <- edit(my.dataset)
I strongly recommend you to avoid this!
kable(my.dataset)
site | season | pH |
---|---|---|
A | Winter | 7.4 |
B | Summer | 6.3 |
A | Summer | 8.6 |
A | Spring | 7.2 |
B | Fall | 8.9 |
B | Winter | 6.3 |
A | Fall | 6.4 |
names(my.dataset)
## [1] "site" "season" "pH"
colnames(my.dataset)
## [1] "site" "season" "pH"
names(my.dataset) <- c("Site", "Season", "pH_val") kable(my.dataset)
Site | Season | pH_val |
---|---|---|
A | Winter | 7.4 |
B | Summer | 6.3 |
A | Summer | 8.6 |
A | Spring | 7.2 |
B | Fall | 8.9 |
B | Winter | 6.3 |
A | Fall | 6.4 |
Data frames automatically convert strings into factors:
Since 4.0, data frame do not convert strings into factors anymore:
df <- data.frame(name = c("burkay", "ali", "burkay", "veli")) df$name
## [1] "burkay" "ali" "burkay" "veli"
But, you can force it to do so:
df <- data.frame(name = c("burkay", "ali", "burkay", "veli"), stringsAsFactors = T) df$name
## [1] burkay ali burkay veli ## Levels: ali burkay veli
tibble
packagelibrary(tibble)
my.dataset <- data.frame( site = c('A', 'B', 'A', 'A', 'B'), season = c('Winter', 'Summer', 'Summer', 'Spring', 'Fall'), pH = c(7.4, 6.3, 8.6, 7.2, 8.9)) my.dataset
## site season pH ## 1 A Winter 7.4 ## 2 B Summer 6.3 ## 3 A Summer 8.6 ## 4 A Spring 7.2 ## 5 B Fall 8.9
my.tibble <- tibble( site = c('A', 'B', 'A', 'A', 'B'), season = c('Winter', 'Summer', 'Summer', 'Spring', 'Fall'), pH = c(7.4, 6.3, 8.6, 7.2, 8.9)) my.tibble
## # A tibble: 5 × 3 ## site season pH ## <chr> <chr> <dbl> ## 1 A Winter 7.4 ## 2 B Summer 6.3 ## 3 A Summer 8.6 ## 4 A Spring 7.2 ## 5 B Fall 8.9
You can construct a tibble from a data frame:
my.dataset <- data.frame( site = c('A', 'B', 'A', 'A', 'B'), season = c('Winter', 'Summer', 'Summer', 'Spring', 'Fall'), pH = c(7.4, 6.3, 8.6, 7.2, 8.9)) my.tibble <- tibble(my.dataset) my.tibble
## # A tibble: 5 × 3 ## site season pH ## <chr> <chr> <dbl> ## 1 A Winter 7.4 ## 2 B Summer 6.3 ## 3 A Summer 8.6 ## 4 A Spring 7.2 ## 5 B Fall 8.9
You cannot use shorthand to access a column with tibbles:
my.tibble
## # A tibble: 5 × 3 ## site season pH ## <chr> <chr> <dbl> ## 1 A Winter 7.4 ## 2 B Summer 6.3 ## 3 A Summer 8.6 ## 4 A Spring 7.2 ## 5 B Fall 8.9
my.tibble$sea
## Warning: Unknown or uninitialised column: `sea`.
## NULL
my.dataset$sea
## [1] "Winter" "Summer" "Summer" "Spring" "Fall"
dat <- tibble(TempCels = sample(-10:40, size = 100, replace = TRUE), TempFahr = TempCels * 9 / 5 + 32, # <<<<<<< Location = rep(letters[1:2], each = 50)) dat
## # A tibble: 100 × 3 ## TempCels TempFahr Location ## <int> <dbl> <chr> ## 1 10 50 a ## 2 11 51.8 a ## 3 2 35.6 a ## 4 21 69.8 a ## 5 36 96.8 a ## 6 34 93.2 a ## 7 5 41 a ## 8 -4 24.8 a ## 9 -3 26.6 a ## 10 32 89.6 a ## # ℹ 90 more rows
Data frames try to print everything on the screen
data(iris) iris
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species ## 1 5.1 3.5 1.4 0.2 setosa ## 2 4.9 3.0 1.4 0.2 setosa ## 3 4.7 3.2 1.3 0.2 setosa ## 4 4.6 3.1 1.5 0.2 setosa ## 5 5.0 3.6 1.4 0.2 setosa ## 6 5.4 3.9 1.7 0.4 setosa ## 7 4.6 3.4 1.4 0.3 setosa ## 8 5.0 3.4 1.5 0.2 setosa ## 9 4.4 2.9 1.4 0.2 setosa ## 10 4.9 3.1 1.5 0.1 setosa ## 11 5.4 3.7 1.5 0.2 setosa ## 12 4.8 3.4 1.6 0.2 setosa ## 13 4.8 3.0 1.4 0.1 setosa ## 14 4.3 3.0 1.1 0.1 setosa ## 15 5.8 4.0 1.2 0.2 setosa ## 16 5.7 4.4 1.5 0.4 setosa ## 17 5.4 3.9 1.3 0.4 setosa ## 18 5.1 3.5 1.4 0.3 setosa ## 19 5.7 3.8 1.7 0.3 setosa ## 20 5.1 3.8 1.5 0.3 setosa ## 21 5.4 3.4 1.7 0.2 setosa ## 22 5.1 3.7 1.5 0.4 setosa ## 23 4.6 3.6 1.0 0.2 setosa ## 24 5.1 3.3 1.7 0.5 setosa ## 25 4.8 3.4 1.9 0.2 setosa ## 26 5.0 3.0 1.6 0.2 setosa ## 27 5.0 3.4 1.6 0.4 setosa ## 28 5.2 3.5 1.5 0.2 setosa ## 29 5.2 3.4 1.4 0.2 setosa ## 30 4.7 3.2 1.6 0.2 setosa ## 31 4.8 3.1 1.6 0.2 setosa ## 32 5.4 3.4 1.5 0.4 setosa ## 33 5.2 4.1 1.5 0.1 setosa ## 34 5.5 4.2 1.4 0.2 setosa ## 35 4.9 3.1 1.5 0.2 setosa ## 36 5.0 3.2 1.2 0.2 setosa ## 37 5.5 3.5 1.3 0.2 setosa ## 38 4.9 3.6 1.4 0.1 setosa ## 39 4.4 3.0 1.3 0.2 setosa ## 40 5.1 3.4 1.5 0.2 setosa ## 41 5.0 3.5 1.3 0.3 setosa ## 42 4.5 2.3 1.3 0.3 setosa ## 43 4.4 3.2 1.3 0.2 setosa ## 44 5.0 3.5 1.6 0.6 setosa ## 45 5.1 3.8 1.9 0.4 setosa ## 46 4.8 3.0 1.4 0.3 setosa ## 47 5.1 3.8 1.6 0.2 setosa ## 48 4.6 3.2 1.4 0.2 setosa ## 49 5.3 3.7 1.5 0.2 setosa ## 50 5.0 3.3 1.4 0.2 setosa ## 51 7.0 3.2 4.7 1.4 versicolor ## 52 6.4 3.2 4.5 1.5 versicolor ## 53 6.9 3.1 4.9 1.5 versicolor ## 54 5.5 2.3 4.0 1.3 versicolor ## 55 6.5 2.8 4.6 1.5 versicolor ## 56 5.7 2.8 4.5 1.3 versicolor ## 57 6.3 3.3 4.7 1.6 versicolor ## 58 4.9 2.4 3.3 1.0 versicolor ## 59 6.6 2.9 4.6 1.3 versicolor ## 60 5.2 2.7 3.9 1.4 versicolor ## 61 5.0 2.0 3.5 1.0 versicolor ## 62 5.9 3.0 4.2 1.5 versicolor ## 63 6.0 2.2 4.0 1.0 versicolor ## 64 6.1 2.9 4.7 1.4 versicolor ## 65 5.6 2.9 3.6 1.3 versicolor ## 66 6.7 3.1 4.4 1.4 versicolor ## 67 5.6 3.0 4.5 1.5 versicolor ## 68 5.8 2.7 4.1 1.0 versicolor ## 69 6.2 2.2 4.5 1.5 versicolor ## 70 5.6 2.5 3.9 1.1 versicolor ## 71 5.9 3.2 4.8 1.8 versicolor ## 72 6.1 2.8 4.0 1.3 versicolor ## 73 6.3 2.5 4.9 1.5 versicolor ## 74 6.1 2.8 4.7 1.2 versicolor ## 75 6.4 2.9 4.3 1.3 versicolor ## 76 6.6 3.0 4.4 1.4 versicolor ## 77 6.8 2.8 4.8 1.4 versicolor ## 78 6.7 3.0 5.0 1.7 versicolor ## 79 6.0 2.9 4.5 1.5 versicolor ## 80 5.7 2.6 3.5 1.0 versicolor ## 81 5.5 2.4 3.8 1.1 versicolor ## 82 5.5 2.4 3.7 1.0 versicolor ## 83 5.8 2.7 3.9 1.2 versicolor ## 84 6.0 2.7 5.1 1.6 versicolor ## 85 5.4 3.0 4.5 1.5 versicolor ## 86 6.0 3.4 4.5 1.6 versicolor ## 87 6.7 3.1 4.7 1.5 versicolor ## 88 6.3 2.3 4.4 1.3 versicolor ## 89 5.6 3.0 4.1 1.3 versicolor ## 90 5.5 2.5 4.0 1.3 versicolor ## 91 5.5 2.6 4.4 1.2 versicolor ## 92 6.1 3.0 4.6 1.4 versicolor ## 93 5.8 2.6 4.0 1.2 versicolor ## 94 5.0 2.3 3.3 1.0 versicolor ## 95 5.6 2.7 4.2 1.3 versicolor ## 96 5.7 3.0 4.2 1.2 versicolor ## 97 5.7 2.9 4.2 1.3 versicolor ## 98 6.2 2.9 4.3 1.3 versicolor ## 99 5.1 2.5 3.0 1.1 versicolor ## 100 5.7 2.8 4.1 1.3 versicolor ## 101 6.3 3.3 6.0 2.5 virginica ## 102 5.8 2.7 5.1 1.9 virginica ## 103 7.1 3.0 5.9 2.1 virginica ## 104 6.3 2.9 5.6 1.8 virginica ## 105 6.5 3.0 5.8 2.2 virginica ## 106 7.6 3.0 6.6 2.1 virginica ## 107 4.9 2.5 4.5 1.7 virginica ## 108 7.3 2.9 6.3 1.8 virginica ## 109 6.7 2.5 5.8 1.8 virginica ## 110 7.2 3.6 6.1 2.5 virginica ## 111 6.5 3.2 5.1 2.0 virginica ## 112 6.4 2.7 5.3 1.9 virginica ## 113 6.8 3.0 5.5 2.1 virginica ## 114 5.7 2.5 5.0 2.0 virginica ## 115 5.8 2.8 5.1 2.4 virginica ## 116 6.4 3.2 5.3 2.3 virginica ## 117 6.5 3.0 5.5 1.8 virginica ## 118 7.7 3.8 6.7 2.2 virginica ## 119 7.7 2.6 6.9 2.3 virginica ## 120 6.0 2.2 5.0 1.5 virginica ## 121 6.9 3.2 5.7 2.3 virginica ## 122 5.6 2.8 4.9 2.0 virginica ## 123 7.7 2.8 6.7 2.0 virginica ## 124 6.3 2.7 4.9 1.8 virginica ## 125 6.7 3.3 5.7 2.1 virginica ## 126 7.2 3.2 6.0 1.8 virginica ## 127 6.2 2.8 4.8 1.8 virginica ## 128 6.1 3.0 4.9 1.8 virginica ## 129 6.4 2.8 5.6 2.1 virginica ## 130 7.2 3.0 5.8 1.6 virginica ## 131 7.4 2.8 6.1 1.9 virginica ## 132 7.9 3.8 6.4 2.0 virginica ## 133 6.4 2.8 5.6 2.2 virginica ## 134 6.3 2.8 5.1 1.5 virginica ## 135 6.1 2.6 5.6 1.4 virginica ## 136 7.7 3.0 6.1 2.3 virginica ## 137 6.3 3.4 5.6 2.4 virginica ## 138 6.4 3.1 5.5 1.8 virginica ## 139 6.0 3.0 4.8 1.8 virginica ## 140 6.9 3.1 5.4 2.1 virginica ## 141 6.7 3.1 5.6 2.4 virginica ## 142 6.9 3.1 5.1 2.3 virginica ## 143 5.8 2.7 5.1 1.9 virginica ## 144 6.8 3.2 5.9 2.3 virginica ## 145 6.7 3.3 5.7 2.5 virginica ## 146 6.7 3.0 5.2 2.3 virginica ## 147 6.3 2.5 5.0 1.9 virginica ## 148 6.5 3.0 5.2 2.0 virginica ## 149 6.2 3.4 5.4 2.3 virginica ## 150 5.9 3.0 5.1 1.8 virginica
Tibbles are much smarter and only print what you can see:
iris_tibble <- tibble(iris) iris_tibble
## # A tibble: 150 × 5 ## Sepal.Length Sepal.Width Petal.Length Petal.Width Species ## <dbl> <dbl> <dbl> <dbl> <fct> ## 1 5.1 3.5 1.4 0.2 setosa ## 2 4.9 3 1.4 0.2 setosa ## 3 4.7 3.2 1.3 0.2 setosa ## 4 4.6 3.1 1.5 0.2 setosa ## 5 5 3.6 1.4 0.2 setosa ## 6 5.4 3.9 1.7 0.4 setosa ## 7 4.6 3.4 1.4 0.3 setosa ## 8 5 3.4 1.5 0.2 setosa ## 9 4.4 2.9 1.4 0.2 setosa ## 10 4.9 3.1 1.5 0.1 setosa ## # ℹ 140 more rows
You can force limits on the print output:
print(iris_tibble, n = 5, width = 50)
## # A tibble: 150 × 5 ## Sepal.Length Sepal.Width Petal.Length ## <dbl> <dbl> <dbl> ## 1 5.1 3.5 1.4 ## 2 4.9 3 1.4 ## 3 4.7 3.2 1.3 ## 4 4.6 3.1 1.5 ## 5 5 3.6 1.4 ## # ℹ 145 more rows ## # ℹ 2 more variables: Petal.Width <dbl>, ## # Species <fct>
print(iris_tibble, n = 3, width = 30)
## # A tibble: 150 × 5 ## Sepal.Length Sepal.Width ## <dbl> <dbl> ## 1 5.1 3.5 ## 2 4.9 3 ## 3 4.7 3.2 ## # ℹ 147 more rows ## # ℹ 3 more variables: ## # Petal.Length <dbl>, ## # Petal.Width <dbl>, ## # Species <fct>
Subsetting works differently in data frames and tibbles:
iris[1:15, "Petal.Width"]
## [1] 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 0.2 0.2 0.1 0.1 0.2
iris_tibble[1:15, "Petal.Width"]
## # A tibble: 15 × 1 ## Petal.Width ## <dbl> ## 1 0.2 ## 2 0.2 ## 3 0.2 ## 4 0.2 ## 5 0.2 ## 6 0.4 ## 7 0.3 ## 8 0.2 ## 9 0.2 ## 10 0.1 ## 11 0.2 ## 12 0.2 ## 13 0.1 ## 14 0.1 ## 15 0.2
drop = F
iris$Species[1:10]
## [1] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa ## Levels: setosa versicolor virginica
iris_tibble$Species[1:10]
## [1] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa ## Levels: setosa versicolor virginica
iris[1:10,]$Species
## [1] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa ## Levels: setosa versicolor virginica
iris_tibble[1:10,]$Species
## [1] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa ## Levels: setosa versicolor virginica
iris[1:10, "Species"]
## [1] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa ## Levels: setosa versicolor virginica
iris_tibble[1:10, "Species"]
## # A tibble: 10 × 1 ## Species ## <fct> ## 1 setosa ## 2 setosa ## 3 setosa ## 4 setosa ## 5 setosa ## 6 setosa ## 7 setosa ## 8 setosa ## 9 setosa ## 10 setosa
iris[1:10, 5]
## [1] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa ## Levels: setosa versicolor virginica
iris_tibble[1:10, 5]
## # A tibble: 10 × 1 ## Species ## <fct> ## 1 setosa ## 2 setosa ## 3 setosa ## 4 setosa ## 5 setosa ## 6 setosa ## 7 setosa ## 8 setosa ## 9 setosa ## 10 setosa
drop=F
$
acts like a forced conversion to vectordplyr
is a very useful package for data manipulation## Petal widths and lengths of setosas select(filter(iris_tibble, Species == "setosa"), Petal.Width, Petal.Length)
## # A tibble: 50 × 2 ## Petal.Width Petal.Length ## <dbl> <dbl> ## 1 0.2 1.4 ## 2 0.2 1.4 ## 3 0.2 1.3 ## 4 0.2 1.5 ## 5 0.2 1.4 ## 6 0.4 1.7 ## 7 0.3 1.4 ## 8 0.2 1.5 ## 9 0.2 1.4 ## 10 0.1 1.5 ## # ℹ 40 more rows
select
selects the desired columns from the datasetfilter
selects the matching rows from the datasetmutate
allows creating new columns from existing onessummarise
provides summary statisticsgroup_by
allows grouping rows and obtaining summary statisticsdplyr
also declares the pipe operator %>%
T %>% foo(a, b)
is the same as foo(T, a, b)
iris_tibble %>% filter(Species == "setosa") %>% select(Petal.Width, Petal.Length)
## # A tibble: 50 × 2 ## Petal.Width Petal.Length ## <dbl> <dbl> ## 1 0.2 1.4 ## 2 0.2 1.4 ## 3 0.2 1.3 ## 4 0.2 1.5 ## 5 0.2 1.4 ## 6 0.4 1.7 ## 7 0.3 1.4 ## 8 0.2 1.5 ## 9 0.2 1.4 ## 10 0.1 1.5 ## # ℹ 40 more rows
foo_d(foo_c(foo_b(foo_a(x))))
vs.
x %>% foo_a() %>% foo_b() %>% foo_c() %>% foo_d()
vs.
x %>% foo_a() %>% foo_b() %>% foo_c() %>% foo_d()
x <- tibble(a=1:100, b=100:1, c=sample(1:100, 100)) x %>% filter(a > 60) %>% filter(b < 20)
## # A tibble: 19 × 3 ## a b c ## <int> <int> <int> ## 1 82 19 59 ## 2 83 18 37 ## 3 84 17 67 ## 4 85 16 69 ## 5 86 15 71 ## 6 87 14 90 ## 7 88 13 39 ## 8 89 12 99 ## 9 90 11 10 ## 10 91 10 22 ## 11 92 9 34 ## 12 93 8 61 ## 13 94 7 14 ## 14 95 6 13 ## 15 96 5 33 ## 16 97 4 73 ## 17 98 3 27 ## 18 99 2 82 ## 19 100 1 60
x %>% filter(a * b > 1000) %>% select(c)
## # A tibble: 78 × 1 ## c ## <int> ## 1 17 ## 2 66 ## 3 44 ## 4 51 ## 5 55 ## 6 86 ## 7 31 ## 8 15 ## 9 41 ## 10 65 ## # ℹ 68 more rows
x %>% filter(a * b > 1000) %>% select(c) %>% mean()
## Warning in mean.default(.): argument is not numeric or logical: returning NA
## [1] NA
mean
requires a vector but received a tibble.x %>% filter(a * b > 1000) %>% pull(c) %>% mean()
## [1] 52.08974
pull
gets a vector out of a tibbleIt is useful to save and load very large R objects
This is much faster than reading from other data types
Save objects f
and my.dataset
into mysession.RData
save(f, my.dataset, file='mysession.RData')
mysession.RData
load('mysession.RData')
x.csv
ID, Name, Age 23424, Ana, 45 11234, Charles, 23 77654, Susanne, 76
readr
packagelibrary(readr) dat <- read_csv("x.csv")
## Rows: 3 Columns: 3 ## ── Column specification ──────────────────────────────────────────────────────── ## Delimiter: "," ## chr (1): Name ## dbl (2): ID, Age ## ## ℹ Use `spec()` to retrieve the full column specification for this data. ## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dat
## # A tibble: 3 × 3 ## ID Name Age ## <dbl> <chr> <dbl> ## 1 23424 Ana 45 ## 2 11234 Charles 23 ## 3 77654 Susanne 76
x.txt
ID Name Age Phone 23424 Ana 45 ??? 11234 Charles 23 3456789 77654 Susanne 76 2345678
read_delim()
dat <- read_delim("x.txt", delim = " ", na = "???")
## Rows: 3 Columns: 4 ## ── Column specification ──────────────────────────────────────────────────────── ## Delimiter: " " ## chr (1): Name ## dbl (3): ID, Age, Phone ## ## ℹ Use `spec()` to retrieve the full column specification for this data. ## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dat
## # A tibble: 3 × 4 ## ID Name Age Phone ## <dbl> <chr> <dbl> <dbl> ## 1 23424 Ana 45 NA ## 2 11234 Charles 23 3456789 ## 3 77654 Susanne 76 2345678
na
library(DBI) library(RMySQL) # The DBMS-specific code starts here drv <- dbDriver("MySQL") # Loading the MySQL driver con <- dbConnect(drv,dbname="transDB", # connecting to the DBMS username="myuser",password="mypasswd", host="localhost") # The DBMS-specific code ends here # getting the results of an SQL query as a data frame data <- dbGetQuery(con,"SELECT * FROM clients") # closing up stuff dbDisconnect(con) dbUnloadDriver(drv)
library(RMySQL) library(dplyr) dbConn <- src_mysql("sonae", host = "localhost", user = "prodUser", password = "myPassword") sensors <- tbl(dbConn,"sensor_values") sensors %>% filter(sid==274,value > 100) %>% select(time,value)
Source: query [?? x 2] Database: mysql 5.7.14 [prodUser@localhost:/sonae] time value <chr> <dbl> 1 2009-04-01 06:31:56 100.60 2 2009-04-01 06:32:04 103.11 3 2009-04-01 06:38:21 104.05 4 2009-04-01 06:38:29 103.87 5 2009-04-01 06:44:46 101.29 6 2009-04-01 06:44:54 100.16 7 2009-04-01 08:00:01 100.25 8 2009-04-01 08:55:52 101.64 9 2009-04-01 09:00:14 100.44 10 2009-04-01 09:11:50 102.33 # ... with more rows
d <- read.table("clipboard", header = TRUE)
library(readxl) fc <- "c:\\Documents and Settings\\xpto\\My Documents\\calc.xls" dat <- read_excel(fc, sheet = 1)
foreign
packageWrite a script to do the following things in a batch
csv_object.dat
xls_object.dat
.dat
files