library(knitr) library(tidyr) library(dplyr) library(readr) library(stringr) library(lubridate) library(Hmisc)
23/03/2020
library(knitr) library(tidyr) library(dplyr) library(readr) library(stringr) library(lubridate) library(Hmisc)
set.seed(1024)
Raw data -> Clean -> Transform -> Create -> Reduce
Wickham (2014) -> Presented the notion of “tidy data”
Tidy Data:
Name | Math | English |
---|---|---|
Anna | 86 | 90 |
John | 43 | 75 |
Catherine | 80 | 82 |
Name | Subject | Grade |
---|---|---|
Anna | Math | 86 |
Anna | English | 90 |
John | Math | 43 |
John | English | 75 |
Catherine | Math | 80 |
Catherine | English | 82 |
Math English Anna 86 90 John 43 75 Catherine 80 82
library(readr) std <- read_delim("stud.txt", delim = " ", skip = 1, col_names = c("StudentName", "Math", "English"))
## Parsed with column specification: ## cols( ## StudentName = col_character(), ## Math = col_double(), ## English = col_double() ## )
std
## # A tibble: 3 x 3 ## StudentName Math English ## <chr> <dbl> <dbl> ## 1 Anna 86 90 ## 2 John 43 75 ## 3 Catherine 80 82
gather
is used to bring together multiple columns of data
std
## # A tibble: 3 x 3 ## StudentName Math English ## <chr> <dbl> <dbl> ## 1 Anna 86 90 ## 2 John 43 75 ## 3 Catherine 80 82
stdL <- gather(std, Subject, Grade, Math:English) stdL
## # A tibble: 6 x 3 ## StudentName Subject Grade ## <chr> <chr> <dbl> ## 1 Anna Math 86 ## 2 John Math 43 ## 3 Catherine Math 80 ## 4 Anna English 90 ## 5 John English 75 ## 6 Catherine English 82
spread
is the opposite of gather
spread(stdL, Subject, Grade)
## # A tibble: 3 x 3 ## StudentName English Math ## <chr> <dbl> <dbl> ## 1 Anna 90 86 ## 2 Catherine 82 80 ## 3 John 75 43
Math English Degree_Year Anna 86 90 Bio_2014 John 43 75 Math_2013 Catherine 80 82 Bio_2012
std2 <- read_delim("stud2.txt", delim = " ", skip = 1, col_names = c("StudentName", "Math", "English", "Degree_Year"))
## Parsed with column specification: ## cols( ## StudentName = col_character(), ## Math = col_double(), ## English = col_double(), ## Degree_Year = col_character() ## )
std2
## # A tibble: 3 x 4 ## StudentName Math English Degree_Year ## <chr> <dbl> <dbl> <chr> ## 1 Anna 86 90 Bio_2014 ## 2 John 43 75 Math_2013 ## 3 Catherine 80 82 Bio_2012
separate
splits a character column into multiple columnsstd2L <- gather(std2, "Subject", "Grade", Math:English) std2L <- separate(std2L, Degree_Year, c("Degree", "Year")) std2L
## # A tibble: 6 x 5 ## StudentName Degree Year Subject Grade ## <chr> <chr> <chr> <chr> <dbl> ## 1 Anna Bio 2014 Math 86 ## 2 John Math 2013 Math 43 ## 3 Catherine Bio 2012 Math 80 ## 4 Anna Bio 2014 English 90 ## 5 John Math 2013 English 75 ## 6 Catherine Bio 2012 English 82
sep
can be used to fine tuneunite()
reverses the effect of separate()
lubridate
package provides useful functions for working with datesymd("20151021")
## [1] "2015-10-21"
ymd("2019/10/31")
## [1] "2019-10-31"
ymd("19.2.4")
## [1] "2019-02-04"
dmy("4 July 2018")
## [1] "2018-07-04"
dmy_hms("4 July 2018, 14:05:01", tz = "Europe/Istanbul")
## [1] "2018-07-04 14:05:01 +03"
dates <- c(20120521, "2010-12-12", "2007/01/5", "2015-2-04", "Measured on 2014-12-6", "2013-7+ 25") dates <- ymd(dates) dates
## [1] "2012-05-21" "2010-12-12" "2007-01-05" "2015-02-04" "2014-12-06" ## [6] "2013-07-25"
You can extract components from date objects
myDate <- dmy_hms("31 October 2019, 14:05:01", tz = "Europe/Istanbul") wday(myDate)
## [1] 5
wday(myDate, label = TRUE)
## [1] Prş ## Levels: Paz < Pzt < Sal < Çrş < Prş < Cum < Cts
wday(myDate, week_start = 1)
## [1] 4
month(myDate)
## [1] 10
hour(myDate)
## [1] 14
myDate
## [1] "2019-10-31 14:05:01 +03"
with_tz(myDate, "Pacific/Auckland")
## [1] "2019-11-01 00:05:01 NZDT"
with_tz(myDate, "America/New_York")
## [1] "2019-10-31 07:05:01 EDT"
force_tz(myDate, "America/New_York")
## [1] "2019-10-31 14:05:01 EDT"
OlsonNames()
head(OlsonNames(), 50)
## [1] "Africa/Abidjan" "Africa/Accra" "Africa/Addis_Ababa" ## [4] "Africa/Algiers" "Africa/Asmara" "Africa/Asmera" ## [7] "Africa/Bamako" "Africa/Bangui" "Africa/Banjul" ## [10] "Africa/Bissau" "Africa/Blantyre" "Africa/Brazzaville" ## [13] "Africa/Bujumbura" "Africa/Cairo" "Africa/Casablanca" ## [16] "Africa/Ceuta" "Africa/Conakry" "Africa/Dakar" ## [19] "Africa/Dar_es_Salaam" "Africa/Djibouti" "Africa/Douala" ## [22] "Africa/El_Aaiun" "Africa/Freetown" "Africa/Gaborone" ## [25] "Africa/Harare" "Africa/Johannesburg" "Africa/Juba" ## [28] "Africa/Kampala" "Africa/Khartoum" "Africa/Kigali" ## [31] "Africa/Kinshasa" "Africa/Lagos" "Africa/Libreville" ## [34] "Africa/Lome" "Africa/Luanda" "Africa/Lubumbashi" ## [37] "Africa/Lusaka" "Africa/Malabo" "Africa/Maputo" ## [40] "Africa/Maseru" "Africa/Mbabane" "Africa/Mogadishu" ## [43] "Africa/Monrovia" "Africa/Nairobi" "Africa/Ndjamena" ## [46] "Africa/Niamey" "Africa/Nouakchott" "Africa/Ouagadougou" ## [49] "Africa/Porto-Novo" "Africa/Sao_Tome"
stringr
provides useful functionsstringi
provides more complex functionsuci.repo <- "https://archive.ics.uci.edu/ml/machine-learning-databases/" dataset <- "audiology/audiology.standardized" dataF <- str_c(uci.repo,dataset,".data") namesF <- str_c(uci.repo,dataset,".names") data <- read_csv(url(dataF), col_names=FALSE, na="?") print(data, n = 5)
## # A tibble: 200 x 71 ## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 ## <lgl> <chr> <lgl> <chr> <chr> <chr> <lgl> <chr> <lgl> <lgl> <lgl> <lgl> ## 1 FALSE mild FALSE norm… norm… <NA> TRUE <NA> FALSE FALSE FALSE FALSE ## 2 FALSE modera… FALSE norm… norm… <NA> TRUE <NA> FALSE FALSE FALSE FALSE ## 3 TRUE mild TRUE <NA> abse… mild TRUE <NA> FALSE FALSE FALSE FALSE ## 4 TRUE mild TRUE <NA> abse… mild FALSE <NA> FALSE FALSE FALSE FALSE ## 5 TRUE mild FALSE norm… norm… mild TRUE <NA> FALSE FALSE FALSE FALSE ## # … with 195 more rows, and 59 more variables: X13 <lgl>, X14 <lgl>, ## # X15 <lgl>, X16 <lgl>, X17 <lgl>, X18 <lgl>, X19 <lgl>, X20 <lgl>, ## # X21 <lgl>, X22 <lgl>, X23 <lgl>, X24 <lgl>, X25 <lgl>, X26 <lgl>, ## # X27 <lgl>, X28 <lgl>, X29 <lgl>, X30 <lgl>, X31 <lgl>, X32 <lgl>, ## # X33 <lgl>, X34 <lgl>, X35 <lgl>, X36 <lgl>, X37 <lgl>, X38 <lgl>, ## # X39 <lgl>, X40 <lgl>, X41 <lgl>, X42 <lgl>, X43 <lgl>, X44 <lgl>, ## # X45 <lgl>, X46 <lgl>, X47 <lgl>, X48 <lgl>, X49 <lgl>, X50 <lgl>, ## # X51 <lgl>, X52 <lgl>, X53 <lgl>, X54 <lgl>, X55 <lgl>, X56 <lgl>, ## # X57 <lgl>, X58 <lgl>, X59 <chr>, X60 <chr>, X61 <lgl>, X62 <lgl>, ## # X63 <lgl>, X64 <chr>, X65 <lgl>, X66 <chr>, X67 <lgl>, X68 <lgl>, ## # X69 <lgl>, X70 <chr>, X71 <chr>
text <- read_lines(url(namesF)) text[1:3]
## [1] "WARNING: This database should be credited to the original owner whenever" ## [2] " used for any publication whatsoever." ## [3] ""
namesF
## [1] "https://archive.ics.uci.edu/ml/machine-learning-databases/audiology/audiology.standardized.names"
nms <- str_split_fixed(text[67:135], ":", n = 2)[,1] nms[1:3]
## [1] " age_gt_60" " air()" " airBoneGap"
nms <- str_trim(nms) nms[1:3]
## [1] "age_gt_60" "air()" "airBoneGap"
nms <- str_replace_all(nms, "\\(|\\)", "") nms[1:3]
## [1] "age_gt_60" "air" "airBoneGap"
colnames(data)[1:69] <- nms data[1:3, 1:8]
## # A tibble: 3 x 8 ## age_gt_60 air airBoneGap ar_c ar_u bone boneAbnormal bser ## <lgl> <chr> <lgl> <chr> <chr> <chr> <lgl> <chr> ## 1 FALSE mild FALSE normal normal <NA> TRUE <NA> ## 2 FALSE moderate FALSE normal normal <NA> TRUE <NA> ## 3 TRUE mild TRUE <NA> absent mild TRUE <NA>
dat <- data.frame(X = c("green", "blue", "green", "red"), Y = c(56, "?", 100, -10)) dat
## X Y ## 1 green 56 ## 2 blue ? ## 3 green 100 ## 4 red -10
class(dat$Y)
## [1] "factor"
Column Y is designated as a factor due to the missing value
parse_integer
from readr
converts to integer and takes care of the NA valuesdat$Y <- parse_integer(as.character(dat$Y), na = "?") class(dat$Y)
## [1] "integer"
dat
## X Y ## 1 green 56 ## 2 blue NA ## 3 green 100 ## 4 red -10
dat$Y <- as.character(dat$Y) dat$Y[dat$Y == "?"] <- NA dat$Y <- as.integer(dat$Y)
\(Y = \frac{X - \bar x}{s_X}\)
\(Y = \frac{X - min_X}{max_X - min_X}\)
\(Y = log(X)\)
dplyr
method scale()
does these transformationsdata(iris) apply(iris[,1:4], 2, mean)
## Sepal.Length Sepal.Width Petal.Length Petal.Width ## 5.843333 3.057333 3.758000 1.199333
apply(iris[,1:4], 2, range)
## Sepal.Length Sepal.Width Petal.Length Petal.Width ## [1,] 4.3 2.0 1.0 0.1 ## [2,] 7.9 4.4 6.9 2.5
iris_scaled <- cbind(iris %>% select(-Species) %>% scale(), iris %>% select(Species)) apply(iris_scaled[,1:4], 2, mean)
## Sepal.Length Sepal.Width Petal.Length Petal.Width ## -4.484318e-16 2.034094e-16 -2.895326e-17 -3.663049e-17
apply(iris_scaled[,1:4], 2, range)
## Sepal.Length Sepal.Width Petal.Length Petal.Width ## [1,] -1.863780 -2.425820 -1.562342 -1.442245 ## [2,] 2.483699 3.080455 1.779869 1.706379
rng <- apply(iris[,1:4], 2, range) rng <- rbind(rng, rng[2,] - rng[1,]) iris_scaled <- cbind( iris %>% select(-Species) %>% scale(center = rng[1,], scale = rng[3,]), iris %>% select(Species)) apply(iris_scaled[,1:4], 2, range)
## Sepal.Length Sepal.Width Petal.Length Petal.Width ## [1,] 0 0 0 0 ## [2,] 1 1 1 1
cut2
from Hmisc
cut
from base
Two main methods:
MASS
data(Boston, package = "MASS") hist(Boston$age)
Boston$newAge <- cut(Boston$age, 5) barplot(table(Boston$newAge))
t <- table(Boston$newAge) tibble(Range = names(t), Count = t)
## # A tibble: 5 x 2 ## Range Count ## <chr> <table> ## 1 (2.8,22.3] 45 ## 2 (22.3,41.7] 71 ## 3 (41.7,61.2] 70 ## 4 (61.2,80.6] 81 ## 5 (80.6,100] 239
Boston$newAge <- cut( Boston$age, 5, labels = c("A1", "A2", "A3", "A4", "A5")) barplot(table(Boston$newAge))
t <- table(Boston$newAge) tibble(Range = names(t), Count = t)
## # A tibble: 5 x 2 ## Range Count ## <chr> <table> ## 1 A1 45 ## 2 A2 71 ## 3 A3 70 ## 4 A4 81 ## 5 A5 239
Boston$newAge <- cut2(Boston$age, g = 5) barplot(table(Boston$newAge))
t <- table(Boston$newAge) tibble(Range = names(t), Count = t)
## # A tibble: 5 x 2 ## Range Count ## <chr> <table> ## 1 [ 2.9, 38.1) 102 ## 2 [38.1, 66.1) 101 ## 3 [66.1, 86.1) 101 ## 4 [86.1, 95.7) 101 ## 5 [95.7,100.0] 101
merge
function, merge the two tables into one
merged
has 4 columns: Year, Type, Area, Tonnesefficiency
to merged
efficiency
is the amount of production per decareefficiency
using normalizationefficiency
into 3 bins
## ## low medium high ## Barley 18 0 0 ## Cotton 4 14 0 ## Maize 0 11 7 ## SugarBeets 0 0 18 ## Sunflower 18 0 0 ## Wheat 18 0 0