library(rmarkdown) # used for syntax highlighting library(knitr) library(dplyr) library(readr) library(stringr) library(lubridate) library(xts) library(sp) library(CORElearn)
28/03/2020
library(rmarkdown) # used for syntax highlighting library(knitr) library(dplyr) library(readr) library(stringr) library(lubridate) library(xts) library(sp) library(CORElearn)
set.seed(1024)
xts
packagexts
package is used for working with time series datasp500 <- xts(c(1102.94, 1104.49, 1115.71, 1118.31), ymd(c("2010-02-25", "2010-02-26", "2010-03-01", "2010-03-02"))) sp500
## [,1] ## 2010-02-25 1102.94 ## 2010-02-26 1104.49 ## 2010-03-01 1115.71 ## 2010-03-02 1118.31
sp500["2010-03-02"]
## [,1] ## 2010-03-02 1118.31
sp500["2010-03"]
## [,1] ## 2010-03-01 1115.71 ## 2010-03-02 1118.31
sp500["2010-03-01/"]
## [,1] ## 2010-03-01 1115.71 ## 2010-03-02 1118.31
sp500["2010-02-26/2010-03-01"]
## [,1] ## 2010-02-26 1104.49 ## 2010-03-01 1115.71
xts
packagexts
also allows nice plots of time series dataplot(sp500)
data(AirPassengers) AirPassengers
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec ## 1949 112 118 132 129 121 135 148 148 136 119 104 118 ## 1950 115 126 141 135 125 149 170 170 158 133 114 140 ## 1951 145 150 178 163 172 178 199 199 184 162 146 166 ## 1952 171 180 193 181 183 218 230 242 209 191 172 194 ## 1953 196 196 236 235 229 243 264 272 237 211 180 201 ## 1954 204 188 235 227 234 264 302 293 259 229 203 229 ## 1955 242 233 267 269 270 315 364 347 312 274 237 278 ## 1956 284 277 317 313 318 374 413 405 355 306 271 306 ## 1957 315 301 356 348 355 422 465 467 404 347 305 336 ## 1958 340 318 362 348 363 435 491 505 404 359 310 337 ## 1959 360 342 406 396 420 472 548 559 463 407 362 405 ## 1960 417 391 419 461 472 535 622 606 508 461 390 432
ap <- as.xts(AirPassengers) head(ap)
## [,1] ## Oca 1949 112 ## Şub 1949 118 ## Mar 1949 132 ## Nis 1949 129 ## May 1949 121 ## Haz 1949 135
head(diff(ap))
## [,1] ## Oca 1949 NA ## Şub 1949 6 ## Mar 1949 14 ## Nis 1949 -3 ## May 1949 -8 ## Haz 1949 14
tail(diff(ap))
## [,1] ## Tem 1960 87 ## Ağu 1960 -16 ## Eyl 1960 -98 ## Eki 1960 -47 ## Kas 1960 -71 ## Ara 1960 42
apRel <- diff(ap) / ap head(apRel)
## [,1] ## Oca 1949 NA ## Şub 1949 0.05084746 ## Mar 1949 0.10606061 ## Nis 1949 -0.02325581 ## May 1949 -0.06611570 ## Haz 1949 0.10370370
tail(apRel)
## [,1] ## Tem 1960 0.13987138 ## Ağu 1960 -0.02640264 ## Eyl 1960 -0.19291339 ## Eki 1960 -0.10195228 ## Kas 1960 -0.18205128 ## Ara 1960 0.09722222
plot(ap)
plot(apRel)
embed
Functionhead(apRel, 10)
## [,1] ## Oca 1949 NA ## Şub 1949 0.05084746 ## Mar 1949 0.10606061 ## Nis 1949 -0.02325581 ## May 1949 -0.06611570 ## Haz 1949 0.10370370 ## Tem 1949 0.08783784 ## Ağu 1949 0.00000000 ## Eyl 1949 -0.08823529 ## Eki 1949 -0.14285714
head(embed(apRel[-1], 5))
## [,1] [,2] [,3] [,4] [,5] ## [1,] 0.10370370 -0.06611570 -0.02325581 0.10606061 0.05084746 ## [2,] 0.08783784 0.10370370 -0.06611570 -0.02325581 0.10606061 ## [3,] 0.00000000 0.08783784 0.10370370 -0.06611570 -0.02325581 ## [4,] -0.08823529 0.00000000 0.08783784 0.10370370 -0.06611570 ## [5,] -0.14285714 -0.08823529 0.00000000 0.08783784 0.10370370 ## [6,] -0.14423077 -0.14285714 -0.08823529 0.00000000 0.08783784
“Everything is related with everything else, but near things are more related than distant things.”
First law of geography, (Tobler, 1970)
sp
is useful in spatial analysis# https://web.cs.dal.ca/~ltorgo/AuxFiles/forestFires.txt ff <- read_csv("forestFires.txt") print(ff, width=70)
## # A tibble: 25,000 x 14 ## FID_ CID ano1991 ano1992 ano1993 ano1994 ano1995 ano1996 ano1997 ## <lgl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 NA 1 0 0 0 0 0 0 0 ## 2 NA 2 0 0 0 0 0 0 0 ## 3 NA 3 0 0 0 0 0 0 0 ## 4 NA 4 0 0 0 0 0 0 0 ## 5 NA 5 0 0 0 0 0 0 0 ## 6 NA 6 0 0 0 0 0 0 0 ## 7 NA 7 0 0 0 0 0 0 0 ## 8 NA 8 0 0 0 0 0 0 0 ## 9 NA 9 0 0 0 0 0 0 0 ## 10 NA 10 0 0 0 0 0 0 0 ## # … with 24,990 more rows, and 5 more variables: ano1998 <dbl>, ## # ano1999 <dbl>, ano2000 <dbl>, x <dbl>, y <dbl>
anoX
: Fire happened in year XspatialCoord <- ff %>% select(long = x, lat = y) spatialData <- ff %>% select(Year2000 = ano2000) coordRefSys <- CRS("+proj=longlat +ellps=WGS84") fires <- SpatialPointsDataFrame(spatialCoord, spatialData, proj4string = coordRefSys) head(fires)
## coordinates Year2000 ## 1 (-7.31924, 38.5406) 0 ## 2 (-7.63557, 40.5022) 0 ## 3 (-7.90273, 40.3418) 0 ## 4 (-7.25657, 39.2572) 0 ## 5 (-8.50379, 37.3445) 0 ## 6 (-8.05975, 41.562) 0
bbox(fires)
## min max ## long -9.49174 -6.20743 ## lat 36.98050 42.14360
head(coordinates(fires))
## long lat ## [1,] -7.31924 38.5406 ## [2,] -7.63557 40.5022 ## [3,] -7.90273 40.3418 ## [4,] -7.25657 39.2572 ## [5,] -8.50379 37.3445 ## [6,] -8.05975 41.5620
tm
provides useful functionsColumns
Rows
sample()
data(iris) sampleRate <- 0.7 sampledRows <- sample(1:nrow(iris), nrow(iris) * sampleRate) iris.sample <- iris[sampledRows,]
sample.int()
data(iris) sampleRate <- 0.7 sampledRows <- sample.int(nrow(iris), nrow(iris) * sampleRate) iris.sample <- iris[sampledRows,]
A pseudocode for very large dataset sampling
Potential problems:
ncol() > nrow()
Correlation - simple
Information theoretic metrics
\(H(Y)=-\sum_{c_i\in\mathcal{Y}}{P(Y=c_i)\times\log{P(Y=c_i)}}\)
\(H(Y|X) = -\sum_{v_i\in \mathcal {X}, c_i\in \mathcal {Y}} P(X=v_i,Y=c_i)\log {\frac {P(X=v_i,Y=c_i)}{p(X=v_i)}}\)
\(IG(X) = H(Y) - H(Y|X)\)
\(GR(X) = \frac{IG(X)} {H(X)}\)
FSelector
and CORElearn
attrEval
is from CORElearn
data(iris) attrEval(Species ~ ., iris, estimator = "GainRatio")
## Sepal.Length Sepal.Width Petal.Length Petal.Width ## 0.5919339 0.3512938 1.0000000 1.0000000
attrEval(Species ~ ., iris, estimator = "InfGain")
## Sepal.Length Sepal.Width Petal.Length Petal.Width ## 0.5572327 0.2831260 0.9182958 0.9182958
attrEval(Species ~ ., iris, estimator = "Gini")
## Sepal.Length Sepal.Width Petal.Length Petal.Width ## 0.2277603 0.1269234 0.3333333 0.3333333
attrEval(Species ~ ., iris, estimator = "Relief")
## Sepal.Length Sepal.Width Petal.Length Petal.Width ## 0.1974074 0.1874074 0.7267797 0.7088889
infoCore(what = "attrEval")
## [1] "ReliefFequalK" "ReliefFexpRank" "ReliefFbestK" ## [4] "Relief" "InfGain" "GainRatio" ## [7] "MDL" "Gini" "MyopicReliefF" ## [10] "Accuracy" "ReliefFmerit" "ReliefFdistance" ## [13] "ReliefFsqrDistance" "DKM" "ReliefFexpC" ## [16] "ReliefFavgC" "ReliefFpe" "ReliefFpa" ## [19] "ReliefFsmp" "GainRatioCost" "DKMcost" ## [22] "ReliefKukar" "MDLsmp" "ImpurityEuclid" ## [25] "ImpurityHellinger" "UniformDKM" "UniformGini" ## [28] "UniformInf" "UniformAccuracy" "EqualDKM" ## [31] "EqualGini" "EqualInf" "EqualHellinger" ## [34] "DistHellinger" "DistAUC" "DistAngle" ## [37] "DistEuclid"
infoCore(what = "attrEvalReg")
## [1] "RReliefFequalK" "RReliefFexpRank" "RReliefFbestK" ## [4] "RReliefFwithMSE" "MSEofMean" "MSEofModel" ## [7] "MAEofModel" "RReliefFdistance" "RReliefFsqrDistance"
data(iris) pca <- princomp(iris[,1:4]) loadings(pca)
## ## Loadings: ## Comp.1 Comp.2 Comp.3 Comp.4 ## Sepal.Length 0.361 0.657 0.582 0.315 ## Sepal.Width 0.730 -0.598 -0.320 ## Petal.Length 0.857 -0.173 -0.480 ## Petal.Width 0.358 -0.546 0.754 ## ## Comp.1 Comp.2 Comp.3 Comp.4 ## SS loadings 1.00 1.00 1.00 1.00 ## Proportion Var 0.25 0.25 0.25 0.25 ## Cumulative Var 0.25 0.50 0.75 1.00
new.iris <- data.frame(pca$scores[, 1:2], Species = iris$Species) head(new.iris) %>% kable()
Comp.1 | Comp.2 | Species |
---|---|---|
-2.684126 | 0.3193972 | setosa |
-2.714142 | -0.1770012 | setosa |
-2.888991 | -0.1449494 | setosa |
-2.745343 | -0.3182990 | setosa |
-2.728716 | 0.3267545 | setosa |
-2.280860 | 0.7413304 | setosa |