Class07 Answer:

Create a heatmap from GSPC pctlead, pctlag1, and date-features

After much time with Google, I did this lab.

The script looks like this:


# hm17.r

# This script should show a nice demo of a heatmap in R.

# The heatmap helps me see how pctlead depends on three independent features:
# Day-of-Week
# Month-of-Year
# pctlag1

# Demo:
# R -f hm17.r

# ref:
# http://stackoverflow.com/questions/24621070/heatmap-2-with-color-key-on-top

# I should do this once (similar to pip install):
# install.packages("gplots", repos="http://cran.us.r-project.org")

# # I should get GSPC dates and prices:
# gspc0_df = read.csv('http://tkrprice.herokuapp.com/static/gspc.csv')
# 
# # I should identify each row by Date instead of an integer:
# # row.names(gspc0_df) = gspc0_df$Date
# 
# # I should order by Date:
# gspc1_df = gspc0_df[order(gspc0_df$Date),]
# 
# head(gspc1_df)
# 
# # I should write the df to a csv:
# write.csv(gspc1_df,'gspc1_df.csv', row.names=FALSE)

gspc2_df           = read.csv('gspc1_df.csv')
gspc3_df           = data.frame(gspc2_df$Date,gspc2_df$Close)
colnames(gspc3_df) = c('cdate','cp')

# I should compute pctlead,pctlag1 from cp
len_i            = length(gspc3_df$cp)
last_f           = gspc3_df$cp[len_i]
leadp_v          = c(gspc3_df$cp, last_f)[1:len_i+1]
gspc3_df$pctlead = 100 * (leadp_v - gspc3_df$cp) / gspc3_df$cp
gspc3_df$pctlag1 = c(0, gspc3_df$pctlead)[1:len_i]

# I should get Month-of-Year, Day-of-Week from cdate.
# I should get a column like this: 01_2 which corresponds to January_Tuesday.
gspc3_df$moydow = format(as.Date(gspc3_df$cdate),"%m_%w")

# I should get rows after 1990-01-01:
pred_v   = (as.Date(gspc3_df$cdate) > '1990-01-01')
gspc4_df = gspc3_df[ pred_v , ]

# I should get rows where pctlag1 < 0:
down_v = (gspc4_df$pctlag1 < 0)

# I should get rows where pctlag1 >= 0:
up_v = (gspc4_df$pctlag1 >=0)

# I should use aggregate() to sum(pctlead) groupby Month-of-Year, Day-of-Week:
moydow0_d_df = aggregate(pctlead ~ moydow, gspc4_df[down_v,], sum)
moydow0_u_df = aggregate(pctlead ~ moydow, gspc4_df[up_v,  ], sum)

# I should order by moydow descending:
moydow1_d_df   = moydow0_d_df[order(moydow0_d_df$moydow),]
moydow1_u_df   = moydow0_u_df[order(moydow0_u_df$moydow),]

# I should hstack them:
pctlead_after_down_pctlag = moydow1_d_df$pctlead
pctlead_after_up_pctlag   = moydow1_u_df$pctlead
moydow_df = data.frame(pctlead_after_down_pctlag,pctlead_after_up_pctlag)

# I should prepare df for feeding it to heatmap():
row.names(moydow_df) = moydow1_d_df$moydow
moydow_x             = data.matrix(moydow_df)

# I should create vectors full of color strings:
library(gplots)
row_i = nrow(moydow_x) # number of rows (should be 12 x 5)
col5_v  = rainbow(5, start=0, end=2/6)
col60_v = rainbow(row_i, start=0, end=2/6)

# I should prep the layout for the heatmap.

# A default heatmap layout has 4 things:
# - Color Key, which is like a legend
# - Column Dendrogram, which I usually dont want
# - Row Dendrogram,    which I usually dont want
# - Heatmap,           which I always want
# A default heatmap layout is controlled by 4 integers: 4,3,2,1
# I use variables instead of integers:
colorKey_i  = 4
colDendro_i = 3
rowDendro_i = 2
heatmap_i   = 1
# I like to add margins and each margin needs an integer:
margin_left5_i = 5
margin_left6_i = 6

# I want the layout to look like this:
# -----------------------------------------------
# | margin_left5 | colorKey | colDendro(unused) |
# | margin_left6 | heatmap  | rowDendro(unused) |
# -----------------------------------------------

# I should use above integers to create a layout_matrix:
lmat_x = rbind(c(margin_left5_i, colorKey_i, colDendro_i)
              ,c(margin_left6_i, heatmap_i,  rowDendro_i))

lmat_x

# The above layout has 2 rows and 3 columns.
# I should specify height of each row:
row1height_f = 0.8
row2height_f = 5.0
lhei_v       = c(row1height_f,row2height_f)
# I should specify width of each column:
col1width_f = 1.0
col2width_f = 10.0
col3width_f = 1.0
lwid_v      = c(col1width_f, col2width_f, col3width_f)

# I should specify available colors and how they sort:
color_v = rev(rainbow(30, start = 0/6, end = 4/6))
# I should write the heatmap to png file:

png('hm17.png',width=800,height=2500)
heatmap.2(x=moydow_x, Rowv=NULL,Colv=NULL
  ,col    = color_v
  ,scale  ="none"
  ,margins=c(25.0,0.0) # ("margin.Y", "margin.X")
  ,trace='none' # turns off unneeded trace lines inside the heat map
  ,symkey      =FALSE 
  ,symbreaks   =FALSE 
  ,dendrogram  ='none'
  ,density.info='histogram' 
  ,denscol     ="black"
  ,keysize     =1
  # For color-key at the top:
  #( "bottom.margin", "left.margin", "top.margin", "right.margin" )
  ,key.par =list(mar=c(3.5,0,3,0))
  ,lmat    =lmat_x, lhei=lhei_v, lwid=lwid_v
  ,cexCol  =2.0
  ,cexRow  =2.0
  # I should separate the cells:
  ,sepcolor='white'
  ,sepwidth=c(0.1, 0.1)
  ,rowsep  =c(1:row_i)
  ,colsep  =c(1:2)
  ,xlab = 'Pctlead Dependence on 2 Types of Pctlag(Down/Up)'
  ,cellnote = round(moydow_x,1)
  ,notecol  = 'black'
  ,notecex  = 1.8
)
dev.off()

# I should report:
moydow_x

# The above matrix helps me see how pctlead depends on three independent features:
# Day-of-Week
# Month-of-Year
# pctlag1

'bye'

I ran it:


dan@e80:~/ml4us/public/class07 $ 
dan@e80:~/ml4us/public/class07 $ 
dan@e80:~/ml4us/public/class07 $ R -f hm17.r

R version 3.2.3 (2015-12-10) -- "Wooden Christmas-Tree"
Copyright (C) 2015 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

  Natural language support but running in an English locale

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> # hm17.r
> 
> # This script should show a nice demo of a heatmap in R.
> 
> # The heatmap helps me see how pctlead depends on three independent features:
> # Day-of-Week
> # Month-of-Year
> # pctlag1
> 
> # Demo:
> # R -f hm17.r
> 
> # ref:
> # http://stackoverflow.com/questions/24621070/heatmap-2-with-color-key-on-top
> 
> # I should do this once (similar to pip install):
> # install.packages("gplots", repos="http://cran.us.r-project.org")
> 
> # # I should get GSPC dates and prices:
> # gspc0_df = read.csv('http://tkrprice.herokuapp.com/static/gspc.csv')
> # 
> # # I should identify each row by Date instead of an integer:
> # # row.names(gspc0_df) = gspc0_df$Date
> # 
> # # I should order by Date:
> # gspc1_df = gspc0_df[order(gspc0_df$Date),]
> # 
> # head(gspc1_df)
> # 
> # # I should write the df to a csv:
> # write.csv(gspc1_df,'gspc1_df.csv', row.names=FALSE)
> 
> gspc2_df           = read.csv('gspc1_df.csv')
> gspc3_df           = data.frame(gspc2_df$Date,gspc2_df$Close)
> colnames(gspc3_df) = c('cdate','cp')
> 
> # I should compute pctlead,pctlag1 from cp
> len_i            = length(gspc3_df$cp)
> last_f           = gspc3_df$cp[len_i]
> leadp_v          = c(gspc3_df$cp, last_f)[1:len_i+1]
> gspc3_df$pctlead = 100 * (leadp_v - gspc3_df$cp) / gspc3_df$cp
> gspc3_df$pctlag1 = c(0, gspc3_df$pctlead)[1:len_i]
> 
> # I should get Month-of-Year, Day-of-Week from cdate.
> # I should get a column like this: 01_2 which corresponds to January_Tuesday.
> gspc3_df$moydow = format(as.Date(gspc3_df$cdate),"%m_%w")
> 
> # I should get rows after 1990-01-01:
> pred_v   = (as.Date(gspc3_df$cdate) > '1990-01-01')
> gspc4_df = gspc3_df[ pred_v , ]
> 
> # I should get rows where pctlag1 < 0:
> down_v = (gspc4_df$pctlag1 < 0)
> 
> # I should get rows where pctlag1 >= 0:
> up_v = (gspc4_df$pctlag1 >=0)
> 
> # I should use aggregate() to sum(pctlead) groupby Month-of-Year, Day-of-Week:
> moydow0_d_df = aggregate(pctlead ~ moydow, gspc4_df[down_v,], sum)
> moydow0_u_df = aggregate(pctlead ~ moydow, gspc4_df[up_v,  ], sum)
> 
> # I should order by moydow descending:
> moydow1_d_df   = moydow0_d_df[order(moydow0_d_df$moydow),]
> moydow1_u_df   = moydow0_u_df[order(moydow0_u_df$moydow),]
> 
> # I should hstack them:
> pctlead_after_down_pctlag = moydow1_d_df$pctlead
> pctlead_after_up_pctlag   = moydow1_u_df$pctlead
> moydow_df = data.frame(pctlead_after_down_pctlag,pctlead_after_up_pctlag)
> 
> # I should prepare df for feeding it to heatmap():
> row.names(moydow_df) = moydow1_d_df$moydow
> moydow_x             = data.matrix(moydow_df)
> 
> # I should create vectors full of color strings:
> library(gplots)

Attaching package: ‘gplots’

The following object is masked from ‘package:stats’:

    lowess

> row_i = nrow(moydow_x) # number of rows (should be 12 x 5)
> col5_v  = rainbow(5, start=0, end=2/6)
> col60_v = rainbow(row_i, start=0, end=2/6)
> 
> # I should prep the layout for the heatmap.
> 
> # A default heatmap layout has 4 things:
> # - Color Key, which is like a legend
> # - Column Dendrogram, which I usually dont want
> # - Row Dendrogram,    which I usually dont want
> # - Heatmap,           which I always want
> # A default heatmap layout is controlled by 4 integers: 4,3,2,1
> # I use variables instead of integers:
> colorKey_i  = 4
> colDendro_i = 3
> rowDendro_i = 2
> heatmap_i   = 1
> # I like to add margins and each margin needs an integer:
> margin_left5_i = 5
> margin_left6_i = 6
> 
> # I want the layout to look like this:
> # -----------------------------------------------
> # | margin_left5 | colorKey | colDendro(unused) |
> # | margin_left6 | heatmap  | rowDendro(unused) |
> # -----------------------------------------------
> 
> # I should use above integers to create a layout_matrix:
> lmat_x = rbind(c(margin_left5_i, colorKey_i, colDendro_i)
+               ,c(margin_left6_i, heatmap_i,  rowDendro_i))
> 
> lmat_x
     [,1] [,2] [,3]
[1,]    5    4    3
[2,]    6    1    2
> 
> # The above layout has 2 rows and 3 columns.
> # I should specify height of each row:
> row1height_f = 0.8
> row2height_f = 5.0
> lhei_v       = c(row1height_f,row2height_f)
> # I should specify width of each column:
> col1width_f = 1.0
> col2width_f = 10.0
> col3width_f = 1.0
> lwid_v      = c(col1width_f, col2width_f, col3width_f)
> 
> # I should specify available colors and how they sort:
> color_v = rev(rainbow(30, start = 0/6, end = 4/6))
> # I should write the heatmap to png file:
> 
> png('hm17.png',width=800,height=2500)
> heatmap.2(x=moydow_x, Rowv=NULL,Colv=NULL
+   ,col    = color_v
+   ,scale  ="none"
+   ,margins=c(25.0,0.0) # ("margin.Y", "margin.X")
+   ,trace='none' # turns off unneeded trace lines inside the heat map
+   ,symkey      =FALSE 
+   ,symbreaks   =FALSE 
+   ,dendrogram  ='none'
+   ,density.info='histogram' 
+   ,denscol     ="black"
+   ,keysize     =1
+   # For color-key at the top:
+   #( "bottom.margin", "left.margin", "top.margin", "right.margin" )
+   ,key.par =list(mar=c(3.5,0,3,0))
+   ,lmat    =lmat_x, lhei=lhei_v, lwid=lwid_v
+   ,cexCol  =2.0
+   ,cexRow  =2.0
+   # I should separate the cells:
+   ,sepcolor='white'
+   ,sepwidth=c(0.1, 0.1)
+   ,rowsep  =c(1:row_i)
+   ,colsep  =c(1:2)
+   ,xlab = 'Pctlead Dependence on 2 Types of Pctlag(Down/Up)'
+   ,cellnote = round(moydow_x,1)
+   ,notecol  = 'black'
+   ,notecex  = 1.8
+ )
> dev.off()
null device 
          1 
> 
> # I should report:
> moydow_x
     pctlead_after_down_pctlag pctlead_after_up_pctlag
01_1                4.54556471              -0.6196479
01_2               16.21099647              -5.9309989
01_3                3.70490724               2.5064805
01_4              -11.00846356              -8.7556801
01_5                4.10580965              -5.6591545
02_1                8.35959782              -8.2736386
02_2               11.07737054              -2.5375395
02_3               -2.89501597              -0.8129677
02_4                2.43805434              -4.9514313
02_5               -5.27547368               3.1414040
03_1               18.34500693              -0.1618667
03_2                5.11231400              -4.4889308
03_3                3.90538646              12.1137822
03_4                4.28373338              -3.6481135
03_5                4.93315649               6.4659295
04_1               16.29967492              10.6624635
04_2                0.02734355              11.4182069
04_3               14.99438849              11.3218047
04_4              -19.00823101               1.9344332
04_5                8.48153374              -6.9066808
05_1                0.44378730              -4.6595065
05_2                5.01707776              -3.0768097
05_3               -1.83385909              -5.9505451
05_4                2.10139541              -2.1316273
05_5               12.95900364              12.1715435
06_1                6.59545062               2.0114993
06_2               -0.56336178               5.4908623
06_3                5.87933457               1.4400655
06_4               -4.31091434              -8.2961729
06_5                3.37615112             -10.3332052
07_1                0.25523748              -7.5592878
07_2                5.99835396               5.3003341
07_3               16.92087978              -1.7240495
07_4               -1.61526940              -4.5723903
07_5               -7.48825490               5.8153238
08_1               -7.02540246              -1.8983086
08_2               11.05163168              11.3477515
08_3               -8.88201315             -17.6169008
08_4                2.61393144              -5.1680903
08_5              -17.16819838               8.9962648
09_1               14.84082497             -10.6773323
09_2                0.69325066              -6.0640832
09_3              -12.21159664              -2.2577051
09_4                5.61086588              -0.3457646
09_5                9.18286083             -10.4618765
10_1                5.83802558              -2.8851917
10_2               -9.76690629             -11.9953413
10_3                6.81746108              15.8411905
10_4                8.28734977              11.4251985
10_5               11.13039911              14.8639589
11_1                8.39345040              -0.2540485
11_2               12.43801542             -10.3523163
11_3                1.12694549              11.1466439
11_4                3.18442896               7.3606821
11_5               -3.96786880               4.7496289
12_1               13.77764451               4.1822588
12_2               -3.58169191              13.9624099
12_3               -7.01324390              -4.3280039
12_4                8.45559008               8.4457124
12_5               10.15145354               6.4169704
> 
> # The above matrix helps me see how pctlead depends on three independent features:
> # Day-of-Week
> # Month-of-Year
> # pctlag1
> 
> 'bye'
[1] "bye"
> 
dan@e80:~/ml4us/public/class07 $ 
dan@e80:~/ml4us/public/class07 $ 
dan@e80:~/ml4us/public/class07 $ 

The heatmap looks like this:

Class07 Lab


ml4.us About Blog Contact Class01 Class02 Class03 Class04 Class05 Class06 Class07 Class08 Class09 Class10 dan101 Forum Google Hangout Vboxen