Class07 Answer:

Build a heuristic pctlead model from pctlag1, and date-features

I started this lab by writing a script which gets prices from Yaho and then builds features.

The features are the same features I used to build heatmaps:

The script looks like this:


# genf.r

# This script should create feat.csv full of features from dates and prices from Yahoo.

# Ref:
# http://www.ml4.us/cclasses/class07#hr

# Demo:
# R -f genf.r

get_prices = function(){
  # I should get GSPC dates and prices:
  gspc0_df = read.csv('http://tkrprice.herokuapp.com/static/gspc.csv')
  # I should order by Date:
  gspc1_df = gspc0_df[order(gspc0_df$Date),]
  # I should only use Date and Closing Price:
  gspc3_df = data.frame(gspc1_df$Date,gspc1_df$Close)
  colnames(gspc3_df) = c('cdate','cp')
  write.csv(gspc3_df,'gspc3.csv', row.names=FALSE)
  tail(gspc3_df)
  # I should now see a new CSV:
  # gspc3.csv
}

get_prices()
# I should compute pctlead,pctlag1 from cp
gspc3_df         = read.csv('gspc3.csv')
len_i            = length(gspc3_df$cp)
last_f           = gspc3_df$cp[len_i]
leadp_v          = c(gspc3_df$cp, last_f)[1:len_i+1]
gspc3_df$pctlead = 100 * (leadp_v - gspc3_df$cp) / gspc3_df$cp
gspc3_df$pctlag1 = c(0, gspc3_df$pctlead)[1:len_i]

# I should get moydow:
gspc3_df$moydow = format(as.Date(gspc3_df$cdate),"%m_%w")
# I should get moy:
gspc3_df$moy = format(as.Date(gspc3_df$cdate),"%-m")

# I should get dow:
gspc3_df$dow = format(as.Date(gspc3_df$cdate),"%w")

write.csv(gspc3_df,'feat.csv', row.names=FALSE)

'bye'

I ran it:


dan@e80:~/ml4us/public/class07 $ 
dan@e80:~/ml4us/public/class07 $ 
dan@e80:~/ml4us/public/class07 $ R -f genf.r

R version 3.2.3 (2015-12-10) -- "Wooden Christmas-Tree"
Copyright (C) 2015 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

  Natural language support but running in an English locale

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> # genf.r
> 
> # This script should create feat.csv full of features from dates and prices from Yahoo.
> 
> # Ref:
> # http://www.ml4.us/cclasses/class07#hr
> 
> # Demo:
> # R -f genf.r
> 
> get_prices = function(){
+   # I should get GSPC dates and prices:
+   gspc0_df = read.csv('http://tkrprice.herokuapp.com/static/gspc.csv')
+   # I should order by Date:
+   gspc1_df = gspc0_df[order(gspc0_df$Date),]
+   # I should only use Date and Closing Price:
+   gspc3_df = data.frame(gspc1_df$Date,gspc1_df$Close)
+   colnames(gspc3_df) = c('cdate','cp')
+   write.csv(gspc3_df,'gspc3.csv', row.names=FALSE)
+   tail(gspc3_df)
+   # I should now see a new CSV:
+   # gspc3.csv
+ }
> 
> get_prices()
           cdate      cp
16811 2016-10-21 2141.16
16812 2016-10-24 2151.33
16813 2016-10-25 2143.16
16814 2016-10-26 2139.43
16815 2016-10-27 2133.04
16816 2016-10-28 2126.41
> # I should compute pctlead,pctlag1 from cp
> gspc3_df         = read.csv('gspc3.csv')
> len_i            = length(gspc3_df$cp)
> last_f           = gspc3_df$cp[len_i]
> leadp_v          = c(gspc3_df$cp, last_f)[1:len_i+1]
> gspc3_df$pctlead = 100 * (leadp_v - gspc3_df$cp) / gspc3_df$cp
> gspc3_df$pctlag1 = c(0, gspc3_df$pctlead)[1:len_i]
> 
> # I should get moydow:
> gspc3_df$moydow = format(as.Date(gspc3_df$cdate),"%m_%w")
> # I should get moy:
> gspc3_df$moy = format(as.Date(gspc3_df$cdate),"%-m")
> 
> # I should get dow:
> gspc3_df$dow = format(as.Date(gspc3_df$cdate),"%w")
> 
> write.csv(gspc3_df,'feat.csv', row.names=FALSE)
> 
> 'bye'
[1] "bye"
> 
dan@e80:~/ml4us/public/class07 $ 
dan@e80:~/ml4us/public/class07 $ 
dan@e80:~/ml4us/public/class07 $ 

Next, I wrote a script to build a series of models for a series of years:


# hr_model.r

# This script should use a function and a loop to create many models.
# This script depends on genf.r to run first to create feat.csv

# Ref:
# http://www.ml4.us/cclasses/class07#hr

# Demo:
# R -f hr_model.r

create_model = function(yr_i, size_i) {
  # I should load features from CSV:
  feat_df = read.csv('feat.csv')

  # I should use yr_i to compute end, start:
  yr_train_end_i   = yr_i - 1
  yr_train_start_i = yr_i - size_i
  
  # I should constrain the training data.
  yr_v     = strtoi(format(as.Date(feat_df$cdate),"%Y"))
  pred1_v  = (yr_v >= yr_train_start_i)
  pred2_v  = (yr_v <= yr_train_end_i)
  pred3_v  = (pred1_v & pred2_v)
  train_df = feat_df[ pred3_v , ]
  # I should build a model from train_df.
  
  # I should get strings like this: '01_2' which corresponds to January_Tuesday.
  train_df$moydow = format(as.Date(train_df$cdate),"%m_%w")
  
  # I should get rows where pctlag1 < 0:
  down_v = (train_df$pctlag1 < 0)
  # I should get rows where pctlag1 >= 0:
  up_v   = (train_df$pctlag1 >=0)
  
  # I should use aggregate() to sum(pctlead) groupby Month-of-Year, Day-of-Week:
  mdown_df = aggregate(pctlead ~ moydow, train_df[down_v,], sum)
  mup_df   = aggregate(pctlead ~ moydow, train_df[up_v,]  , sum)
  # I should hstack them so I can use features to look up pctlead:
  pctlead_after_down_pctlag = round(mdown_df$pctlead,2)
  pctlead_after_up_pctlag   = round(mup_df$pctlead  ,2)
  moydow                    = mup_df$moydow
  model1_df = data.frame(moydow, pctlead_after_down_pctlag, pctlead_after_up_pctlag)
  
  # The model is ready for use.
  # I should write it to CSV so a predictor function can ask for predictions later:
  csv_s = paste('model',yr_i,'.csv',sep='')
  write.csv(model1_df,csv_s, row.names=FALSE)

  # The model should be written as a csv now.
  # The csv should look like: 'model2015.csv'
  return(csv_s)
}

# I should use a loop to create many models:
model_size_i = 25 # years
for (yr_i in c(2000:2017))
{
  fn_s = create_model(yr_i,model_size_i)
  print(fn_s)
}

'bye'

I ran it:


dan@e80:~/ml4us/public/class07 $ 
dan@e80:~/ml4us/public/class07 $ 
dan@e80:~/ml4us/public/class07 $ R -f hr_model.r

R version 3.2.3 (2015-12-10) -- "Wooden Christmas-Tree"
Copyright (C) 2015 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

  Natural language support but running in an English locale

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> # hr_model.r
> 
> # This script should use a function and a loop to create many models.
> # This script depends on genf.r to run first to create feat.csv
> 
> # Ref:
> # http://www.ml4.us/cclasses/class07#hr
> 
> # Demo:
> # R -f hr_model.r
> 
> create_model = function(yr_i, size_i) {
+   # I should load features from CSV:
+   feat_df = read.csv('feat.csv')
+ 
+   # I should use yr_i to compute end, start:
+   yr_train_end_i   = yr_i - 1
+   yr_train_start_i = yr_i - size_i
+   
+   # I should constrain the training data.
+   yr_v     = strtoi(format(as.Date(feat_df$cdate),"%Y"))
+   pred1_v  = (yr_v >= yr_train_start_i)
+   pred2_v  = (yr_v <= yr_train_end_i)
+   pred3_v  = (pred1_v & pred2_v)
+   train_df = feat_df[ pred3_v , ]
+   # I should build a model from train_df.
+   
+   # I should get strings like this: '01_2' which corresponds to January_Tuesday.
+   train_df$moydow = format(as.Date(train_df$cdate),"%m_%w")
+   
+   # I should get rows where pctlag1 < 0:
+   down_v = (train_df$pctlag1 < 0)
+   # I should get rows where pctlag1 >= 0:
+   up_v   = (train_df$pctlag1 >=0)
+   
+   # I should use aggregate() to sum(pctlead) groupby Month-of-Year, Day-of-Week:
+   mdown_df = aggregate(pctlead ~ moydow, train_df[down_v,], sum)
+   mup_df   = aggregate(pctlead ~ moydow, train_df[up_v,]  , sum)
+   # I should hstack them so I can use features to look up pctlead:
+   pctlead_after_down_pctlag = round(mdown_df$pctlead,2)
+   pctlead_after_up_pctlag   = round(mup_df$pctlead  ,2)
+   moydow                    = mup_df$moydow
+   model1_df = data.frame(moydow, pctlead_after_down_pctlag, pctlead_after_up_pctlag)
+   
+   # The model is ready for use.
+   # I should write it to CSV so a predictor function can ask for predictions later:
+   csv_s = paste('model',yr_i,'.csv',sep='')
+   write.csv(model1_df,csv_s, row.names=FALSE)
+ 
+   # The model should be written as a csv now.
+   # The csv should look like: 'model2015.csv'
+   return(csv_s)
+ }
> 
> # I should use a loop to create many models:
> model_size_i = 25 # years
> for (yr_i in c(2000:2016))
+ {
+   fn_s = create_model(yr_i,model_size_i)
+   print(fn_s)
+ }
[1] "model2000.csv"
[1] "model2001.csv"
[1] "model2002.csv"
[1] "model2003.csv"
[1] "model2004.csv"
[1] "model2005.csv"
[1] "model2006.csv"
[1] "model2007.csv"
[1] "model2008.csv"
[1] "model2009.csv"
[1] "model2010.csv"
[1] "model2011.csv"
[1] "model2012.csv"
[1] "model2013.csv"
[1] "model2014.csv"
[1] "model2015.csv"
[1] "model2016.csv"
> 
> 'bye'
[1] "bye"
> 
dan@e80:~/ml4us/public/class07 $ 
dan@e80:~/ml4us/public/class07 $ 
dan@e80:~/ml4us/public/class07 $ head model2016.csv 
"moydow","pctlead_after_down_pctlag","pctlead_after_up_pctlag"
"01_1",2.13,-0.22
"01_2",14.98,1.78
"01_3",4.87,2.59
"01_4",-8.86,-8.92
"01_5",4.56,-1.47
"02_1",7.63,-6.85
"02_2",9.01,-4.98
"02_3",-1.06,-2.52
"02_4",0.12,-2.27
dan@e80:~/ml4us/public/class07 $ 
dan@e80:~/ml4us/public/class07 $ 
dan@e80:~/ml4us/public/class07 $ 

One way to visualize a model is to see it as a table, for a specific year, with 60 rows and three columns.

Class07 Lab


ml4.us About Blog Contact Class01 Class02 Class03 Class04 Class05 Class06 Class07 Class08 Class09 Class10 dan101 Forum Google Hangout Vboxen