Class07 Answer:

Build a Logistic Regression model from pctlag1, and date-features.

I started this lab by writing a script which gets prices from Yaho and then builds features.

The features are the same features I used to build heatmaps:

The script looks like this:


# genf.r

# This script should create feat.csv full of features from dates and prices from Yahoo.

# Ref:
# http://www.ml4.us/cclasses/class07#hr

# Demo:
# R -f genf.r

get_prices = function(){
  # I should get GSPC dates and prices:
  gspc0_df = read.csv('http://tkrprice.herokuapp.com/static/gspc.csv')
  # I should order by Date:
  gspc1_df = gspc0_df[order(gspc0_df$Date),]
  # I should only use Date and Closing Price:
  gspc3_df = data.frame(gspc1_df$Date,gspc1_df$Close)
  colnames(gspc3_df) = c('cdate','cp')
  write.csv(gspc3_df,'gspc3.csv', row.names=FALSE)
  tail(gspc3_df)
  # I should now see a new CSV:
  # gspc3.csv
}

get_prices()
# I should compute pctlead,pctlag1 from cp
gspc3_df         = read.csv('gspc3.csv')
len_i            = length(gspc3_df$cp)
last_f           = gspc3_df$cp[len_i]
leadp_v          = c(gspc3_df$cp, last_f)[1:len_i+1]
gspc3_df$pctlead = 100 * (leadp_v - gspc3_df$cp) / gspc3_df$cp
gspc3_df$pctlag1 = c(0, gspc3_df$pctlead)[1:len_i]

# I should get moydow:
gspc3_df$moydow = format(as.Date(gspc3_df$cdate),"%m_%w")
# I should get moy:
gspc3_df$moy = format(as.Date(gspc3_df$cdate),"%-m")

# I should get dow:
gspc3_df$dow = format(as.Date(gspc3_df$cdate),"%w")

write.csv(gspc3_df,'feat.csv', row.names=FALSE)

'bye'

I ran it:


dan@e80:~/ml4us/public/class07 $ 
dan@e80:~/ml4us/public/class07 $ 
dan@e80:~/ml4us/public/class07 $ R -f genf.r

R version 3.2.3 (2015-12-10) -- "Wooden Christmas-Tree"
Copyright (C) 2015 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

  Natural language support but running in an English locale

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> # genf.r
> 
> # This script should create feat.csv full of features from dates and prices from Yahoo.
> 
> # Ref:
> # http://www.ml4.us/cclasses/class07#hr
> 
> # Demo:
> # R -f genf.r
> 
> get_prices = function(){
+   # I should get GSPC dates and prices:
+   gspc0_df = read.csv('http://tkrprice.herokuapp.com/static/gspc.csv')
+   # I should order by Date:
+   gspc1_df = gspc0_df[order(gspc0_df$Date),]
+   # I should only use Date and Closing Price:
+   gspc3_df = data.frame(gspc1_df$Date,gspc1_df$Close)
+   colnames(gspc3_df) = c('cdate','cp')
+   write.csv(gspc3_df,'gspc3.csv', row.names=FALSE)
+   tail(gspc3_df)
+   # I should now see a new CSV:
+   # gspc3.csv
+ }
> 
> get_prices()
           cdate      cp
16811 2016-10-21 2141.16
16812 2016-10-24 2151.33
16813 2016-10-25 2143.16
16814 2016-10-26 2139.43
16815 2016-10-27 2133.04
16816 2016-10-28 2126.41
> # I should compute pctlead,pctlag1 from cp
> gspc3_df         = read.csv('gspc3.csv')
> len_i            = length(gspc3_df$cp)
> last_f           = gspc3_df$cp[len_i]
> leadp_v          = c(gspc3_df$cp, last_f)[1:len_i+1]
> gspc3_df$pctlead = 100 * (leadp_v - gspc3_df$cp) / gspc3_df$cp
> gspc3_df$pctlag1 = c(0, gspc3_df$pctlead)[1:len_i]
> 
> # I should get moydow:
> gspc3_df$moydow = format(as.Date(gspc3_df$cdate),"%m_%w")
> # I should get moy:
> gspc3_df$moy = format(as.Date(gspc3_df$cdate),"%-m")
> 
> # I should get dow:
> gspc3_df$dow = format(as.Date(gspc3_df$cdate),"%w")
> 
> write.csv(gspc3_df,'feat.csv', row.names=FALSE)
> 
> 'bye'
[1] "bye"
> 
dan@e80:~/ml4us/public/class07 $ 
dan@e80:~/ml4us/public/class07 $ 
dan@e80:~/ml4us/public/class07 $ 

Next, I wrote a script to create Logistic Regression models for a series of years.

The script then uses the models to generate predictions for each year.

The predictions are stored in CSV files with names like predictions2016.csv.

Then, the predictions are aggregated so I can sum both effectivness and accuracy of the models.

The script is listed below:


# logr_model.r

# This script should create a logistic regression model.

# Ref:
# http://www.ml4.us/cclasses/class07#hr

# Demo:
# R -f logr_model.r

train_test_logr = function(feat_df,yr_i,size_i){
  # This function should train and then test using Logistic Regression and data in feat_df.
  
  # I should use yr_i to compute end, start:
  yr_train_end_i   = yr_i - 1
  yr_train_start_i = yr_i - size_i
  
  # I should constrain the training data.
  yr_v     = strtoi(format(as.Date(feat_df$cdate),"%Y"))
  pred1_v  = (yr_v >= yr_train_start_i)
  pred2_v  = (yr_v <= yr_train_end_i)
  pred3_v  = (pred1_v & pred2_v)
  train_df = feat_df[ pred3_v , ]
  # I should build a model from train_df.
  # So, I should generate labels from pctlead:
  train_df$labels = (train_df$pctlead > median(train_df$pctlead))
  
  # Now I should learn:
  mymodel = glm(labels ~ pctlag1 + moy + dow, data=train_df, family='binomial')
  
  # The above model assumes that each label relies somewhat on pctlag1,moy, and dow
  # The model returns the probability that label is TRUE
  # If the probability is above 0.51 I consider that a bullish prediction.
  # If the probability is below 0.49 I consider that a bearish prediction.
  
  # I should load test data
  yr_test            = yr_i
  yr_test_v          = strtoi(format(as.Date(feat_df$cdate),"%Y"))
  pred_test_v        = (yr_test_v == yr_test)
  test_df            = feat_df[pred_test_v , ]
  test_df$prediction = predict(mymodel,test_df, type='response') 
  test_df$eff        = sign(test_df$prediction-0.5) * test_df$pctlead
  test_df$accurate   = (test_df$eff >= 0)
  # I should write predictions to CSV
  csv_s = paste('predictions',yr_test,'.csv',sep='')
  write.csv(test_df,csv_s, row.names=FALSE)
  return(csv_s)
} # train_test_logr = function(feat_df,yr_i,size_i)

# I should load features from CSV:
feat_df = read.csv('feat.csv')
size_i  = 25

for (yr_i in c(2000:2018)){
  pf_s = train_test_logr(feat_df,yr_i,size_i)
  print(pf_s)
}

# I should report effectiveness, accuracy:
sum_eff_long_f      = 0
sum_eff_logr_f      = 0
sum_long_accuracy_i = 0
sum_accuracy_i      = 0
sum_all_i           = 0
for (yr_i in c(2000:2018)){
  csv_s = paste('predictions',yr_i,'.csv',sep='')
  p_df = read.csv(csv_s)
  sum_eff_long_f      = sum_eff_long_f + sum(p_df$pctlead)
  sum_eff_logr_f      = sum_eff_logr_f + sum(p_df$eff)
  sum_long_accuracy_i = sum_long_accuracy_i + sum((p_df$pctlead > 0))
  sum_accuracy_i      = sum_accuracy_i + sum(p_df$accurate)
  sum_all_i           = sum_all_i      + length(p_df$accurate)
}

print('Long-Only Effectiveness:')
print(sum_eff_long_f)

print('Logistic-Regression Effectiveness:')
print(sum_eff_logr_f)

print('Long-Only Accuracy:')
acc_long_f = 100.0 * sum_long_accuracy_i / sum_all_i
print(acc_long_f)

print('Logistic-Regression Accuracy:')
acc_logr_f = 100.0 * sum_accuracy_i / sum_all_i
print(acc_logr_f)

'bye'

I ran it:


dan@h80:~/ml4/public/class07 $ R -f logr_model.r

R version 3.2.3 (2015-12-10) -- "Wooden Christmas-Tree"
Copyright (C) 2015 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

  Natural language support but running in an English locale

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> # logr_model.r
> 
> # This script should create a logistic regression model.
> 
> # Ref:
> # http://www.ml4.us/cclasses/class07#hr
> 
> # Demo:
> # R -f logr_model.r
> 
> train_test_logr = function(feat_df,yr_i,size_i){
+   # This function should train and then test using Logistic Regression and data in feat_df.
+   
+   # I should use yr_i to compute end, start:
+   yr_train_end_i   = yr_i - 1
+   yr_train_start_i = yr_i - size_i
+   
+   # I should constrain the training data.
+   yr_v     = strtoi(format(as.Date(feat_df$cdate),"%Y"))
+   pred1_v  = (yr_v >= yr_train_start_i)
+   pred2_v  = (yr_v <= yr_train_end_i)
+   pred3_v  = (pred1_v & pred2_v)
+   train_df = feat_df[ pred3_v , ]
+   # I should build a model from train_df.
+   # So, I should generate labels from pctlead:
+   train_df$labels = (train_df$pctlead > median(train_df$pctlead))
+   
+   # Now I should learn:
+   mymodel = glm(labels ~ pctlag1 + moy + dow, data=train_df, family='binomial')
+   
+   # The above model assumes that each label relies somewhat on pctlag1,moy, and dow
+   # The model returns the probability that label is TRUE
+   # If the probability is above 0.51 I consider that a bullish prediction.
+   # If the probability is below 0.49 I consider that a bearish prediction.
+   
+   # I should load test data
+   yr_test            = yr_i
+   yr_test_v          = strtoi(format(as.Date(feat_df$cdate),"%Y"))
+   pred_test_v        = (yr_test_v == yr_test)
+   test_df            = feat_df[pred_test_v , ]
+   test_df$prediction = predict(mymodel,test_df, type='response') 
+   test_df$eff        = sign(test_df$prediction-0.5) * test_df$pctlead
+   test_df$accurate   = (test_df$eff >= 0)
+   # I should write predictions to CSV
+   csv_s = paste('predictions',yr_test,'.csv',sep='')
+   write.csv(test_df,csv_s, row.names=FALSE)
+   return(csv_s)
+ } # train_test_logr = function(feat_df,yr_i,size_i)
> 
> # I should load features from CSV:
> feat_df = read.csv('feat.csv')
> size_i  = 25
> 
> for (yr_i in c(2000:2018)){
+   pf_s = train_test_logr(feat_df,yr_i,size_i)
+   print(pf_s)
+ }
[1] "predictions2000.csv"
[1] "predictions2001.csv"
[1] "predictions2002.csv"
[1] "predictions2003.csv"
[1] "predictions2004.csv"
[1] "predictions2005.csv"
[1] "predictions2006.csv"
[1] "predictions2007.csv"
[1] "predictions2008.csv"
[1] "predictions2009.csv"
[1] "predictions2010.csv"
[1] "predictions2011.csv"
[1] "predictions2012.csv"
[1] "predictions2013.csv"
[1] "predictions2014.csv"
[1] "predictions2015.csv"
[1] "predictions2016.csv"
[1] "predictions2017.csv"
[1] "predictions2018.csv"
> 
> # I should report effectiveness, accuracy:
> sum_eff_long_f      = 0
> sum_eff_logr_f      = 0
> sum_long_accuracy_i = 0
> sum_accuracy_i      = 0
> sum_all_i           = 0
> for (yr_i in c(2000:2018)){
+   csv_s = paste('predictions',yr_i,'.csv',sep='')
+   p_df = read.csv(csv_s)
+   sum_eff_long_f      = sum_eff_long_f + sum(p_df$pctlead)
+   sum_eff_logr_f      = sum_eff_logr_f + sum(p_df$eff)
+   sum_long_accuracy_i = sum_long_accuracy_i + sum((p_df$pctlead > 0))
+   sum_accuracy_i      = sum_accuracy_i + sum(p_df$accurate)
+   sum_all_i           = sum_all_i      + length(p_df$accurate)
+ }
> 
> print('Long-Only Effectiveness:')
[1] "Long-Only Effectiveness:"
> print(sum_eff_long_f)
[1] 97.48878
> 
> print('Logistic-Regression Effectiveness:')
[1] "Logistic-Regression Effectiveness:"
> print(sum_eff_logr_f)
[1] 165.8401
> 
> print('Long-Only Accuracy:')
[1] "Long-Only Accuracy:"
> acc_long_f = 100.0 * sum_long_accuracy_i / sum_all_i
> print(acc_long_f)
[1] 53.30697
> 
> print('Logistic-Regression Accuracy:')
[1] "Logistic-Regression Accuracy:"
> acc_logr_f = 100.0 * sum_accuracy_i / sum_all_i
> print(acc_logr_f)
[1] 51.15359
> 
> 'bye'
[1] "bye"
> 
dan@h80:~/ml4/public/class07 $ 
dan@h80:~/ml4/public/class07 $
dan@h80:~/ml4/public/class07 $

The above output tells me that Logistic-Regression is much more effective than the Long-Only model.

Long-Only, however, is 2% more accurate.

Class07 Lab


ml4.us About Blog Contact Class01 Class02 Class03 Class04 Class05 Class06 Class07 Class08 Class09 Class10 dan101 Forum Google Hangout Vboxen