Class04 Answer:

Build features column consistent with this demo:

http://spark.apache.org/docs/latest/ml-pipeline.html#example-estimator-transformer-and-param

This lab requires some knowledge.

{
/* ~/sparkapps/logr10/logr12j.scala
This script should download prices and predict daily direction of GSPC.
It should generate a label which I assume to be dependent on price calculations.
A label should classify an observation as down or up. Down is 0.0, up is 1.0.
It should generate independent features from slopes of moving averages of prices.
It should create a Logistic Regression model from many years of features.
Demo:
spark-shell -i logr12j.scala
*/

import org.apache.spark.sql.SQLContext
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.sql.Row
import sys.process._

// I should get prices:
"/usr/bin/curl -L ml4.herokuapp.com/csv/GSPC.csv -o /tmp/gspc.csv"!

val sqlContext = new SQLContext(sc)
  
val dp10df = sqlContext
  .read
  .format("com.databricks.spark.csv")
  .option("header","true")
  .option("inferSchema","true")
  .load("/tmp/gspc.csv")

dp10df.createOrReplaceTempView("tab")

spark.sql("SELECT COUNT(Date),MIN(Date),MAX(Date),MIN(Close),MAX(Close)FROM tab").show

// I should compute a label I can use to classify observations.

var sqls="SELECT Date,Close,LEAD(Close,1)OVER(ORDER BY Date) leadp FROM tab ORDER BY Date"

val dp11df=spark.sql(sqls);dp11df.createOrReplaceTempView("tab")

sqls="SELECT Date,Close,100*(leadp-Close)/Close pctlead FROM tab ORDER BY Date"

val dp12df=spark.sql(sqls);dp12df.createOrReplaceTempView("tab")

sqls = "SELECT Date, Close, pctlead"
sqls=sqls++",AVG(Close)OVER(ORDER BY Date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS mavg2"
sqls=sqls++",AVG(Close)OVER(ORDER BY Date ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS mavg3"
sqls=sqls++",AVG(Close)OVER(ORDER BY Date ROWS BETWEEN 4 PRECEDING AND CURRENT ROW) AS mavg4"
sqls=sqls++",AVG(Close)OVER(ORDER BY Date ROWS BETWEEN 5 PRECEDING AND CURRENT ROW) AS mavg5"
sqls=sqls++",AVG(Close)OVER(ORDER BY Date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS mavg6"
sqls=sqls++",AVG(Close)OVER(ORDER BY Date ROWS BETWEEN 7 PRECEDING AND CURRENT ROW) AS mavg7"
sqls=sqls++",AVG(Close)OVER(ORDER BY Date ROWS BETWEEN 8 PRECEDING AND CURRENT ROW) AS mavg8"
sqls=sqls++",AVG(Close)OVER(ORDER BY Date ROWS BETWEEN 9 PRECEDING AND CURRENT ROW) AS mavg9"
sqls=sqls++" FROM tab ORDER BY Date"

val dp13df=spark.sql(sqls);dp13df.createOrReplaceTempView("tab")

sqls = "SELECT Date, Close, pctlead"
sqls=sqls++",(mavg2-LAG(mavg2,1)OVER(ORDER BY Date))/mavg2 AS slp2 "
sqls=sqls++",(mavg3-LAG(mavg3,1)OVER(ORDER BY Date))/mavg3 AS slp3 "
sqls=sqls++",(mavg4-LAG(mavg4,1)OVER(ORDER BY Date))/mavg4 AS slp4 "
sqls=sqls++",(mavg5-LAG(mavg5,1)OVER(ORDER BY Date))/mavg5 AS slp5 "
sqls=sqls++",(mavg6-LAG(mavg6,1)OVER(ORDER BY Date))/mavg6 AS slp6 "
sqls=sqls++",(mavg7-LAG(mavg7,1)OVER(ORDER BY Date))/mavg7 AS slp7 "
sqls=sqls++",(mavg8-LAG(mavg8,1)OVER(ORDER BY Date))/mavg8 AS slp8 "
sqls=sqls++",(mavg9-LAG(mavg9,1)OVER(ORDER BY Date))/mavg9 AS slp9 "
sqls=sqls++" FROM tab ORDER BY Date"

val dp14df=spark.sql(sqls);dp14df.createOrReplaceTempView("tab")

// For Class Boundry, I should get avg of pctlead over training period.

val training_period = " WHERE Date BETWEEN'1986-01-01'AND'2015-12-31' "

sqls = "SELECT AVG(pctlead) FROM tab"++training_period

val class_df = spark.sql(sqls)

val class_boundry = class_df.first()(0).asInstanceOf[Double]

// I should compute label from pctlead:

val pctlead2label = udf((pctlead:Float)=> {if (pctlead> class_boundry) 1.0 else 0.0}) 

// I should add the label to my DF of observations:

val dp15df = dp14df.withColumn("label",pctlead2label(col("pctlead")))

// I should copy slp-values into Vectors.dense():

val fill_vec = udf((
  slp2:Float
  ,slp3:Float
  ,slp4:Float
  ,slp5:Float
  ,slp6:Float
  ,slp7:Float
  ,slp8:Float
  ,slp9:Float
  )=> {Vectors.dense(
  slp2
  ,slp3
  ,slp4
  ,slp5
  ,slp6
  ,slp7
  ,slp8
  ,slp9
  )
  }
)

val dp16df = dp15df.withColumn("features"
,fill_vec(
  col("slp2")
  ,col("slp3")
  ,col("slp4")
  ,col("slp5")
  ,col("slp6")
  ,col("slp7")
  ,col("slp8")
  ,col("slp9")
  )
)

dp16df.show

// UNDER CONSTRUCTION
}

I saw something like this:


dan@h80:~/ml4/public/class04/logr10 $ 
dan@h80:~/ml4/public/class04/logr10 $ 
dan@h80:~/ml4/public/class04/logr10 $ spark-shell -i logr12j.scala
Spark context Web UI available at http://192.168.1.80:4042
Spark context available as 'sc' (master = local[*], app id = local-1515735761790).
Spark session available as 'spark'.
Loading logr12j.scala...
warning: there was one deprecation warning; re-run with -deprecation for details
warning: there was one feature warning; re-run with -feature for details
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1252k  100 1252k    0     0   637k      0  0:00:01  0:00:01 --:--:-- 2328k
+-----------+-------------------+-------------------+----------+----------+
|count(Date)|          min(Date)|          max(Date)|min(Close)|max(Close)|
+-----------+-------------------+-------------------+----------+----------+
|      17116|1950-01-03 00:00:00|2018-01-09 00:00:00|     16.66|2753.52002|
+-----------+-------------------+-------------------+----------+----------+

+-------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+
|               Date|    Close|             pctlead|                slp2|                slp3|                slp4|                slp5|                slp6|                slp7|                slp8|                slp9|label|            features|
+-------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+
|1950-01-03 00:00:00|    16.66|  1.1404561824729968|                null|                null|                null|                null|                null|                null|                null|                null|  1.0|                null|
|1950-01-04 00:00:00|    16.85|  0.4747774480712065|0.005669949268875106|0.005669949268875106|0.005669949268875106|0.005669949268875106|0.005669949268875106|0.005669949268875106|0.005669949268875106|0.005669949268875106|  1.0|[0.00566994911059...|
|1950-01-05 00:00:00|    16.93| 0.29533372711164035|0.003469468675654...|0.003469468675654...|0.003469468675654...|0.003469468675654...|0.003469468675654...|0.003469468675654...|0.003469468675654...|0.003469468675654...|  1.0|[0.00346946856006...|
|1950-01-06 00:00:00|    16.98|   0.588928150765594| 0.00630417651694241|0.002472065658063...|0.002472065658063...|0.002472065658063...|0.002472065658063...|0.002472065658063...|0.002472065658063...|0.002472065658063...|  1.0|[0.00630417652428...|
|1950-01-09 00:00:00|    17.08| -0.2927341920374689|0.004510688370268549|0.006191037735849081|0.006191037735849081|0.006191037735849081|0.006191037735849081|0.006191037735849081|0.006191037735849081|0.006191037735849081|  0.0|[0.00451068859547...|
|1950-01-10 00:00:00|17.030001|  0.3523135436104863|0.001957349736595277|0.002646295168387165|0.006174669124159513|0.006174669124159513|0.006174669124159513|0.006174669124159513|0.006174669124159513|0.006174669124159513|  1.0|[0.00195734971202...|
|1950-01-11 00:00:00|    17.09| -1.9309537741369123|0.002148437458038...|0.002346729211693775|0.004165447284167...|0.007235753291741252|0.007235753291741252|0.007235753291741252|0.007235753291741252|0.007235753291741252|  0.0|[0.00214843754656...|
|1950-01-12 00:00:00|    16.76| -0.5369928400954644|-0.00628930805248...|-0.00323719830433...|-0.00188345788870...|-1.96188931780665E-4|0.002606581564268871|0.002606581564268871|0.002606581564268871|0.002606581564268871|  0.0|[-0.0062893079593...|
|1950-01-13 00:00:00|    16.67|  0.2999340131973586|-0.00712591053048...|-0.00606957800045...|-0.00595114424942...|-0.00537872678936...|-0.00429310980215...|-0.00207253142749...|-0.00207253142749...|-0.00207253142749...|  1.0|[-0.0071259103715...|
|1950-01-16 00:00:00|16.719999|  0.8373325859648619|-0.00737788648809...|-0.00461038079432...|-0.00690068124480...|-0.00743607486767...|-0.00739358173654073|-0.00676682788171...|-0.00502348642542...|-0.00502348642542...|  1.0|[-0.0073778866790...|
|1950-01-17 00:00:00|16.860001|-0.05931790869999...|0.001990069651741...|-0.00343230861065...|-0.00605879719444...|-0.00830721782818...|-0.00906048137804598|-0.00925235039546...|-0.00887098111392...|-0.00743172660796...|  0.0|[0.00199006963521...|
|1950-01-18 00:00:00|    16.85| 0.11870029673588751|0.003569303985722708|0.001341281669150...|-0.00268256333830...|-0.00476901142573...|-0.00668512667660...|-0.00737705663189...|-0.00758404371584...|-0.00727273323397...|  1.0|[0.00356930401176...|
|1950-01-19 00:00:00|16.870001|  0.1778245300637511|0.002965638475063...|0.002971783016764...|0.003150089106239...|1.485884054094424...|-0.00174060069585...|-0.00341752595219...|-0.00406140822758...|-0.00427933723210...|  1.0|[0.00296563841402...|
|1950-01-20 00:00:00|     16.9|  0.1183431952662907|7.901817307350008E-4|0.002667471764449836|0.004505056179459153| 0.00484096014105035|0.002286412837645...|8.150711080299655E-4|-6.58614878720948...|-0.00124479545806...|  1.0|[7.90181744378060...|
|1950-01-23 00:00:00|    16.92| -0.3546040189125369|0.001380942959539...|8.883476326862416E-4|0.002665090277389...|0.004343110191346...|0.004780242325263554|0.002665094719201...|0.001414801084569...|1.184512863715192...|  0.0|[0.00138094299472...|
|1950-01-24 00:00:00|16.860001| -0.7117496612248245|-1.97316491765477...|1.480532894729239...|4.441213784120597...|0.002023212375330503|0.003574093158427946|0.004071080560441...|0.002286395195073107|0.001213933346737628|  0.0|[-1.9731649081222...|
|1950-01-25 00:00:00|    16.74|-0.05973715651133818|-0.00316706248679...|-0.00192822601708...|-0.00148324827227...|-0.00128548895552...|4.237614888207852E-5|0.001409077404196...|0.001878771520305...|2.966508410461777...|  0.0|[-0.0031670625321...|
|1950-01-26 00:00:00|    16.73|  0.5379557680812902|-0.00377508436767...|-0.00252788100330291|-0.00270632858429...|-0.00262701854829...|-0.00265534780969...|-0.00163568770801...|-5.12183387675194E-4|-1.48695908569830...|  1.0|[-0.0037750843912...|
|1950-01-27 00:00:00|    16.82|  1.1890606420927425|-7.95406641479493...|-0.00148920325406...|-0.00253164255351...| -0.0029287713636816|-0.00304223163217...|-0.00320179444226...|-0.00239927025466...|-0.00145941621058...|  1.0|[-7.9540663864463...|
|1950-01-30 00:00:00|    17.02|  0.1762573443008232|0.005536879572869...|0.002377046501262...|8.022463229833146E-4|-4.95320160450941...|-4.03268459366999...|-5.19997028673214...|-6.76822001023354...|2.970138166712838...|  1.0|[0.00553687941282...|
+-------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+
only showing top 20 rows


Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 2.2.1
      /_/
         
Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_152)
Type in expressions to have them evaluated.
Type :help for more information.

scala> :quit
dan@h80:~/ml4/public/class04/logr10 $ 
dan@h80:~/ml4/public/class04/logr10 $

Class04 Lab


learn4.us About Blog Contact Class01 Class02 Class03 Class04 Class05 Class06 Class07 Class08 Class09 Class10 dan101 Forum Google Hangout Vboxen