Class04 Answer:

Build a DataFrame full of training data.

This lab required some knowledge.

{
/* ~/sparkapps/logr10/logr12l.scala
This script should download prices and predict daily direction of GSPC.
It should generate a label which I assume to be dependent on price calculations.
A label should classify an observation as down or up. Down is 0.0, up is 1.0.
It should generate independent features from slopes of moving averages of prices.
It should create a Logistic Regression model from many years of features.
Demo:
spark-shell -i logr12l.scala
*/

import org.apache.spark.sql.SQLContext
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.sql.Row
import sys.process._

// I should get prices:
"/usr/bin/curl -L ml4.herokuapp.com/csv/GSPC.csv -o /tmp/gspc.csv"!

val sqlContext = new SQLContext(sc)
  
val dp10df = sqlContext
  .read
  .format("com.databricks.spark.csv")
  .option("header","true")
  .option("inferSchema","true")
  .load("/tmp/gspc.csv")

dp10df.createOrReplaceTempView("tab")

spark.sql("SELECT COUNT(Date),MIN(Date),MAX(Date),MIN(Close),MAX(Close)FROM tab").show

// I should compute a label I can use to classify observations.

var sqls="SELECT Date,Close,LEAD(Close,1)OVER(ORDER BY Date) leadp FROM tab ORDER BY Date"

val dp11df=spark.sql(sqls);dp11df.createOrReplaceTempView("tab")

sqls="SELECT Date,Close,100*(leadp-Close)/Close pctlead FROM tab ORDER BY Date"

val dp12df=spark.sql(sqls);dp12df.createOrReplaceTempView("tab")

sqls = "SELECT Date, Close, pctlead"
sqls=sqls++",AVG(Close)OVER(ORDER BY Date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS mavg2"
sqls=sqls++",AVG(Close)OVER(ORDER BY Date ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS mavg3"
sqls=sqls++",AVG(Close)OVER(ORDER BY Date ROWS BETWEEN 4 PRECEDING AND CURRENT ROW) AS mavg4"
sqls=sqls++",AVG(Close)OVER(ORDER BY Date ROWS BETWEEN 5 PRECEDING AND CURRENT ROW) AS mavg5"
sqls=sqls++",AVG(Close)OVER(ORDER BY Date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS mavg6"
sqls=sqls++",AVG(Close)OVER(ORDER BY Date ROWS BETWEEN 7 PRECEDING AND CURRENT ROW) AS mavg7"
sqls=sqls++",AVG(Close)OVER(ORDER BY Date ROWS BETWEEN 8 PRECEDING AND CURRENT ROW) AS mavg8"
sqls=sqls++",AVG(Close)OVER(ORDER BY Date ROWS BETWEEN 9 PRECEDING AND CURRENT ROW) AS mavg9"
sqls=sqls++" FROM tab ORDER BY Date"

val dp13df=spark.sql(sqls);dp13df.createOrReplaceTempView("tab")

sqls = "SELECT Date, Close, pctlead"
sqls=sqls++",(mavg2-LAG(mavg2,1)OVER(ORDER BY Date))/mavg2 AS slp2 "
sqls=sqls++",(mavg3-LAG(mavg3,1)OVER(ORDER BY Date))/mavg3 AS slp3 "
sqls=sqls++",(mavg4-LAG(mavg4,1)OVER(ORDER BY Date))/mavg4 AS slp4 "
sqls=sqls++",(mavg5-LAG(mavg5,1)OVER(ORDER BY Date))/mavg5 AS slp5 "
sqls=sqls++",(mavg6-LAG(mavg6,1)OVER(ORDER BY Date))/mavg6 AS slp6 "
sqls=sqls++",(mavg7-LAG(mavg7,1)OVER(ORDER BY Date))/mavg7 AS slp7 "
sqls=sqls++",(mavg8-LAG(mavg8,1)OVER(ORDER BY Date))/mavg8 AS slp8 "
sqls=sqls++",(mavg9-LAG(mavg9,1)OVER(ORDER BY Date))/mavg9 AS slp9 "
sqls=sqls++" FROM tab ORDER BY Date"

val dp14df=spark.sql(sqls);dp14df.createOrReplaceTempView("tab")

// For Class Boundry, I should get avg of pctlead over training period.

val training_period = " WHERE Date BETWEEN'1986-01-01'AND'2015-12-31' "

sqls = "SELECT AVG(pctlead) FROM tab"++training_period

val class_df = spark.sql(sqls)

val class_boundry = class_df.first()(0).asInstanceOf[Double]

// I should compute label from pctlead:

val pctlead2label = udf((pctlead:Float)=> {if (pctlead> class_boundry) 1.0 else 0.0}) 

// I should add the label to my DF of observations:

val dp15df = dp14df.withColumn("label",pctlead2label(col("pctlead")))

// I should copy slp-values into Vectors.dense():

val fill_vec = udf((
  slp2:Float
  ,slp3:Float
  ,slp4:Float
  ,slp5:Float
  ,slp6:Float
  ,slp7:Float
  ,slp8:Float
  ,slp9:Float
  )=> {Vectors.dense(
  slp2
  ,slp3
  ,slp4
  ,slp5
  ,slp6
  ,slp7
  ,slp8
  ,slp9
  )
  }
)

val dp16df = dp15df.withColumn("features"
,fill_vec(
  col("slp2")
  ,col("slp3")
  ,col("slp4")
  ,col("slp5")
  ,col("slp6")
  ,col("slp7")
  ,col("slp8")
  ,col("slp9")
  )
)

// I should create a LogisticRegression instance. This instance is an 'Estimator'.

val lr = new LogisticRegression()

lr.setMaxIter(1234).setRegParam(0.01)

// I should gather observations to learn from:
dp16df.createOrReplaceTempView("tab")

val train_df = spark.sql("SELECT * FROM tab"++training_period)

train_df.show

// UNDER CONSTRUCTION
}

I saw something like this:


dan@h80:~/ml4/public/class04/logr10 $ spark-shell -i logr12l.scala
Spark context Web UI available at http://192.168.1.80:4042
Spark context available as 'sc' (master = local[*], app id = local-1515736544169).
Spark session available as 'spark'.
Loading logr12l.scala...
warning: there was one deprecation warning; re-run with -deprecation for details
warning: there was one feature warning; re-run with -feature for details
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1252k  100 1252k    0     0  1342k      0 --:--:-- --:--:-- --:--:-- 1886k
+-----------+-------------------+-------------------+----------+----------+
|count(Date)|          min(Date)|          max(Date)|min(Close)|max(Close)|
+-----------+-------------------+-------------------+----------+----------+
|      17116|1950-01-03 00:00:00|2018-01-09 00:00:00|     16.66|2753.52002|
+-----------+-------------------+-------------------+----------+----------+

+-------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+
|               Date|     Close|             pctlead|                slp2|                slp3|                slp4|                slp5|                slp6|                slp7|                slp8|                slp9|label|            features|
+-------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+
|1986-01-02 00:00:00|209.589996|  0.6154916859676822|-3.16760357534345...|0.002306341273205815|0.004840934249429745|0.005397301020063774|0.004184687696959427|0.003822090377624...|0.003651031295598...|0.003114737546081...|  1.0|[-3.1676034268457...|
|1986-01-03 00:00:00|210.880005|-0.10907198148066187|3.165999208548375E-4|0.001507548414174...|0.004014584746628398|0.006089532712067...|0.006601649874000632|0.005579095638870487|0.005269143012469958|0.005120895309813...|  0.0|[3.16599907819181...|
|1986-01-06 00:00:00|210.649994|  1.4953757843449127|-9.98233307439437...|-3.56113487816278...| 9.11679493672915E-4|0.003094336046097...|0.004999323397431154|0.005579293724448...|0.004779988690793448|0.004577390820826497|  1.0|[-9.9823332857340...|
|1986-01-07 00:00:00|213.800003|    -2.7268484182388|0.006626488575617283|0.002982535631734...|0.002906794022882...|0.003700554696382...|0.005593100628006...|0.007314310839640111|  0.0079008177161039|0.007248025865757...|  0.0|[0.00662648864090...|
|1986-01-08 00:00:00|207.970001| -0.8943597591269835|-0.00460137884507...|-0.00192101861050...|-0.00196845084085...|-0.00152574053767...|-4.84481541872379...|0.001458568120033...|0.003238611659559...|0.003984355019621...|  0.0|[-0.0046013789251...|
|1986-01-09 00:00:00|206.110001|-0.07277376123054835|-0.00723066981564...|-0.00568853112671...|-0.00451027417565...|-0.00506839310666875|-0.00505816659347...|-0.00441248017889...|-0.00287142088944...|-0.00139529295480...|  0.0|[-0.0072306697256...|
|1986-01-10 00:00:00|205.960007|  0.3690007643085806|-0.01264433889136317|-0.00562456458373...|-0.00682144190509...|-0.00658798001328...|-0.00757937654420...|-0.00796313909675...|-0.00769130544220...|-0.00653360683296...|  1.0|[-0.0126443393528...|
|1986-01-13 00:00:00|206.720001|-0.03870065770752...|-0.00202007140034...|-0.00856355159219...|-0.01068265844159...|-0.01228086814052...|-0.01253083588307...|-0.01374037551719489|-0.01435858111553632|-0.01433546598365...|  0.0|[-0.0020200714934...|
|1986-01-14 00:00:00|206.639999|  0.7839701934957979|8.557740651191889E-4|-0.00161128380009...|-0.00850223802379...|-0.01055207174714...|-0.01217547067557...|-0.01261160352677639|-0.01386078299957...|-0.01456936612849...|  1.0|[8.55774036608636...|
|1986-01-15 00:00:00|208.259995|  0.4369552587379967|0.003699990377561...|0.002597928894855...|0.001039168174583...|-0.00469643135077...|-0.00661826563290275|-0.00819860736557...|-0.00873497424119...|-0.00998090864936...|  1.0|[0.00369999045506...|
|1986-01-16 00:00:00|209.169998| -0.3537816164247401| 0.00392583689555115|0.003863781493574087|0.004619688046723...|0.003631070457537544|-0.00108503903396...|-0.00272632978139...|-0.00412593732871...|-0.00462452344439...|  0.0|[0.00392583711072...|
|1986-01-17 00:00:00|208.429993|-0.43179678080207606| 0.00286005502834624|0.002054044481454...| 0.00372370577279944|0.004716697982883165|0.004149272833234394|2.222012052046092...|-0.00115050132203...|-0.00235917914160...|  0.0|[0.00286005507223...|
|1986-01-20 00:00:00|207.529999| -0.8384358928272312|-0.00116775072653...|0.001067927400159466|0.002416631152580...|0.003923720857608531| 0.00489736746716456|0.004511678287086647|0.001102574778094...|-1.12806491189223...|  0.0|[-0.0011677506845...|
|1986-01-21 00:00:00|205.789993| -1.1176384072280974|-0.00543627676967...|-0.00297261114250...|-0.00132865284574...|-2.96868136178602...|9.627732968224614E-4|0.001817246583176...|0.001486955720767...|-0.00158380569359...|  0.0|[-0.0054362765513...|
|1986-01-22 00:00:00|203.490005| 0.37348026012383434|-0.00800892985526...|-0.00688283780334...|-0.00739662701028...|-0.00643044617037...|-0.00579571352501...|-0.00485919556564393|-0.00421157405778...|-0.00459503277343597|  1.0|[-0.0080089299008...|
|1986-01-23 00:00:00|    204.25|  1.0673160342717238|-0.00534611023208...|-0.00509097144578092|-0.00787761360148...|-0.00899649034214...|-0.00866822838456...|-0.00847075064600926|-0.00790576777626...|-0.00752685482495...|  1.0|[-0.0053461100906...|
|1986-01-24 00:00:00|206.429993|  0.4650516071082717|0.001042056762922...|-0.00134153618722...|-0.00442948563328...|-0.00708977803446...|-0.00835580751786...|-0.00831747899270...|-0.00833103155097...|-0.00797112672781...|  1.0|[0.00104205671232...|
|1986-01-27 00:00:00|207.389999|   1.166883172606611|0.006309955264742839|0.001947521794929...|-5.25822826789884...|-0.00290503230688...|-0.00511917138605...|-0.00622594943604...|-0.00621039745087...|-0.00623690712633...|  1.0|[0.00630995538085...|
|1986-01-28 00:00:00|209.809998| 0.22877603764144852|0.008915539805903143|0.007633948248948443|0.007247424835089...|0.005588568459058772|0.003782467138917...|0.001980963448579...|0.001068325803544961|0.001120935897967...|  1.0|[0.00891553983092...|
|1986-01-29 00:00:00|210.289993|-0.45650817060039667|0.006151492552096441|0.007242892751258105| 0.01058133535576863| 0.01096826776324734|0.010052340443109927|0.008825778432029634|0.007477394067115583|0.006835180732202236|  0.0|[0.00615149270743...|
+-------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+
only showing top 20 rows


Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 2.2.1
      /_/
         
Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_152)
Type in expressions to have them evaluated.
Type :help for more information.

scala> 

scala> :quit
dan@h80:~/ml4/public/class04/logr10 $ 
dan@h80:~/ml4/public/class04/logr10 $

Class04 Lab


learn4.us About Blog Contact Class01 Class02 Class03 Class04 Class05 Class06 Class07 Class08 Class09 Class10 dan101 Forum Google Hangout Vboxen