http://spark.apache.org/docs/latest/ml-pipeline.html#example-estimator-transformer-and-param
This lab requires some knowledge.
{
/* ~/sparkapps/logr10/logr12j.scala
This script should download prices and predict daily direction of GSPC.
It should generate a label which I assume to be dependent on price calculations.
A label should classify an observation as down or up. Down is 0.0, up is 1.0.
It should generate independent features from slopes of moving averages of prices.
It should create a Logistic Regression model from many years of features.
Demo:
spark-shell -i logr12j.scala
*/
import org.apache.spark.sql.SQLContext
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.sql.Row
import sys.process._
// I should get prices:
"/usr/bin/curl -L ml4.herokuapp.com/csv/GSPC.csv -o /tmp/gspc.csv"!
val sqlContext = new SQLContext(sc)
val dp10df = sqlContext
.read
.format("com.databricks.spark.csv")
.option("header","true")
.option("inferSchema","true")
.load("/tmp/gspc.csv")
dp10df.createOrReplaceTempView("tab")
spark.sql("SELECT COUNT(Date),MIN(Date),MAX(Date),MIN(Close),MAX(Close)FROM tab").show
// I should compute a label I can use to classify observations.
var sqls="SELECT Date,Close,LEAD(Close,1)OVER(ORDER BY Date) leadp FROM tab ORDER BY Date"
val dp11df=spark.sql(sqls);dp11df.createOrReplaceTempView("tab")
sqls="SELECT Date,Close,100*(leadp-Close)/Close pctlead FROM tab ORDER BY Date"
val dp12df=spark.sql(sqls);dp12df.createOrReplaceTempView("tab")
sqls = "SELECT Date, Close, pctlead"
sqls=sqls++",AVG(Close)OVER(ORDER BY Date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS mavg2"
sqls=sqls++",AVG(Close)OVER(ORDER BY Date ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS mavg3"
sqls=sqls++",AVG(Close)OVER(ORDER BY Date ROWS BETWEEN 4 PRECEDING AND CURRENT ROW) AS mavg4"
sqls=sqls++",AVG(Close)OVER(ORDER BY Date ROWS BETWEEN 5 PRECEDING AND CURRENT ROW) AS mavg5"
sqls=sqls++",AVG(Close)OVER(ORDER BY Date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS mavg6"
sqls=sqls++",AVG(Close)OVER(ORDER BY Date ROWS BETWEEN 7 PRECEDING AND CURRENT ROW) AS mavg7"
sqls=sqls++",AVG(Close)OVER(ORDER BY Date ROWS BETWEEN 8 PRECEDING AND CURRENT ROW) AS mavg8"
sqls=sqls++",AVG(Close)OVER(ORDER BY Date ROWS BETWEEN 9 PRECEDING AND CURRENT ROW) AS mavg9"
sqls=sqls++" FROM tab ORDER BY Date"
val dp13df=spark.sql(sqls);dp13df.createOrReplaceTempView("tab")
sqls = "SELECT Date, Close, pctlead"
sqls=sqls++",(mavg2-LAG(mavg2,1)OVER(ORDER BY Date))/mavg2 AS slp2 "
sqls=sqls++",(mavg3-LAG(mavg3,1)OVER(ORDER BY Date))/mavg3 AS slp3 "
sqls=sqls++",(mavg4-LAG(mavg4,1)OVER(ORDER BY Date))/mavg4 AS slp4 "
sqls=sqls++",(mavg5-LAG(mavg5,1)OVER(ORDER BY Date))/mavg5 AS slp5 "
sqls=sqls++",(mavg6-LAG(mavg6,1)OVER(ORDER BY Date))/mavg6 AS slp6 "
sqls=sqls++",(mavg7-LAG(mavg7,1)OVER(ORDER BY Date))/mavg7 AS slp7 "
sqls=sqls++",(mavg8-LAG(mavg8,1)OVER(ORDER BY Date))/mavg8 AS slp8 "
sqls=sqls++",(mavg9-LAG(mavg9,1)OVER(ORDER BY Date))/mavg9 AS slp9 "
sqls=sqls++" FROM tab ORDER BY Date"
val dp14df=spark.sql(sqls);dp14df.createOrReplaceTempView("tab")
// For Class Boundry, I should get avg of pctlead over training period.
val training_period = " WHERE Date BETWEEN'1986-01-01'AND'2015-12-31' "
sqls = "SELECT AVG(pctlead) FROM tab"++training_period
val class_df = spark.sql(sqls)
val class_boundry = class_df.first()(0).asInstanceOf[Double]
// I should compute label from pctlead:
val pctlead2label = udf((pctlead:Float)=> {if (pctlead> class_boundry) 1.0 else 0.0})
// I should add the label to my DF of observations:
val dp15df = dp14df.withColumn("label",pctlead2label(col("pctlead")))
// I should copy slp-values into Vectors.dense():
val fill_vec = udf((
slp2:Float
,slp3:Float
,slp4:Float
,slp5:Float
,slp6:Float
,slp7:Float
,slp8:Float
,slp9:Float
)=> {Vectors.dense(
slp2
,slp3
,slp4
,slp5
,slp6
,slp7
,slp8
,slp9
)
}
)
val dp16df = dp15df.withColumn("features"
,fill_vec(
col("slp2")
,col("slp3")
,col("slp4")
,col("slp5")
,col("slp6")
,col("slp7")
,col("slp8")
,col("slp9")
)
)
dp16df.show
// UNDER CONSTRUCTION
}
I saw something like this:
dan@h80:~/ml4/public/class04/logr10 $
dan@h80:~/ml4/public/class04/logr10 $
dan@h80:~/ml4/public/class04/logr10 $ spark-shell -i logr12j.scala
Spark context Web UI available at http://192.168.1.80:4042
Spark context available as 'sc' (master = local[*], app id = local-1515735761790).
Spark session available as 'spark'.
Loading logr12j.scala...
warning: there was one deprecation warning; re-run with -deprecation for details
warning: there was one feature warning; re-run with -feature for details
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
100 1252k 100 1252k 0 0 637k 0 0:00:01 0:00:01 --:--:-- 2328k
+-----------+-------------------+-------------------+----------+----------+
|count(Date)| min(Date)| max(Date)|min(Close)|max(Close)|
+-----------+-------------------+-------------------+----------+----------+
| 17116|1950-01-03 00:00:00|2018-01-09 00:00:00| 16.66|2753.52002|
+-----------+-------------------+-------------------+----------+----------+
+-------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+
| Date| Close| pctlead| slp2| slp3| slp4| slp5| slp6| slp7| slp8| slp9|label| features|
+-------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+
|1950-01-03 00:00:00| 16.66| 1.1404561824729968| null| null| null| null| null| null| null| null| 1.0| null|
|1950-01-04 00:00:00| 16.85| 0.4747774480712065|0.005669949268875106|0.005669949268875106|0.005669949268875106|0.005669949268875106|0.005669949268875106|0.005669949268875106|0.005669949268875106|0.005669949268875106| 1.0|[0.00566994911059...|
|1950-01-05 00:00:00| 16.93| 0.29533372711164035|0.003469468675654...|0.003469468675654...|0.003469468675654...|0.003469468675654...|0.003469468675654...|0.003469468675654...|0.003469468675654...|0.003469468675654...| 1.0|[0.00346946856006...|
|1950-01-06 00:00:00| 16.98| 0.588928150765594| 0.00630417651694241|0.002472065658063...|0.002472065658063...|0.002472065658063...|0.002472065658063...|0.002472065658063...|0.002472065658063...|0.002472065658063...| 1.0|[0.00630417652428...|
|1950-01-09 00:00:00| 17.08| -0.2927341920374689|0.004510688370268549|0.006191037735849081|0.006191037735849081|0.006191037735849081|0.006191037735849081|0.006191037735849081|0.006191037735849081|0.006191037735849081| 0.0|[0.00451068859547...|
|1950-01-10 00:00:00|17.030001| 0.3523135436104863|0.001957349736595277|0.002646295168387165|0.006174669124159513|0.006174669124159513|0.006174669124159513|0.006174669124159513|0.006174669124159513|0.006174669124159513| 1.0|[0.00195734971202...|
|1950-01-11 00:00:00| 17.09| -1.9309537741369123|0.002148437458038...|0.002346729211693775|0.004165447284167...|0.007235753291741252|0.007235753291741252|0.007235753291741252|0.007235753291741252|0.007235753291741252| 0.0|[0.00214843754656...|
|1950-01-12 00:00:00| 16.76| -0.5369928400954644|-0.00628930805248...|-0.00323719830433...|-0.00188345788870...|-1.96188931780665E-4|0.002606581564268871|0.002606581564268871|0.002606581564268871|0.002606581564268871| 0.0|[-0.0062893079593...|
|1950-01-13 00:00:00| 16.67| 0.2999340131973586|-0.00712591053048...|-0.00606957800045...|-0.00595114424942...|-0.00537872678936...|-0.00429310980215...|-0.00207253142749...|-0.00207253142749...|-0.00207253142749...| 1.0|[-0.0071259103715...|
|1950-01-16 00:00:00|16.719999| 0.8373325859648619|-0.00737788648809...|-0.00461038079432...|-0.00690068124480...|-0.00743607486767...|-0.00739358173654073|-0.00676682788171...|-0.00502348642542...|-0.00502348642542...| 1.0|[-0.0073778866790...|
|1950-01-17 00:00:00|16.860001|-0.05931790869999...|0.001990069651741...|-0.00343230861065...|-0.00605879719444...|-0.00830721782818...|-0.00906048137804598|-0.00925235039546...|-0.00887098111392...|-0.00743172660796...| 0.0|[0.00199006963521...|
|1950-01-18 00:00:00| 16.85| 0.11870029673588751|0.003569303985722708|0.001341281669150...|-0.00268256333830...|-0.00476901142573...|-0.00668512667660...|-0.00737705663189...|-0.00758404371584...|-0.00727273323397...| 1.0|[0.00356930401176...|
|1950-01-19 00:00:00|16.870001| 0.1778245300637511|0.002965638475063...|0.002971783016764...|0.003150089106239...|1.485884054094424...|-0.00174060069585...|-0.00341752595219...|-0.00406140822758...|-0.00427933723210...| 1.0|[0.00296563841402...|
|1950-01-20 00:00:00| 16.9| 0.1183431952662907|7.901817307350008E-4|0.002667471764449836|0.004505056179459153| 0.00484096014105035|0.002286412837645...|8.150711080299655E-4|-6.58614878720948...|-0.00124479545806...| 1.0|[7.90181744378060...|
|1950-01-23 00:00:00| 16.92| -0.3546040189125369|0.001380942959539...|8.883476326862416E-4|0.002665090277389...|0.004343110191346...|0.004780242325263554|0.002665094719201...|0.001414801084569...|1.184512863715192...| 0.0|[0.00138094299472...|
|1950-01-24 00:00:00|16.860001| -0.7117496612248245|-1.97316491765477...|1.480532894729239...|4.441213784120597...|0.002023212375330503|0.003574093158427946|0.004071080560441...|0.002286395195073107|0.001213933346737628| 0.0|[-1.9731649081222...|
|1950-01-25 00:00:00| 16.74|-0.05973715651133818|-0.00316706248679...|-0.00192822601708...|-0.00148324827227...|-0.00128548895552...|4.237614888207852E-5|0.001409077404196...|0.001878771520305...|2.966508410461777...| 0.0|[-0.0031670625321...|
|1950-01-26 00:00:00| 16.73| 0.5379557680812902|-0.00377508436767...|-0.00252788100330291|-0.00270632858429...|-0.00262701854829...|-0.00265534780969...|-0.00163568770801...|-5.12183387675194E-4|-1.48695908569830...| 1.0|[-0.0037750843912...|
|1950-01-27 00:00:00| 16.82| 1.1890606420927425|-7.95406641479493...|-0.00148920325406...|-0.00253164255351...| -0.0029287713636816|-0.00304223163217...|-0.00320179444226...|-0.00239927025466...|-0.00145941621058...| 1.0|[-7.9540663864463...|
|1950-01-30 00:00:00| 17.02| 0.1762573443008232|0.005536879572869...|0.002377046501262...|8.022463229833146E-4|-4.95320160450941...|-4.03268459366999...|-5.19997028673214...|-6.76822001023354...|2.970138166712838...| 1.0|[0.00553687941282...|
+-------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+
only showing top 20 rows
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 2.2.1
/_/
Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_152)
Type in expressions to have them evaluated.
Type :help for more information.
scala> :quit
dan@h80:~/ml4/public/class04/logr10 $
dan@h80:~/ml4/public/class04/logr10 $