Class03 Answer:

Write Pandas syntax which mimics this SQL:


CREATE TABLE prices4 AS 
SELECT
extract(year from cdate) yr
,cdate
,closep
FROM prices
WHERE extract(year from cdate) > 1999;

SELECT yr, COUNT(yr) FROM prices4 GROUP BY yr;

SELECT yr, COUNT(yr) FROM prices4 GROUP BY yr ORDER BY yr;
SELECT yr, COUNT(yr) FROM prices4 GROUP BY yr ORDER BY COUNT(yr);

I did this lab by writing the script listed below:


"""
class03pd38.py

This script should mimic the SQL listed below:

-- GROUP BY
CREATE TABLE prices4 AS 
SELECT
extract(year from cdate) yr
,cdate
,closep
FROM prices
WHERE extract(year from cdate) > 1999;

SELECT yr, COUNT(yr) FROM prices4 GROUP BY yr;

-- GROUP BY ORDER BY
SELECT yr, COUNT(yr) FROM prices4 GROUP BY yr ORDER BY yr;
SELECT yr, COUNT(yr) FROM prices4 GROUP BY yr ORDER BY COUNT(yr);
"""

# SQL lacks the ability to get data from the web.
# Pandas can get data from the web:

import pandas as pd
from datetime import datetime

prices_df         = pd.read_csv('http://ml4.us/csv/ibm.csv')
prices_df.columns = ['cdate_s', 'openp', 'highp', 'lowp', 'closep', 'adjp', 'volume']

# Mimic
# CREATE TABLE prices4 AS 
# SELECT
# extract(year from cdate) yr
# ,cdate
# ,closep
# FROM prices
# WHERE extract(year from cdate) > 1999;

extract_sr          = pd.to_datetime(prices_df.cdate_s)
pred_sr             = extract_sr.dt.year > 1999
prices4_df          = prices_df.copy()[pred_sr]
prices4_df['cdate'] = pd.to_datetime(prices4_df.cdate_s)
prices4_df['yr']    = prices4_df.cdate.dt.year
prices4_df          = prices4_df[['yr','cdate','closep']]

# Mimic
# SELECT yr, COUNT(yr) FROM prices4 GROUP BY yr;

yr_df         = prices4_df[['yr','yr']]
yr_df.columns = ['yr','yrcount']
yr_gb         = yr_df.groupby('yr')

yr_gb_df = yr_gb.aggregate(len)
print(yr_gb_df)

# Mimic
# SELECT yr, COUNT(yr) FROM prices4 GROUP BY yr ORDER BY yr;

yr_l      = pd.Series(yr_gb_df.index).tolist()
yr_cnt_df = pd.DataFrame({'yr':yr_l, 'yrcount':yr_gb_df.yrcount.tolist()})

# Now that yr is a column I can sort by yr:
yr_cnt_df_sorted = yr_cnt_df.sort_values(by=['yr'])
print(yr_cnt_df_sorted)

# Mimic
# SELECT yr, COUNT(yr) FROM prices4 GROUP BY yr ORDER BY COUNT(yr);
yr_cnt_df_sorted = yr_cnt_df.sort_values(by=['yrcount'])
print(yr_cnt_df_sorted)

'bye'

I saw this:


dan@h79:~/ml4/public/class03demos $ python class03pd38.py
      yrcount
yr           
2000      252
2001      248
2002      252
2003      252
2004      252
2005      252
2006      251
2007      251
2008      253
2009      252
2010      252
2011      252
2012      250
2013      252
2014      252
2015      252
2016      252
2017      114
      yr  yrcount
0   2000      252
1   2001      248
2   2002      252
3   2003      252
4   2004      252
5   2005      252
6   2006      251
7   2007      251
8   2008      253
9   2009      252
10  2010      252
11  2011      252
12  2012      250
13  2013      252
14  2014      252
15  2015      252
16  2016      252
17  2017      114
      yr  yrcount
17  2017      114
1   2001      248
12  2012      250
6   2006      251
7   2007      251
15  2015      252
14  2014      252
13  2013      252
11  2011      252
10  2010      252
0   2000      252
16  2016      252
5   2005      252
4   2004      252
3   2003      252
2   2002      252
9   2009      252
8   2008      253
dan@h79:~/ml4/public/class03demos $
dan@h79:~/ml4/public/class03demos $

Class03 Lab


ml4.us About Blog Contact Class01 Class02 Class03 Class04 Class05 Class06 Class07 Class08 Class09 Class10 dan101 Forum Google Hangout Vboxen