/* 
Below I show examples using data sets in Dielman (2004) Applied Regression Analysis.  
These data are publicly available from Cengage at the address below.
http://www.cengage.com/statistics/discipline_content/dataLibrary.html
*/


/* change the directory to point to your file */
proc import datafile="g:\Teaching\ECN410\ECN410.Beamers\Lecture4\TELEMARK6.xlsx" dbms=xlsx 
	out=work.telemark replace; 
getnames=yes;
run;


/* The variables are 
MONTHS: number of months of employment
CALLS: average number of calls placed per day over recent 20 day period.
*/

proc means data=telemark;
run;

/*regress calls on months*/
PROC REG data=telemark;
model CALLS = MONTHS;
output out=telemark r=residual;
run;
QUIT;

proc univariate data=telemark normal;
var residual;
run;

/* relationship appears nonlinear, try adding another variable */
data telemark; set telemark;
MONTHSSQ=MONTHS**2;

PROC REG data=telemark;
model CALLS = MONTHS MONTHSSQ;

run;
QUIT;


/* heteroskedasticity example with S&P500 data */
proc import datafile="g:\Teaching\ECN410\ECN410.Beamers\Lecture4\SP5006.xlsx" dbms=xlsx 
	out=work.sandp replace; 
getnames=yes;
run;

/* check variable name */
proc contents data=sandp;
run;
proc means data=sandp;
run;

/* create a lagged value */
data sandp; set sandp;
n=_n_;
x=lag(S_P); /* create a new variable called x, this is the lagged value of S_P, 
				lagged means it is the value of the previous observation */

proc reg data=sandp;
model s_p=x;  /* current stock index value is a function of the value in the previous time period*/
output out=sandp rstudent=studentized r=residual p=fitted;
run;
quit;

/*create a scatter plot*/
proc sgplot data=sandp;
scatter x=fitted y=studentized;
run;

proc sgplot data=sandp;
scatter x=n y=residual;
run;





/* autocorrelation example with sales and advertising data, ABC Company 1967-2002 */
proc import datafile="g:\Teaching\ECN410\ECN410.Beamers\Lecture4\SALESADV6.xlsx" dbms=xlsx 
	out=work.sales replace; 
getnames=yes;
run;

data sales; set sales;
n=_n_;
run;

proc reg data=sales;
model sales = adv/dw; /*use the dw option to get the Durbin-Watson statistic*/
output out=sales rstudent=studentized r=residual p=fitted;
run;
quit;

/*create a scatter plot*/
proc sgplot data=sales;
scatter x=n y=studentized;
run;




/* outliers, leverage, and influence */


/* autocorrelation example with sales and advertising data, ABC Company 1967-2002 */
proc import datafile="g:\Teaching\ECN410\ECN410.Beamers\Lecture4\SL6.xlsx" dbms=xlsx 
	out=work.sl replace; 
getnames=yes;
run;


PROC REG data=sl;
model return = beta sigma;
output out=return_residuals p=return_hat r=residuals rstudent=r h=leverage cookd=cd;
/* we are creating a data set that has:
predicted values: return_hat
residuals: residuals
studentized residuals: r
leverage: leverage
Cook's distance: cd
*/
run;
QUIT;

/*sort the data to see which has the highest leverage, this can also be done for the other variables such as Cook's D*/

proc sort data=return_residuals out=return_residuals_sorted; by leverage;
run;
proc print data=return_residuals_sorted;
run;


proc sort data=return_residuals out=return_residuals_sorted; by descending cd;
run;
proc print data=return_residuals_sorted;
run;



/*compare the estimated coefficients with and without the high influence observation */
data sl2; set sl;
where return lt 10;

PROC REG data=sl2;
model return = beta sigma;

run;
QUIT;