/* Below I show examples using data sets in Dielman (2004) Applied Regression Analysis. These data are publicly available from Cengage at the address below. http://www.cengage.com/statistics/discipline_content/dataLibrary.html */ /* change the directory to point to your file */ proc import datafile="g:\Teaching\ECN410\ECN410.Beamers\Lecture4\TELEMARK6.xlsx" dbms=xlsx out=work.telemark replace; getnames=yes; run; /* The variables are MONTHS: number of months of employment CALLS: average number of calls placed per day over recent 20 day period. */ proc means data=telemark; run; /*regress calls on months*/ PROC REG data=telemark; model CALLS = MONTHS; output out=telemark r=residual; run; QUIT; proc univariate data=telemark normal; var residual; run; /* relationship appears nonlinear, try adding another variable */ data telemark; set telemark; MONTHSSQ=MONTHS**2; PROC REG data=telemark; model CALLS = MONTHS MONTHSSQ; run; QUIT; /* heteroskedasticity example with S&P500 data */ proc import datafile="g:\Teaching\ECN410\ECN410.Beamers\Lecture4\SP5006.xlsx" dbms=xlsx out=work.sandp replace; getnames=yes; run; /* check variable name */ proc contents data=sandp; run; proc means data=sandp; run; /* create a lagged value */ data sandp; set sandp; n=_n_; x=lag(S_P); /* create a new variable called x, this is the lagged value of S_P, lagged means it is the value of the previous observation */ proc reg data=sandp; model s_p=x; /* current stock index value is a function of the value in the previous time period*/ output out=sandp rstudent=studentized r=residual p=fitted; run; quit; /*create a scatter plot*/ proc sgplot data=sandp; scatter x=fitted y=studentized; run; proc sgplot data=sandp; scatter x=n y=residual; run; /* autocorrelation example with sales and advertising data, ABC Company 1967-2002 */ proc import datafile="g:\Teaching\ECN410\ECN410.Beamers\Lecture4\SALESADV6.xlsx" dbms=xlsx out=work.sales replace; getnames=yes; run; data sales; set sales; n=_n_; run; proc reg data=sales; model sales = adv/dw; /*use the dw option to get the Durbin-Watson statistic*/ output out=sales rstudent=studentized r=residual p=fitted; run; quit; /*create a scatter plot*/ proc sgplot data=sales; scatter x=n y=studentized; run; /* outliers, leverage, and influence */ /* autocorrelation example with sales and advertising data, ABC Company 1967-2002 */ proc import datafile="g:\Teaching\ECN410\ECN410.Beamers\Lecture4\SL6.xlsx" dbms=xlsx out=work.sl replace; getnames=yes; run; PROC REG data=sl; model return = beta sigma; output out=return_residuals p=return_hat r=residuals rstudent=r h=leverage cookd=cd; /* we are creating a data set that has: predicted values: return_hat residuals: residuals studentized residuals: r leverage: leverage Cook's distance: cd */ run; QUIT; /*sort the data to see which has the highest leverage, this can also be done for the other variables such as Cook's D*/ proc sort data=return_residuals out=return_residuals_sorted; by leverage; run; proc print data=return_residuals_sorted; run; proc sort data=return_residuals out=return_residuals_sorted; by descending cd; run; proc print data=return_residuals_sorted; run; /*compare the estimated coefficients with and without the high influence observation */ data sl2; set sl; where return lt 10; PROC REG data=sl2; model return = beta sigma; run; QUIT;