procedure tcolstat(tabfil,delim,colnr) char tabfil ="" {prompt="free format ASCII file"} char delim =" " {prompt="column separation character"} int colnr =1 {prompt="column to calculate statistics for"} char rows ="-" {prompt="rows to include in statistics (\"-\"=all)"} int nval =0 {prompt="number of column entries"} real average =0. {prompt="average of column entries"} real midpt =0. {prompt="midpoint (median) of column entries"} real stddev =0. {prompt="standard deviation"} real lquart =0. {prompt="value at lower quartile point (-34.135%)"} real uquart =0. {prompt="value at upper quartile point (+34.135%)"} real loctil =0. {prompt="value at lower octile point (-47.725%)"} real uoctil =0. {prompt="value at upper octile point (+47.725%)"} real min =0. {prompt="minimum of column entries"} real max =0. {prompt="maximum of column entries"} real sumx =0. {prompt="sum of column entries"} real sumx2 =0. {prompt="sum of squares of column entries"} bool verbose =no {prompt="print output to screen"} # @(#) task tcolstat Author: R.A. Jansen -- Jul 9 1996 # @(#) # @(#) task to calculate the column statistics in a free format ASCII # @(#) table. Sum, average and number of column entries are output task # @(#) parameters. Maximum number of lines is 32768. # @(#) Sep 24 1996 -- abandoned use of 'rdlist' in favor of 'fscan', to # @(#) speed this task up for tables containing more than # @(#) several tens of lines. # @(#) Jan 31 1997 -- Added check for commented lines (character '#' in # @(#) column 1). These lines are skipped. # @(#) Added calculation of standard deviation 'sigma' # @(#) and 'sum of squares'. Added verbose option. # @(#) Dec 2 1997 -- To speed this task up, sigma is now approximated # @(#) as half the difference of the 84% and 15% values, # @(#) saving another pass over all data values. There no # @(#) longer is a limit on the number of lines, but a # @(#) warning is given if the 'nval' exceeds 32768. begin string infile, sep, cstr, ostr, crows, blank, uline string cavg, cmed, csig, cmin, cmax, cnn, cnc struct line int ncol, NS, NN, nrow1, nrow2 real rsum, ssum, val, avg, rmin, rmax, med real hival, loval, msum, sigma, quart1, quart2, oct1, oct2 bool hundred infile = tabfil ncol = colnr sep = delim if ( sep == "" ) { sep = " " } if ( ncol > 15 ) { error (1, "Current limit on column number is 15.") } crows = rows if ( crows == "-" ) { nrow1 = 1 nrow2 = 9999999 } else { cparse (crows, delim="-") if ( cparse.nfields == 1 ) { nrow1 = int(cparse.field1) nrow2 = 9999999 } else if (cparse.nfields == 2 && cparse.field1 == "" ) { nrow1 = 1 nrow2 = int(cparse.field2) } else if (cparse.nfields == 2 ) { nrow1 = int(cparse.field1) nrow2 = int(cparse.field2) } else { error (1, "Could not parse \"rows="//crows//"\"!") } } # Check whether input file exist if ( !access(infile) ) { error (1, "File "//infile//" not found!") } # Input OK. Get number table lines and calculate column average print (infile//": calculating statistics of column "//str(ncol)//" ...") rmin = 9999999.9999 rmax = -9999999.9999 rsum = 0.00 ssum = 0.00 NN = 0 NS = 0 list = infile while ( fscan(list,line) != EOF ) { cstr = line if ( substr(cstr,1,1) != "#" ) { NS += 1 if ( NS >= nrow1 && NS <= nrow2 ) { NN += 1 cparse (cstr, delim=sep, nselect=ncol) val = real(cparse.selfield) rsum = rsum + val ssum = ssum + (val*val) if ( val <= rmin ) { rmin = val } if ( val >= rmax ) { rmax = val } print(val, >> "tmptclstt.val") if (verbose) { printf ("%1s", "\r processing line: "//str(NN)) } } if ( NS > 32768 ) { print ("WARNING: Number of lines exceeds 32768! Continuing...") } } } printf ("%1s\n", " ") avg = rsum/real(NN) sort ("tmptclstt.val", column=1, ignore_white=yes, numeric_sort=yes, reverse_sort=yes, > "tmptclstt.sort") NS = 0 list = "tmptclstt.sort" while ( fscan(list,line) != EOF ) { NS += 1 if ( NS == max(1,nint(0.0227495*NN))) { oct1 = real(line) } if ( NS == max(1,nint(0.1586470*NN))) { quart1 = real(line) } if ( NS == nint(0.5000000*NN) ) { med = real(line) } if ( NS == nint(0.8413530*NN) ) { quart2 = real(line) } if ( NS == nint(0.9772505*NN) ) { oct2 = real(line) } } sigma = 0.5*(quart1-quart2) if (NN<=25) { oct1 = INDEF oct2 = INDEF } list = "" delete ("tmptclstt.val", yes, verify=no) delete ("tmptclstt.sort", yes, verify=no) blank = " " cavg = str(0.00001*nint(avg*100000)) cmed = str(0.00001*nint(med*100000)) csig = str(0.00001*nint(sigma*100000)) cmin = str(0.00001*nint(rmin*100000)) cmax = str(0.00001*nint(rmax*100000)) cnc = str(ncol) cnn = str(NN) if (verbose) { # print results... uline = "========================================" uline = "#"//uline//uline ostr = infile//" "//cnc//" "//cnn ostr = ostr//" "//substr(blank,1,10-strlen(cavg))//cavg ostr = ostr//" "//substr(blank,1,10-strlen(cmed))//cmed ostr = ostr//" "//substr(blank,1,10-strlen(csig))//csig ostr = ostr//" "//substr(blank,1,10-strlen(cmin))//cmin ostr = ostr//" "//substr(blank,1,10-strlen(cmax))//cmax cstr = "#"//substr(blank,1,nint(0.5*strlen(infile))-3)//"file"//substr(blank,1,nint(0.5*strlen(infile))-4) cstr = cstr//" "//substr(blank,1,strlen(cnc)-3)//"col" cstr = cstr//" "//substr(blank,1,strlen(cnn)-5)//"nval" cstr = cstr//" "//substr(blank,1,max(1,strlen(cavg)-7))//"average" cstr = cstr//" "//substr(blank,1,max(5,strlen(cmed)-5))//"midpt" cstr = cstr//" "//substr(blank,1,max(4,strlen(csig)-6))//"stddev" cstr = cstr//" "//substr(blank,1,max(7,strlen(cmin)-3))//"min" cstr = cstr//" "//substr(blank,1,max(7,strlen(cmax)-3))//"max" print (cstr) print (substr(uline,1,strlen(cstr))) print (ostr) } # update task parameters nval = NN average = real(cavg) midpt = real(cmed) stddev = real(csig) min = real(cmin) max = real(cmax) sumx = rsum sumx2 = ssum lquart = quart2 uquart = quart1 loctil = oct2 uoctil = oct1 end