This code is cleaned up and runs turnkey from top to bottom. In order to run it you need to download 3 files from the following FTP site:
ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/v2/
v2.mean.z
v2.country.codes.z
v2.temperature.inv.z
Unzip them to the folder of your choice using a free zip software, and point the variable folder to the folder you chose.
The rest of this runs for me at that point. The results are compiled as seasonal anomaly in
nh – northern hemisphere
sh – southern hemisphere
glt – global temperature
These include the seasonal variation in their graphs, to anomalize there are three plotting routines at the bottom which create the variables
nha – northern hemisphere anomaly
sha – southern hemisphere anomaly
gla – global temperature anomaly
In addition to the above output, the array gridtem holds the temperature data in a 36 x 72 x columns matrix. The data is not anomalized but can be accessed for further processing at that point.
Documentation for the unique methods employed by Roman M used to assimilate this data is at the following links:
http://statpad.wordpress.com/2010/02/19/combining-stations-plan-b/
http://statpad.wordpress.com/2010/02/25/comparing-single-and-monthly-offsets/
http://statpad.wordpress.com/2010/03/04/weighted-seasonal-offsets/
http://statpad.wordpress.com/2010/03/18/anomaly-regression-%E2%80%93-do-it-right/
https://noconsensus.wordpress.com/2010/03/21/demonstration-of-improved-anomaly-calculation/
https://noconsensus.wordpress.com/2010/03/24/thermal-hammer/
################################################### ## gridded global trend calculation with offset#### ################################################### #### load external functions filtering used source("http://www.climateaudit.info/scripts/utilities.txt") #Steve McIntyre ################################################### ######## data load ################################ ################################################### folder="C:/agw/R/GHCN/" loc=paste(folder,"v2.mean",sep="") wd=c(3,5,3,1,4,5,5,5,5,5,5,5,5,5,5,5,5) ghmean=read.fwf(loc,widths=wd,comment.char="") loc=paste(folder,"v2.country.codes",sep="") wd=c(3,38) ccode=read.fwf(loc,widths=wd) loc=paste(folder,"v2.temperature.inv",sep="") wd=c(3,5,4,30,8,7,4,6,1,10,3,2,16,1) tinv=read.fwf(loc,widths=wd,comment.char="") stationnum=nrow(tinv) #7280 #save(ghmean, ccode, stationnum,tinv ,file="GHCN data.RData") #load("ghcn data.RData") ################################################### ######## functions ################################ ################################################### # Some functions below were written by Ryan O SteveM, NicL,RomanM ### Gausssian filter ff=function(x) { filter.combine.pad(x,truncated.gauss.weights(11) )[,2]#31 } ### Sum of squares ssq=function(x) {sum(x^2)} ################################################ ######## roman M offset series combination ##### ################################################ ##### version2.0 ### subfunction to do pseudoinverse psx.inv = function(mat,tol = NULL) { if (NCOL(mat)==1) return( mat /sum(mat^2)) msvd = svd(mat) dind = msvd$d if (is.null(tol)) { tol = max(NROW(mat),NCOL(mat))*max(dind)*.Machine$double.eps } dind[dind<tol]=0 dind[dind>0] = 1/dind[dind>0] inv = msvd$v %*% diag(dind, length(dind)) %*% t(msvd$u) inv } ### subfunction to do offsets calcx.offset = function(tdat,wts) { ## new version nr = length(wts) delt.mat = !is.na(tdat) delt.vec = rowSums(delt.mat) row.miss= (delt.vec ==0) delt2 = delt.mat/(delt.vec+row.miss) co.mat = diag(colSums(delt.mat)) - (t(delt.mat)%*% delt2) co.vec = colSums(delt.mat*tdat,na.rm=T) - colSums(rowSums(delt.mat*tdat,na.rm=T)*delt2) co.mat[nr,] = wts co.vec[nr]=0 psx.inv(co.mat)%*%co.vec } temp.combine = function(tsdat, wts=NULL, all=T) { ### main routine nr = nrow(tsdat) nc = ncol(tsdat) dims = dim(tsdat) if (is.null(wts)) wts = rep(1,nc) wts=wts/sum(wts) off.mat = matrix(NA,12,nc) dat.tsp = tsp(tsdat) for (i in 1:12) off.mat[i,] = calcx.offset(window(tsdat,start=c(dat.tsp[1],i), deltat=1), wts) colnames(off.mat) = colnames(tsdat) rownames(off.mat) = month.abb matoff = matrix(NA,nr,nc) for (i in 1:nc) matoff[,i] = rep(off.mat[,i],length=nr) temp = rowMeans(tsdat-matoff,na.rm=T) pred=NULL residual=NULL if (all==T) { pred =c(temp) + matoff residual = tsdat-pred } list(temps = ts(temp,start=c(dat.tsp[1],1),freq=12),pred =pred, residual = residual, offsets=off.mat) } #pick out those series with have at least nn + 1 observations in every month #Outputs a logical vector with TRUE indicating that that sereis is OK dat.check = function(tsdat, nn=0) { good = rep(NA,ncol(tsdat)) for (i in 1:ncol(tsdat)) good[i]= (min(rowSums(!is.na(matrix(tsdat[,i],nrow=12))))>nn) good } ###################################################################### ##### slope plotting function with modifications for 12 month anomaly Jeff from Roman's,Ryan's, NicL and SteveM work ###################################################################### get.slope=function(dat=dat) { len=length(dat) month = factor(rep(1:12,as.integer(len/12)+1)) month=month[1:len] #first month by this method is not necessarily jan #the important bit is that there are 12 steps time2=time(dat) coes = coef(lm(dat~0+month+time2)) residual=ts(rep(NA,len),start=time2[1],deltat=1/12) fitline=array(NA,dim=c(len,12)) for(i in 1:12) { fitline[,i]=coes[13]*time2+coes[i] } anomaly=rep(NA,len) for(i in 1:len) { residual[i]=dat[i]-fitline[i,month[i]] } anomaly = residual+time2*coes[13]-mean(time2*coes[13],na.rm=TRUE) list(coes,residual,anomaly) } ### plot average slope using roman's methods plt.avg2=function(dat, st=1850, en=2011, y.pos=NA, x.pos=NA, main.t="Untitled") { ### Get trend fm=get.slope(window(dat, st, en)) ### Initialize variables N=length(window(dat, st, en)) I=seq(1:N)/frequency(dat) rmI=ssq(I-mean(I)) ### Calculate sum of squared errors SSE=ssq(fm[[2]][!is.na(fm[[2]])])/(N-2) ### Calculate OLS standard errors seOLS=sqrt(SSE/rmI)*10 ### Calculate two-tailed 95% CI ci95=seOLS*1.964 ### Get lag-1 ACF r1=acf(fm [[2]], lag.max=1, plot=FALSE,na.action=na.pass)$acf[[2]] ### Calculate CIs with Quenouille (Santer) adjustment for autocorrelation Q=(1-r1)/(1+r1) ciQ=ci95/sqrt(Q) maxx=max(time(fm[[3]])) minx=min(time(fm[[3]])) maxy=max(fm[[3]],na.rm=TRUE) miny=min(fm[[3]],na.rm=TRUE) ### Plot data plot(window(fm[[3]], st, en), main=main.t, ylab="Anomaly (Deg C)") lfit=lm(window(fm[[3]], st, en)~I(time(window(fm[[3]], st, en)))) ### Add trendline and x-axis abline(a=lfit[[1]][1],b=lfit[[1]][2], col=2); abline(fm, col=4, lwd=2) grid() ### Annotate text tx=paste("Slope: ", round(fm[[1]][13]*10, 4) ,"Deg C/Decade", "+/-", round(ciQ, 4) , "Deg C/Decade\n(corrected for AR(1) serial correlation)\nLag-1 value:" , round(r1, 3)) text(x=ifelse(is.na(x.pos), (maxx-minx)*.2+minx,x.pos), y=ifelse(is.na(y.pos), (maxy-miny)*.8+miny, y.pos), tx, cex=.8, col=4, pos=4) } ######## function plots monthly data with seasonal signal intact 12 slope curves plt.seasonal.slopes=function(dat, st=1850, en=2011, y.pos=NA, x.pos=NA, main.t="Untitled") { ### Get trend fm=get.slope(window(dat, st, en)) ### Initialize variables N=length(window(dat, st, en)) I=seq(1:N)/frequency(dat) rmI=ssq(I-mean(I)) ### Calculate sum of squared errors SSE=ssq(fm[[2]][!is.na(fm[[2]])])/(N-2) ### Calculate OLS standard errors seOLS=sqrt(SSE/rmI)*10 ### Calculate two-tailed 95% CI ci95=seOLS*1.964 ### Get lag-1 ACF r1=acf(fm [[2]], lag.max=1, plot=FALSE,na.action=na.pass)$acf[[2]] ### Calculate CIs with Quenouille (Santer) adjustment for autocorrelation Q=(1-r1)/(1+r1) ciQ=ci95/sqrt(Q) maxx=max(time(dat)) minx=min(time(dat)) maxy=max(dat,na.rm=TRUE) miny=min(dat,na.rm=TRUE) ### Plot data plot(window(dat, st, en), main=main.t, ylab="Temperature (Deg C)") ### Add 12 trendlines and x-axis for(i in 1:12) { abline(a=fm[[1]][i],b=fm[[1]][13], col=2); } grid() ### Annotate text tx=paste("Slope: ", round(fm[[1]][13]*10, 4),"C/Dec" ,"+/-", round(ciQ, 4) ,"C/Dec(AR(1)Lag-1 val:" , round(r1, 3)) text(x=ifelse(is.na(x.pos), (maxx-minx)*.2+minx,x.pos), y=ifelse(is.na(y.pos), (maxy-miny)*.05+miny, y.pos), tx, cex=.8, col=4, pos=4) } dat=nh plt.seasonal.slopes(dat,main.t="Typical Temperature Series") #savePlot("c:/agw/plots/12 line seasonal sloope.jpg",type="jpg") ########################################################### ### combine staton data by insuring whether each series is in fact the same ########################################################### getsinglestation=function(staid=60360, version=0) { staraw=NA #raw data smask= (ghmean[,2]==staid) mask= ghmean[smask,3]==version data=ghmean[smask,][mask,] noser= levels(factor(data[,4])) for(j in noser) { mask2=data[,4]==j startyear=min(data[mask2,5]) endyear=max(data[mask2,5]) subd=data[mask2,6:17] index=(data[mask2,5]-startyear)+1 dat=array(NA,dim=c(12,(endyear-startyear)+1)) for(k in 1:length(index)) { dat[,index[k]]=as.numeric(subd[k,]) } dim(dat)=c(length(dat),1) dat[dat==-9999]=NA dat=dat/10 rawd=ts(dat,start=startyear,deltat=1/12) if(max(time(rawd))>=2011) { print ("error series") rawd=NA } if(!is.ts(staraw)) { staraw=rawd }else{ staraw=ts.union(staraw,rawd) } } if(!is.null(ncol(staraw))) { allraw=ts(rowMeans(staraw,na.rm=TRUE),start=time(staraw)[1],freq=12) }else{ allraw=staraw } allraw } ############################################################## ############ Assemble stations into grid Jeff Id ############# ############################################################## ##assign stations to gridcells gridval=array(NA,dim=c(stationnum,2)) gridval[,1]=as.integer((tinv[,5]+90)/5)+1 #lat gridval[,2]=as.integer((tinv[,6]+180)/5)+1 #lon #calculate gridcell trends gridtem=array(NA,dim=c(36,72,211*12)) for(ii in 1:72) { for(jj in 1:36) { maska=gridval[,2]==ii maskb=gridval[,1]==jj maskc= maska & maskb sta=NA ##initialize to non TS for(i in (1:stationnum)[maskc]) { #get all station trends for station ID rawsta = getsinglestation(tinv[i,2],tinv[i,3]) #place all curves side by side in a ts structure if(!is.ts(sta)) { sta=rawsta }else{ ts.union(sta,rawsta) } } if(is.ts(sta))#if at least one time series exists { sta=window(sta,start=1800) #ignore pre-1800 index=as.integer(((time(sta)-1800)*12+.002)+1) #calculate time index for insertion into array if(!is.null(ncol(sta))) #is station more than one column { gridtem[jj,ii,index]=temp.combine(sta)$temps }else{ gridtem[jj,ii,index]=sta } print(jj) #progress debug tit=paste("ii=",ii,"jj=",jj) plot(ts(gridtem[jj,ii,],start=1800,freq=12),main=tit) #debug } } print ("COLUMN") #progress debug print(ii) #progress debug } ########################################### ## calculate global trends using roman's offset combine on each grid ########################################### rgl=gridtem dim(rgl)=c(36*72,2532) rgl=t(rgl) rgl=ts(rgl,start=1800,deltat=1/12) mask=!is.na(colMeans(rgl,na.rm=TRUE)) weight=sin(seq(2.5,180, 5)*(pi)/180) #cell weights wt=rep(weight,72) maskb=array(FALSE,dim=c(36,72)) maskb[1:18,]=TRUE dim(maskb)=(36*72) maskc=mask & maskb sh=temp.combine(rgl[,maskc],wt[maskc])$temps maskc=mask & !maskb nh=temp.combine(rgl[,maskc],wt[maskc])$temps un=ts.union(nh,sh) glt=temp.combine(un)$temps # nh - northern hemisphere trend, sh - southern, glt=global trend ########################################### #### plotting and anomaly calculation ########################################### nha=get.slope(nh) plot(window(nha[[3]],1900),main="Northern Hemisphere Anomaly",ylab="Anomaly Deg C",xlab="Year") #savePlot("c:/agw/plots/northern hemisphere anom.jpg",type="jpg") sha=get.slope(sh) plot(window(sha[[3]],1900),main="Southern Hemisphere Anomaly",ylab="Anomaly Deg C",xlab="Year") #savePlot("c:/agw/plots/Southern hemisphere anom.jpg",type="jpg") gla= get.slope(glt) plot(ff(window(nha[[3]],1978)),main="Hemispheric Temperature Anomaly",ylab="Anomaly Deg C",xlab="Year") lines(ff(window(sha[[3]],1978)),col=2) legend("bottomright",c("Northern","Southern"),col=c(1,2),pch=10)
As a newbie on this blog I probably missed something in the lead up to this thread. Presumably one needs some form of Windows to run this code. Can it be run in Linux (I have Ubuntu 8.04).
I don’t have a lot of time today, but the language is R. It’s free and easy to find by google.
You need to slap a version number in there somewhere, my good man.
Gallop, go to this site for the binary distribution.
You guys are awesome! Thanks.
Jeff
Posted this on the annoucement site but maybe this is where I should post this.
Congratulations on the baby boy and to Mom Id.
I have used your program to look at all the British Columbia sites in the GHCN v2.mean database. I have done the seasonal means and improved anomaly for all of them.
Just some things that I noticed with compiling an Excel worksheet on the results.
Just talking BC here. I have no idea if other parts of the GHCN data base show these kind of results for other areas. I do not know how your program uses all the sites in the database.
1. The record is poor. Most records stop in 1990. Very few go beyond 2000 or to 2010.
2. Most records are for the period ~1950 to 1990.
3. The above records can show large decadal seasonal/anomaly slope and large +/-
4. There are few records of the 1930s/40s – at time of increased temperatures historically.
Would these out of date, and somewhat time specific (1950 t0 1990) data have an effect on your results for the whole database if my findings for BC are the rule rather than the exception?
I have my records in an Excel worksheet ( includes all the slopes and +/- data)which I could send you if you would be interested as well as all the plots (22 Mb folder.
By the way, the only US area I did was 72235 version 1 thru 12. They show much more up to date data ie to 2006.
Some results after sorting for record length:
Length
of record Slope +/-
NEW WESTMINSTER, BC 1874 1980 106 0.0989 0.0329
ESTEVAN POINT 1908 2010 102 0.0382 0.0278
PRINCE RUPERT 1908 2010 102 -0.0678 0.0369
AGASSIZ CDA,BC 1889 1990 101 0.0722 0.0397
QUATSINO,BC 1895 1990 95 0.0903 0.0364
BELLA COOLA,BC 1895 1990 95 0.1108 0.0481
FORT ST JAMES,BC 1895 1990 95 0.2206 0.069
BIG CREEK,BC 1893 1986 93 -0.0986 0.0639
BARKERVILLE,BC 1888 1980 92 0.0059 0.0499
ATLIN,BC 1899 1990 91 0.0819 0.085
MCINNES IS.,B 1954 1990 36 0.1455 0.1697
CASSIAR,BC 1954 1990 36 0.5108 0.2783
BARRIERE,BC 1955 1990 35 0.1116 0.2574
TLELL,BC 1957 1990 33 0.055 0.2046
ETHELDA BAY,BC 1957 1990 33 -0.1716 0.1793
HOLBERG,BC 1958 1990 32 0.1028 0.1724
This must be having some effect on the data – yes/no?
http://surfacestations.googlecode.com/files/RuralStations_1km.kml
I got some R code for building kml files… cool stuff
Jeff can you run a Conus Study?
Conus?
Click to access HadGHCND_paper.pdf
Interesting work on the correlation between stations..
Hey Jeff,
I’ve been asking Roman M over at his blog about the code for the offsets (i.e. the hammer) and how to use it for a set of annual series. (Just a plain set of annual series). I am more or less at the beginner stage of R so I think I had trouble understanding some of his description so I was wondering (since you’re using this practically) if you could help me better understand what the function is doing. In particular I was wondering what sort of input do I need. For example I have a .csv file of 31 temperature time series and I would like to use this method to just simply combine them. What sort of header should I have? i.e. should it be:
Year Station A Station B
1880 -1 -5
1881
etc…
The code he gave me was:
###function for calculating matrix pseudo inverse for any r x c size
psx.inv = function(mat,tol = NULL) {
# if (is.null(dim(mat))) return( mat /sum(mat^2))
if (NCOL(mat)==1) return( mat /sum(mat^2))
msvd = svd(mat)
dind = msvd$d
if (is.null(tol)) {tol = max(NROW(mat),NCOL(mat))*max(dind)*.Machine$double.eps}
dind[dind>tol] = 1/dind[dind>tol]
inv = msvd$v %*% diag(dind, length(dind)) %*% t(msvd$u)
inv}
### single offset calculation routine
# also used by monthly routine
calc.offset = function(tdat) {
delt.mat = !is.na(tdat)
delt.vec = rowSums(delt.mat)
co.mat = diag(colSums(delt.vec*delt.mat))-(t(delt.mat)%*%delt.mat)
co.vec = colSums(delt.vec*delt.mat*tdat,na.rm=T)-colSums(rowSums(delt.mat*tdat,na.rm=T)*delt.mat)
offs = psx.inv(co.mat)%*%co.vec
c(offs)-mean(offs)}
Jeff, in the meantime NOAA had removed v2.mean temperature data from ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/v2/ (and from ../v1 too). Do you have a backup copy?
I doubt it but I will check my other computer.