the Air Vent

Because the world needs another opinion

GHCN Global Temperatures – Compiled Using Offset Anomalies

Posted by Jeff Id on March 28, 2010

This code is cleaned up and runs turnkey from top to bottom. In order to run it you need to download 3 files from the following FTP site:

ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/v2/
v2.mean.z
v2.country.codes.z
v2.temperature.inv.z

Unzip them to the folder of your choice using a free zip software, and point the variable folder to the folder you chose.

The rest of this runs for me at that point.  The results are compiled as seasonal anomaly in

nh – northern hemisphere

sh – southern hemisphere

glt – global temperature

These include the seasonal variation in their graphs, to anomalize there are three plotting routines at the bottom which create the variables

nha – northern hemisphere anomaly

sha – southern hemisphere anomaly

gla – global temperature anomaly

In addition to the above output, the array gridtem holds the temperature data in a 36 x 72 x columns matrix.  The data is not anomalized but can be accessed for further processing at that point.

Documentation for the unique methods employed by Roman M used to assimilate this data is at the following links:

http://statpad.wordpress.com/2010/02/19/combining-stations-plan-b/

http://statpad.wordpress.com/2010/02/25/comparing-single-and-monthly-offsets/

http://statpad.wordpress.com/2010/03/04/weighted-seasonal-offsets/

http://statpad.wordpress.com/2010/03/18/anomaly-regression-%E2%80%93-do-it-right/

http://noconsensus.wordpress.com/2010/03/21/demonstration-of-improved-anomaly-calculation/

http://noconsensus.wordpress.com/2010/03/24/thermal-hammer/


###################################################
## gridded global trend calculation with offset####
###################################################

#### load external functions filtering used
source("http://www.climateaudit.info/scripts/utilities.txt")  #Steve McIntyre

###################################################
######## data load ################################
###################################################
folder="C:/agw/R/GHCN/"

loc=paste(folder,"v2.mean",sep="")
wd=c(3,5,3,1,4,5,5,5,5,5,5,5,5,5,5,5,5)
ghmean=read.fwf(loc,widths=wd,comment.char="")

loc=paste(folder,"v2.country.codes",sep="")
wd=c(3,38)
ccode=read.fwf(loc,widths=wd)

loc=paste(folder,"v2.temperature.inv",sep="")
wd=c(3,5,4,30,8,7,4,6,1,10,3,2,16,1)
tinv=read.fwf(loc,widths=wd,comment.char="")
stationnum=nrow(tinv) #7280

#save(ghmean, ccode, stationnum,tinv ,file="GHCN data.RData")
#load("ghcn data.RData")

###################################################
######## functions ################################
###################################################
# Some functions below were written by Ryan O SteveM, NicL,RomanM

### Gausssian filter
ff=function(x)
{
	filter.combine.pad(x,truncated.gauss.weights(11) )[,2]#31
}

### Sum of squares
ssq=function(x) {sum(x^2)}

################################################
######## roman M offset series combination #####
################################################
##### version2.0
### subfunction to do pseudoinverse
psx.inv = function(mat,tol = NULL)
{
 	if (NCOL(mat)==1) return( mat /sum(mat^2))
	msvd = svd(mat)
 	dind = msvd$d
	if (is.null(tol))
	{
		tol = max(NROW(mat),NCOL(mat))*max(dind)*.Machine$double.eps
	}
 	dind[dind<tol]=0
 	dind[dind>0] = 1/dind[dind>0]
 	inv = msvd$v %*% diag(dind, length(dind)) %*% t(msvd$u)
	inv
}
### subfunction to do offsets
calcx.offset = function(tdat,wts)
{
	## new version
 	nr = length(wts)
 	delt.mat = !is.na(tdat)
 	delt.vec = rowSums(delt.mat)
 	row.miss= (delt.vec ==0)
 	delt2 = delt.mat/(delt.vec+row.miss)
 	co.mat = diag(colSums(delt.mat)) - (t(delt.mat)%*% delt2)
 	co.vec = colSums(delt.mat*tdat,na.rm=T) - colSums(rowSums(delt.mat*tdat,na.rm=T)*delt2)
 	co.mat[nr,] = wts
 	co.vec[nr]=0
 	psx.inv(co.mat)%*%co.vec
}

temp.combine = function(tsdat, wts=NULL, all=T)
{
	### main routine
	nr = nrow(tsdat)
	nc = ncol(tsdat)
 	dims = dim(tsdat)
 	if (is.null(wts)) wts = rep(1,nc)
 	wts=wts/sum(wts)
 	off.mat = matrix(NA,12,nc)
 	dat.tsp = tsp(tsdat)
 	for (i in 1:12) off.mat[i,] = calcx.offset(window(tsdat,start=c(dat.tsp[1],i), deltat=1), wts)
 	colnames(off.mat) = colnames(tsdat)
 	rownames(off.mat) = month.abb
 	matoff = matrix(NA,nr,nc)
 	for (i in 1:nc) matoff[,i] = rep(off.mat[,i],length=nr)
 	temp = rowMeans(tsdat-matoff,na.rm=T)
 	pred=NULL
 	residual=NULL
 	if (all==T)
	{
		pred =c(temp) + matoff
 		residual = tsdat-pred
	}
	list(temps = ts(temp,start=c(dat.tsp[1],1),freq=12),pred =pred, residual = residual, offsets=off.mat)
}

#pick out those series with have at least nn + 1 observations in every month
#Outputs a logical vector with TRUE indicating that that sereis is OK
dat.check = function(tsdat, nn=0)
{
	good = rep(NA,ncol(tsdat))
	for (i in 1:ncol(tsdat)) good[i]= (min(rowSums(!is.na(matrix(tsdat[,i],nrow=12))))>nn)
	good
}

######################################################################
##### slope plotting function with modifications for 12 month anomaly Jeff from Roman's,Ryan's, NicL and SteveM work
######################################################################
get.slope=function(dat=dat)
{
	len=length(dat)
	month = factor(rep(1:12,as.integer(len/12)+1))
	month=month[1:len]	#first month by this method is not necessarily jan
					#the important bit is that there are 12 steps
	time2=time(dat)
	coes = coef(lm(dat~0+month+time2))
	residual=ts(rep(NA,len),start=time2[1],deltat=1/12)
	fitline=array(NA,dim=c(len,12))
	for(i in 1:12)
	{
		fitline[,i]=coes[13]*time2+coes[i]
	}
	anomaly=rep(NA,len)
	for(i in 1:len)
	{
		residual[i]=dat[i]-fitline[i,month[i]]
	}
	anomaly = residual+time2*coes[13]-mean(time2*coes[13],na.rm=TRUE)
	list(coes,residual,anomaly)
}

### plot average slope using roman's methods
plt.avg2=function(dat, st=1850, en=2011, y.pos=NA, x.pos=NA, main.t="Untitled") {

	###  Get trend
	fm=get.slope(window(dat, st, en))

	###  Initialize variables
	N=length(window(dat, st, en))
	I=seq(1:N)/frequency(dat)
	rmI=ssq(I-mean(I))

	###  Calculate sum of squared errors
	SSE=ssq(fm[[2]][!is.na(fm[[2]])])/(N-2)

	###  Calculate OLS standard errors
	seOLS=sqrt(SSE/rmI)*10

	###  Calculate two-tailed 95% CI
	ci95=seOLS*1.964

	###  Get lag-1 ACF
	r1=acf(fm [[2]], lag.max=1, plot=FALSE,na.action=na.pass)$acf[[2]]

	###  Calculate CIs with Quenouille (Santer) adjustment for autocorrelation
	Q=(1-r1)/(1+r1)
	ciQ=ci95/sqrt(Q)

	maxx=max(time(fm[[3]]))
	minx=min(time(fm[[3]]))
	maxy=max(fm[[3]],na.rm=TRUE)
	miny=min(fm[[3]],na.rm=TRUE)

	###  Plot data
	plot(window(fm[[3]], st, en), main=main.t, ylab="Anomaly (Deg C)")
	lfit=lm(window(fm[[3]], st, en)~I(time(window(fm[[3]], st, en))))

	###  Add trendline and x-axis
	abline(a=lfit[[1]][1],b=lfit[[1]][2], col=2); abline(fm, col=4, lwd=2)
	grid()
	###  Annotate text
	tx=paste("Slope: ", round(fm[[1]][13]*10, 4)
		,"Deg C/Decade", "+/-", round(ciQ, 4)
		, "Deg C/Decade\n(corrected for AR(1) serial correlation)\nLag-1 value:"
		, round(r1, 3))
	text(x=ifelse(is.na(x.pos), (maxx-minx)*.2+minx,x.pos), y=ifelse(is.na(y.pos), (maxy-miny)*.8+miny, y.pos), tx, cex=.8, col=4, pos=4)
}
######## function plots monthly data with seasonal signal intact 12 slope curves
plt.seasonal.slopes=function(dat, st=1850, en=2011, y.pos=NA, x.pos=NA, main.t="Untitled")
{
	###  Get trend
	fm=get.slope(window(dat, st, en))

	###  Initialize variables
	N=length(window(dat, st, en))
	I=seq(1:N)/frequency(dat)
	rmI=ssq(I-mean(I))

	###  Calculate sum of squared errors
	SSE=ssq(fm[[2]][!is.na(fm[[2]])])/(N-2)

	###  Calculate OLS standard errors
	seOLS=sqrt(SSE/rmI)*10

	###  Calculate two-tailed 95% CI
	ci95=seOLS*1.964

	###  Get lag-1 ACF
	r1=acf(fm [[2]], lag.max=1, plot=FALSE,na.action=na.pass)$acf[[2]]

	###  Calculate CIs with Quenouille (Santer) adjustment for autocorrelation
	Q=(1-r1)/(1+r1)
	ciQ=ci95/sqrt(Q)

	maxx=max(time(dat))
	minx=min(time(dat))
	maxy=max(dat,na.rm=TRUE)
	miny=min(dat,na.rm=TRUE)

	###  Plot data
	plot(window(dat, st, en), main=main.t, ylab="Temperature (Deg C)")

	###  Add 12 trendlines and x-axis
	for(i in 1:12)
	{
		abline(a=fm[[1]][i],b=fm[[1]][13], col=2);
	}
	grid()
	###  Annotate text
	tx=paste("Slope: ", round(fm[[1]][13]*10, 4),"C/Dec"
		,"+/-", round(ciQ, 4)
		,"C/Dec(AR(1)Lag-1 val:"
		, round(r1, 3))
	text(x=ifelse(is.na(x.pos), (maxx-minx)*.2+minx,x.pos), y=ifelse(is.na(y.pos), (maxy-miny)*.05+miny, y.pos), tx, cex=.8, col=4, pos=4)
}
dat=nh
plt.seasonal.slopes(dat,main.t="Typical Temperature Series")
#savePlot("c:/agw/plots/12 line seasonal sloope.jpg",type="jpg")

###########################################################
### combine staton data by insuring whether each series is in fact the same
###########################################################
getsinglestation=function(staid=60360, version=0)
{
	staraw=NA
	#raw data
	smask= (ghmean[,2]==staid)
	mask= ghmean[smask,3]==version
	data=ghmean[smask,][mask,]
	noser= levels(factor(data[,4]))

	for(j in noser)
	{
		mask2=data[,4]==j
		startyear=min(data[mask2,5])
		endyear=max(data[mask2,5])
		subd=data[mask2,6:17]
		index=(data[mask2,5]-startyear)+1
		dat=array(NA,dim=c(12,(endyear-startyear)+1))
		for(k in 1:length(index))
		{
			dat[,index[k]]=as.numeric(subd[k,])
		}
		dim(dat)=c(length(dat),1)

		dat[dat==-9999]=NA
		dat=dat/10
		rawd=ts(dat,start=startyear,deltat=1/12)
		if(max(time(rawd))>=2011)
		{
			print ("error series")
			rawd=NA
		}

		if(!is.ts(staraw))
		{
			staraw=rawd
		}else{
			staraw=ts.union(staraw,rawd)
		}
	}
	if(!is.null(ncol(staraw)))
	{
		allraw=ts(rowMeans(staraw,na.rm=TRUE),start=time(staraw)[1],freq=12)
	}else{
		allraw=staraw
	}

	allraw
}

##############################################################
############ Assemble stations into grid Jeff Id #############
##############################################################

##assign stations to gridcells
gridval=array(NA,dim=c(stationnum,2))
gridval[,1]=as.integer((tinv[,5]+90)/5)+1  #lat
gridval[,2]=as.integer((tinv[,6]+180)/5)+1  #lon

#calculate gridcell trends
gridtem=array(NA,dim=c(36,72,211*12))

for(ii in 1:72)
{
	for(jj in 1:36)
	{
		maska=gridval[,2]==ii
		maskb=gridval[,1]==jj
		maskc= maska & maskb
		sta=NA ##initialize to non TS
		for(i in (1:stationnum)[maskc])
		{
			#get all station trends for station ID
			rawsta = getsinglestation(tinv[i,2],tinv[i,3])

			#place all curves side by side in a ts structure
			if(!is.ts(sta))
			{
				sta=rawsta
			}else{
				ts.union(sta,rawsta)
			}
		}

		if(is.ts(sta))#if at least one time series exists
		{

			sta=window(sta,start=1800)	#ignore pre-1800
			index=as.integer(((time(sta)-1800)*12+.002)+1)  #calculate time index for insertion into array
			if(!is.null(ncol(sta)))  #is station more than one column
			{
				gridtem[jj,ii,index]=temp.combine(sta)$temps
			}else{
				gridtem[jj,ii,index]=sta
			}
			print(jj)  #progress debug
			tit=paste("ii=",ii,"jj=",jj)
			plot(ts(gridtem[jj,ii,],start=1800,freq=12),main=tit)  #debug
		}
	}
	print ("COLUMN")  #progress debug
	print(ii) 		#progress debug
}

###########################################
## calculate global trends using roman's offset combine on each grid
###########################################
rgl=gridtem
dim(rgl)=c(36*72,2532)
rgl=t(rgl)
rgl=ts(rgl,start=1800,deltat=1/12)
mask=!is.na(colMeans(rgl,na.rm=TRUE))
weight=sin(seq(2.5,180, 5)*(pi)/180)	#cell weights
wt=rep(weight,72)
maskb=array(FALSE,dim=c(36,72))
maskb[1:18,]=TRUE
dim(maskb)=(36*72)
maskc=mask & maskb
sh=temp.combine(rgl[,maskc],wt[maskc])$temps
maskc=mask & !maskb
nh=temp.combine(rgl[,maskc],wt[maskc])$temps
un=ts.union(nh,sh)
glt=temp.combine(un)$temps

# nh - northern hemisphere trend, sh - southern, glt=global trend

###########################################
#### plotting and anomaly calculation
###########################################

nha=get.slope(nh)
plot(window(nha[[3]],1900),main="Northern Hemisphere Anomaly",ylab="Anomaly Deg C",xlab="Year")
#savePlot("c:/agw/plots/northern hemisphere anom.jpg",type="jpg")

sha=get.slope(sh)
plot(window(sha[[3]],1900),main="Southern Hemisphere Anomaly",ylab="Anomaly Deg C",xlab="Year")
#savePlot("c:/agw/plots/Southern hemisphere anom.jpg",type="jpg")

gla= get.slope(glt)

plot(ff(window(nha[[3]],1978)),main="Hemispheric Temperature Anomaly",ylab="Anomaly Deg C",xlab="Year")
lines(ff(window(sha[[3]],1978)),col=2)
legend("bottomright",c("Northern","Southern"),col=c(1,2),pch=10)


13 Responses to “GHCN Global Temperatures – Compiled Using Offset Anomalies”

  1. gallopingcamel said

    As a newbie on this blog I probably missed something in the lead up to this thread. Presumably one needs some form of Windows to run this code. Can it be run in Linux (I have Ubuntu 8.04).

  2. Jeff Id said

    I don’t have a lot of time today, but the language is R. It’s free and easy to find by google.

  3. Carrick said

    You need to slap a version number in there somewhere, my good man.

  4. Carrick said

    Gallop, go to this site for the binary distribution.

  5. gallopingcamel said

    You guys are awesome! Thanks.

  6. SteveWH said

    Jeff

    Posted this on the annoucement site but maybe this is where I should post this.

    Congratulations on the baby boy and to Mom Id.
    I have used your program to look at all the British Columbia sites in the GHCN v2.mean database. I have done the seasonal means and improved anomaly for all of them.
    Just some things that I noticed with compiling an Excel worksheet on the results.
    Just talking BC here. I have no idea if other parts of the GHCN data base show these kind of results for other areas. I do not know how your program uses all the sites in the database.
    1. The record is poor. Most records stop in 1990. Very few go beyond 2000 or to 2010.
    2. Most records are for the period ~1950 to 1990.
    3. The above records can show large decadal seasonal/anomaly slope and large +/-
    4. There are few records of the 1930s/40s – at time of increased temperatures historically.
    Would these out of date, and somewhat time specific (1950 t0 1990) data have an effect on your results for the whole database if my findings for BC are the rule rather than the exception?

    I have my records in an Excel worksheet ( includes all the slopes and +/- data)which I could send you if you would be interested as well as all the plots (22 Mb folder.

    By the way, the only US area I did was 72235 version 1 thru 12. They show much more up to date data ie to 2006.

    Some results after sorting for record length:

    Length
    of record Slope +/-
    NEW WESTMINSTER, BC 1874 1980 106 0.0989 0.0329
    ESTEVAN POINT 1908 2010 102 0.0382 0.0278
    PRINCE RUPERT 1908 2010 102 -0.0678 0.0369
    AGASSIZ CDA,BC 1889 1990 101 0.0722 0.0397
    QUATSINO,BC 1895 1990 95 0.0903 0.0364
    BELLA COOLA,BC 1895 1990 95 0.1108 0.0481
    FORT ST JAMES,BC 1895 1990 95 0.2206 0.069
    BIG CREEK,BC 1893 1986 93 -0.0986 0.0639
    BARKERVILLE,BC 1888 1980 92 0.0059 0.0499
    ATLIN,BC 1899 1990 91 0.0819 0.085

    MCINNES IS.,B 1954 1990 36 0.1455 0.1697
    CASSIAR,BC 1954 1990 36 0.5108 0.2783
    BARRIERE,BC 1955 1990 35 0.1116 0.2574
    TLELL,BC 1957 1990 33 0.055 0.2046
    ETHELDA BAY,BC 1957 1990 33 -0.1716 0.1793
    HOLBERG,BC 1958 1990 32 0.1028 0.1724

    This must be having some effect on the data – yes/no?

  7. http://surfacestations.googlecode.com/files/RuralStations_1km.kml

    I got some R code for building kml files… cool stuff

    Jeff can you run a Conus Study?

  8. Conus?

  9. http://hadobs.metoffice.com/hadghcnd/HadGHCND_paper.pdf

    Interesting work on the correlation between stations..

  10. [...] anomaly programs to examine these issues. These include Zeke Hausfather, Tamino, Nike Stokes, JeffId/RomanM and Chad. These independent methods have confirmed the general trends presented by CRUTEM [...]

  11. Robert said

    Hey Jeff,
    I’ve been asking Roman M over at his blog about the code for the offsets (i.e. the hammer) and how to use it for a set of annual series. (Just a plain set of annual series). I am more or less at the beginner stage of R so I think I had trouble understanding some of his description so I was wondering (since you’re using this practically) if you could help me better understand what the function is doing. In particular I was wondering what sort of input do I need. For example I have a .csv file of 31 temperature time series and I would like to use this method to just simply combine them. What sort of header should I have? i.e. should it be:

    Year Station A Station B
    1880 -1 -5
    1881

    etc…

    The code he gave me was:

    ###function for calculating matrix pseudo inverse for any r x c size

    psx.inv = function(mat,tol = NULL) {
    # if (is.null(dim(mat))) return( mat /sum(mat^2))
    if (NCOL(mat)==1) return( mat /sum(mat^2))
    msvd = svd(mat)
    dind = msvd$d
    if (is.null(tol)) {tol = max(NROW(mat),NCOL(mat))*max(dind)*.Machine$double.eps}
    dind[dind>tol] = 1/dind[dind>tol]
    inv = msvd$v %*% diag(dind, length(dind)) %*% t(msvd$u)
    inv}

    ### single offset calculation routine
    # also used by monthly routine

    calc.offset = function(tdat) {
    delt.mat = !is.na(tdat)
    delt.vec = rowSums(delt.mat)
    co.mat = diag(colSums(delt.vec*delt.mat))-(t(delt.mat)%*%delt.mat)
    co.vec = colSums(delt.vec*delt.mat*tdat,na.rm=T)-colSums(rowSums(delt.mat*tdat,na.rm=T)*delt.mat)
    offs = psx.inv(co.mat)%*%co.vec
    c(offs)-mean(offs)}

  12. Hugo.Mildenberger@web.de said

    Jeff, in the meantime NOAA had removed v2.mean temperature data from ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/v2/ (and from ../v1 too). Do you have a backup copy?

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

 
Follow

Get every new post delivered to your Inbox.

Join 140 other followers

%d bloggers like this: