GHCN Global Temperatures – Compiled Using Offset Anomalies

This code is cleaned up and runs turnkey from top to bottom. In order to run it you need to download 3 files from the following FTP site:

ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/v2/
v2.mean.z
v2.country.codes.z
v2.temperature.inv.z

Unzip them to the folder of your choice using a free zip software, and point the variable folder to the folder you chose.

The rest of this runs for me at that point. The results are compiled as seasonal anomaly in

nh – northern hemisphere

sh – southern hemisphere

glt – global temperature

These include the seasonal variation in their graphs, to anomalize there are three plotting routines at the bottom which create the variables

nha – northern hemisphere anomaly

sha – southern hemisphere anomaly

gla – global temperature anomaly

In addition to the above output, the array gridtem holds the temperature data in a 36 x 72 x columns matrix. The data is not anomalized but can be accessed for further processing at that point.

Documentation for the unique methods employed by Roman M used to assimilate this data is at the following links:

http://statpad.wordpress.com/2010/02/19/combining-stations-plan-b/

http://statpad.wordpress.com/2010/02/25/comparing-single-and-monthly-offsets/

http://statpad.wordpress.com/2010/03/04/weighted-seasonal-offsets/

http://statpad.wordpress.com/2010/03/18/anomaly-regression-%E2%80%93-do-it-right/

https://noconsensus.wordpress.com/2010/03/21/demonstration-of-improved-anomaly-calculation/

https://noconsensus.wordpress.com/2010/03/24/thermal-hammer/


###################################################
## gridded global trend calculation with offset####
###################################################

#### load external functions filtering used
source("http://www.climateaudit.info/scripts/utilities.txt")  #Steve McIntyre

###################################################
######## data load ################################
###################################################
folder="C:/agw/R/GHCN/"

loc=paste(folder,"v2.mean",sep="")
wd=c(3,5,3,1,4,5,5,5,5,5,5,5,5,5,5,5,5)
ghmean=read.fwf(loc,widths=wd,comment.char="")

loc=paste(folder,"v2.country.codes",sep="")
wd=c(3,38)
ccode=read.fwf(loc,widths=wd)

loc=paste(folder,"v2.temperature.inv",sep="")
wd=c(3,5,4,30,8,7,4,6,1,10,3,2,16,1)
tinv=read.fwf(loc,widths=wd,comment.char="")
stationnum=nrow(tinv) #7280

#save(ghmean, ccode, stationnum,tinv ,file="GHCN data.RData")
#load("ghcn data.RData")

###################################################
######## functions ################################
###################################################
# Some functions below were written by Ryan O SteveM, NicL,RomanM

### Gausssian filter
ff=function(x)
{
	filter.combine.pad(x,truncated.gauss.weights(11) )[,2]#31
}

### Sum of squares
ssq=function(x) {sum(x^2)}

################################################
######## roman M offset series combination #####
################################################
##### version2.0
### subfunction to do pseudoinverse
psx.inv = function(mat,tol = NULL)
{
 	if (NCOL(mat)==1) return( mat /sum(mat^2))
	msvd = svd(mat)
 	dind = msvd$d
	if (is.null(tol))
	{
		tol = max(NROW(mat),NCOL(mat))*max(dind)*.Machine$double.eps
	}
 	dind[dind<tol]=0
 	dind[dind>0] = 1/dind[dind>0]
 	inv = msvd$v %*% diag(dind, length(dind)) %*% t(msvd$u)
	inv
}
### subfunction to do offsets
calcx.offset = function(tdat,wts)
{
	## new version
 	nr = length(wts)
 	delt.mat = !is.na(tdat)
 	delt.vec = rowSums(delt.mat)
 	row.miss= (delt.vec ==0)
 	delt2 = delt.mat/(delt.vec+row.miss)
 	co.mat = diag(colSums(delt.mat)) - (t(delt.mat)%*% delt2)
 	co.vec = colSums(delt.mat*tdat,na.rm=T) - colSums(rowSums(delt.mat*tdat,na.rm=T)*delt2)
 	co.mat[nr,] = wts
 	co.vec[nr]=0
 	psx.inv(co.mat)%*%co.vec
}

temp.combine = function(tsdat, wts=NULL, all=T)
{
	### main routine
	nr = nrow(tsdat)
	nc = ncol(tsdat)
 	dims = dim(tsdat)
 	if (is.null(wts)) wts = rep(1,nc)
 	wts=wts/sum(wts)
 	off.mat = matrix(NA,12,nc)
 	dat.tsp = tsp(tsdat)
 	for (i in 1:12) off.mat[i,] = calcx.offset(window(tsdat,start=c(dat.tsp[1],i), deltat=1), wts)
 	colnames(off.mat) = colnames(tsdat)
 	rownames(off.mat) = month.abb
 	matoff = matrix(NA,nr,nc)
 	for (i in 1:nc) matoff[,i] = rep(off.mat[,i],length=nr)
 	temp = rowMeans(tsdat-matoff,na.rm=T)
 	pred=NULL
 	residual=NULL
 	if (all==T)
	{
		pred =c(temp) + matoff
 		residual = tsdat-pred
	}
	list(temps = ts(temp,start=c(dat.tsp[1],1),freq=12),pred =pred, residual = residual, offsets=off.mat)
}

#pick out those series with have at least nn + 1 observations in every month
#Outputs a logical vector with TRUE indicating that that sereis is OK
dat.check = function(tsdat, nn=0)
{
	good = rep(NA,ncol(tsdat))
	for (i in 1:ncol(tsdat)) good[i]= (min(rowSums(!is.na(matrix(tsdat[,i],nrow=12))))>nn)
	good
}

######################################################################
##### slope plotting function with modifications for 12 month anomaly Jeff from Roman's,Ryan's, NicL and SteveM work
######################################################################
get.slope=function(dat=dat)
{
	len=length(dat)
	month = factor(rep(1:12,as.integer(len/12)+1))
	month=month[1:len]	#first month by this method is not necessarily jan
					#the important bit is that there are 12 steps
	time2=time(dat)
	coes = coef(lm(dat~0+month+time2))
	residual=ts(rep(NA,len),start=time2[1],deltat=1/12)
	fitline=array(NA,dim=c(len,12))
	for(i in 1:12)
	{
		fitline[,i]=coes[13]*time2+coes[i]
	}
	anomaly=rep(NA,len)
	for(i in 1:len)
	{
		residual[i]=dat[i]-fitline[i,month[i]]
	}
	anomaly = residual+time2*coes[13]-mean(time2*coes[13],na.rm=TRUE)
	list(coes,residual,anomaly)
}

### plot average slope using roman's methods
plt.avg2=function(dat, st=1850, en=2011, y.pos=NA, x.pos=NA, main.t="Untitled") {

	###  Get trend
	fm=get.slope(window(dat, st, en))

	###  Initialize variables
	N=length(window(dat, st, en))
	I=seq(1:N)/frequency(dat)
	rmI=ssq(I-mean(I))

	###  Calculate sum of squared errors
	SSE=ssq(fm[[2]][!is.na(fm[[2]])])/(N-2)

	###  Calculate OLS standard errors
	seOLS=sqrt(SSE/rmI)*10

	###  Calculate two-tailed 95% CI
	ci95=seOLS*1.964

	###  Get lag-1 ACF
	r1=acf(fm [[2]], lag.max=1, plot=FALSE,na.action=na.pass)$acf[[2]]

	###  Calculate CIs with Quenouille (Santer) adjustment for autocorrelation
	Q=(1-r1)/(1+r1)
	ciQ=ci95/sqrt(Q)

	maxx=max(time(fm[[3]]))
	minx=min(time(fm[[3]]))
	maxy=max(fm[[3]],na.rm=TRUE)
	miny=min(fm[[3]],na.rm=TRUE)

	###  Plot data
	plot(window(fm[[3]], st, en), main=main.t, ylab="Anomaly (Deg C)")
	lfit=lm(window(fm[[3]], st, en)~I(time(window(fm[[3]], st, en))))

	###  Add trendline and x-axis
	abline(a=lfit[[1]][1],b=lfit[[1]][2], col=2); abline(fm, col=4, lwd=2)
	grid()
	###  Annotate text
	tx=paste("Slope: ", round(fm[[1]][13]*10, 4)
		,"Deg C/Decade", "+/-", round(ciQ, 4)
		, "Deg C/Decade\n(corrected for AR(1) serial correlation)\nLag-1 value:"
		, round(r1, 3))
	text(x=ifelse(is.na(x.pos), (maxx-minx)*.2+minx,x.pos), y=ifelse(is.na(y.pos), (maxy-miny)*.8+miny, y.pos), tx, cex=.8, col=4, pos=4)
}
######## function plots monthly data with seasonal signal intact 12 slope curves
plt.seasonal.slopes=function(dat, st=1850, en=2011, y.pos=NA, x.pos=NA, main.t="Untitled")
{
	###  Get trend
	fm=get.slope(window(dat, st, en))

	###  Initialize variables
	N=length(window(dat, st, en))
	I=seq(1:N)/frequency(dat)
	rmI=ssq(I-mean(I))

	###  Calculate sum of squared errors
	SSE=ssq(fm[[2]][!is.na(fm[[2]])])/(N-2)

	###  Calculate OLS standard errors
	seOLS=sqrt(SSE/rmI)*10

	###  Calculate two-tailed 95% CI
	ci95=seOLS*1.964

	###  Get lag-1 ACF
	r1=acf(fm [[2]], lag.max=1, plot=FALSE,na.action=na.pass)$acf[[2]]

	###  Calculate CIs with Quenouille (Santer) adjustment for autocorrelation
	Q=(1-r1)/(1+r1)
	ciQ=ci95/sqrt(Q)

	maxx=max(time(dat))
	minx=min(time(dat))
	maxy=max(dat,na.rm=TRUE)
	miny=min(dat,na.rm=TRUE)

	###  Plot data
	plot(window(dat, st, en), main=main.t, ylab="Temperature (Deg C)")

	###  Add 12 trendlines and x-axis
	for(i in 1:12)
	{
		abline(a=fm[[1]][i],b=fm[[1]][13], col=2);
	}
	grid()
	###  Annotate text
	tx=paste("Slope: ", round(fm[[1]][13]*10, 4),"C/Dec"
		,"+/-", round(ciQ, 4)
		,"C/Dec(AR(1)Lag-1 val:"
		, round(r1, 3))
	text(x=ifelse(is.na(x.pos), (maxx-minx)*.2+minx,x.pos), y=ifelse(is.na(y.pos), (maxy-miny)*.05+miny, y.pos), tx, cex=.8, col=4, pos=4)
}
dat=nh
plt.seasonal.slopes(dat,main.t="Typical Temperature Series")
#savePlot("c:/agw/plots/12 line seasonal sloope.jpg",type="jpg")

###########################################################
### combine staton data by insuring whether each series is in fact the same
###########################################################
getsinglestation=function(staid=60360, version=0)
{
	staraw=NA
	#raw data
	smask= (ghmean[,2]==staid)
	mask= ghmean[smask,3]==version
	data=ghmean[smask,][mask,]
	noser= levels(factor(data[,4]))

	for(j in noser)
	{
		mask2=data[,4]==j
		startyear=min(data[mask2,5])
		endyear=max(data[mask2,5])
		subd=data[mask2,6:17]
		index=(data[mask2,5]-startyear)+1
		dat=array(NA,dim=c(12,(endyear-startyear)+1))
		for(k in 1:length(index))
		{
			dat[,index[k]]=as.numeric(subd[k,])
		}
		dim(dat)=c(length(dat),1)

		dat[dat==-9999]=NA
		dat=dat/10
		rawd=ts(dat,start=startyear,deltat=1/12)
		if(max(time(rawd))>=2011)
		{
			print ("error series")
			rawd=NA
		}

		if(!is.ts(staraw))
		{
			staraw=rawd
		}else{
			staraw=ts.union(staraw,rawd)
		}
	}
	if(!is.null(ncol(staraw)))
	{
		allraw=ts(rowMeans(staraw,na.rm=TRUE),start=time(staraw)[1],freq=12)
	}else{
		allraw=staraw
	}

	allraw
}

##############################################################
############ Assemble stations into grid Jeff Id #############
##############################################################

##assign stations to gridcells
gridval=array(NA,dim=c(stationnum,2))
gridval[,1]=as.integer((tinv[,5]+90)/5)+1  #lat
gridval[,2]=as.integer((tinv[,6]+180)/5)+1  #lon

#calculate gridcell trends
gridtem=array(NA,dim=c(36,72,211*12))

for(ii in 1:72)
{
	for(jj in 1:36)
	{
		maska=gridval[,2]==ii
		maskb=gridval[,1]==jj
		maskc= maska & maskb
		sta=NA ##initialize to non TS
		for(i in (1:stationnum)[maskc])
		{
			#get all station trends for station ID
			rawsta = getsinglestation(tinv[i,2],tinv[i,3])

			#place all curves side by side in a ts structure
			if(!is.ts(sta))
			{
				sta=rawsta
			}else{
				ts.union(sta,rawsta)
			}
		}

		if(is.ts(sta))#if at least one time series exists
		{

			sta=window(sta,start=1800)	#ignore pre-1800
			index=as.integer(((time(sta)-1800)*12+.002)+1)  #calculate time index for insertion into array
			if(!is.null(ncol(sta)))  #is station more than one column
			{
				gridtem[jj,ii,index]=temp.combine(sta)$temps
			}else{
				gridtem[jj,ii,index]=sta
			}
			print(jj)  #progress debug
			tit=paste("ii=",ii,"jj=",jj)
			plot(ts(gridtem[jj,ii,],start=1800,freq=12),main=tit)  #debug
		}
	}
	print ("COLUMN")  #progress debug
	print(ii) 		#progress debug
}

###########################################
## calculate global trends using roman's offset combine on each grid
###########################################
rgl=gridtem
dim(rgl)=c(36*72,2532)
rgl=t(rgl)
rgl=ts(rgl,start=1800,deltat=1/12)
mask=!is.na(colMeans(rgl,na.rm=TRUE))
weight=sin(seq(2.5,180, 5)*(pi)/180)	#cell weights
wt=rep(weight,72)
maskb=array(FALSE,dim=c(36,72))
maskb[1:18,]=TRUE
dim(maskb)=(36*72)
maskc=mask & maskb
sh=temp.combine(rgl[,maskc],wt[maskc])$temps
maskc=mask & !maskb
nh=temp.combine(rgl[,maskc],wt[maskc])$temps
un=ts.union(nh,sh)
glt=temp.combine(un)$temps

# nh - northern hemisphere trend, sh - southern, glt=global trend

###########################################
#### plotting and anomaly calculation
###########################################

nha=get.slope(nh)
plot(window(nha[[3]],1900),main="Northern Hemisphere Anomaly",ylab="Anomaly Deg C",xlab="Year")
#savePlot("c:/agw/plots/northern hemisphere anom.jpg",type="jpg")

sha=get.slope(sh)
plot(window(sha[[3]],1900),main="Southern Hemisphere Anomaly",ylab="Anomaly Deg C",xlab="Year")
#savePlot("c:/agw/plots/Southern hemisphere anom.jpg",type="jpg")

gla= get.slope(glt)

plot(ff(window(nha[[3]],1978)),main="Hemispheric Temperature Anomaly",ylab="Anomaly Deg C",xlab="Year")
lines(ff(window(sha[[3]],1978)),col=2)
legend("bottomright",c("Northern","Southern"),col=c(1,2),pch=10)


Rate this:
Share this:
Facebook
X
Like Loading...


	Related

Jeff

Posted this on the annoucement site but maybe this is where I should post this.

Congratulations on the baby boy and to Mom Id.
I have used your program to look at all the British Columbia sites in the GHCN v2.mean database. I have done the seasonal means and improved anomaly for all of them.
Just some things that I noticed with compiling an Excel worksheet on the results.
Just talking BC here. I have no idea if other parts of the GHCN data base show these kind of results for other areas. I do not know how your program uses all the sites in the database.
1. The record is poor. Most records stop in 1990. Very few go beyond 2000 or to 2010.
2. Most records are for the period ~1950 to 1990.
3. The above records can show large decadal seasonal/anomaly slope and large +/-
4. There are few records of the 1930s/40s – at time of increased temperatures historically.
Would these out of date, and somewhat time specific (1950 t0 1990) data have an effect on your results for the whole database if my findings for BC are the rule rather than the exception?

I have my records in an Excel worksheet ( includes all the slopes and +/- data)which I could send you if you would be interested as well as all the plots (22 Mb folder.

By the way, the only US area I did was 72235 version 1 thru 12. They show much more up to date data ie to 2006.

Some results after sorting for record length:

Length
of record Slope +/-
NEW WESTMINSTER, BC 1874 1980 106 0.0989 0.0329
ESTEVAN POINT 1908 2010 102 0.0382 0.0278
PRINCE RUPERT 1908 2010 102 -0.0678 0.0369
AGASSIZ CDA,BC 1889 1990 101 0.0722 0.0397
QUATSINO,BC 1895 1990 95 0.0903 0.0364
BELLA COOLA,BC 1895 1990 95 0.1108 0.0481
FORT ST JAMES,BC 1895 1990 95 0.2206 0.069
BIG CREEK,BC 1893 1986 93 -0.0986 0.0639
BARKERVILLE,BC 1888 1980 92 0.0059 0.0499
ATLIN,BC 1899 1990 91 0.0819 0.085

MCINNES IS.,B 1954 1990 36 0.1455 0.1697
CASSIAR,BC 1954 1990 36 0.5108 0.2783
BARRIERE,BC 1955 1990 35 0.1116 0.2574
TLELL,BC 1957 1990 33 0.055 0.2046
ETHELDA BAY,BC 1957 1990 33 -0.1716 0.1793
HOLBERG,BC 1958 1990 32 0.1028 0.1724

This must be having some effect on the data – yes/no?

13 thoughts on “GHCN Global Temperatures – Compiled Using Offset Anomalies”

gallopingcamel says:

March 28, 2010 at 3:31 pm

As a newbie on this blog I probably missed something in the lead up to this thread. Presumably one needs some form of Windows to run this code. Can it be run in Linux (I have Ubuntu 8.04).

Jeff Id says:

March 28, 2010 at 4:00 pm

I don’t have a lot of time today, but the language is R. It’s free and easy to find by google.

Carrick says:

March 28, 2010 at 4:44 pm

You need to slap a version number in there somewhere, my good man.

Carrick says:

March 28, 2010 at 5:13 pm

Gallop, go to this site for the binary distribution.

gallopingcamel says:

March 29, 2010 at 12:07 am

You guys are awesome! Thanks.

SteveWH says:

March 31, 2010 at 11:31 pm

Jeff

Posted this on the annoucement site but maybe this is where I should post this.

Congratulations on the baby boy and to Mom Id.
I have used your program to look at all the British Columbia sites in the GHCN v2.mean database. I have done the seasonal means and improved anomaly for all of them.
Just some things that I noticed with compiling an Excel worksheet on the results.
Just talking BC here. I have no idea if other parts of the GHCN data base show these kind of results for other areas. I do not know how your program uses all the sites in the database.
1. The record is poor. Most records stop in 1990. Very few go beyond 2000 or to 2010.
2. Most records are for the period ~1950 to 1990.
3. The above records can show large decadal seasonal/anomaly slope and large +/-
4. There are few records of the 1930s/40s – at time of increased temperatures historically.
Would these out of date, and somewhat time specific (1950 t0 1990) data have an effect on your results for the whole database if my findings for BC are the rule rather than the exception?

I have my records in an Excel worksheet ( includes all the slopes and +/- data)which I could send you if you would be interested as well as all the plots (22 Mb folder.

By the way, the only US area I did was 72235 version 1 thru 12. They show much more up to date data ie to 2006.

Some results after sorting for record length:

Length
of record Slope +/-
NEW WESTMINSTER, BC 1874 1980 106 0.0989 0.0329
ESTEVAN POINT 1908 2010 102 0.0382 0.0278
PRINCE RUPERT 1908 2010 102 -0.0678 0.0369
AGASSIZ CDA,BC 1889 1990 101 0.0722 0.0397
QUATSINO,BC 1895 1990 95 0.0903 0.0364
BELLA COOLA,BC 1895 1990 95 0.1108 0.0481
FORT ST JAMES,BC 1895 1990 95 0.2206 0.069
BIG CREEK,BC 1893 1986 93 -0.0986 0.0639
BARKERVILLE,BC 1888 1980 92 0.0059 0.0499
ATLIN,BC 1899 1990 91 0.0819 0.085

MCINNES IS.,B 1954 1990 36 0.1455 0.1697
CASSIAR,BC 1954 1990 36 0.5108 0.2783
BARRIERE,BC 1955 1990 35 0.1116 0.2574
TLELL,BC 1957 1990 33 0.055 0.2046
ETHELDA BAY,BC 1957 1990 33 -0.1716 0.1793
HOLBERG,BC 1958 1990 32 0.1028 0.1724

This must be having some effect on the data – yes/no?

steven mosher says:

April 5, 2010 at 2:18 pm

http://surfacestations.googlecode.com/files/RuralStations_1km.kml

I got some R code for building kml files… cool stuff

Jeff can you run a Conus Study?

steven mosher says:

April 9, 2010 at 3:23 am

Conus?

steven mosher says:

April 12, 2010 at 2:40 am

Click to access HadGHCND_paper.pdf

Interesting work on the correlation between stations..

Pingback: D’Aleo at Heartland: An Apple a Day « The Whiteboard
Robert says:

March 9, 2011 at 3:55 pm

Hey Jeff,
I’ve been asking Roman M over at his blog about the code for the offsets (i.e. the hammer) and how to use it for a set of annual series. (Just a plain set of annual series). I am more or less at the beginner stage of R so I think I had trouble understanding some of his description so I was wondering (since you’re using this practically) if you could help me better understand what the function is doing. In particular I was wondering what sort of input do I need. For example I have a .csv file of 31 temperature time series and I would like to use this method to just simply combine them. What sort of header should I have? i.e. should it be:

Year Station A Station B
1880 -1 -5
1881

etc…

The code he gave me was:

###function for calculating matrix pseudo inverse for any r x c size

psx.inv = function(mat,tol = NULL) {
# if (is.null(dim(mat))) return( mat /sum(mat^2))
if (NCOL(mat)==1) return( mat /sum(mat^2))
msvd = svd(mat)
dind = msvd$d
if (is.null(tol)) {tol = max(NROW(mat),NCOL(mat))*max(dind)*.Machine$double.eps}
dind[dind>tol] = 1/dind[dind>tol]
inv = msvd$v %*% diag(dind, length(dind)) %*% t(msvd$u)
inv}

### single offset calculation routine
# also used by monthly routine

calc.offset = function(tdat) {
delt.mat = !is.na(tdat)
delt.vec = rowSums(delt.mat)
co.mat = diag(colSums(delt.vec*delt.mat))-(t(delt.mat)%*%delt.mat)
co.vec = colSums(delt.vec*delt.mat*tdat,na.rm=T)-colSums(rowSums(delt.mat*tdat,na.rm=T)*delt.mat)
offs = psx.inv(co.mat)%*%co.vec
c(offs)-mean(offs)}

Hugo.Mildenberger@web.de says:

May 17, 2012 at 1:55 pm

Jeff, in the meantime NOAA had removed v2.mean temperature data from ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/v2/ (and from ../v1 too). Do you have a backup copy?

1. Jeff Condon says:
  
  May 17, 2012 at 2:03 pm
  
  I doubt it but I will check my other computer.