http://ww2.amstat.org/publications/jse/v21n1/witt.pdf
http://nsidc.org/research/bios/fetterer.html
ftp://sidads.colorado.edu/DATASETS/NOAA/G02135/north/monthly/data/N_08_extent_v3.0.csv
import numpy as np
import numpy.linalg as la
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
#from scipy import stats
data = pd.read_csv('N_09_extent_v3.0.csv', dtype={'year': np.int32, 'extent': np.double})
data.head()
print(data.shape)
year = data['year']
extent = data[' extent']
plt.figure(figsize=(6,4))
plt.plot(year, extent, 'o')
npoints = data.shape[0]
print('number of data points = ', npoints)
def fitfunction(t,coeffs):
fit = 0
for i,c in enumerate(coeffs):
fit += c*t**i
return fit
ndata = 20 #use the first ndata points for the fit
year1 = year[:ndata]
extent1 = extent[:ndata]
A = np.array([
1+0*year1,
year1
]).T
b = np.array(extent1)
x = la.solve(A.T@A,A.T@b)
plt.plot(year1, fitfunction(year1,x))
plt.plot(year, extent, 'o')
plt.figure(figsize=(10,8))
plt.plot(year, extent, 'o')
for ndata in range(22, npoints):
year1 = year[:ndata]
extent1 = extent[:ndata]
A = np.array([
1+0*year1,
year1
]).T
b = np.array(extent1)
x = la.solve(A.T@A,A.T@b)
plt.plot(year1, fitfunction(year1,x), label='%d' % (year[0]+ndata))
plt.legend()
ndata = 26 #use the first ndata points for the fit
year1 = year[:ndata]
extent1 = extent[:ndata]
A = np.array([
1+0*year1,
year1,
year1**2
]).T
b = np.array(extent1)
x = la.solve(A.T@A,A.T@b)
print(x)
plt.plot(year, fitfunction(year,x))
plt.plot(year, extent, 'o')
Let's try to use the least square function from scipy
coeffs,residual,rank,sval=np.linalg.lstsq(A,b,rcond=None)
plt.plot(year, fitfunction(year,coeffs))
plt.plot(year, extent, 'o')
print(coeffs)
Seems to work with lstsq
... what could be the issue with the Normal Equations method above?
Let's check the condition number of the matrix A
print(la.cond(A))
print(x)
print(la.norm(A@x-b))
The matrix A becomes closer to singular as the number of columns increases (i.e., as the number of coefficients for the fit increase). We can scale the years, to mitigate this situation:
year2 = year - 1980
extent2 = extent
A = np.array([
1+0*year2,
year2,
year2**2
]).T
b = np.array(extent2)
x = la.solve(A.T@A,A.T@b)
print(la.cond(A))
print(x)
plt.plot(year2, fitfunction(year2,x))
plt.plot(year2, extent, 'o')