Data Driven Modeling


PhD seminar series at Chair for Computer Aided Architectural Design (CAAD), ETH Zurich

Vahid Moosavi


Seventh Session


08 November 2016

Topics to be discussed

  • Self Organizing Maps
In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
pd.__version__
import sys
from scipy import stats
%matplotlib inline
In [2]:
# A two dimensional example
fig = plt.figure()

N = 100
d0 = 1.6*np.random.randn(N,2)
d0[:,0]= d0[:,0] - 3
plt.plot(d0[:,0],d0[:,1],'.b')

d1 = 1.6*np.random.randn(N,2)+7.6
plt.plot(d1[:,0],d1[:,1],'.b')

d2 = 1.6*np.random.randn(N,2)
d2[:,0]= d2[:,0] + 5
d2[:,1]= d2[:,1] + 1
plt.plot(d2[:,0],d2[:,1],'.b')

d3 = 1.6*np.random.randn(N,2)
d3[:,0]= d3[:,0] - 5
d3[:,1]= d3[:,1] + 4
plt.plot(d3[:,0],d3[:,1],'.b')


d4 = 1.8*np.random.randn(N,2)
d4[:,0]= d4[:,0] - 15
d4[:,1]= d4[:,1] + 14
plt.plot(d4[:,0],d4[:,1],'.b')


d5 = 1.8*np.random.randn(N,2)
d5[:,0]= d5[:,0] + 25
d5[:,1]= d5[:,1] + 14
plt.plot(d5[:,0],d5[:,1],'.b')
Data1 = np.concatenate((d0,d1,d2,d3,d4,d5))
fig.set_size_inches(7,7)

Self Organizing Map (SOM)

main concepts:

  • Vector quantization
  • Manifold Learning and high dimensional space indexing
  • Topology preserving dimensionality reduction

The Original Algorithm

Online learning

0- Assume we have a grid (e.g. 2D), where each node has an n-dimensional vector w, and a 2-D index, (x,y)

1- initialize ws with random vectors

2- for several runs repeat the following steps

  • project each data point to the grid and find the most similar node (using the selected similarity measure) to that data point. This node is called Best Matching Unit (BMU)
  • considering the BMU as the center, BMU and its neighboring nodes adapt their Ws to be similar to the data vector based on a selected learning rate
In [14]:
def normalize_by(data_raw, data, method='var'):
    
    me = np.mean(data_raw, axis = 0)
    st = np.std(data_raw, axis = 0)
    if method == 'var':
        n_data = (data-me)/st
        return n_data
        
def denormalize_by(data_by, n_vect, n_method = 'var'):
    #based on the normalization
    if n_method == 'var':
        me = np.mean(data_by, axis = 0)
        st = np.std(data_by, axis = 0)
        vect = n_vect* st + me
        return vect 
    else:
        print 'data is not normalized before'
        return n_vect  
    
def random_initialize(mapszie,data):
    nnodes = mapsize[0]*mapsize[1]
    dim = data.shape[1]
    
    #It produces random values in the range of min- max of each dimension based on a uniform distribution
    mn = np.tile(np.min(data, axis =0)-1.5, (nnodes,1))
    mx = np.tile(np.max(data, axis =0)+1.5, (nnodes,1))
    init_W = mn + (mx-mn)*(np.random.rand(nnodes,dim))
    return init_W

def rect_dist(mapsize,bmu):
    #the way we consider the list of nodes in a planar grid is that node0 is on top left corner,
    #nodemapsz[1]-1 is top right corner and then it goes to the second row. 
    #no. of rows is map_size[0] and no. of cols is map_size[1]
    
    rows = mapsize[0]
    cols = mapsize[1]
    
    #bmu should be an integer between 0 to no_nodes
    if 0<=bmu<=(rows*cols):
        c_bmu = int(bmu%cols)
        r_bmu = int(bmu/cols)
    else:
        print 'wrong bmu'  
      
    #calculating the grid distance
    if np.logical_and(rows>0 , cols>0): 
        r,c = np.arange(0, rows, 1)[:,np.newaxis] , np.arange(0,cols, 1)
        dist2 = (r-r_bmu)**2 + (c-c_bmu)**2
        return dist2.ravel()
    else:
        print 'please consider the above mentioned errors'
        return np.zeros((rows,cols)).ravel()


def calculate_UD2(mapsize):
    nnodes = mapsize[0]*mapsize[1]
    UD2 = np.zeros((nnodes, nnodes))
    for i in range(nnodes):
        UD2[i,:] = rect_dist(mapsize,i).reshape(1,nnodes)
    return UD2

def calculate_neigh_activation(UD2,bmu,radius):
#     'Guassian'
    return np.exp(-1.0*UD2[bmu]/(2.0*radius**2))


def find_bmu(data,W):
    import scipy.spatial.distance as DIST
    Dists = DIST.cdist(data,W)
    ind_sets = np.argmin(Dists,axis=1)
    return Dists,ind_sets





#Learning rate adaptation
def calc_alpha(t,alpha0,T,method='linear'):
    if method =='linear':
        return alpha0*(1-t/float(T))
    if method =='power':
        return alpha0*(.005/alpha0)**(float(t)/T)
    if method =='inv':
        return alpha0/(1+100*t/float(T))
    
In [57]:
def train_som_online(data,mapsize,trainlen):
    # mapsize = [10,10]
    dlen = len(data) 
    
    #normalize the data
    data_n = normalize_by(data, data, method='var');
      
    #Initialize the map
    W = random_initialize(mapsize,data_n);
    UD2 = calculate_UD2(mapsize);
    alpha0 = .5
    if min(mapsize)==1:
        radiusin = max(mapsize)/float(2)
    else:
        radiusin = max(mapsize)/float(6)
    
    radiusfin = max([1,radiusin/float(4)])
    radiusfin = radiusin/float(10)
    print radiusin,radiusfin

    # trainlen= 200*nnodes/float(dlen)
    # trainlen = int(np.ceil(trainlen))
    radius = np.linspace(radiusin, radiusfin, trainlen)
    qe = []
    Ws = []
    W_denorm = denormalize_by(data, W.copy(), n_method = 'var')
    Ws.append(W_denorm.copy())
    for i in range(trainlen):
        
        #all or a sample 
        data_e = np.random.permutation(data_n)[:]
        qe_t = np.zeros((data_e.shape[0],1))
        for j, d in enumerate(data_e):
            #project using a distance function
            #find bmu
            
            #Competition between nodes
            Dists,bmu = find_bmu(d[np.newaxis,:],W)
            qe_t[j] = np.min(Dists)
            
            # Adapt nodes
            #distance to bmu in the lower dimensional space
            neiboring_effect = calculate_neigh_activation(UD2,bmu,radius[i])
            #learning rate
            alpha = calc_alpha(i,alpha0,trainlen,method='inv')
            #Adapatation 
            W[:] = W[:] + neiboring_effect.T * alpha * (d-W[:])
        W_denorm = denormalize_by(data, W.copy(), n_method = 'var')
        Ws.append(W_denorm.copy())
        if i%10==0:
            print 'iteration {}: alpha: {}, raduis: {} and quantization error is {}'.format(i,alpha,radius[i],np.mean(qe_t))
        qe.append(np.mean(qe_t))
    Ws = np.asarray(Ws)
    return W, qe,Ws
In [434]:
data = Data1
Data = np.random.randint(0,2,size = (500,3))
data = Data1
In [435]:
data = Data1
mapsize = [40,40]
trainlen = 200
W, qe,Ws = train_som_online(data,mapsize,trainlen)
codebook = denormalize_by(data, W, n_method = 'var')
6.66666666667 0.666666666667
iteration 0: alpha: 0.5, raduis: 6.66666666667 and quantization error is 0.118186899882
iteration 10: alpha: 0.0833333333333, raduis: 6.36515912898 and quantization error is 0.0978878664771
iteration 20: alpha: 0.0454545454545, raduis: 6.06365159129 and quantization error is 0.0906245838952
iteration 30: alpha: 0.03125, raduis: 5.7621440536 and quantization error is 0.08590481532
iteration 40: alpha: 0.0238095238095, raduis: 5.46063651591 and quantization error is 0.0820396115341
iteration 50: alpha: 0.0192307692308, raduis: 5.15912897822 and quantization error is 0.0748734499173
iteration 60: alpha: 0.0161290322581, raduis: 4.85762144054 and quantization error is 0.0700469994541
iteration 70: alpha: 0.0138888888889, raduis: 4.55611390285 and quantization error is 0.0647056462594
iteration 80: alpha: 0.0121951219512, raduis: 4.25460636516 and quantization error is 0.0611252921953
iteration 90: alpha: 0.0108695652174, raduis: 3.95309882747 and quantization error is 0.0556555269744
iteration 100: alpha: 0.00980392156863, raduis: 3.65159128978 and quantization error is 0.0517942198458
iteration 110: alpha: 0.00892857142857, raduis: 3.35008375209 and quantization error is 0.0480460081038
iteration 120: alpha: 0.00819672131148, raduis: 3.04857621441 and quantization error is 0.0442862072837
iteration 130: alpha: 0.00757575757576, raduis: 2.74706867672 and quantization error is 0.0414536197213
iteration 140: alpha: 0.00704225352113, raduis: 2.44556113903 and quantization error is 0.0387689534516
iteration 150: alpha: 0.00657894736842, raduis: 2.14405360134 and quantization error is 0.036193964836
iteration 160: alpha: 0.00617283950617, raduis: 1.84254606365 and quantization error is 0.0337516643049
iteration 170: alpha: 0.00581395348837, raduis: 1.54103852596 and quantization error is 0.0317958630539
iteration 180: alpha: 0.00549450549451, raduis: 1.23953098827 and quantization error is 0.0301403734082
iteration 190: alpha: 0.00520833333333, raduis: 0.938023450586 and quantization error is 0.0284984199562
In [436]:
plt.plot(qe)
Out[436]:
[<matplotlib.lines.Line2D at 0x147049050>]
In [437]:
# Double pendulum formula translated from the C code at
# http://www.physics.usyd.edu.au/~wheat/dpend_html/solve_dpend.c

from numpy import sin, cos
import numpy as np
import matplotlib.pyplot as plt
import scipy.integrate as integrate
import matplotlib.animation as animation

# data_n = normalize_by(data, data, method='var');
fig = plt.figure()
ax = fig.add_subplot(111, autoscale_on=False, xlim=(data.min(axis=0)[0],data.max(axis=0)[0]), ylim=(data.min(axis=0)[1],data.max(axis=0)[1]))
# ax.grid()

dataplt, = ax.plot([], [], '.k',markersize=3)
somplt, = ax.plot([], [], 'o',markersize=3,alpha=.5,markerfacecolor='None',markeredgecolor='green')
time_template = 'time = %.1fs'
time_text = ax.text(0.05, 0.9, '', transform=ax.transAxes)


def init():
    dataplt.set_data([], [])
    somplt.set_data([], [])
    time_text.set_text('')
    return dataplt,somplt, time_text


def animate(i):
    dataplt.set_data(data[:,0],data[:,1])
    somplt.set_data(Ws[i][:,0],Ws[i][:,1])
    
    time_text.set_text(time_template % (i))
    return dataplt,somplt, time_text

ani = animation.FuncAnimation(fig, animate, np.arange(0, trainlen),
                              interval=25, blit=True, init_func=init)

# ani.save('./Images/double_pendulum.mp4', fps=15)
ani.save('./Images/SOM.mp4', fps=15, extra_args=['-vcodec', 'libx264'],dpi=200)
plt.close()
In [1]:
from IPython.display import HTML
HTML("""
<video width="600" height="400" controls>
  <source src="files/Images/SOM.mp4" type="video/mp4">
</video>
""")
Out[1]:

Batch trainign algorithm

faster, and parallelizable

In [439]:
import sompylib.sompy as SOM
msz11 =40
msz10 = 40
X = Data1
som1 = SOM.SOM('', X, mapsize = [msz10, msz11],norm_method = 'var',initmethod='pca')
som1.init_map()
# som1.train(trainlen=None,n_job = 1, shared_memory = 'no',verbose='final')
som1.train(n_job = 1, shared_memory = 'no',verbose='final')


codebook1 = som1.codebook[:]
codebook1_n = SOM.denormalize_by(som1.data_raw, codebook1, n_method = 'var')
Total time elapsed: 9.754000 secodns
final quantization error: 0.015409
In [440]:
fig = plt.figure()
plt.plot(data[:,0],data[:,1],'.')
plt.plot(codebook1_n[:,0],codebook1_n[:,1],'.r')
fig.set_size_inches(7,7)

Applications of SOM

  • Dimensionality Reduction and Visualization of High Dimensional Spaces"
    • Feature selection
    • Multi-criteria Optimization
  • Clustering
  • Classification
  • Space transmorformation similar to PCA, K-means, Sparse Coding,...
  • Function approximation
  • Transfer function : Multidirectional function approximantion
  • Non parametric Probability density estimation
    • filling missing values
    • resampling
  • Data Reduction and Abstraction similar to K-means with a large K

Applications of SOM

Dimensionality Reduction and Visualization of High Dimensional Spaces"**

  • Feature selection
  • Multi-criteria Optimization

A new way of visualizing the space

In [441]:
som1.hit_map()
In [94]:
som1.view_map(text_size=8)

An example with high dimensional data

Feature extraction

In many real engineering applications, we are intersted to know what are the important factors in an experiment.

Imagine, each experiment takes few hours and considering the combinations of all values of different variable explodes the experimental space!!!

in the following case, we have 5 parameters and one label which says the label of the experiment

In [442]:
path = "./Data/CFD.csv"
D = pd.read_csv(path)
# D  = D[['Rent','ZIP','Year built','Living space','lng','lat']]
D.head()
Out[442]:
P1 P2 P3 P4 P5 Label
0 11.00 7.50 5.75 0.001750 0.000500 1
1 5.75 11.00 9.25 0.001125 0.001750 1
2 9.25 9.25 11.00 0.001750 0.003000 1
3 9.25 11.00 11.00 0.001750 0.002375 1
4 11.00 11.00 11.00 0.000500 0.002375 1

If we use one to one plots and with colors show the labels, usually we don't get any thing

In [443]:
c = 1
fig = plt.figure(figsize=(20,20))
for i in range(0,4):
    for j in range(i,5):
                
        if i != j:
            plt.subplot(5,4,c);
            c = c +1;
            plt.scatter(D.values[:,i],D.values[:,j],marker='o',s=70,edgecolors=None,alpha=1.,vmin=0,vmax=1,c=plt.cm.RdYlBu_r(D.Label.values[:].astype(float)));
            plt.xlim((D.values[:,i].min(),D.values[:,i].max()));
            plt.ylim((D.values[:,j].min(),D.values[:,j].max()));
            plt.xlabel(D.columns[i]);
            plt.ylabel(D.columns[j]);
        if i == j:
            plt.subplot(5,4,c);
            c = c +1;
            plt.scatter(D.values[:,i],D.values[:,j],marker='o',s=70,edgecolors=None,alpha=1.,vmin=0,vmax=1,c=plt.cm.RdYlBu_r(D.Label.values[:].astype(float)));
            plt.xlim((D.values[:,i].min(),D.values[:,i].max()));
            plt.ylim((D.values[:,j].min(),D.values[:,j].max()));
            plt.xlabel(D.columns[i]);
            plt.ylabel(D.columns[j]);

if we take the usual correlations

In [444]:
D.corr()
Out[444]:
P1 P2 P3 P4 P5 Label
P1 1.000000 -0.020883 -0.089509 -0.036360 0.031264 0.383061
P2 -0.020883 1.000000 0.082728 -0.084491 0.042166 0.168023
P3 -0.089509 0.082728 1.000000 -0.023677 0.012584 0.144121
P4 -0.036360 -0.084491 -0.023677 1.000000 -0.118082 -0.220599
P5 0.031264 0.042166 0.012584 -0.118082 1.000000 -0.020029
Label 0.383061 0.168023 0.144121 -0.220599 -0.020029 1.000000

But now if we train a SOM with the parameters and visualize their labels

In [630]:
import sompylib1.sompy as SOM1

msz11 =70
msz10 = 70

X = D.values[:,:-1]

som2 = SOM.SOM('', X, mapsize = [msz10, msz11],norm_method = 'var',initmethod='pca')
# som1 = SOM1.SOM('', X, mapsize = [msz10, msz11],norm_method = 'var',initmethod='pca')
som2.init_map()
som2.train(n_job = 1, shared_memory = 'no',verbose='final')
codebook1 = som2.codebook[:]
codebook1_n = SOM.denormalize_by(som2.data_raw, codebook1, n_method = 'var')
som2.compname= [D.columns[:-1]]
Total time elapsed: 231.507000 secodns
final quantization error: 0.116649
In [ ]:
 
In [631]:
som2.view_map(text_size=8)
In [632]:
# We project data 
som2.cluster_labels = np.asarray(["" for i in range(som2.nnodes)])
bmus = som2.project_data(X)
Df = pd.DataFrame(data=bmus,columns=['bmu'])
Df['dlabel'] = D['Label']
Df.head()
gb = Df.groupby(by='bmu').median()
sel_bmus = gb.index.values[:]
print sel_bmus.shape
sel_bmus_label = gb.dlabel.values[:]
som2.cluster_labels[sel_bmus]= sel_bmus_label
(725,)
In [633]:
def viewmap_withlabel(som,data=None):
    if hasattr(som, 'cluster_labels'):
        codebook = getattr(som, 'cluster_labels')

    else:
        print 'clustering based on default parameters...'
        codebook = som.cluster()
    msz =  getattr(som, 'mapsize')
    fig = plt.figure(figsize=(20,14))
    dim = som.codebook.shape[1]
    no_row_in_plot = dim/3 + 1 #6 is arbitrarily selected
    if no_row_in_plot <=1:
        no_col_in_plot = dim
    else:
        no_col_in_plot = 3
    
    
    for k in range(dim):
        ax = plt.subplot(no_row_in_plot,no_col_in_plot,k+1)
    
        if data == None:
#             data_tr = getattr(som, 'data_raw')
#             proj = som.project_data(data_tr)
#             coord = som.ind_to_xy(proj)
            cents = som.ind_to_xy(np.arange(0,msz[0]*msz[1]))
            for i, txt in enumerate(codebook):
                ax.annotate(txt, (cents[i,1],cents[i,0]),size=8, va="center")
        if data != None:
            proj = som.project_data(data)
            cents = som.ind_to_xy(proj)
            label = codebook[proj]
            for i, txt in enumerate(label):
                ax.annotate(txt, (cents[i,1],cents[i,0]),size=8, va="center")

        ax.set_yticklabels([])
        ax.set_xticklabels([])
        ax.imshow(som.codebook[:,k].reshape(msz[0],msz[1])[::],alpha=1,cmap=plt.cm.RdYlBu_r)
#         ax.imshow(som.codebook[:,k].reshape(msz[0],msz[1])[::],alpha=1,cmap=plt.cm.Accent_r)
    plt.subplots_adjust(hspace = .001,wspace=.01)
#     plt.tight_layout()
In [634]:
viewmap_withlabel(som2)