exp_2.py 15.11.17

#!/usr/bin/python
# -*- coding: utf-8 -*-

# --------------------------------------------------
# File Name: exp_1i.py
# Location: 
# Purpose:
# Creation Date: 29-10-2017
# Last Modified: Wed, Nov 15, 2017  6:28:25 PM
# Author(s): Mike Stout 
# Copyright 2017 The Author(s) All Rights Reserved
# Credits: 
# --------------------------------------------------

import numpy as np
import pandas as pd
import re
pd.set_option('display.max_rows', 15) 
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 100)
pd.set_option('chop_threshold', 0)
pd.set_option('precision', 4)
pd.set_option('display.colheader_justify','right')
pd.set_option('display.max_colwidth', -1) 
pd.set_option('display.max_colwidth', 32) 

import matplotlib.pyplot as plt 
plt.style.use('ggplot')



#-------------------------------------------------------------------------------
# https://github.com/aflc/editdistance

import editdistance

ed = editdistance.eval

#-------------------------------------------------------------------------------

from scipy.stats import entropy # == KL Divergence


#-------------------------------------------------------------------------------
from mlpy import dtw_subsequence, dtw_std

# Enclde Lower and upercase letters to same values ...
lc = 'abcdefghijklmnopqrstuvwxyz'
lc_lut = [ (a, i) for i,a in enumerate(lc) ] 
uc_lut = [ (a.upper(), i) for a,i in lc_lut ]
lut = dict( lc_lut + uc_lut  ) 

#def encode(c): return ord(c.lower())*200

def enc(c):
    try: val = lut[c.lower()]
    except: val = ord(c)  * 2.
    return val

def encode(s):
    return np.array(map(enc, s))
    #return np.array(map(float, map(ord, s)))

def splits(n, f, xs):
    return [ f+":\t" + xs[i:i+n] for i in range(0, len(xs), n)]

def fix(xs, i, path):
    if i>0 and path[i]==path[i-1]: return '_' # gapChar
    else: return xs[path[i]]


def recover(xs, path):
    return ''.join([ fix(xs, i, path) for i in xrange(len(path))]).replace("\n"," ")

def dtw(x,y): 
    x_ = encode(x)
    y_ = encode(y)
    dist, cost, path = dtw_subsequence(x_, y_) 
    x = recover(x, path[0])
    y = recover(y, path[1])
    al = (x,y)
    return dist, al

#-------------------------------------------------------------------------------
from scipy.stats import entropy # == KL Divergence
import numpy as np

def jsd(x,y):

    P = encode(x)
    Q = encode(y)

    M = 0.5 * (P + Q)
    return 0.5 * (entropy(P, M) + entropy(Q, M))


#-------------------------------------------------------------------------------
from sklearn.metrics import normalized_mutual_info_score

def nmi(x,y):
    x = encode(x)
    y = encode(y)

    return normalized_mutual_info_score(x,y)

#-------------------------------------------------------------------------------

def words(s): 
    #return s.split(' ')
    return filter(lambda x: x!='', re.split('[ ,;:.\>\<?!#@\n\t\']', s))



f_words=pd.read_csv("CK_200_function_words.csv")
f_words.columns = ['word']

df=pd.read_csv("df.csv")
df = df[ (df.Subset == 'Full-a')]
qf = df[ (df.Edtn == 'Q')]
ff = df[ (df.Edtn == 'F')]

def proc(r): 
    return r.spkr + " " + r.txt


def mkWindow(i,xs):
    w = xs[i:i+wSize]
    s = ' '.join(w)
    return s

wSize = 200
def window(df):

    samples = df.apply(lambda row: proc(row), axis=1)
    #print samples

    txt_full = ''
    for sample in samples:
        txt_full += (' ' + sample)

    ws = words(txt_full)

    windows=[]
    print ed, "#words: ", len(ws)
    for i in xrange(100): # len(ws)-wSize):
        windows.append(mkWindow(i,ws))

    windows = np.array(windows)
    #print windows
    return windows

q_ws = window(qf)
f_ws = window(ff)
print q_ws.shape, f_ws.shape

res = []
res_a = []

k=0
for q in q_ws:


    print k
    k+=1


    pairs = []
    for f in f_ws:

        edist = ed(q,f)
        dist, al = dtw(q,f)

        # For nmi and jsd the matrix must by NxN
        # .. so strings (concatenations of 2000 words) must be aligned ... 
        q_,f_ = al 
        nmi_score = nmi(q_,f_)
        jsd_score = jsd(q_,f_)
        #print q_
        #print f_

        dat = nmi_score, edist, jsd_score, dist
        pairs.append(dat)

    pairs = np.array(pairs).T
    mins = map(np.argmin, pairs)[1:]

    # .. for nmi need to find max not min
    maxs = [map(np.argmax, pairs)[0]] 

    posns = maxs + mins

    scores = []
    for i,p in enumerate(posns):
        scores.append(pairs[i][p])

    res.append(zip(posns, scores))
    res_a.append(pairs) 

def selectMetric(a,i):
    return np.array([ ys[i] for ys in a ])

res = np.array(res)
res_a = np.array(res_a)
print res_a
print res_a.shape
#res = pd.DataFrame([res])
#res.to_csv('res_scores.csv', header=True, index=False, encoding='utf-8')


metricNames = "Normalised Mutual Information Score", "Edit Distance", "Jensen Shannon Divergence",  "DTW Distance"
plotTitles = [ name + " Map" for name in metricNames ] 

from myPlot import hmap    

for i, title in enumerate(plotTitles):
    arr = selectMetric(res_a, i)
    print title, arr.shape
    hmap(title, title, "Q", "F", arr, 'YlOrRd', 0)