#!/usr/bin/python
# -*- coding: utf-8 -*-
# --------------------------------------------------
# File Name: exp_1i.py
# Location:
# Purpose:
# Creation Date: 29-10-2017
# Last Modified: Wed, Nov 15, 2017 6:28:25 PM
# Author(s): Mike Stout
# Copyright 2017 The Author(s) All Rights Reserved
# Credits:
# --------------------------------------------------
import numpy as np
import pandas as pd
import re
pd.set_option('display.max_rows', 15)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 100)
pd.set_option('chop_threshold', 0)
pd.set_option('precision', 4)
pd.set_option('display.colheader_justify','right')
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_colwidth', 32)
import matplotlib.pyplot as plt
plt.style.use('ggplot')
#-------------------------------------------------------------------------------
# https://github.com/aflc/editdistance
import editdistance
ed = editdistance.eval
#-------------------------------------------------------------------------------
from scipy.stats import entropy # == KL Divergence
#-------------------------------------------------------------------------------
from mlpy import dtw_subsequence, dtw_std
# Enclde Lower and upercase letters to same values ...
lc = 'abcdefghijklmnopqrstuvwxyz'
lc_lut = [ (a, i) for i,a in enumerate(lc) ]
uc_lut = [ (a.upper(), i) for a,i in lc_lut ]
lut = dict( lc_lut + uc_lut )
#def encode(c): return ord(c.lower())*200
def enc(c):
try: val = lut[c.lower()]
except: val = ord(c) * 2.
return val
def encode(s):
return np.array(map(enc, s))
#return np.array(map(float, map(ord, s)))
def splits(n, f, xs):
return [ f+":\t" + xs[i:i+n] for i in range(0, len(xs), n)]
def fix(xs, i, path):
if i>0 and path[i]==path[i-1]: return '_' # gapChar
else: return xs[path[i]]
def recover(xs, path):
return ''.join([ fix(xs, i, path) for i in xrange(len(path))]).replace("\n"," ")
def dtw(x,y):
x_ = encode(x)
y_ = encode(y)
dist, cost, path = dtw_subsequence(x_, y_)
x = recover(x, path[0])
y = recover(y, path[1])
al = (x,y)
return dist, al
#-------------------------------------------------------------------------------
from scipy.stats import entropy # == KL Divergence
import numpy as np
def jsd(x,y):
P = encode(x)
Q = encode(y)
M = 0.5 * (P + Q)
return 0.5 * (entropy(P, M) + entropy(Q, M))
#-------------------------------------------------------------------------------
from sklearn.metrics import normalized_mutual_info_score
def nmi(x,y):
x = encode(x)
y = encode(y)
return normalized_mutual_info_score(x,y)
#-------------------------------------------------------------------------------
def words(s):
#return s.split(' ')
return filter(lambda x: x!='', re.split('[ ,;:.\>\<?!#@\n\t\']', s))
f_words=pd.read_csv("CK_200_function_words.csv")
f_words.columns = ['word']
df=pd.read_csv("df.csv")
df = df[ (df.Subset == 'Full-a')]
qf = df[ (df.Edtn == 'Q')]
ff = df[ (df.Edtn == 'F')]
def proc(r):
return r.spkr + " " + r.txt
def mkWindow(i,xs):
w = xs[i:i+wSize]
s = ' '.join(w)
return s
wSize = 200
def window(df):
samples = df.apply(lambda row: proc(row), axis=1)
#print samples
txt_full = ''
for sample in samples:
txt_full += (' ' + sample)
ws = words(txt_full)
windows=[]
print ed, "#words: ", len(ws)
for i in xrange(100): # len(ws)-wSize):
windows.append(mkWindow(i,ws))
windows = np.array(windows)
#print windows
return windows
q_ws = window(qf)
f_ws = window(ff)
print q_ws.shape, f_ws.shape
res = []
res_a = []
k=0
for q in q_ws:
print k
k+=1
pairs = []
for f in f_ws:
edist = ed(q,f)
dist, al = dtw(q,f)
# For nmi and jsd the matrix must by NxN
# .. so strings (concatenations of 2000 words) must be aligned ...
q_,f_ = al
nmi_score = nmi(q_,f_)
jsd_score = jsd(q_,f_)
#print q_
#print f_
dat = nmi_score, edist, jsd_score, dist
pairs.append(dat)
pairs = np.array(pairs).T
mins = map(np.argmin, pairs)[1:]
# .. for nmi need to find max not min
maxs = [map(np.argmax, pairs)[0]]
posns = maxs + mins
scores = []
for i,p in enumerate(posns):
scores.append(pairs[i][p])
res.append(zip(posns, scores))
res_a.append(pairs)
def selectMetric(a,i):
return np.array([ ys[i] for ys in a ])
res = np.array(res)
res_a = np.array(res_a)
print res_a
print res_a.shape
#res = pd.DataFrame([res])
#res.to_csv('res_scores.csv', header=True, index=False, encoding='utf-8')
metricNames = "Normalised Mutual Information Score", "Edit Distance", "Jensen Shannon Divergence", "DTW Distance"
plotTitles = [ name + " Map" for name in metricNames ]
from myPlot import hmap
for i, title in enumerate(plotTitles):
arr = selectMetric(res_a, i)
print title, arr.shape
hmap(title, title, "Q", "F", arr, 'YlOrRd', 0)