#!/usr/bin/python
# -*- coding: utf-8 -*-
# --------------------------------------------------
# File Name: exp_2a.py
# Location:
# Purpose:
# Creation Date: 29-10-2017
# Last Modified: Fri, Dec 1, 2017 3:55:22 PM
# Author(s): Mike Stout
# Copyright 2017 The Author(s) All Rights Reserved
# Credits:
# --------------------------------------------------
import numpy as np
import pandas as pd
import re
pd.set_option('display.max_rows', 15)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 100)
pd.set_option('chop_threshold', 0)
pd.set_option('precision', 4)
pd.set_option('display.colheader_justify','right')
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_colwidth', 32)
import matplotlib.pyplot as plt
plt.style.use('ggplot')
def outputTable(filename, df):
df = pd.DataFrame(df)
filename = filename.replace(' ', '_')
html = df.to_html()
with open('data/'+filename+'.txt', 'w') as text_file:
text_file.write(html)
#-------------------------------------------------------------------------------
# https://github.com/aflc/editdistance
import editdistance
ed = editdistance.eval
#-------------------------------------------------------------------------------
from scipy.stats import entropy # == KL Divergence
#-------------------------------------------------------------------------------
from mlpy import dtw_subsequence, dtw_std
# Encode Lower and upercase letters to same values ...
lc = 'abcdefghijklmnopqrstuvwxyz'
lc_lut = [ (a, i) for i,a in enumerate(lc) ]
uc_lut = [ (a.upper(), i) for a,i in lc_lut ]
lut = dict( lc_lut + uc_lut )
#def encode(c): return ord(c.lower())*200
def enc(c):
try: val = lut[c.lower()]
except: val = ord(c) * 2.
return val
def encode(s):
return np.array(map(enc, s))
#return np.array(map(float, map(ord, s)))
def splits(n, f, xs):
return [ f+":\t" + xs[i:i+n] for i in range(0, len(xs), n)]
def fix(xs, i, path):
if i>0 and path[i]==path[i-1]: return '_' # gapChar
else: return xs[path[i]]
def recover(xs, path):
return ''.join([ fix(xs, i, path) for i in xrange(len(path))]).replace("\n"," ")
def dtw(x,y):
x_ = encode(x)
y_ = encode(y)
dist, cost, path = dtw_subsequence(x_, y_)
x = recover(x, path[0])
y = recover(y, path[1])
al = (x,y)
return dist, al
#-------------------------------------------------------------------------------
from scipy.stats import entropy # == KL Divergence
import numpy as np
def jsd(x,y):
P = encode(x)
Q = encode(y)
M = 0.5 * (P + Q)
return 0.5 * (entropy(P, M) + entropy(Q, M))
#-------------------------------------------------------------------------------
from sklearn.metrics import normalized_mutual_info_score
def nmi(x,y):
x = encode(x)
y = encode(y)
return normalized_mutual_info_score(x,y)
#-------------------------------------------------------------------------------
def words(s):
#return s.split(' ')
return filter(lambda x: x!='', re.split('[ ,;:.\>\<?!#@\n\t\']', s))
#f_words=pd.read_csv("CK_200_function_words.csv")
#f_words.columns = ['word']
df=pd.read_csv("df.csv")
print df
df = df[ (df.Subset == 'Full-a')]
print df
qf = df[ (df.Edtn == 'Q')]
ff = df[ (df.Edtn == 'F')]
print qf
print ff
def proc(r):
#return r.spkr + " " + r.txt
return r.txt
def labels(r):
return r.Scene + " " + str(r['sp#']) + " " + r.spkr
'''
def mkWindow(i,xs):
print xs
w = xs[i:i+wSize]
s = ' '.join(w)
#print s
#exit()
return s
'''
#wSize = 200
def window(df):
samples = df[['Scene','spkr','txt']][:30]
samples['sample'] = df.apply(lambda row: proc(row), axis=1)
print samples[:10]
# NB use speeches as the windows ....
windows = samples['sample']
windows = np.array(windows)
samples['labs'] = df.apply(lambda row: labels(row), axis=1)
labs = samples['labs']
return labs, windows
q_labs, q_ws = window(qf)
f_labs, f_ws = window(ff)
q_labs = np.array(q_labs)
f_labs = np.array(f_labs)
res = []
res_a = []
k=0
for q in q_ws:
print k
k+=1
pairs = []
for f in f_ws:
if 1:# qs==fs:
edist = ed(q,f)
dist, al = dtw(q,f)
# For nmi and jsd the matrix must by NxN
# .. so strings (concatenations of 2000 words) must be aligned ...
q_,f_ = al
nmi_score = nmi(q_,f_)
jsd_score = jsd(q_,f_)
#print q_
#print f_
#dat = nmi_score, 1/float(edist+1), jsd_score, 1/float(dist+1)
dat = nmi_score, edist, jsd_score, dist
#print dat
pairs.append(dat)
pairs = np.array(pairs).T
mins = map(np.argmin, pairs)[1:]
# .. for nmi need to find max not min
maxs = [map(np.argmax, pairs)[0]]
posns = maxs + mins
scores = []
for i,p in enumerate(posns):
scores.append(pairs[i][p])
res.append(zip(posns, scores))
res_a.append(pairs)
def selectMetric(a,i):
return np.array([ ys[i] for ys in a ])
res = np.array(res)
res_a = np.array(res_a)
print res_a
print res_a.shape
#res = pd.DataFrame([res])
#res.to_csv('res_scores.csv', header=True, index=False, encoding='utf-8')
metricNames = "Normalised Mutual Information Score", "Edit Distance", "Jensen Shannon Divergence", "DTW Distance"
metrics = 'nmi ed jsd dtw'.split(' ')
from myPlot import hmap
res = []
for i, metric in enumerate(metrics):
arr = selectMetric(res_a, i)
print metric, arr.shape
title = metricNames[i]
hmap(title, title, "Q", "F", arr, q_labs, f_labs, 'YlOrRd', 0)
outputTable(title, arr)