exp_1i.py 15.11.17

#!/usr/bin/python
# -*- coding: utf-8 -*-

# --------------------------------------------------
# File Name: exp_1i.py
# Location: 
# Purpose:
# Creation Date: 29-10-2017
# Last Modified: Wed, Nov 15, 2017  5:54:01 PM
# Author(s): Mike Stout 
# Copyright 2017 The Author(s) All Rights Reserved
# Credits: 
# --------------------------------------------------

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 15) 
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('chop_threshold', 0)
pd.set_option('precision', 4)
pd.set_option('display.colheader_justify','right')
pd.set_option('display.max_colwidth', -1) 
pd.set_option('display.max_colwidth', 64) 

import matplotlib.pyplot as plt 
plt.style.use('ggplot')


df=pd.read_csv("df.csv")

#print df # .info()

qc = df[(df.Edtn == 'Q') & (df.Subset == 'Common')]
fc = df[(df.Edtn == 'F') & (df.Subset == 'Common')]
columns = ['Act','Scene','Speaker', 'sp#','#chars', 'txt']
qc = qc[columns]
fc = fc[columns]


res = qc.merge(fc, on=['Act','Scene','Speaker']) 

# N.B. Speach num must be at least fairly close ....
# this allows lots of mis matches ....

res = res[ (abs(res['sp#_x'] - res['sp#_y']) < 6)] 

print qc.shape, fc.shape, res.shape


#-------------------------------------------------------------------------------
# https://github.com/aflc/editdistance

import editdistance

def calcEditDistance(r): 
    return editdistance.eval(r.txt_x, r.txt_y)


res['ed'] = res.apply(lambda row: calcEditDistance(row), axis=1)

# N.B. For speeches to match the e.d. must be lowish ... 
# so remove any mismatches by this criterion ....
res = res[ (res.ed < res['#chars_x']/1.5 ) ] 

print res.shape

#-------------------------------------------------------------------------------
from mlpy import dtw_subsequence, dtw_std

lc = 'abcdefghijklmnopqrstuvwxyz'
lc_lut = [ (a, i) for i,a in enumerate(lc) ] 
uc_lut = [ (a.upper(), i) for a,i in lc_lut ]
lut = dict( lc_lut + uc_lut  ) 

#def encode(c): return ord(c.lower())*200

def enc(c):
    try: val = lut[c.lower()]
    except: val = ord(c)  * 2.
    return val

def encode(s):
    return np.array(map(enc, s))
    #return np.array(map(float, map(ord, s)))

def splits(n, f, xs):
    return [ f+":\t" + xs[i:i+n] for i in range(0, len(xs), n)]

def fix(xs, i, path):
    if i>0 and path[i]==path[i-1]: return '_' # gapChar
    else: return xs[path[i]]


def recover(xs, path):
    return ''.join([ fix(xs, i, path) for i in xrange(len(path))]).replace("\n"," ")

def dtw(r): 
    q = r.txt_x
    f = r.txt_y
    q_ = encode(q)
    f_ = encode(f)
    dist, cost, path = dtw_subsequence(q_, f_) 
    q_ = recover(q, path[0])
    f_ = recover(f, path[1])
    al = (q_,f_)
    return dist, al

res['dtw'] = res.apply(lambda row: dtw(row)[0], axis=1)
res['alignment'] = res.apply(lambda row: dtw(row)[1], axis=1)

#-------------------------------------------------------------------------------
from scipy.stats import entropy # == KL Divergence
import numpy as np

def jsd(r):
    x,y = r.alignment # ... the aligned texts

    P = encode(x)
    Q = encode(y)
    M = 0.5 * (P + Q)
    return 0.5 * (entropy(P, M) + entropy(Q, M))


res['jsd'] = res.apply(lambda row: jsd(row), axis=1)

#-------------------------------------------------------------------------------
from sklearn.metrics import normalized_mutual_info_score

def nmi(r):
    x,y = r.alignment # dtw(r)[1] # ... the aligned texts

    q = encode(x)
    f = encode(y)

    return normalized_mutual_info_score(q,f)

res['nmi'] = res.apply(lambda row: nmi(row), axis=1)

# N.B. Also for speeches to match the m.i.  must be highish ... 
# so remove mismatches ....

res = res[ (res.nmi > .7) ]
print res.shape
#-------------------------------------------------------------------------------
# Main .... 

# ok so we think we have matching speeches in Q and F 
# so we can now do pairwise metrics ....

res.columns = ['Act','Scene','Speaker','sp#', '#chars_q', "q_txt", 'sp#_', '#chars_f', 'f_txt', 'ed', 'dtw','alignment', 'jsd','nmi']

print res
res.to_csv('res.csv', header=True, index=False, encoding='utf-8')

plotTitles = "Edit Distance", "DTW Distance", "Jensen Shannon Divergence", "Normalised Mutual Information Score"
columns = "ed", "dtw", "jsd", "nmi"

metrics = zip(plotTitles, columns)


import myPlot

for title, metric in metrics:

    zero = 0
    if metric == 'nmi': zero = .7

    # Calc table w margings ...
    pt = res.pivot_table(
        values=metric
        , columns=['Scene']
        , index=['Speaker']
        , aggfunc=np.mean
        #, fill_value=0 
        , margins=True
        )
    pt = pt.T
    #print pt

    if metric == 'jsd': 
        pt = pt *100
        

    myPlot.hmap(title, title, "Speaker", "Scene", pt, 'YlOrRd', zero)

    myPlot.outputTable(title, pt)



#-------------------------------------------------------------------------------
r_max = res['#chars_f'].argmin()

# Find sp w worst m.i. ....

r_max = res['nmi'].argmin()
r = res.ix[r_max]
print r
x,y =  r.alignment
#print x +'\n'+ y

n = 100 
aa = splits(n, "Q", x)
bb = splits(n, "F", y)

zz = [ "\n"+a+"\n"+b for a,b in zip(aa,bb) ]

for z in zz: print z

#-------------------------------------------------------------------------------
# Pairs plots ... 
# for info metrics .. 

import seaborn as sns 
sns.set(style="ticks", color_codes=True)
plt.xticks(rotation=90)

sns.pairplot(res[['Speaker','ed','dtw','jsd','nmi']], hue="Speaker", palette="husl")
filename = "Speaker by Scene Q F Information Metrics".replace(' ', '_')
plt.savefig('data/'+filename+'.png')
#plt.show()