#!/usr/bin/python # -*- coding: utf-8 -*- # -------------------------------------------------- # File Name: exp_1i.py # Location: # Purpose: # Creation Date: 29-10-2017 # Last Modified: Wed, Nov 15, 2017 5:54:01 PM # Author(s): Mike Stout # Copyright 2017 The Author(s) All Rights Reserved # Credits: # -------------------------------------------------- import numpy as np import pandas as pd pd.set_option('display.max_rows', 15) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) pd.set_option('chop_threshold', 0) pd.set_option('precision', 4) pd.set_option('display.colheader_justify','right') pd.set_option('display.max_colwidth', -1) pd.set_option('display.max_colwidth', 64) import matplotlib.pyplot as plt plt.style.use('ggplot') df=pd.read_csv("df.csv") #print df # .info() qc = df[(df.Edtn == 'Q') & (df.Subset == 'Common')] fc = df[(df.Edtn == 'F') & (df.Subset == 'Common')] columns = ['Act','Scene','Speaker', 'sp#','#chars', 'txt'] qc = qc[columns] fc = fc[columns] res = qc.merge(fc, on=['Act','Scene','Speaker']) # N.B. Speach num must be at least fairly close .... # this allows lots of mis matches .... res = res[ (abs(res['sp#_x'] - res['sp#_y']) < 6)] print qc.shape, fc.shape, res.shape #------------------------------------------------------------------------------- # https://github.com/aflc/editdistance import editdistance def calcEditDistance(r): return editdistance.eval(r.txt_x, r.txt_y) res['ed'] = res.apply(lambda row: calcEditDistance(row), axis=1) # N.B. For speeches to match the e.d. must be lowish ... # so remove any mismatches by this criterion .... res = res[ (res.ed < res['#chars_x']/1.5 ) ] print res.shape #------------------------------------------------------------------------------- from mlpy import dtw_subsequence, dtw_std lc = 'abcdefghijklmnopqrstuvwxyz' lc_lut = [ (a, i) for i,a in enumerate(lc) ] uc_lut = [ (a.upper(), i) for a,i in lc_lut ] lut = dict( lc_lut + uc_lut ) #def encode(c): return ord(c.lower())*200 def enc(c): try: val = lut[c.lower()] except: val = ord(c) * 2. return val def encode(s): return np.array(map(enc, s)) #return np.array(map(float, map(ord, s))) def splits(n, f, xs): return [ f+":\t" + xs[i:i+n] for i in range(0, len(xs), n)] def fix(xs, i, path): if i>0 and path[i]==path[i-1]: return '_' # gapChar else: return xs[path[i]] def recover(xs, path): return ''.join([ fix(xs, i, path) for i in xrange(len(path))]).replace("\n"," ") def dtw(r): q = r.txt_x f = r.txt_y q_ = encode(q) f_ = encode(f) dist, cost, path = dtw_subsequence(q_, f_) q_ = recover(q, path[0]) f_ = recover(f, path[1]) al = (q_,f_) return dist, al res['dtw'] = res.apply(lambda row: dtw(row)[0], axis=1) res['alignment'] = res.apply(lambda row: dtw(row)[1], axis=1) #------------------------------------------------------------------------------- from scipy.stats import entropy # == KL Divergence import numpy as np def jsd(r): x,y = r.alignment # ... the aligned texts P = encode(x) Q = encode(y) M = 0.5 * (P + Q) return 0.5 * (entropy(P, M) + entropy(Q, M)) res['jsd'] = res.apply(lambda row: jsd(row), axis=1) #------------------------------------------------------------------------------- from sklearn.metrics import normalized_mutual_info_score def nmi(r): x,y = r.alignment # dtw(r)[1] # ... the aligned texts q = encode(x) f = encode(y) return normalized_mutual_info_score(q,f) res['nmi'] = res.apply(lambda row: nmi(row), axis=1) # N.B. Also for speeches to match the m.i. must be highish ... # so remove mismatches .... res = res[ (res.nmi > .7) ] print res.shape #------------------------------------------------------------------------------- # Main .... # ok so we think we have matching speeches in Q and F # so we can now do pairwise metrics .... res.columns = ['Act','Scene','Speaker','sp#', '#chars_q', "q_txt", 'sp#_', '#chars_f', 'f_txt', 'ed', 'dtw','alignment', 'jsd','nmi'] print res res.to_csv('res.csv', header=True, index=False, encoding='utf-8') plotTitles = "Edit Distance", "DTW Distance", "Jensen Shannon Divergence", "Normalised Mutual Information Score" columns = "ed", "dtw", "jsd", "nmi" metrics = zip(plotTitles, columns) import myPlot for title, metric in metrics: zero = 0 if metric == 'nmi': zero = .7 # Calc table w margings ... pt = res.pivot_table( values=metric , columns=['Scene'] , index=['Speaker'] , aggfunc=np.mean #, fill_value=0 , margins=True ) pt = pt.T #print pt if metric == 'jsd': pt = pt *100 myPlot.hmap(title, title, "Speaker", "Scene", pt, 'YlOrRd', zero) myPlot.outputTable(title, pt) #------------------------------------------------------------------------------- r_max = res['#chars_f'].argmin() # Find sp w worst m.i. .... r_max = res['nmi'].argmin() r = res.ix[r_max] print r x,y = r.alignment #print x +'\n'+ y n = 100 aa = splits(n, "Q", x) bb = splits(n, "F", y) zz = [ "\n"+a+"\n"+b for a,b in zip(aa,bb) ] for z in zz: print z #------------------------------------------------------------------------------- # Pairs plots ... # for info metrics .. import seaborn as sns sns.set(style="ticks", color_codes=True) plt.xticks(rotation=90) sns.pairplot(res[['Speaker','ed','dtw','jsd','nmi']], hue="Speaker", palette="husl") filename = "Speaker by Scene Q F Information Metrics".replace(' ', '_') plt.savefig('data/'+filename+'.png') #plt.show()