#!/usr/bin/python # -*- coding: utf-8 -*- # -------------------------------------------------- # File Name: exp_1i.py # Location: # Purpose: # Creation Date: 29-10-2017 # Last Modified: Wed, Nov 15, 2017 6:19:52 PM # Author(s): Mike Stout # Copyright 2017 The Author(s) All Rights Reserved # Credits: # -------------------------------------------------- import numpy as np import pandas as pd import re pd.set_option('display.max_rows', 15) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 100) pd.set_option('chop_threshold', 0) pd.set_option('precision', 4) pd.set_option('display.colheader_justify','right') pd.set_option('display.max_colwidth', -1) pd.set_option('display.max_colwidth', 32) import matplotlib.pyplot as plt plt.style.use('ggplot') df=pd.read_csv("df.csv") print df f_words=pd.read_csv("CK_200_function_words.csv") f_words.columns = ['word'] def fix_pos(r): w = r.word return w.split(' ')[0] f_words['word'] = f_words.apply(lambda row: fix_pos(row), axis=1) #print df # .info() #df = df[ ((df.Subset == 'Only')) | ((df.Edtn == 'Q') & (df.Subset == 'Common')) ] df = df[ (df.Subset != 'Full-a')] print df #df = df[ (df.Subset == 'Only') ] df['Text'] = df['Edition'] + ' ' + df['Subset'] columns = ['Text', 'txt'] df = df[columns] print df def words(s): #return s.split(' ') return filter(lambda x: x!='', re.split('[ ,;:.\>\<?!#@\n\t\']', s)) def calcFreq(w, r): s = r.txt n = float(len(s)) ws = words(s) f_ws = filter(lambda x: x==w, ws) return len(f_ws)/n print f_words for w in f_words.word: df[w] = df.apply(lambda row: calcFreq(w, row), axis=1) df_ = df[[column for column in df.columns if column not in columns]] # PCA ... import sklearn.decomposition pca = sklearn.decomposition.PCA(n_components = 3) pca.fit(df_) projection = pca.transform(df_) x = projection[:,0] y = projection[:,1] z = projection[:,2] # Groups ... labels = df['Text'] df = pd.DataFrame(dict(x=x, y=y, z=z, label=labels)) groups = df.groupby('label') # 3D Scatter Plot ... from mpl_toolkits.mplot3d import Axes3D plt3d = plt.figure().gca(projection='3d') # Center the axes ... plt3d.autoscale(enable=False,axis='both') # ... needed to change the Z-axis o_x = 0 o_y = 0 o_z = 0 delta = .1 plt3d.set_xbound(o_x-delta, o_x+delta) plt3d.set_ybound(o_y-delta, o_y+delta) plt3d.set_zbound(o_z-delta, o_z+delta) plt3d.set_xlabel('Pr Comp 1') plt3d.set_ylabel('Pr Comp 2') plt3d.set_zlabel('Pr Comp 3') markers = 'o', '^', 'D', '*' import matplotlib.cm as cm colors = cm.rainbow(np.linspace(0, 1, len(markers))) colors = 'magenta', 'b', 'r', 'g' labels=[] for i, (name,group) in enumerate(groups): labels.append(name) # Calculate centroids .... c_x = np.mean(group.x) c_y = np.mean(group.y) c_z = np.mean(group.z) print c_x, c_y, c_z plt3d.scatter(c_x, c_y, c_z, marker=markers[i], color=colors[i], s=500) plt3d.scatter(group.x, group.y, group.z , marker=markers[i] , color=colors[i], s=20, alpha=.2) # Fake a legend for 3d scatter .... proxy = [] for c,m in zip(colors, markers): proxy.append(plt.Line2D([0],[0], linestyle="none", color=c, marker=m)) plt3d.legend(proxy, labels, numpoints = 1) plt.suptitle("PCA of 200 Function Words in Lr Speeches") plt.show() exit() #------------------------------------ import seaborn as sns import matplotlib.pyplot as plt #%matplotlib inline sns.set(context = "paper", font = "monospace") labels = df['Text'] print labels df = pd.DataFrame(dict(x=x, y=y, label=labels)) groups = df.groupby('label') # Plot fig, ax = plt.subplots() ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling for name, group in groups: ax.plot(group.x, group.y, marker='.', linestyle='', ms=12, label=name) ax.legend() ''' from scipy.cluster.vq import kmeans2 centroids, ks = kmeans2(np.array(df), 2, 10) print centroids exit() colors = ['r', 'g'] # , 'b'] plt.scatter(*df.T, c=np.choose(ks, colors)) plt.scatter(*centroids.T, c=colors, marker='v') ''' plt.show() fig.savefig('tmp.png') exit() ''' def approx(r): return len(r['txt']) /10 #qc['fuzzymatch'] = qc.apply(lambda r: approx(r), axis=1) #fc['fuzzymatch'] = fc.apply(lambda r: approx(r), axis=1) ''' res = qc.merge(fc, on=['Act','Scene','Speaker']) # ,'fuzzymatch']) res = res[ (abs(res['sp#_x'] - res['sp#_y']) < 6)] print qc.shape, fc.shape, res.shape #------------------------------------------------------------------------------- # https://github.com/aflc/editdistance import editdistance def calcEditDistance(r): return editdistance.eval(r.txt_x, r.txt_y) res['ed'] = res.apply(lambda row: calcEditDistance(row), axis=1) res = res[ (res.ed < res['#chars_x']/1.5 ) ] print res.shape #------------------------------------------------------------------------------- from mlpy import dtw_subsequence, dtw_std lc = 'abcdefghijklmnopqrstuvwxyz' lc_lut = [ (a, i) for i,a in enumerate(lc) ] uc_lut = [ (a.upper(), i) for a,i in lc_lut ] lut = dict( lc_lut + uc_lut ) #def encode(c): return ord(c.lower())*200 def enc(c): try: val = lut[c.lower()] except: val = ord(c) * 2. return val def encode(s): return np.array(map(enc, s)) #return np.array(map(float, map(ord, s))) def splits(n, f, xs): return [ f+":\t" + xs[i:i+n] for i in range(0, len(xs), n)] def fix(xs, i, path): if i>0 and path[i]==path[i-1]: return '_' # gapChar else: return xs[path[i]] def recover(xs, path): return ''.join([ fix(xs, i, path) for i in xrange(len(path))]).replace("\n"," ") def dtw(r): q = r.txt_x f = r.txt_y q_ = encode(q) f_ = encode(f) dist, cost, path = dtw_subsequence(q_, f_) q_ = recover(q, path[0]) f_ = recover(f, path[1]) al = (q_,f_) return dist, al res['dtw'] = res.apply(lambda row: dtw(row)[0], axis=1) res['alignment'] = res.apply(lambda row: dtw(row)[1], axis=1) #------------------------------------------------------------------------------- from scipy.stats import entropy # == KL Divergence import numpy as np def jsd(r): x,y = r.alignment # ... the aligned texts P = encode(x) Q = encode(y) M = 0.5 * (P + Q) return 0.5 * (entropy(P, M) + entropy(Q, M)) res['jsd'] = res.apply(lambda row: jsd(row), axis=1) #------------------------------------------------------------------------------- from sklearn.metrics import normalized_mutual_info_score def nmi(r): x,y = r.alignment # dtw(r)[1] # ... the aligned texts q = encode(x) f = encode(y) return normalized_mutual_info_score(q,f) res['nmi'] = res.apply(lambda row: nmi(row), axis=1) res = res[ (res.nmi > .7) ] print res.shape #------------------------------------------------------------------------------- res.columns = ['Act','Scene','Speaker','sp#', '#chars_q', "q_txt", 'sp#_', '#chars_f', 'f_txt', 'ed', 'dtw','alignment', 'jsd','nmi'] print res res.to_csv('res.csv', header=True, index=False, encoding='utf-8') plotTitles = "Edit Distance", "DTW Distance", "Jensen Shannon Divergence", "Normalised Mutual Information Score" columns = "ed", "dtw", "jsd", "nmi" metrics = zip(plotTitles, columns) import myPlot for title, metric in metrics: zero = 0 if metric == 'nmi': zero = .7 # Calc table w margings ... pt = res.pivot_table( values=metric , columns=['Scene'] , index=['Speaker'] , aggfunc=np.mean #, fill_value=0 , margins=True ) pt = pt.T #print pt if metric == 'jsd': pt = pt *100 myPlot.hmap(title, title, "Speaker", "Scene", pt, 'YlOrRd', zero) myPlot.outputTable(title, pt) r_max = res['#chars_f'].argmin() r_max = res['nmi'].argmin() r = res.ix[r_max] print r x,y = r.alignment #print x +'\n'+ y n = 100 aa = splits(n, "Q", x) bb = splits(n, "F", y) zz = [ "\n"+a+"\n"+b for a,b in zip(aa,bb) ] for z in zz: print z import seaborn as sns sns.set(style="ticks", color_codes=True) plt.xticks(rotation=90) sns.pairplot(res[['Speaker','ed','dtw','jsd','nmi']], hue="Speaker", palette="husl") #plt.show() filename = "Speaker by Scene Q F Information Metrics".replace(' ', '_') plt.savefig('data/'+filename+'.png')