exp_1ii.py 15.11.17

#!/usr/bin/python
# -*- coding: utf-8 -*-

# --------------------------------------------------
# File Name: exp_1i.py
# Location: 
# Purpose:
# Creation Date: 29-10-2017
# Last Modified: Wed, Nov 15, 2017  6:19:52 PM
# Author(s): Mike Stout 
# Copyright 2017 The Author(s) All Rights Reserved
# Credits: 
# --------------------------------------------------

import numpy as np
import pandas as pd
import re
pd.set_option('display.max_rows', 15) 
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 100)
pd.set_option('chop_threshold', 0)
pd.set_option('precision', 4)
pd.set_option('display.colheader_justify','right')
pd.set_option('display.max_colwidth', -1) 
pd.set_option('display.max_colwidth', 32) 

import matplotlib.pyplot as plt 
plt.style.use('ggplot')

df=pd.read_csv("df.csv")
print df


f_words=pd.read_csv("CK_200_function_words.csv")
f_words.columns = ['word']
def fix_pos(r):
    w = r.word
    return w.split(' ')[0]

f_words['word'] = f_words.apply(lambda row: fix_pos(row), axis=1)


#print df # .info()
#df = df[ ((df.Subset == 'Only')) |  ((df.Edtn == 'Q') & (df.Subset == 'Common')) ] 
df = df[ (df.Subset != 'Full-a')]
print df


#df = df[ (df.Subset == 'Only') ] 
df['Text'] = df['Edition'] + ' ' + df['Subset']

columns = ['Text', 'txt']
df = df[columns]
print df


def words(s): 
    #return s.split(' ')
    return filter(lambda x: x!='', re.split('[ ,;:.\>\<?!#@\n\t\']', s))


def calcFreq(w, r): 
    s = r.txt
    n = float(len(s))
    ws = words(s)
    f_ws = filter(lambda x: x==w, ws)
    return len(f_ws)/n

print f_words

for w in f_words.word:
    df[w] = df.apply(lambda row: calcFreq(w, row), axis=1)




df_ = df[[column for column in df.columns if column not in columns]]

# PCA ...
import sklearn.decomposition
pca = sklearn.decomposition.PCA(n_components = 3)
pca.fit(df_)
projection = pca.transform(df_)
x = projection[:,0]
y = projection[:,1]
z = projection[:,2]

# Groups ...
labels = df['Text'] 
df = pd.DataFrame(dict(x=x, y=y, z=z, label=labels))
groups = df.groupby('label')


# 3D Scatter Plot ...
from mpl_toolkits.mplot3d import Axes3D

plt3d = plt.figure().gca(projection='3d')
# Center the axes ...
plt3d.autoscale(enable=False,axis='both') # ... needed to change the Z-axis
o_x = 0
o_y = 0
o_z = 0
delta = .1
plt3d.set_xbound(o_x-delta, o_x+delta)
plt3d.set_ybound(o_y-delta, o_y+delta)
plt3d.set_zbound(o_z-delta, o_z+delta)
plt3d.set_xlabel('Pr Comp 1')
plt3d.set_ylabel('Pr Comp 2')
plt3d.set_zlabel('Pr Comp 3')


markers = 'o', '^', 'D', '*'
import matplotlib.cm as cm
colors = cm.rainbow(np.linspace(0, 1, len(markers)))
colors = 'magenta', 'b', 'r', 'g'


labels=[]
for i, (name,group) in enumerate(groups):
   labels.append(name)

   # Calculate centroids .... 
   c_x = np.mean(group.x)
   c_y = np.mean(group.y)
   c_z = np.mean(group.z)
   print c_x, c_y, c_z

   plt3d.scatter(c_x, c_y, c_z, marker=markers[i], color=colors[i], s=500) 
   plt3d.scatter(group.x, group.y, group.z , marker=markers[i] , color=colors[i], s=20, alpha=.2)

# Fake a legend for 3d scatter ....
proxy = []
for c,m in zip(colors, markers):
    proxy.append(plt.Line2D([0],[0], linestyle="none", color=c, marker=m))
plt3d.legend(proxy, labels, numpoints = 1)
plt.suptitle("PCA of 200 Function Words in Lr Speeches")

plt.show()


exit()

#------------------------------------

import seaborn as sns
import matplotlib.pyplot as plt
#%matplotlib inline
sns.set(context = "paper", font = "monospace")

labels = df['Text'] 
print labels
df = pd.DataFrame(dict(x=x, y=y, label=labels))

groups = df.groupby('label')

# Plot
fig, ax = plt.subplots()
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
for name, group in groups:
    ax.plot(group.x, group.y, marker='.', linestyle='', ms=12, label=name)
ax.legend()

'''
from scipy.cluster.vq import kmeans2
centroids, ks = kmeans2(np.array(df), 2, 10)
print centroids
exit()

colors = ['r', 'g'] # , 'b']
plt.scatter(*df.T, c=np.choose(ks, colors))
plt.scatter(*centroids.T, c=colors, marker='v')
'''

plt.show()
fig.savefig('tmp.png')





exit()

'''

def approx(r):
    return len(r['txt']) /10 
    

#qc['fuzzymatch'] = qc.apply(lambda r: approx(r), axis=1) 
#fc['fuzzymatch'] = fc.apply(lambda r: approx(r), axis=1) 
'''

res = qc.merge(fc, on=['Act','Scene','Speaker']) #  ,'fuzzymatch'])

res = res[ (abs(res['sp#_x'] - res['sp#_y']) < 6)]

print qc.shape, fc.shape, res.shape


#-------------------------------------------------------------------------------
# https://github.com/aflc/editdistance

import editdistance

def calcEditDistance(r): 
    return editdistance.eval(r.txt_x, r.txt_y)

res['ed'] = res.apply(lambda row: calcEditDistance(row), axis=1)

res = res[ (res.ed < res['#chars_x']/1.5 ) ] 
print res.shape

#-------------------------------------------------------------------------------
from mlpy import dtw_subsequence, dtw_std

lc = 'abcdefghijklmnopqrstuvwxyz'
lc_lut = [ (a, i) for i,a in enumerate(lc) ] 
uc_lut = [ (a.upper(), i) for a,i in lc_lut ]
lut = dict( lc_lut + uc_lut  ) 

#def encode(c): return ord(c.lower())*200

def enc(c):
    try: val = lut[c.lower()]
    except: val = ord(c)  * 2.
    return val

def encode(s):
    return np.array(map(enc, s))
    #return np.array(map(float, map(ord, s)))

def splits(n, f, xs):
    return [ f+":\t" + xs[i:i+n] for i in range(0, len(xs), n)]

def fix(xs, i, path):
    if i>0 and path[i]==path[i-1]: return '_' # gapChar
    else: return xs[path[i]]


def recover(xs, path):
    return ''.join([ fix(xs, i, path) for i in xrange(len(path))]).replace("\n"," ")

def dtw(r): 
    q = r.txt_x
    f = r.txt_y
    q_ = encode(q)
    f_ = encode(f)
    dist, cost, path = dtw_subsequence(q_, f_) 
    q_ = recover(q, path[0])
    f_ = recover(f, path[1])
    al = (q_,f_)
    return dist, al

res['dtw'] = res.apply(lambda row: dtw(row)[0], axis=1)
res['alignment'] = res.apply(lambda row: dtw(row)[1], axis=1)

#-------------------------------------------------------------------------------
from scipy.stats import entropy # == KL Divergence
import numpy as np

def jsd(r):
    x,y = r.alignment # ... the aligned texts

    P = encode(x)
    Q = encode(y)
    M = 0.5 * (P + Q)
    return 0.5 * (entropy(P, M) + entropy(Q, M))


res['jsd'] = res.apply(lambda row: jsd(row), axis=1)

#-------------------------------------------------------------------------------
from sklearn.metrics import normalized_mutual_info_score

def nmi(r):
    x,y = r.alignment # dtw(r)[1] # ... the aligned texts

    q = encode(x)
    f = encode(y)

    return normalized_mutual_info_score(q,f)

res['nmi'] = res.apply(lambda row: nmi(row), axis=1)

res = res[ (res.nmi > .7) ]
print res.shape
#-------------------------------------------------------------------------------

res.columns = ['Act','Scene','Speaker','sp#', '#chars_q', "q_txt", 'sp#_', '#chars_f', 'f_txt', 'ed', 'dtw','alignment', 'jsd','nmi']

print res
res.to_csv('res.csv', header=True, index=False, encoding='utf-8')

plotTitles = "Edit Distance", "DTW Distance", "Jensen Shannon Divergence", "Normalised Mutual Information Score"
columns = "ed", "dtw", "jsd", "nmi"

metrics = zip(plotTitles, columns)



import myPlot
for title, metric in metrics:

    zero = 0
    if metric == 'nmi': zero = .7

    # Calc table w margings ...
    pt = res.pivot_table(
        values=metric
        , columns=['Scene']
        , index=['Speaker']
        , aggfunc=np.mean
        #, fill_value=0 
        , margins=True
        )
    pt = pt.T
    #print pt

    if metric == 'jsd': 
        pt = pt *100
        

    myPlot.hmap(title, title, "Speaker", "Scene", pt, 'YlOrRd', zero)

    myPlot.outputTable(title, pt)



r_max = res['#chars_f'].argmin()
r_max = res['nmi'].argmin()
r = res.ix[r_max]
print r
x,y =  r.alignment
#print x +'\n'+ y

n = 100 
aa = splits(n, "Q", x)
bb = splits(n, "F", y)

zz = [ "\n"+a+"\n"+b for a,b in zip(aa,bb) ]

for z in zz: print z




import seaborn as sns 
sns.set(style="ticks", color_codes=True)
plt.xticks(rotation=90)

sns.pairplot(res[['Speaker','ed','dtw','jsd','nmi']], hue="Speaker", palette="husl")
#plt.show()
filename = "Speaker by Scene Q F Information Metrics".replace(' ', '_')
plt.savefig('data/'+filename+'.png')