calcMetrics.py 31.1.18

#!/usr/bin/python
# -*- coding: utf-8 -*-

# --------------------------------------------------
# File Name: exp_2a.py
# Location: 
# Purpose:
# Creation Date: 29-10-2017
# Last Modified: Wed, Jan 31, 2018  9:18:26 PM
# Author(s): Mike Stout 
# Copyright 2017 The Author(s) All Rights Reserved
# Credits: 
# --------------------------------------------------

import sys
import numpy as np

#----------------------------------------------------------------
# https://github.com/aflc/editdistance

import editdistance

def ed(q,f):
    k = 1. # Not Normalised
    # k = float(len(q)+len(f)) # Normalised
    return editdistance.eval(q,f) / k

#----------------------------------------------------------------

from mlpy import dtw_subsequence, dtw_std

# Encode Lower and upercase letters to same values ...
lc = 'abcdefghijklmnopqrstuvwxyz'
lc_lut = [ (a, i) for i,a in enumerate(lc) ] 
uc_lut = [ (a.upper(), i) for a,i in lc_lut ]
lut = dict( lc_lut + uc_lut  ) 

#def encode(c): return ord(c.lower())*200

def encChar(c):
    try: val = lut[c.lower()]
    except: val = ord(c)  #* 2
    return val

def isOK(c):
    return c in "aeiouAEIOUst"

def encString(s):
    #if s=='of': s='from'
    s = filter(isOK, s) ## NB consider only vowels...
    s_ = map(encChar, s)#[::-1]
    #s_ = map(ord, s)
    val = sum(s_)
    #val = float( ''.join(map(str, s_)) + ".0")
    if val==0: return 0.
    else: return 1.0/val



def encode(s):
    return np.array(map(encString, s))
    #return np.array(map(float, map(ord, s)))

'''
def splits(n, f, xs):
    return [ f+":\t" + xs[i:i+n] for i in range(0, len(xs), n)]

def fix(xs, i, path):
    if i+1<len(path) and path[i]!=path[i+1]: return xs[path[i]]
    else: return "_" # xs[path[i]]


def recover(xs, ys, xp, yp):

    xl,yl,flip = len(xp),len(yp),False
    k = xl
    if xl<yl: xl,yl,Flip = yl,xl.True

    res = []
    xoff = 0, 0
    for i in xrange(yl):
        while xp[i+xoff]==yp[i+yoff]: res.append((xp[i+xoff],xp[i+yoff]))
        while xp[i+xoff]!=yp[i+yoff]: xoff+=1
             
    

    res = []
    for i in xrange(k):

        x = xs[i]
        y = ys[i]
        
        if xs[i]==ys[i]: = x,y # res.append((x,y))
        if xs[i+1]==xs[i]: 
            for xy in recover(xs, ys[i+1:], xs, yp[i+1:]):
                
    if ys[i+1]==ys[i]: return recover(xs, ys[i+1:], xs, yp[i+1:])
    re
        

    path_ = (path + path[:1]) # [::-1]
    xs_ = (xs + xs[:1])
    
    return ([ fix(xs_, i, path_) for i in xrange(len(path_))])[::-1]
'''


def rl_enc(input_string):
    count = 1
    prev = ''
    lst = []
    for item in input_string:
        if item != prev:
            if prev:
                entry = (prev,count)
                lst.append(entry)
                #print lst
            count = 1
            prev = item
        else:
            count += 1
    else:
        entry = (item,count)
        lst.append(entry)
    return lst

def rl_dec(lst):
    xs = []
    for i,(item,n) in enumerate(lst):
        if n==1: xs.append([item])
        else: xs.append([item] + [str((i,n-1))])
    return [item for sublist in xs for item in sublist]

''' 
def rl_dec(lst):
    q = ""
    for character, count in lst:
        q += character * count
    return q
'''

def recover(xs):
    return rl_dec( rl_enc(xs) )

def decode(xs,x):
    return [ x[i] for i in xs]



def fixLen((i,(x,y))):
    if x=='_': x = str(i) # *(len(y)) # -len(x))
    if y=='_': y = str(i) # *(len(x)) # -len(y))
    return x,y

def dtw(a,b):

    #a = a[::-1]
    #b = b[::-1]

    x,y,flip = a,b, False
    if len(a) > len(b): x,y,flip = b,a,True

    xe = encode(x[::-1])
    ye = encode(y[::-1])
    #dist, cost, path = dtw_subsequence(xe, ye) 
    dist, cost, path = dtw_std(xe, ye, dist_only=False) 
    p0, p1 = path
    xa = recover(decode(p0,x))
    ya = recover(decode(p1,y))
    xa,ya = zip(*map(fixLen,  enumerate(zip(xa,ya))))


    #xa = xa[::-1]
    #ya = ya[::-1]
   
    al = xa,ya 
    if flip: al = ya,xa

    
    return dist, al

#----------------------------------------------------------------
from scipy.stats import entropy # == KL Divergence

def jsd(x,y):

    P = np.array(map(float, encode(x)))
    Q = np.array(map(float, encode(y)))
 
    M = 0.5 * (P + Q)

    return 0.5 * (entropy(P, M) + entropy(Q, M))


#----------------------------------------------------------------
from sklearn.metrics import normalized_mutual_info_score

def nmi(x,y):
    x = encode(x)
    y = encode(y)

    return normalized_mutual_info_score(x,y)