analyseXML.py

#!usr/bin/python
# -*- coding: utf-8 -*-

# --------------------------------------------------
# File Name: analyseXML.py
# Location: 
# Purpose:
# Creation Date: 06-07-2017
# Last Modified: Thu, Jan 25, 2018  5:38:23 PM
# Author(s): Mike Stout 
# Copyright 2017 The Author(s) All Rights Reserved
# Credits: 
# --------------------------------------------------


import codecs
import xml.dom.minidom as minidom
import sys
import re

prefix, play, f_filename, q_filename = sys.argv[1:]

doTKL = 1
doMetrics = 1

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 25) 
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('chop_threshold', 0)
pd.set_option('precision', 4)
pd.set_option('display.colheader_justify','right')
pd.set_option('display.max_colwidth', -1)

import matplotlib.pyplot as plt
plt.style.use('ggplot')

six_colors = ['red', 'coral', 'lightcoral', 'blue', 'slateblue', 'royalblue']
two_colors = ['red', 'blue'] 

#f = sys.argv[1]

def hack_utf8_to_ascii(s):    
    return s.encode('ascii', 'ignore')

def words(s): 
    return filter(lambda x: x!='', re.split('[ ,;:.\>\<?!#@\n\t\'~%]', s))

def words_(s): 
    return filter(lambda x: x!='', re.split('[ \n\t]', s))

'''
text='hello,how are you?I am fine,thank you. And you?'
print text
print words(text)
exit()
'''

tkl_word_groups = map(words,  
    [ 'which_rel that who'
    , 'do does doth'
    , 'these this those'
    , 'thine thy' 
    ])

for g in tkl_word_groups: print g




def getTklWordGroup(w):
    def ok(g):
        return  w in g
    return filter(ok, tkl_word_groups)[0]

all_tkl_word_tokens = [ w for kws in tkl_word_groups for w in kws ]

function_words =  \
    [ 'for'
    , 'in'
    ]

def isFunctionWord(w):
    #res = w in function_words
    res = any( [ w in all_tkl_word_tokens ] )
    return res

#N=int(sys.argv[1])

def isProse(s): return (s!=' ' and '~' not in s)
def isVerse(s): return (s!=' ' and '~' in s)


def getVerseLines(txt): 
    txt_ = re.split('[@#%]',txt)
    xs = filter(lambda x: isVerse(x), txt_)
    '''
    if "Courtesan" in txt:
        print xs 
        print len(xs)
        print map(len,map(words,xs))
        #exit()
    '''
    return xs

def getProseLines(txt): 
    txt_ = re.split('[@#%]',txt)
    #print txt_
    xs = filter(lambda x: isProse(x), txt_)
    '''
    if "Courtesan" in txt:
        print xs 
        print len( xs)
        print map(len,map(words,xs))
        exit()
    '''
    return xs

def getVerseWords(txt): 
    xs = ' '.join(getVerseLines(txt))
    return words(xs)

def getProseWords(txt): 
    xs = ' '.join(getProseLines(txt))
    return words(xs)

def divide(a,b):
    try: res = float(a) / float(b)
    except: res = 0
    return res 




# https://stackoverflow.com/questions/15450192/fastest-way-to-compute-entropy-in-python
def entropy(labels):

    prob_dict = {x:labels.count(x)/float(len(labels)) for x in labels}
    probs = np.array(list(prob_dict.values()))

    return - probs.dot(np.log2(probs))


def analyse(a):
    txt = a[-1]

    num_chars = len(txt)

    ws = words(txt)
    #k = len(ws)
    #if k==N: # ws[0]=='Fathom': 
    #    print ws
    #    #exit()
    num_words = len(ws)

    # Only count verse lines (delimited by '@') ...
    verse_lines = getVerseLines(txt)
    num_verse_lines = len(verse_lines) #or 1
    verse_words = getVerseWords(txt)
    num_verse_words = len(verse_words) #or 1
    w_per_vl = divide(num_verse_words, num_verse_lines) 

    # Only count prose lines (delimited by '#') ...
    prose_lines = getProseLines(txt)
    #print "PROSE", prose_lines
    num_prose_lines = len(prose_lines) #or 1
    prose_words = getProseWords(txt)
    num_prose_words = len(prose_words) #or 1
    w_per_pl = divide(num_prose_words, num_prose_lines)

    func_words = filter(isFunctionWord, ws)
    num_func_words = float(len(func_words))


    #print num_prose_lines, num_prose_words, num_verse_lines, num_verse_words, txt

    #v2p = float(num_verse_words) - float(num_prose_words) 
    v2p = float(w_per_vl) - float(w_per_pl)

    ent = entropy(ws)

    tkl_analysis = []
    for w in all_tkl_word_tokens:
        pred = lambda x: w.lower() in x.lower()
        f_words = filter(pred, ws)
        num_f_words = float(len(f_words))
        tkl_analysis.append(num_f_words)

    return [num_chars, num_words, num_verse_words, num_prose_words, num_verse_lines, num_prose_lines, v2p, w_per_vl, w_per_pl, num_func_words, ent] + tkl_analysis

#-------------------------------------------------------------------------------
# Make Canonical Name LUT ...

with open(prefix+"/"+play+'/'+play+'-names.txt', 'r') as fh:
    name_lut = []
    i=0
    for line in fh: 
      if i==0: _,col1,col2 = line[:-1].split('\t')
      if i>0 and len(line)>1:
        n,q,f = line.split('\t')
        qs = q.split(',')
        fs = f.split(',')
        for k in qs: name_lut.append((col1+'_'+k.strip(),n))
        for k in fs: name_lut.append((col2+'_'+k.strip(),n))
      i+=1

name_lut = dict(name_lut)
print name_lut

nameIds = dict([ (k,i) for i,k in enumerate(name_lut.keys()  ) ])
print nameIds
#exit()
        
#-------------------------------------------------------------------------------
# Make Canonical Scene Act LUT ...

with open(prefix+"/"+play+'/'+play+'-scenes.txt', 'r') as fh:
    scene_lut = []
    inv_scene_lut = []
    act_lut = []
    i=0
    for line in fh: 
     if i>0 and line[0] in ['P','-']+map(str,xrange(10)):
        q,f = line[:-1].split('\t')
        q_str = q
        print ">", q,f

        if q=="Prologue": q = -1
        elif q=="Epilogue": q = 99
        elif q=="-": q = 0
        elif "." in q_str: q = i
        else: q = int(q)

        if f=="-": a,f = -1,-1
        elif f=="Epilogue": a,f = 99,99
        elif f=="Prologue": a,f = -1,-1
        elif "." not in f: 
            a = 1
            f = int(f)
        else: a,f = map(int, f.split('.') )

        scene_lut.append(( (a,f),q))
        inv_scene_lut.append((q,f))
        act_lut.append(( q, a) )
     i+=1

scene_lut = dict(scene_lut)
print scene_lut

inv_scene_lut = dict(inv_scene_lut)
print inv_scene_lut

act_lut = dict(act_lut)
print act_lut

#exit()


#-------------------------------------------------------------------------------

#which_as_relative = 
#       '<seg subtype='relative' type='homograph'>which_rel</seg>'

def get_all_text( node ):
        # do not include stage directions ...
        if node.nodeName == 'stage': return ''
        if node.nodeType == node.TEXT_NODE:
            return node.data
        else:
            txt = ''
            for child_node in node.childNodes:

                # to start indicate additional text ... 
                if child_node.nodeName == 'add': txt += '>' 

                # Add a delimter '@' for verse lines ...
                if child_node.nodeName == 'l': txt += '@~' 
                # Add a delimter '%' for prose ...
                if child_node.nodeName == 'p': txt += '%' 
                # Add a delimter '#' for prose ...
                if child_node.nodeName == 'lb': txt += '#' 

                # Parse which as a rleative ...
                if child_node.nodeName == 'seg' \
                    and child_node.attributes['type'].value=='homograph'\
                    and child_node.attributes['subtype'].value=='relative' \
                    and child_node.firstChild.nodeValue.lower() == 'which': 
                        child_node.firstChild.replaceWholeText('which_rel')
                        t = get_all_text( child_node )
                t = get_all_text( child_node )

                # to indicate end of additional text ...
                if child_node.nodeName == 'add': t += '<' 

                txt += ' '+t+' '
            
            txt = hack_utf8_to_ascii(txt.replace('\n',' '))


            txt = ' '.join(words_(txt)) 
            return txt

def getSpeaker(xml):
    for el in xml.getElementsByTagName('speaker'):
        return get_all_text(el) 

def getSps(ed, xml):
    sps = []
    for n,el in enumerate(xml.getElementsByTagName('sp')):
        who = el.getAttribute('who')
        s = getSpeaker(el)
        speaker_ = name_lut[ed+'_'+who]
        t = get_all_text(el)[len(s):]
        '''
        #except:
        if 0:
                print n, who, speaker_
                print el.toxml()
                exit()
        '''
        sps.append([n+1, who, speaker_, s, t]) 
    return sps

def fixSplitSpeeches(ed, act, scene, sps):
    pairs = zip(sps, sps[1:])
    with open(ed+'_splitSpeeches.txt', 'w') as text_file:
     for a,b in pairs:
        if a[1]==b[1]: # ... the "who" elements match?
            s = '\n\n'+ ','.join(map(str, [ed,act,scene]))
            text_file.write(s)
            s = '\n'+ ','.join(map(str, a))
            text_file.write(s)
            s = '\n'+ ','.join(map(str, b))
            text_file.write(s)
            #exit()
    return sps

    

def getScenes(ed, act, subset, xml):
    scs = []
    for el in xml.getElementsByTagName('div2'):
        if el.getAttribute('type') == 'scene':
            scene = el.getAttribute('n')
            print 'scene: ', n
            sps = getSps(ed, el)

            # Find speeches split by stage directions ....
            if subset not in ["Only", "Common"]:
                sps = fixSplitSpeeches(ed, act, scene, sps)

            for sp in sps:
                scs.append([int(scene)] + sp)

    return scs

def proc(f):
    global play

    #play = 'Ham' # ../dissectXMLadditions/'

    xml = minidom.parse(prefix+'/'+play+'/'+f)

    play, ed, Subset = f.split('.')[0].split('_')

    print ed, Subset
    acts = []
   
    for el in xml.getElementsByTagName('div1'):
        if el.getAttribute('type') == 'act':
            act = el.getAttribute('n')
            print 'act: ', act
            scs = getScenes(ed, act, Subset, el)
            for sc in scs:
              j=sc[0] 
              acts.append([ ed, Subset, int(act)] + sc)
                
        if el.getAttribute('type') == 'scene':
            n = el.getAttribute('n')
            sps = getSps(ed, el)
            for sp in sps:
                acts.append([ed, Subset,  int(n), 0] + sp)

    print acts
    #exit()

    return acts

#-------------------------------------------------------------------------------

res = []
rs = []
#for f in ['F_HC.xml', 'Q_HC.xml']: 
#for f in 'F_plus.xml F_minus.xml Q_plus.xml Q_minus.xml'.split(' '):


for f in [f_filename, q_filename]:
    print f

    rs = proc(f)
    print len(rs)
    
    for r in rs:
        stats = analyse(r)
        res.append(stats + r)
#exit()


df = pd.DataFrame(res) 

metric_names = ['#chars','#words'
    , '#v_words', '#p_words'
    , '#v_lines', '#p_lines'
    , 'v2p', '#w_per_vl'
    , '#w_per_pl', '#f_words'
    , 'ent'
    ] 

colNames = metric_names + \
            all_tkl_word_tokens + \
            ['Edtn','Subset', 'div1', 'div2','sp#','who_','Speaker', 'spkr','txt'] 

            
df.columns = colNames

# fix numeric sorting of columns ...
for c in ['div1', 'div2','sp#']: 
    i = colNames.index(c)
    df[df.columns[i]] = df[df.columns[i]].astype(int)


# Fix Ordering of editions ...
def fixEd(r):
    if r.Edtn=='F': return 'F'
    else: return ' '+r.Edtn

df['Edition'] = df.apply(lambda row: fixEd(row), axis=1)


#-------------------------------------------------------------------------------




def mkPlot(filename, title, xlab, ylab, df, rot, k):
    
    colors = two_colors
    if k==2: colors = six_colors


    df.plot(kind='bar', width=.9, rot=rot, color=colors) # .legend()
    
    plt.suptitle(title, fontsize=12)
    plt.legend(prop={'size': 6})
    plt.xlabel(xlab, fontsize=6)
    plt.ylabel(ylab, fontsize=10)
    bars = [ x[1] for x in df.index.tolist()] # rows[0] # rowNames # [0]
    #except: bars = xrange(5)
    plt.tick_params(axis=u'both', which=u'both',length=0)
    plt.xticks(range(len(bars)), bars, size='small')
    #plt.show()

    plt.tight_layout()
    filename = filename.replace(' ', '_')
    plt.savefig(play+'_data/'+filename+'.png')
    plt.close()

def outputTable(filename, df):
    filename = filename.replace(' ', '_')
    html = df.to_html()
    with open(play+'_data/'+filename+'.txt', 'w') as text_file:
        text_file.write(html)
    
# Calulate Proportions... 


#-------------------------------------------------------------------------------
# Assign F Acts to Q

def assignAct(df, r):
    if play=="Rom": return 1
    else:
      if r.Edtn == 'F': return r.div1
      else: 
        if r.div2: return r.div1
        else: return act_lut[ r.div1 ]

df['Act'] = df.apply(lambda row: assignAct(df, row), axis=1)


#-------------------------------------------------------------------------------
# Fix F div# 17 -> Q div# 18

print scene_lut

def assignDiv(df, r):
    if play=="Rom":
       return scene_lut[(1, r.div1)]
    if r.Edtn[:1] == 'Q': 
        if r.div2: 
            return scene_lut[(r.div1, r.div2)]
        else: 
            return r.div1
    else: return scene_lut[ (r.div1, r.div2) ]

df['Div'] = df.apply(lambda row: assignDiv(df, row), axis=1)

def fix_zero(s):
    if s=="0": s="_"
    return s

def assignScene(df, r):
    d = r.Div
    print d
    #print inv_scene_lut
    s = inv_scene_lut[ d ]
    if d<10: d = ' '+str(d)
    else: d = str(d)
    return d + ' (' + str(r.Act) + '.'+ fix_zero(str(s)) + ')'

df['Scene'] = df.apply(lambda row: assignScene(df, row), axis=1)

print df
df.to_csv(play+'_df.csv', header=True, index=False, encoding='utf-8')


#-------------------------------------------------------------------------------
# Fix Subset names ....

#df['Subset'] = df['Subset'].replace('Full-a',' Full-XML')
df['Subset'] = df['Subset'].replace('Full',' Full-XML')
#df['Subset'] = df['Subset'].replace('Common','b) Common')
#df['Subset'] = df['Subset'].replace('Only','c) Only')

#-------------------------------------------------------------------------------

'''
def calcWordsPerVerseLine(r): 
    return float(r['#words']) / float(r['#v_lines'])

def calcWordsPerProseLine(r): 
    return float(r['#words']) / float(r['#p_lines'])


df['#w_per_vl'] = df.apply(lambda row: calcWordsPerVerseLine(row), axis=1)
df['#w_per_pl'] = df.apply(lambda row: calcWordsPerProseLine(row), axis=1)
'''

#-------------------------------------------------------------------------------
# Pairs Plots ...

'''
def pmi(dff, x, y):
    df = dff.copy()
    df['f_x'] = df.groupby(x)[x].transform('count')
    df['f_y'] = df.groupby(y)[y].transform('count')
    df['f_xy'] = df.groupby([x, y])[x].transform('count')
    df['pmi'] = np.log(len(df.index) * df['f_xy'] / (df['f_x'] * df['f_y']) )
    return df


df_ = df.copy()
df_ = pmi(df_, 'Speaker', 'which_rel')
df_['dataset'] = df['Edition'] + ' ' + df['Subset']

what = ['pmi', 'v2p'] # , '#v_words', '#p_words'] 

cats = ['dataset', 'Scene']
# Set categorical data to numeric ...
for c in cats:
    df_[c] = pd.Categorical(df_[c])
    df_[c] = df_[c].cat.codes


df_ = df_[ ['Speaker'] + cats + what ]

print df_


import seaborn as sns
sns.set(style="ticks", color_codes=True)


dataset_colors = [ 'royalblue', 'lightcoral', 'blue', 'red', 'slateblue', 'coral']
#sns.pairplot(df_, hue="dataset", palette=dataset_colors) # "husl")
sns.pairplot(df_, hue="Speaker", palette="husl")
plt.show()
exit()

print df_

pairs(df_)

exit()
'''

#-------------------------------------------------------------------------------
# X by Y by Z  ...

# words, lines, speeches, speakers 
# scene, act, speaker
# count, mean, median


def expand(xs):
    s=xs[0]
    for x in xs[1:]: s = s + ' and ' + x
    return s

def processMetric(metric, x,y,z, aggFun):   

    #title = 'Words by Act'
    title = metric + ' for each ' + expand(y) + ' by ' + expand(z)
    print title

    
    df_ = df
    if z==['Edition']:
        df_ = df_[ df_['Subset'] == ' Full-XML']


    pt = df_.pivot_table(
        values=x
        , columns=y
        , index=z
        , aggfunc=aggFun
        , fill_value=0 
        )

    pt = pt.T
    filename = title

    bs = y[0] 
    if bs in ['Scene', 'Speaker'] : rot = 90
    else: rot = 0

    mkPlot(filename, title, bs, metric, pt, rot, len(z) )

    # Calc table w margings ...
    pt = df_.pivot_table(
        values=x
        , columns=y
        , index=z
        , aggfunc=aggFun
        , fill_value=0 
        , margins=True
        )
    pt = pt.T
    print pt
    print '='*30


    outputTable(filename, pt)

def count_uniq(xs): 
    return len(xs.dropna().unique())

metrics0 = [ \
    ('Number of Prose Words','#p_words', np.sum)
    #, ('Mean entropy of speeches', 'ent', np.mean)
    #, ('Median entropy of speeches', 'ent', np.median)
    ]




metrics = \
    [ ('Mean of verse words per line minus prose words per line', 'v2p', np.mean)
    , ('Median of verse words per line minus prose words per line', 'v2p', np.median)

    , ('Number of Words','#words', np.sum)
    , ('Number of Prose Words','#p_words', np.sum)
    , ('Number of Verse Words ','#v_words', np.sum)
    , ('Number of Prose Lines','#p_lines', np.sum)
    , ('Number of Verse Lines','#v_lines', np.sum)
    , ('Mean Words per Speech','#words', np.mean)
    , ('Median Words per Speech','#words', np.median)

    #, ('Number of Prose Lines','#p_lines', np.sum)
    , ('Mean Prose Lines per Speech','#p_lines', np.mean)
    , ('Median Prose Lines per Speech','#p_lines', np.median)

    #, ('Number of Verse Lines','#v_lines', np.sum)
    , ('Mean Verse Lines per Speech','#v_lines', np.mean)
    , ('Median Verse Lines per Speech','#v_lines', np.median)

    , ('Number of Speeches','sp#', len)
    , ('Number of Speakers','who_', count_uniq)
    
    #, ('Number of words per prose line', '#w_per_pl', np.sum)
    , ('Mean number of words per prose line', '#w_per_pl', np.mean)
    , ('Median number of words per prose line', '#w_per_pl', np.median)


    #, ('Number of words per verse line', '#w_per_vl', np.sum)

    , ('Mean number of words per verse line', '#w_per_vl', np.mean)
    , ('Median number of words per verse line', '#w_per_vl', np.median)

    , ('Mean entropy of speeches', 'ent', np.mean)
    , ('Median entropy of speeches', 'ent', np.median)

    ]



if doMetrics:
 for x in metrics:
  for y in ['Act', 'Scene', 'Speaker']:
    for z in [['Edition']]: # , ['Edition','Subset']]:
        t,x_, aggFun = x

        if not (y=='Speaker' and x_=='who_'):
            processMetric(t , [x_] , [y] , z,  aggFun)




#-------------------------------------------------------------------------------

# TKL words by Act/Scene ...

def processTKLWord(w, div, subtype):

    title = w + ' for each ' + div + ' by ' + expand(subtype)
    print title


    df_ = df
    if subtype==['Edition']:
        df_ = df_[ df_['Subset'] == ' Full-XML']

    tot = df_.pivot_table(
        values= ['#words'] 
        , columns=[div]
        , index=subtype
        , aggfunc=np.sum
        , fill_value=0
        )

    # Make Proportions relative to full xml counts for each edition ... 
    if 0:
        if not subtype==['Edition']:
            a = np.array([np.array(tot)[0]]).T
            a = np.append(a,a, axis=1)
            a = np.append(a,a, axis=1)
            a = a.T
            a = pd.DataFrame(a[:3])

            b = np.array([np.array(tot)[3]]).T
            b = np.append(b,b, axis=1)
            b = np.append(b,b, axis=1)
            b = b.T
            b = pd.DataFrame(b[:3])

            m = np.append(a,b, axis=0)

            tot[tot.columns] = m

            print tot
            exit()
   
    # Make Proportions relative to mean of full xml counts for each edition ... 
    if 0: 
        if not subtype==['Edition']:
            df_F = df[ (df.Edtn == 'F') & (df.Subset== ' Full-XML') ]
            num_f_words =  np.sum(df_F['#words'])
            df_Q = df[ (df.Edtn[:1] == ' ') & (df.Subset== ' Full-XML') ]

            print df_Q
            exit()
            num_q_words = np.sum(df_Q['#words'])
    
            mean_num_words = (num_f_words + num_q_words) / 2
    
            tot = tot*0 + mean_num_words
            #print tot
            #exit()


    pt = df_.pivot_table(
        values= [w] 
        , columns=[div] 
        , index=subtype
        , aggfunc=np.sum
        , fill_value=0 
        )

    
    print pt
    print tot
    
    prop_pt = (pt / tot.values) * 100
    prop_pt = prop_pt.T
    print prop_pt

    filename = title    
    title = 'Proportion of \'' + w + '\' for each ' + div + ' by ' + expand(subtype)

    mkPlot(filename, title, div, 'Proportion of Word Tokens', prop_pt, 0 , len(subtype))

    pt.columns = pt.columns.set_levels(['count'], level=0)
    outputTable(filename+'_count', pt.T)

    tot.columns = tot.columns.set_levels(['token group total'], level=0)
    outputTable(filename+'_tot', tot.T)

    prop_pt = prop_pt.T
    prop_pt.columns = prop_pt.columns.set_levels(['proportion'], level=0)
    prop_pt = prop_pt.T
    outputTable(filename+'_prop', prop_pt)

if doTKL:
 for div in ['Act', 'Scene']:
    #for subtype in [['Edition'], ['Edition', 'Subset']]:
    for subtype in [['Edition', 'Subset']]:
        #for w in ['which_rel']:
        for w in all_tkl_word_tokens:
            processTKLWord(w, div, subtype)


#-------------------------------------------------------------------------------

# Calc does to doth Ratio

title = 'Ratio of Does to Doth by Act'
print title


#df['doesDothRatio'] = df['does'].astype(float) / df['doth'].astype(float)


pt_does = df.pivot_table(
    values=['does']
    , columns=['Act']
    , index=['Edition'] 
    , aggfunc=np.sum
    , fill_value=0 
    )

pt_doth = df.pivot_table(
    values=['doth']
    , columns=['Act']
    , index=['Edition'] 
    , aggfunc=np.sum
    , fill_value=0 
    )

pt = pt_does / pt_doth.values
pt = pt.T
print pt

filename = title

mkPlot(filename, title, 'Act', 'Ratio', pt, 0, 1)
outputTable(filename, pt)

#-------------------------------------------------------------------------------
# Heatmaps
import seaborn as sns; sns.set()
sns.set(font_scale = .8)

# Number of words  ...

def hmap(filename, title, xlab, ylab, pt, pal):

    ax = plt.axes()
    sns.heatmap(pt, ax = ax, center=0.0, annot=True,  annot_kws={'size':5}  
            , fmt='.1f', linewidths=0.4, cmap=pal)
    #except: pass



    # Fix y labels
    labels = [item.get_text()[7:] for item in ax.get_yticklabels()]
    ax.set_yticklabels(labels)


    ax.set_title(title, fontsize=12)
    plt.xlabel(xlab, fontsize=10)
    plt.ylabel(ylab, fontsize=10)
    plt.tight_layout()
    filename = filename.replace(' ', '_')
    plt.savefig(play+'_data/'+filename+'.png')
    plt.close()

    return pt

def procHeatMap(ed, df):

    title = 'Number of words'+' '+ed
    print title


    pt = df.pivot_table(
        values=['#words']
        , columns=['Speaker'] 
        , index=['Scene']
        , aggfunc=np.sum
        , fill_value=0 
        )

    #pt = df.pivot('Speaker', 'Scene', '#words')

    pt = pt.T
    #print pt
    #exit()

    filename = title

    try: hmap(filename, title, 'Scene', 'Speaker', pt, 'YlOrRd')
    except: pass

    outputTable(filename, pt)
    pt = pd.DataFrame(pt)
    #pt.columns = pt.columns.droplevel()
    return pt

folio = df['Edtn'] == 'F'
df_F = df[folio]
pt_F = procHeatMap('F', df_F)
print pt_F

quarto = df['Edtn'] != 'F'
df_Q = df[quarto]
pt_Q = procHeatMap('Q', df_Q)
print pt_Q

pt = pt_Q - pt_F
#pt[17] = pt_Q[17]
print pt
#exit()

# Find elements that are both 0 ...
a = pt_Q + pt_F
#a[17] = pt_Q[17]
a[ a==0 ] = np.nan
a[ a>0 ] = 1.


pt = pt * a
print pt

title = 'Number of words Q - F'
filename = title
try: hmap(filename, title, 'Scene', 'Speaker', pt, 'RdBu_r')
except: pass
outputTable(filename, pt)